In [67]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.metrics import r2_score

In [68]:
car = pd.read_csv('car_price.csv')

In [69]:
car['kms_driven'] = car['kms_driven'].str.replace(',', '').str.replace(' kms', '').astype(int)

In [70]:
def convert_price(price_str):
    if 'Crore' in price_str:
        price = float(price_str.replace(',', '').replace(' Crore', '')) * 10000000  # Convert Crore to Rupees
    elif 'Lakh' in price_str:
        price = float(price_str.replace(',', '').replace(' Lakh', '')) * 100000  # Convert Lakh to Rupees
    else:
        price = float(price_str.replace(',', ''))  # No conversion needed
    return int(price)

car['car_prices_in_rupee'] = car['car_prices_in_rupee'].apply(convert_price)

In [71]:
car['Seats'] = car['Seats'].str.extract('(\d+)').astype(float)

In [72]:
car.to_csv('this_clean_car_data.csv')

In [73]:
x = car[['car_name', 'kms_driven', 'fuel_type', 'transmission', 'ownership', 'manufacture', 'engine', 'Seats']]
y = car['car_prices_in_rupee']

In [74]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

In [75]:
ohe = OneHotEncoder(categories='auto', drop=None, dtype=np.float64, handle_unknown='ignore', sparse=True)

In [76]:
column_trans = make_column_transformer(
    (ohe, ['car_name', 'fuel_type', 'transmission', 'ownership', 'engine', 'Seats']),
    remainder='passthrough'
)

In [77]:
lr = LinearRegression()

In [78]:
pipe = make_pipeline(column_trans, lr)

In [79]:
pipe.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('columntransformer',
                 ColumnTransformer(n_jobs=None, remainder='passthrough',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('onehotencoder',
                                                  OneHotEncoder(categories='auto',
                                                                drop=None,
                                                                dtype=<class 'numpy.float64'>,
                                                                handle_unknown='ignore',
                                                                sparse=True),
                                                  ['car_name', 'fuel_type',
                                                   'transmission', 'ownership',
                                                   'engine', 'Seats'])],
                                 

In [80]:
y_pred = pipe.predict(X_test)

In [81]:
r2 = r2_score(y_test, y_pred)

In [82]:
new_car_features = {
    'car_name': 'Toyota Innova Crysta 2.4 ZX MT',
    'kms_driven': 50000,
    'fuel_type': 'Diesel',
    'transmission': 'Manual',
    'ownership': 'First Owner',
    'manufacture': 2019,
    'engine': '2393 CC',
    'Seats': 7
}

In [83]:
new_car_df = pd.DataFrame([new_car_features])

In [84]:
predicted_price = pipe.predict(new_car_df)

In [85]:
print("Predicted price of the car:", predicted_price[0])

Predicted price of the car: 1073614.4066155553


In [86]:
import pickle

In [87]:
pickle.dump(pipe,open('LinearRegressionModelForCarData.pkl','wb'))

In [88]:
pipe.predict(pd.DataFrame([new_car_features]))

array([1073614.40661556])