In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn import metrics
import statsmodels.formula.api as smf
from sklearn.metrics import r2_score
import pickle as pk
from sklearn.ensemble import ExtraTreesRegressor, RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
import gzip
import compress_pickle

In [3]:
car_df  = pd.read_csv("Cardetails.csv")

In [4]:
car_df.head()

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,mileage,engine,max_power,torque,seats
0,Maruti Swift Dzire VDI,2014,450000,145500,Diesel,Individual,Manual,First Owner,23.4 kmpl,1248 CC,74 bhp,190Nm@ 2000rpm,5.0
1,Skoda Rapid 1.5 TDI Ambition,2014,370000,120000,Diesel,Individual,Manual,Second Owner,21.14 kmpl,1498 CC,103.52 bhp,250Nm@ 1500-2500rpm,5.0
2,Honda City 2017-2020 EXi,2006,158000,140000,Petrol,Individual,Manual,Third Owner,17.7 kmpl,1497 CC,78 bhp,"12.7@ 2,700(kgm@ rpm)",5.0
3,Hyundai i20 Sportz Diesel,2010,225000,127000,Diesel,Individual,Manual,First Owner,23.0 kmpl,1396 CC,90 bhp,22.4 kgm at 1750-2750rpm,5.0
4,Maruti Swift VXI BSIII,2007,130000,120000,Petrol,Individual,Manual,First Owner,16.1 kmpl,1298 CC,88.2 bhp,"11.5@ 4,500(kgm@ rpm)",5.0


In [5]:
car_df.drop("torque",axis = 1,inplace = True)

In [6]:
car_df.head()

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,mileage,engine,max_power,seats
0,Maruti Swift Dzire VDI,2014,450000,145500,Diesel,Individual,Manual,First Owner,23.4 kmpl,1248 CC,74 bhp,5.0
1,Skoda Rapid 1.5 TDI Ambition,2014,370000,120000,Diesel,Individual,Manual,Second Owner,21.14 kmpl,1498 CC,103.52 bhp,5.0
2,Honda City 2017-2020 EXi,2006,158000,140000,Petrol,Individual,Manual,Third Owner,17.7 kmpl,1497 CC,78 bhp,5.0
3,Hyundai i20 Sportz Diesel,2010,225000,127000,Diesel,Individual,Manual,First Owner,23.0 kmpl,1396 CC,90 bhp,5.0
4,Maruti Swift VXI BSIII,2007,130000,120000,Petrol,Individual,Manual,First Owner,16.1 kmpl,1298 CC,88.2 bhp,5.0


In [7]:
car_df.shape

(8128, 12)

In [8]:
car_df.isnull().sum()

name               0
year               0
selling_price      0
km_driven          0
fuel               0
seller_type        0
transmission       0
owner              0
mileage          221
engine           221
max_power        215
seats            221
dtype: int64

In [9]:
car_df.dropna(inplace = True)

In [10]:
car_df.duplicated().sum()

1189

In [11]:
car_df.drop_duplicates(inplace= True)

In [12]:
car_df.shape

(6718, 12)

In [13]:
car_df.columns

Index(['name', 'year', 'selling_price', 'km_driven', 'fuel', 'seller_type',
       'transmission', 'owner', 'mileage', 'engine', 'max_power', 'seats'],
      dtype='object')

car_df.info()

In [14]:
len(car_df["name"].unique())

1983

In [15]:
car_df["owner"]

0                First Owner
1               Second Owner
2                Third Owner
3                First Owner
4                First Owner
                ...         
8121            Second Owner
8122            Second Owner
8123             First Owner
8124    Fourth & Above Owner
8125             First Owner
Name: owner, Length: 6718, dtype: object

In [16]:
car = car_df.copy()

In [17]:
for i in car_df.columns:
    print("unique value "+ i)
    print(car_df[i].unique())
    print("------------------")

unique value name
['Maruti Swift Dzire VDI' 'Skoda Rapid 1.5 TDI Ambition'
 'Honda City 2017-2020 EXi' ... 'Tata Nexon 1.5 Revotorq XT'
 'Ford Freestyle Titanium Plus Diesel BSIV'
 'Toyota Innova 2.5 GX (Diesel) 8 Seater BS IV']
------------------
unique value year
[2014 2006 2010 2007 2017 2001 2011 2013 2005 2009 2016 2012 2002 2015
 2018 2019 2008 2020 1999 2000 2003 2004 1994 1998 1997 1995 1996]
------------------
unique value selling_price
[  450000   370000   158000   225000   130000   440000    96000    45000
   350000   200000   500000    92000   280000   180000   400000   778000
   150000   680000   174000   950000   525000   600000   575000   275000
   300000   220000   254999   670000   730000   650000   330000   366000
  1149000   425000  2100000   925000   675000   819999   390000  1500000
   700000  1450000  1090000   850000  1650000  1750000  1590000  1689999
  1425000   265000   190000   630000   540000   448000   745000  1025000
   235000  1700000  1200000   610000  2

In [18]:
def brand_name(car_name):
    car_name = car_name.split(" ")
    return car_name[0].strip()

In [19]:
car_df["model_name"] = car_df["name"].apply(lambda x: " ".join(x.split(" ")[1:]))


In [20]:
car_df

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,mileage,engine,max_power,seats,model_name
0,Maruti Swift Dzire VDI,2014,450000,145500,Diesel,Individual,Manual,First Owner,23.4 kmpl,1248 CC,74 bhp,5.0,Swift Dzire VDI
1,Skoda Rapid 1.5 TDI Ambition,2014,370000,120000,Diesel,Individual,Manual,Second Owner,21.14 kmpl,1498 CC,103.52 bhp,5.0,Rapid 1.5 TDI Ambition
2,Honda City 2017-2020 EXi,2006,158000,140000,Petrol,Individual,Manual,Third Owner,17.7 kmpl,1497 CC,78 bhp,5.0,City 2017-2020 EXi
3,Hyundai i20 Sportz Diesel,2010,225000,127000,Diesel,Individual,Manual,First Owner,23.0 kmpl,1396 CC,90 bhp,5.0,i20 Sportz Diesel
4,Maruti Swift VXI BSIII,2007,130000,120000,Petrol,Individual,Manual,First Owner,16.1 kmpl,1298 CC,88.2 bhp,5.0,Swift VXI BSIII
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8121,Maruti Wagon R VXI BS IV with ABS,2013,260000,50000,Petrol,Individual,Manual,Second Owner,18.9 kmpl,998 CC,67.1 bhp,5.0,Wagon R VXI BS IV with ABS
8122,Hyundai i20 Magna 1.4 CRDi,2014,475000,80000,Diesel,Individual,Manual,Second Owner,22.54 kmpl,1396 CC,88.73 bhp,5.0,i20 Magna 1.4 CRDi
8123,Hyundai i20 Magna,2013,320000,110000,Petrol,Individual,Manual,First Owner,18.5 kmpl,1197 CC,82.85 bhp,5.0,i20 Magna
8124,Hyundai Verna CRDi SX,2007,135000,119000,Diesel,Individual,Manual,Fourth & Above Owner,16.8 kmpl,1493 CC,110 bhp,5.0,Verna CRDi SX


In [21]:
for i in car_df['name']:
    a = i.split(" ")
    print(" ".join(a[1:]))

Swift Dzire VDI
Rapid 1.5 TDI Ambition
City 2017-2020 EXi
i20 Sportz Diesel
Swift VXI BSIII
Xcent 1.2 VTVT E Plus
Wagon R LXI DUO BSIII
800 DX BSII
Etios VXD
Figo Diesel Celebration Edition
Duster 110PS Diesel RxL
Zen LX
Swift Dzire VDi
Wagon R LXI Minor
KUV 100 mFALCON G80 K8 5str
Ertiga SHVS VDI
i20 1.4 CRDi Asta
Alto LX
i20 2015-2017 Asta 1.4 CRDi
Verito 1.5 D4 BSIII
WR-V i-DTEC VX
Swift Dzire ZDI
SX4 ZDI
Tigor 2017-2020 XZ
Baleno Delta 1.2
Alto 800 LXI
Enjoy TCDi LTZ 7 Seater
Wagon R LXI Minor
Omni E MPI STD BS IV
Vitara Brezza LDi Option
Omni 8 Seater BSIV
i20 1.2 Asta Dual Tone
Vitara Brezza LDi
Alto 800 CNG LXI Optional
Alto K10 VXI Airbag
Verna VTVT 1.6 SX Option
GO D
Safari DICOR 2.2 LX 4x2
800 Std
Compass 1.4 Limited Plus BSIV
City i VTEC VX
City V MT
Swift Dzire VXi AT
Vitara Brezza VDi
Alto K10 VXI
Fortuner 4x4 MT
Innova 2.5 G (Diesel) 7 Seater BS IV
B Class B180
Amaze S Diesel
Pajero Sport 4X4
City i DTEC S
Ciaz Zeta
Jazz VX Diesel
Innova Crysta 2.8 ZX AT BSIV
Innova 2.5 G

In [22]:
car_df

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,mileage,engine,max_power,seats,model_name
0,Maruti Swift Dzire VDI,2014,450000,145500,Diesel,Individual,Manual,First Owner,23.4 kmpl,1248 CC,74 bhp,5.0,Swift Dzire VDI
1,Skoda Rapid 1.5 TDI Ambition,2014,370000,120000,Diesel,Individual,Manual,Second Owner,21.14 kmpl,1498 CC,103.52 bhp,5.0,Rapid 1.5 TDI Ambition
2,Honda City 2017-2020 EXi,2006,158000,140000,Petrol,Individual,Manual,Third Owner,17.7 kmpl,1497 CC,78 bhp,5.0,City 2017-2020 EXi
3,Hyundai i20 Sportz Diesel,2010,225000,127000,Diesel,Individual,Manual,First Owner,23.0 kmpl,1396 CC,90 bhp,5.0,i20 Sportz Diesel
4,Maruti Swift VXI BSIII,2007,130000,120000,Petrol,Individual,Manual,First Owner,16.1 kmpl,1298 CC,88.2 bhp,5.0,Swift VXI BSIII
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8121,Maruti Wagon R VXI BS IV with ABS,2013,260000,50000,Petrol,Individual,Manual,Second Owner,18.9 kmpl,998 CC,67.1 bhp,5.0,Wagon R VXI BS IV with ABS
8122,Hyundai i20 Magna 1.4 CRDi,2014,475000,80000,Diesel,Individual,Manual,Second Owner,22.54 kmpl,1396 CC,88.73 bhp,5.0,i20 Magna 1.4 CRDi
8123,Hyundai i20 Magna,2013,320000,110000,Petrol,Individual,Manual,First Owner,18.5 kmpl,1197 CC,82.85 bhp,5.0,i20 Magna
8124,Hyundai Verna CRDi SX,2007,135000,119000,Diesel,Individual,Manual,Fourth & Above Owner,16.8 kmpl,1493 CC,110 bhp,5.0,Verna CRDi SX


In [23]:
def clean_data(value):
    value = value.split(" ")[0]
    value = value.strip()
    if value == "":
        value = 0
    return float(value)

In [24]:
car_df["name"] = car_df["name"].apply(brand_name)

In [25]:
car_df["name"].unique()

array(['Maruti', 'Skoda', 'Honda', 'Hyundai', 'Toyota', 'Ford', 'Renault',
       'Mahindra', 'Tata', 'Chevrolet', 'Datsun', 'Jeep', 'Mercedes-Benz',
       'Mitsubishi', 'Audi', 'Volkswagen', 'BMW', 'Nissan', 'Lexus',
       'Jaguar', 'Land', 'MG', 'Volvo', 'Daewoo', 'Kia', 'Fiat', 'Force',
       'Ambassador', 'Ashok', 'Isuzu', 'Opel'], dtype=object)

In [26]:
car_df["mileage"] = car_df["mileage"].apply(clean_data)

In [27]:
car_df["max_power"] = car_df["max_power"].apply(clean_data)

In [28]:
car_df["engine"] = car_df["engine"].apply(clean_data)

In [29]:
car_df

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,mileage,engine,max_power,seats,model_name
0,Maruti,2014,450000,145500,Diesel,Individual,Manual,First Owner,23.40,1248.0,74.00,5.0,Swift Dzire VDI
1,Skoda,2014,370000,120000,Diesel,Individual,Manual,Second Owner,21.14,1498.0,103.52,5.0,Rapid 1.5 TDI Ambition
2,Honda,2006,158000,140000,Petrol,Individual,Manual,Third Owner,17.70,1497.0,78.00,5.0,City 2017-2020 EXi
3,Hyundai,2010,225000,127000,Diesel,Individual,Manual,First Owner,23.00,1396.0,90.00,5.0,i20 Sportz Diesel
4,Maruti,2007,130000,120000,Petrol,Individual,Manual,First Owner,16.10,1298.0,88.20,5.0,Swift VXI BSIII
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8121,Maruti,2013,260000,50000,Petrol,Individual,Manual,Second Owner,18.90,998.0,67.10,5.0,Wagon R VXI BS IV with ABS
8122,Hyundai,2014,475000,80000,Diesel,Individual,Manual,Second Owner,22.54,1396.0,88.73,5.0,i20 Magna 1.4 CRDi
8123,Hyundai,2013,320000,110000,Petrol,Individual,Manual,First Owner,18.50,1197.0,82.85,5.0,i20 Magna
8124,Hyundai,2007,135000,119000,Diesel,Individual,Manual,Fourth & Above Owner,16.80,1493.0,110.00,5.0,Verna CRDi SX


In [30]:
lab = LabelEncoder()
car_df["name"] = lab.fit_transform(car_df["name"])
car_df["transmission"] = lab.fit_transform(car_df["transmission"])
car_df["seller_type"] = lab.fit_transform(car_df["seller_type"])
car_df["fuel"] = lab.fit_transform(car_df["fuel"])
car_df["owner"] = lab.fit_transform(car_df["owner"])
car_df["model_name"] = lab.fit_transform(car_df["model_name"])


In [31]:
car_df

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,mileage,engine,max_power,seats,model_name
0,20,2014,450000,145500,1,1,1,0,23.40,1248.0,74.00,5.0,1492
1,26,2014,370000,120000,1,1,1,2,21.14,1498.0,103.52,5.0,1236
2,10,2006,158000,140000,3,1,1,4,17.70,1497.0,78.00,5.0,325
3,11,2010,225000,127000,1,1,1,0,23.00,1396.0,90.00,5.0,1980
4,20,2007,130000,120000,3,1,1,0,16.10,1298.0,88.20,5.0,1533
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8121,20,2013,260000,50000,3,1,1,2,18.90,998.0,67.10,5.0,1760
8122,11,2014,475000,80000,1,1,1,2,22.54,1396.0,88.73,5.0,1972
8123,11,2013,320000,110000,3,1,1,0,18.50,1197.0,82.85,5.0,1970
8124,11,2007,135000,119000,1,1,1,1,16.80,1493.0,110.00,5.0,1696


In [32]:
car_df["transmission"].unique()
car_df["seller_type"].unique()
car_df["fuel"].unique()
car_df["owner"].unique()

array([0, 2, 4, 1, 3])

In [33]:
"""car_df["name"].replace(['Maruti', 'Skoda', 'Honda', 'Hyundai', 'Toyota', 'Ford', 'Renault',
       'Mahindra', 'Tata', 'Chevrolet', 'Datsun', 'Jeep', 'Mercedes-Benz',
       'Mitsubishi', 'Audi', 'Volkswagen', 'BMW', 'Nissan', 'Lexus',
       'Jaguar', 'Land', 'MG', 'Volvo', 'Daewoo', 'Kia', 'Fiat', 'Force',
       'Ambassador', 'Ashok', 'Isuzu', 'Opel'],[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31],inplace = True)"""

'car_df["name"].replace([\'Maruti\', \'Skoda\', \'Honda\', \'Hyundai\', \'Toyota\', \'Ford\', \'Renault\',\n       \'Mahindra\', \'Tata\', \'Chevrolet\', \'Datsun\', \'Jeep\', \'Mercedes-Benz\',\n       \'Mitsubishi\', \'Audi\', \'Volkswagen\', \'BMW\', \'Nissan\', \'Lexus\',\n       \'Jaguar\', \'Land\', \'MG\', \'Volvo\', \'Daewoo\', \'Kia\', \'Fiat\', \'Force\',\n       \'Ambassador\', \'Ashok\', \'Isuzu\', \'Opel\'],[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31],inplace = True)'

In [34]:
car_df.reset_index(inplace=True)

In [35]:
car_df.drop("index",axis=1,inplace=True)

In [36]:
X = car_df.drop("selling_price",axis = 1)

In [37]:
Y = car_df["selling_price"]

In [38]:
x_train, x_test, y_train, y_test = train_test_split(X,Y,test_size=0.2,random_state=42)

In [39]:
print(f"Length of X_train: {len(x_train)}")
print(f"Length of y_train: {len(y_train)}")
print(f"Length of X_test: {len(x_test)}")
print(f"Length of y_test: {len(y_test)}")

Length of X_train: 5374
Length of y_train: 5374
Length of X_test: 1344
Length of y_test: 1344


In [40]:
#model = ExtraTreesRegressor()

In [41]:
#model.fit(x_train,y_train)

In [42]:
#y_predict_  = model.predict(x_test)

In [43]:
#y_predict_

In [44]:
#R2_score = r2_score(y_test, y_predict_)
#R2_score

In [45]:

#car_df = pd.read_csv("Cardetails.csv")
#car_df.head()
    

In [46]:
"""
# Drop rows with missing values for simplicity
car_df.dropna(inplace=True)

# Select features and target variable
X = car_df.drop(columns=['selling_price','torque'])  # Features
y = car_df['selling_price']  # Target

# Convert categorical variables to numeric using Label Encoding
label_encoders = {}
categorical_columns = ['name','fuel', 'seller_type', 'transmission', 'owner','mileage','engine','max_power']
for column in categorical_columns:
    label_encoders[column] = LabelEncoder()
    X[column] = label_encoders[column].fit_transform(X[column])

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    """

"\n# Drop rows with missing values for simplicity\ncar_df.dropna(inplace=True)\n\n# Select features and target variable\nX = car_df.drop(columns=['selling_price','torque'])  # Features\ny = car_df['selling_price']  # Target\n\n# Convert categorical variables to numeric using Label Encoding\nlabel_encoders = {}\ncategorical_columns = ['name','fuel', 'seller_type', 'transmission', 'owner','mileage','engine','max_power']\nfor column in categorical_columns:\n    label_encoders[column] = LabelEncoder()\n    X[column] = label_encoders[column].fit_transform(X[column])\n\n# Split data into training and testing sets\nX_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n    "

In [47]:

# Define the model
model = RandomForestRegressor()

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Setup the GridSearchCV
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)

# Fit the model
grid_search.fit(x_train, y_train)

# Get the best parameters
best_params = grid_search.best_params_
print("Best parameters found: ", best_params)
    

Best parameters found:  {'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}


In [48]:

# Train the model with the best parameters
best_model = grid_search.best_estimator_

# Predict on the test set
y_pred = best_model.predict(x_test)

# Evaluate the model
mse = metrics.mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print("Mean Squared Error: ", mse)
print("R^2 Score: ", r2)
    

Mean Squared Error:  18883005206.629974
R^2 Score:  0.8928440275652627


In [49]:
#pk.dump(best_model,open("best_model.pkl.gz","wb"))
compress_pickle.dump(best_model, 'best_model.gz', compression="gzip")