In [24]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [25]:
df = pd.read_excel("Data_Train.xlsx")
df.head()

Unnamed: 0,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,Price
0,Maruti Wagon R LXI CNG,Mumbai,2010,72000,CNG,Manual,First,26.6 km/kg,998 CC,58.16 bhp,5.0,1.75
1,Hyundai Creta 1.6 CRDi SX Option,Pune,2015,41000,Diesel,Manual,First,19.67 kmpl,1582 CC,126.2 bhp,5.0,12.5
2,Honda Jazz V,Chennai,2011,46000,Petrol,Manual,First,18.2 kmpl,1199 CC,88.7 bhp,5.0,4.5
3,Maruti Ertiga VDI,Chennai,2012,87000,Diesel,Manual,First,20.77 kmpl,1248 CC,88.76 bhp,7.0,6.0
4,Audi A4 New 2.0 TDI Multitronic,Coimbatore,2013,40670,Diesel,Automatic,Second,15.2 kmpl,1968 CC,140.8 bhp,5.0,17.74


In [26]:
columns = ['Name', 'Location','Fuel_Type','Transmission', 'Owner_Type', 'Year', 'Kilometers_Driven',  'Mileage', 'Engine', 'Power', 'Seats','Price']
df = df.reindex(columns=columns)
df.head()

Unnamed: 0,Name,Location,Fuel_Type,Transmission,Owner_Type,Year,Kilometers_Driven,Mileage,Engine,Power,Seats,Price
0,Maruti Wagon R LXI CNG,Mumbai,CNG,Manual,First,2010,72000,26.6 km/kg,998 CC,58.16 bhp,5.0,1.75
1,Hyundai Creta 1.6 CRDi SX Option,Pune,Diesel,Manual,First,2015,41000,19.67 kmpl,1582 CC,126.2 bhp,5.0,12.5
2,Honda Jazz V,Chennai,Petrol,Manual,First,2011,46000,18.2 kmpl,1199 CC,88.7 bhp,5.0,4.5
3,Maruti Ertiga VDI,Chennai,Diesel,Manual,First,2012,87000,20.77 kmpl,1248 CC,88.76 bhp,7.0,6.0
4,Audi A4 New 2.0 TDI Multitronic,Coimbatore,Diesel,Automatic,Second,2013,40670,15.2 kmpl,1968 CC,140.8 bhp,5.0,17.74


In [27]:
df = df[df.Power!='null bhp']

In [28]:
#type conversion
df['Mileage'] = df['Mileage'].str.split().str[0].astype('float64')
df['Engine'] = df['Engine'].str.split().str[0].astype('float64')
df['Power'] = df['Power'].str.split().str[0].astype('float64')

In [29]:
df.head()

Unnamed: 0,Name,Location,Fuel_Type,Transmission,Owner_Type,Year,Kilometers_Driven,Mileage,Engine,Power,Seats,Price
0,Maruti Wagon R LXI CNG,Mumbai,CNG,Manual,First,2010,72000,26.6,998.0,58.16,5.0,1.75
1,Hyundai Creta 1.6 CRDi SX Option,Pune,Diesel,Manual,First,2015,41000,19.67,1582.0,126.2,5.0,12.5
2,Honda Jazz V,Chennai,Petrol,Manual,First,2011,46000,18.2,1199.0,88.7,5.0,4.5
3,Maruti Ertiga VDI,Chennai,Diesel,Manual,First,2012,87000,20.77,1248.0,88.76,7.0,6.0
4,Audi A4 New 2.0 TDI Multitronic,Coimbatore,Diesel,Automatic,Second,2013,40670,15.2,1968.0,140.8,5.0,17.74


In [30]:
#missing rows deletion
df = df.dropna(how='any')

In [31]:
#correalted column removal
df = df.drop('Engine',axis=1)
df = df.drop('Mileage',axis=1)

In [32]:
X = df.iloc[:,:-1].values
y = df.iloc[:, -1].values

In [33]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
X[:,0] = le.fit_transform(X[:,0])
X[0]

array([1148, 'Mumbai', 'CNG', 'Manual', 'First', 2010, 72000, 58.16, 5.0],
      dtype=object)

In [34]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [1,2,3,4])], remainder='passthrough')
X = np.array(ct.fit_transform(X))
X[0]

array([0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0,
       0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1148, 2010, 72000, 58.16,
       5.0], dtype=object)

In [35]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [36]:
#Required only for SVR
from sklearn.preprocessing import StandardScaler
sc_x = StandardScaler()
sc_y = StandardScaler()
X_train_scaled = X_train
X_test_scaled = X_test
X_train_scaled[:,21:26] = sc_x.fit_transform(X_train[:,21:26])
X_test_scaled[:,21:26] = sc_x.transform(X_test[:,21:26])
y_train_scaled = sc_y.fit_transform(y_train.reshape(-1,1))

In [37]:
#for knn
from sklearn.model_selection import GridSearchCV
from sklearn import neighbors
params = {'n_neighbors':[3,5,7,9,11,13,15,17,19]}
knn =neighbors.KNeighborsRegressor()
model = GridSearchCV(knn, params)
model.fit(X_train,y_train)
model.best_params_

{'n_neighbors': 7}

In [38]:
#for fitting models

def models(X_train, y_train):
    
    #random forest regressor
    from sklearn.ensemble import RandomForestRegressor
    forest = RandomForestRegressor(n_estimators = 106, random_state = 0)
    forest.fit(X_train, y_train)
    
    #decision tree regressor
    from sklearn.tree import DecisionTreeRegressor
    tree = DecisionTreeRegressor(random_state = 0)
    tree.fit(X_train, y_train)
    
    #KNN regressor
    from sklearn import neighbors
    knn=neighbors.KNeighborsRegressor(n_neighbors = 7)
    knn.fit(X_train, y_train)
    
    #SVM regressor
    from sklearn.svm import SVR
    svr=SVR(kernel='rbf')
    svr.fit(X_train_scaled,y_train_scaled)
    
    return forest, tree, knn, svr
    
    


In [39]:
forest, tree, knn, svr = models(X_train, y_train)

  y = column_or_1d(y, warn=True)


In [40]:
#for predicting 

def predictions(f, t, s ,k ) :
    
    #random forest
    y_f=f.predict(X_test)
    
    #decision tree
    y_t=t.predict(X_test)
    
    #knn
    y_k=k.predict(X_test)
    
    #SVM
    y_s=s.predict(X_test_scaled)
    y_s=sc_y.inverse_transform(y_s.reshape(-1,1))

    np.set_printoptions(precision=2)
    print(np.concatenate((y_test.reshape(len(y_test),1), y_f.reshape(len(y_f),1), y_t.reshape(len(y_t),1), y_k.reshape(len(y_k),1),y_s.reshape(len(y_s),1)),1))
    
    return y_f, y_t, y_k, y_s

yf,yt,yk,ys = predictions(forest, tree, knn, svr)


[[ 0.85  1.24  1.05  1.49  1.36]
 [10.94 12.59 13.54 12.76 12.02]
 [ 3.    2.2   2.25  2.7   2.08]
 ...
 [ 3.2   3.73  3.2   5.74  4.  ]
 [14.   14.53 23.5  16.68 11.23]
 [ 2.9   3.76  3.75  3.95  3.27]]


In [41]:
#for r2 score and mean squared error

def accuracy(yf, yt, yk, ys) :
    
    
    from sklearn.metrics import mean_squared_error
    mse_f= mean_squared_error(y_test, yf)
    mse_t= mean_squared_error(y_test, yt)
    mse_k= mean_squared_error(y_test, yk)
    mse_s= mean_squared_error(y_test, ys)
    
    print('mse of random forest :', mse_f)
    print('mse of decision tree :', mse_t)
    print('mse of knn regression :', mse_k)
    print('mse of SVR model     :', mse_s)

    from sklearn.metrics import r2_score
    r2_f= r2_score(y_test, yf)
    r2_t= r2_score(y_test, yt)
    r2_k= r2_score(y_test, yk)
    r2_s= r2_score(y_test, ys)
    
    print('r2 score of random forest :', r2_f)
    print('r2 score of decision tree :', r2_t)
    print('r2 score of knn regression :', r2_k)
    print('r2 score of SVR model     :', r2_s)
    

accuracy(yf, yt, yk, ys)

mse of random forest : 11.033974549830527
mse of decision tree : 24.16300165957447
mse of knn regression : 21.11241028745115
mse of SVR model     : 17.05195375938832
r2 score of random forest : 0.9208658853245052
r2 score of decision tree : 0.8267063481433982
r2 score of knn regression : 0.8485847607117314
r2 score of SVR model     : 0.8777057842445908
