In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import datetime
import re
import pickle
from collections import Counter
from processing import *
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [2]:
data=pd.read_csv(PATH+'vehicles.csv')

In [3]:
valid_features=['year', 'manufacturer',
       'model', 'condition', 'cylinders', 'fuel', 'odometer', 'title_status',
       'transmission', 'drive', 'size', 'type', 'paint_color','price']
cars=data[valid_features]
cars=fill_nulls(cars)

Removed rows lack of information
Removed irreasonable rows which price==0
Filled odometer
Filled default models
revised 10282 car's manufacturers.
droped 8539 cars.
left 289261 cars.
Filled manufacturers
Added a column (manufacturer,model)
Filled partial na by default values of the certain car model
Filled all nulls


In [4]:
cars.isnull().sum()

year            0
manufacturer    0
model           0
condition       0
cylinders       0
fuel            0
odometer        0
title_status    0
transmission    0
drive           0
size            0
type            0
paint_color     0
price           0
dtype: int64

In [5]:
X=cars[['year', 'manufacturer',
       'model', 'condition', 'cylinders', 'fuel', 'odometer', 'title_status',
       'transmission', 'drive', 'size', 'type', 'paint_color']]
y=cars['price']

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.8,random_state=42)

In [7]:
X.isnull().sum()

year            0
manufacturer    0
model           0
condition       0
cylinders       0
fuel            0
odometer        0
title_status    0
transmission    0
drive           0
size            0
type            0
paint_color     0
dtype: int64

In [8]:
X_train

Unnamed: 0,year,manufacturer,model,condition,cylinders,fuel,odometer,title_status,transmission,drive,size,type,paint_color
279307,2016.0,CHEVROLET,CRUZE,excellent,4 cylinders,gas,105236.0,clean,automatic,fwd,compact,sedan,blue
307070,2018.0,CHEVROLET,MALIBU,excellent,4 cylinders,gas,58307.0,clean,automatic,fwd,mid-size,sedan,white
208056,2010.0,HONDA,INSIGHT,excellent,4 cylinders,hybrid,162194.0,clean,automatic,fwd,sub-compact,sedan,silver
358455,2013.0,GMC,SIERRA 2500HD SLE,good,8 cylinders,other,70900.0,clean,automatic,4wd,full-size,pickup,silver
169588,2018.0,HONDA,CIVIC HATCHBACK,excellent,4 cylinders,gas,43000.0,clean,automatic,fwd,compact,sedan,black
...,...,...,...,...,...,...,...,...,...,...,...,...,...
222837,2019.0,DODGE,JOURNEY,excellent,6 cylinders,gas,36798.0,clean,automatic,fwd,mid-size,SUV,white
390019,1996.0,RAM,2500,excellent,10 cylinders,gas,151785.0,clean,automatic,4wd,full-size,truck,white
236509,2004.0,VOLKSWAGEN,JETTA 18T SEL SEDAN,good,4 cylinders,gas,155000.0,clean,automatic,fwd,mid-size,sedan,grey
253692,2013.0,CHEVROLET,CRUZE,excellent,4 cylinders,gas,77199.0,rebuilt,automatic,fwd,compact,sedan,silver


In [9]:
X_train=ready_to_predict(X_train)

In [10]:
X_test=ready_to_predict(X_test)

In [14]:
X_test

Unnamed: 0,year,manufacturer,model,condition,cylinders,fuel,odometer,title_status,transmission,drive,size,type,paint_color
60650,102,9,16248,0,6,2,0.114434,0,0,0,1,10,8
341989,99,47,10830,0,3,2,0.166912,0,0,0,0,9,9
286503,101,9,17737,0,3,2,0.116000,0,0,0,1,0,5
12080,96,20,16793,0,5,2,0.273600,0,0,1,1,9,9
390881,101,24,13710,2,3,2,0.028764,0,0,0,2,7,9
...,...,...,...,...,...,...,...,...,...,...,...,...,...
335190,93,19,2484,2,3,2,0.233626,0,0,1,1,9,8
142064,95,5,1767,0,5,2,0.303306,0,0,0,2,9,10
329217,95,50,14003,0,3,2,0.219980,0,0,1,2,4,1
9353,87,35,9891,0,3,2,0.307345,0,0,1,2,9,10


# Model

In [11]:
from sklearn.linear_model import LinearRegression, SGDRegressor, Ridge
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor,BaggingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error,accuracy_score,r2_score
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest
from sklearn.preprocessing import PolynomialFeatures

from sklearn.model_selection import GridSearchCV

In [15]:
def modelCompare(X_train,X_test,y_train,y_test):
    results_dict = {'Regressor':[],
                'R2':[], 
                'Test mse':[]
               }

    models = {
        'Linear Regression': LinearRegression(),
        'SGDRegressor' : SGDRegressor(),
        'RidgeCV' : Ridge(),
        'RandomForestRegressor': RandomForestRegressor(),
        'GradientBoostingRegressor': GradientBoostingRegressor(),
        'BaggingRegressor' : BaggingRegressor(),
        'DecisionTreeRegressor': DecisionTreeRegressor()
    }
    for model_name, model in models.items():
        print("Fitting %s..." % model_name)
        model.fit(X_train,y_train)
        y_pred=model.predict(X_test)
        mse = np.sqrt(mean_squared_error(y_pred, y_test))
        r2=r2_score(y_pred,y_test)
        results_dict['Regressor'].append(model_name)
        results_dict['R2'].append(r2)
        results_dict['Test mse'].append(mse)  
        
    results_df = pd.DataFrame(results_dict)

    return results_dict

In [16]:
result1=modelCompare(X_train,X_test,y_train,y_test)
pd.DataFrame(result1).sort_values(by='Test mse',ascending=True)

Fitting Linear Regression...
Fitting SGDRegressor...
Fitting RidgeCV...
Fitting RandomForestRegressor...
Fitting GradientBoostingRegressor...
Fitting BaggingRegressor...
Fitting DecisionTreeRegressor...


Unnamed: 0,Regressor,R2,Test mse
0,Linear Regression,-551037.941848,11456610.0
2,RidgeCV,-551853.964013,11456610.0
4,GradientBoostingRegressor,-907.899195,11462860.0
3,RandomForestRegressor,-737.415079,11464350.0
5,BaggingRegressor,-327.509584,11474090.0
6,DecisionTreeRegressor,-177.866685,11488690.0
1,SGDRegressor,-3.106815,5692824000000000.0


In [17]:
def modelCompare(X_train,X_test,y_train,y_test):
    results_dict = {'Regressor':[],
                'R2':[], 
                'Test mse':[]
               }

    models = {
        'Linear Regression': LinearRegression(),
        'SGDRegressor' : SGDRegressor(),
        'RidgeCV' : Ridge(),
        'RandomForestRegressor': RandomForestRegressor(),
        'GradientBoostingRegressor': GradientBoostingRegressor(),
        'BaggingRegressor' : BaggingRegressor(),
        'DecisionTreeRegressor': DecisionTreeRegressor()
    }
    pca=PCA(n_components=5)
    X_train=pca.fit_transform(X_train)
    X_test=pca.transform(X_test)
    for model_name, model in models.items():
        print("Fitting %s..." % model_name)
        model.fit(X_train,y_train)
        y_pred=model.predict(X_test)
        mse = np.sqrt(mean_squared_error(y_pred, y_test))
        r2=r2_score(y_pred,y_test)
        results_dict['Regressor'].append(model_name)
        results_dict['R2'].append(r2)
        results_dict['Test mse'].append(mse) 
    results_df = pd.DataFrame(results_dict)

    return results_dict

In [18]:
result2=modelCompare(X_train,X_test,y_train,y_test)
pd.DataFrame(result2).sort_values(by='Test mse',ascending=True)

Fitting Linear Regression...
Fitting SGDRegressor...
Fitting RidgeCV...
Fitting RandomForestRegressor...
Fitting GradientBoostingRegressor...
Fitting BaggingRegressor...
Fitting DecisionTreeRegressor...


Unnamed: 0,Regressor,R2,Test mse
0,Linear Regression,-1540499.0,11456640.0
2,RidgeCV,-1540500.0,11456640.0
5,BaggingRegressor,-1177.252,11461470.0
3,RandomForestRegressor,-809.6157,11463660.0
4,GradientBoostingRegressor,-288.1427,11476440.0
6,DecisionTreeRegressor,-274.9101,11477330.0
1,SGDRegressor,-6.047557e-05,507758200000000.0


In [33]:
pipe=Pipeline([('pca',PCA()),
                ('reg',GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0,
                                            criterion='mse', init=None,
                                            learning_rate=1, loss='huber',
                                            max_depth=3, max_features=None,
                                            max_leaf_nodes=None,
                                            min_impurity_decrease=0.0,
                                            min_impurity_split=None,
                                            min_samples_leaf=1,
                                            min_samples_split=2,
                                            min_weight_fraction_leaf=0.0,
                                            n_estimators=100,
                                            n_iter_no_change=None,
                                            presort='deprecated',
                                            random_state=None, subsample=1.0,
                                            tol=0.0001, validation_fraction=0.1,
                                            verbose=0, warm_start=False))])

In [None]:
parameters = \
        [ \
            {
                'reg': [LinearRegression()],
                'pca__n_components': range(5,10,1),
                'reg__fit_intercept': [True, False]
            },
            {
                'reg': [Ridge()],
                'pca__n_components': range(5,10,1),
                'reg__alpha': [0.0001,0.001,0.01,0.1,1,10,100,1000],
                'reg__fit_intercept': [True, False],
                'reg__solver': ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga']
            },
            {
                'reg': [RandomForestRegressor()],
                'pca__n_components': range(5,10,1),
                'reg__n_estimators': range(4,13,2),
                'reg__max_depth':range(10,X_train.shape[1],1),
                'reg__max_leaf_nodes':range(5,20,2)
            },
            {
                'reg': [GradientBoostingRegressor()],
                'pca__n_components': range(5,10,1),
                'reg__learning_rate': [0.001,0.01,0.1,1,10],
                'reg__loss' : ['ls', 'lad', 'huber', 'quantile'],
                'reg__criterion':['mse']
            },
            {
                'reg': [BaggingRegressor()],
                'pca__n_components': range(5,10,1),
                'reg__n_estimators': range(5,13,1),
                'reg__max_features': range(5,X_train.shape[1],1)
            },
            {
                'reg': [DecisionTreeRegressor()],
                'pca__n_components': range(5,10,1),
                'reg__max_depth': range(3,X_train.shape[1],1),
                'reg__max_leaf_nodes':range(6,X_train.shape[1],1)
            },
        ]

#evaluating multiple reg
#based on pipeline parameters
#-------------------------------
result=[]

for params in parameters:
    
    #reg
    reg = params['reg'][0]
    print(f'searching {reg}')

    #getting arguments by
    #popping out classifier
    params.pop('reg')

    #pipeline
    pipe=Pipeline([('pca',PCA(n_components=5)),
                ('reg',reg)])

    #cross validation using Grid Search
    grid = GridSearchCV(pipe, param_grid=params, cv=3)
    grid.fit(X_train, y_train)

    #storing result
    result.append\
    (
        {
            'grid': grid,
            'classifier': grid.best_estimator_,
            'best score': grid.best_score_,
            'best params': grid.best_params_,
            'cv': grid.cv
        }
    )

result

#saving best classifier
#grid = result[0]['grid']


In [None]:
result

In [27]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import cross_val_score 
degrees = [2, 3, 4, 5]
best_score = 0
best_degree = 0
for degree in degrees:
  print(degree)
  poly_features = PolynomialFeatures(degree = degree)
  X_train_poly = poly_features.fit_transform(X_train)
  polynomial_regressor = LinearRegression(normalize=False)
  polynomial_regressor.fit(X_train_poly, y_train)
  scores = cross_val_score(polynomial_regressor, X_train_poly, y_train, cv=5) # Change k-fold cv value here
  if max(scores) > best_score:
      best_score = max(scores)
      best_degree = degree

2
3
4
5


In [28]:
best_degree

3

In [29]:
best_score

0.00012911661612147185

In [30]:
poly=PolynomialFeatures(degree=3)
X_train_poly=poly.fit_transform(X_train)
lr=LinearRegression()
lr.fit(X_train_poly,y_train)
y_lr_pred=lr.predict(poly.transform(X_test))

In [31]:
mean_squared_error(y_lr_pred,y_test)

131260206622746.73

In [32]:
r2_score(y_lr_pred,y_test)

-13848.135866036953