In [61]:
#importing library
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.svm import LinearSVR,SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

In [4]:
#loading the dataset
df=pd.read_csv('../input/usa-cers-dataset/USA_cars_datasets.csv')
#showing the data
df

Unnamed: 0.1,Unnamed: 0,price,brand,model,year,title_status,mileage,color,vin,lot,state,country,condition
0,0,6300,toyota,cruiser,2008,clean vehicle,274117.0,black,jtezu11f88k007763,159348797,new jersey,usa,10 days left
1,1,2899,ford,se,2011,clean vehicle,190552.0,silver,2fmdk3gc4bbb02217,166951262,tennessee,usa,6 days left
2,2,5350,dodge,mpv,2018,clean vehicle,39590.0,silver,3c4pdcgg5jt346413,167655728,georgia,usa,2 days left
3,3,25000,ford,door,2014,clean vehicle,64146.0,blue,1ftfw1et4efc23745,167753855,virginia,usa,22 hours left
4,4,27700,chevrolet,1500,2018,clean vehicle,6654.0,red,3gcpcrec2jg473991,167763266,florida,usa,22 hours left
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2494,2494,7800,nissan,versa,2019,clean vehicle,23609.0,red,3n1cn7ap9kl880319,167722715,california,usa,1 days left
2495,2495,9200,nissan,versa,2018,clean vehicle,34553.0,silver,3n1cn7ap5jl884088,167762225,florida,usa,21 hours left
2496,2496,9200,nissan,versa,2018,clean vehicle,31594.0,silver,3n1cn7ap9jl884191,167762226,florida,usa,21 hours left
2497,2497,9200,nissan,versa,2018,clean vehicle,32557.0,black,3n1cn7ap3jl883263,167762227,florida,usa,2 days left


In [5]:
#checking for information about the dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2499 entries, 0 to 2498
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Unnamed: 0    2499 non-null   int64  
 1   price         2499 non-null   int64  
 2   brand         2499 non-null   object 
 3   model         2499 non-null   object 
 4   year          2499 non-null   int64  
 5   title_status  2499 non-null   object 
 6   mileage       2499 non-null   float64
 7   color         2499 non-null   object 
 8   vin           2499 non-null   object 
 9   lot           2499 non-null   int64  
 10  state         2499 non-null   object 
 11  country       2499 non-null   object 
 12  condition     2499 non-null   object 
dtypes: float64(1), int64(4), object(8)
memory usage: 253.9+ KB


# #Preprocessing

In [16]:
def binary_encode(df,columns_with_positive_values):
    #copying the dataset
    df=df.copy()
    #looping through the column and their respective positive values
    for column,positive_values in columns_with_positive_values:
        #applying the lambda function to convert binary into 1 and 0
        df[column]=df[column].apply(lambda x:1 if x==positive_values else 0)
        
    return df


    
def onehot_encode(df,column_with_prefixes):
    #copying the dataframe
    df=df.copy()
    #reating for loop to looping through column and their respective prefiex
    for column,prefix in column_with_prefixes:
        #creating dummies column and storing it in a variable
        dummies=pd.get_dummies(df[column],prefix=prefix)
        #concating the dummies column into original dataset
        df=pd.concat([df,dummies],axis=1)
        #Dropping the original dataframe
        df=df.drop(column,axis=1)
    return df
    
    
    
    
    
    

In [49]:
def preprocess_inputs(df):
    #copying the dataset
    df=df.copy()
    #showing the dataset
    df=df.drop(['Unnamed: 0','vin','lot'],axis=1)
    #binary encode the title status an country columns
    df=binary_encode(
        df,columns_with_positive_values=[
            ('title_status','salvage insurance'),
            ('country',' canada')
        ])
    #One-hot encode the brand,model,color,state, and condition columns
    df=onehot_encode(
        df,column_with_prefixes=[
            ('brand','br'),
            ('model','md'),
            ('color','cl'),
            ('state','st'),
            ('condition','cd')
        ]
    )
    #Split df into x and y
    y=df['price'].copy()
    x=df.drop('price',axis=1).copy()
    #Train_test_split
    x_train,x_test,y_train,y_test=train_test_split(x,y,train_size=0.7,random_state=1)
    #Scaling x with standard scaler
    scaler=StandardScaler()
    #fitting the x_train dataset
    scaler.fit(x_train)
    #scaling the data 
    x_train=pd.DataFrame(scaler.transform(x_train),columns=x_train.columns,index=x_train.index)
    x_test=pd.DataFrame(scaler.transform(x_test),columns=x_test.columns,index=x_test.index)
    return x_train,x_test,y_train,y_test

In [50]:
x['title_status'].value_counts()

0    2336
1     163
Name: title_status, dtype: int64

In [51]:
x_train,x_test,y_train,y_test=preprocess_inputs(df)
print(x_train.shape)
print(x_test.shape)

(1749, 299)
(750, 299)


In [52]:
x_train

Unnamed: 0,year,title_status,mileage,country,br_acura,br_audi,br_bmw,br_buick,br_cadillac,br_chevrolet,...,cd_5 hours left,cd_53 minutes,cd_6 days left,cd_6 hours left,cd_7 days left,cd_7 hours left,cd_8 days left,cd_9 days left,cd_9 minutes,cd_Listing Expired
137,0.102109,-0.261568,-0.335485,-0.053544,-0.041451,-0.041451,-0.083117,-0.079556,-0.058671,-0.356277,...,-0.071919,-0.023918,-0.140802,-0.067787,-0.134329,-0.047878,-0.193304,-0.156858,-0.033835,-0.093008
2278,-1.001995,-0.261568,0.731790,-0.053544,-0.041451,-0.041451,-0.083117,-0.079556,-0.058671,-0.356277,...,-0.071919,-0.023918,-0.140802,-0.067787,-0.134329,-0.047878,5.173191,-0.156858,-0.033835,-0.093008
1851,-0.725969,3.823097,1.033230,-0.053544,-0.041451,-0.041451,-0.083117,-0.079556,-0.058671,-0.356277,...,-0.071919,-0.023918,-0.140802,-0.067787,-0.134329,-0.047878,-0.193304,-0.156858,-0.033835,-0.093008
1563,0.654161,-0.261568,-0.248061,-0.053544,-0.041451,-0.041451,-0.083117,-0.079556,-0.058671,-0.356277,...,-0.071919,-0.023918,-0.140802,-0.067787,-0.134329,-0.047878,5.173191,-0.156858,-0.033835,-0.093008
2022,-0.449943,-0.261568,1.141934,-0.053544,-0.041451,-0.041451,-0.083117,-0.079556,-0.058671,-0.356277,...,-0.071919,-0.023918,-0.140802,-0.067787,-0.134329,-0.047878,-0.193304,-0.156858,-0.033835,-0.093008
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
960,0.654161,-0.261568,-0.410876,-0.053544,-0.041451,-0.041451,-0.083117,-0.079556,-0.058671,-0.356277,...,-0.071919,-0.023918,-0.140802,-0.067787,-0.134329,-0.047878,-0.193304,-0.156858,-0.033835,-0.093008
905,0.378135,-0.261568,-0.155866,-0.053544,-0.041451,-0.041451,-0.083117,-0.079556,-0.058671,-0.356277,...,-0.071919,-0.023918,-0.140802,-0.067787,-0.134329,-0.047878,-0.193304,-0.156858,-0.033835,-0.093008
1096,-0.725969,-0.261568,0.708157,-0.053544,-0.041451,-0.041451,-0.083117,-0.079556,-0.058671,-0.356277,...,-0.071919,-0.023918,-0.140802,-0.067787,-0.134329,-0.047878,-0.193304,-0.156858,-0.033835,-0.093008
235,-0.173917,-0.261568,0.234919,-0.053544,-0.041451,-0.041451,-0.083117,-0.079556,-0.058671,-0.356277,...,-0.071919,-0.023918,-0.140802,-0.067787,-0.134329,-0.047878,-0.193304,-0.156858,-0.033835,-0.093008


In [32]:
x.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2499 entries, 0 to 2498
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   price         2499 non-null   int64  
 1   brand         2499 non-null   object 
 2   model         2499 non-null   object 
 3   year          2499 non-null   int64  
 4   title_status  2499 non-null   int64  
 5   mileage       2499 non-null   float64
 6   color         2499 non-null   object 
 7   state         2499 non-null   object 
 8   country       2499 non-null   int64  
 9   condition     2499 non-null   object 
dtypes: float64(1), int64(4), object(5)
memory usage: 195.4+ KB


In [26]:
x['country'].value_counts()

0    2492
1       7
Name: country, dtype: int64

In [33]:
x['country'].unique()

array([0, 1])

In [54]:
x_train.var()

year                  1.000572
title_status          1.000572
mileage               1.000572
country               1.000572
br_acura              1.000572
                        ...   
cd_7 hours left       1.000572
cd_8 days left        1.000572
cd_9 days left        1.000572
cd_9 minutes          1.000572
cd_Listing Expired    1.000572
Length: 299, dtype: float64

In [55]:
y_train

137     19500
2278     3300
1851    10500
1563    16500
2022    31700
        ...  
960     23800
905     17300
1096     3700
235      3650
1061    17000
Name: price, Length: 1749, dtype: int64

In [12]:
{column:len(x[column].unique()) for column in x.columns}

{'Unnamed: 0': 2499,
 'price': 790,
 'brand': 28,
 'model': 127,
 'year': 30,
 'title_status': 2,
 'mileage': 2439,
 'color': 49,
 'vin': 2495,
 'lot': 2495,
 'state': 44,
 'country': 2,
 'condition': 47}

In [58]:
models={
    'Linear Regression': LinearRegression(),
    'K-Nearest Neighbors': KNeighborsRegressor(),
    'Neural Network': MLPRegressor(),
    'Support Vector Machine': LinearSVR(),
    'Decision Tree': DecisionTreeRegressor(),
    'Random Forest': RandomForestRegressor(),
    'Gradient Boosting': GradientBoostingRegressor(),
    'XGBoost': XGBRegressor(),
    'LightGBM': LGBMRegressor(),
    'CatBoost': CatBoostRegressor(verbose=0)
}

In [60]:
for name,model in models.items():
    model.fit(x_train,y_train)
    print(name+'trained')
    

Linear Regressiontrained
K-Nearest Neighborstrained




Neural Networktrained
Support Vector Machinetrained
Decision Treetrained
Random Foresttrained
Gradient Boostingtrained
XGBoosttrained


[LightGBM] [Fatal] Do not support special JSON characters in feature name.


LightGBMError: Do not support special JSON characters in feature name.

# Results


In [62]:
for name,model in models.items():
    print(name+'R^2 Score: {:.5f}'.format(model.score(x_test,y_test)))

Linear RegressionR^2 Score: -22247055328394150370344960.00000
K-Nearest NeighborsR^2 Score: 0.53720
Neural NetworkR^2 Score: -1.23408
Support Vector MachineR^2 Score: -2.04768
Decision TreeR^2 Score: 0.40611
Random ForestR^2 Score: 0.68885
Gradient BoostingR^2 Score: 0.64434
XGBoostR^2 Score: 0.71385


NotFittedError: Estimator not fitted, call fit before exploiting the model.