# Trying to Predict the Price of Used Car

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
#neighbors
from sklearn.neighbors import KNeighborsRegressor
#neural_network
from sklearn.neural_network import MLPRegressor
#Support Vector Machine
from sklearn.svm import SVR,LinearSVR
#ensemble
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor
#xgboost
from xgboost import XGBRegressor
#lightgbm
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor



models={'Linear Regression':LinearRegression(),
       'KNeighborsRegressor':KNeighborsRegressor(),
       'MLPRegressor':MLPRegressor(),
       'Support Vector Machine':SVR(),
       'Support Vector Machine (Linear)':LinearSVR(),
       'Random Forest':RandomForestRegressor(),
       'Gradient Boosting Regressor':GradientBoostingRegressor(),
       'XGBoost':XGBRegressor(),
       'Light GBM':LGBMRegressor(),
       'Cat Boost':CatBoostRegressor()}

# Loading the Dataset

In [16]:
df=pd.read_csv('/kaggle/input/craigslist-carstrucks-data/vehicles.csv',nrows=100000)
#showing the dataset
df

Unnamed: 0,id,url,region,region_url,price,year,manufacturer,model,condition,cylinders,...,size,type,paint_color,image_url,description,county,state,lat,long,posting_date
0,7222695916,https://prescott.craigslist.org/cto/d/prescott...,prescott,https://prescott.craigslist.org,6000,,,,,,...,,,,,,,az,,,
1,7218891961,https://fayar.craigslist.org/ctd/d/bentonville...,fayetteville,https://fayar.craigslist.org,11900,,,,,,...,,,,,,,ar,,,
2,7221797935,https://keys.craigslist.org/cto/d/summerland-k...,florida keys,https://keys.craigslist.org,21000,,,,,,...,,,,,,,fl,,,
3,7222270760,https://worcester.craigslist.org/cto/d/west-br...,worcester / central MA,https://worcester.craigslist.org,1500,,,,,,...,,,,,,,ma,,,
4,7210384030,https://greensboro.craigslist.org/cto/d/trinit...,greensboro,https://greensboro.craigslist.org,4900,,,,,,...,,,,,,,nc,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,7309358353,https://jacksonville.craigslist.org/ctd/d/jack...,jacksonville,https://jacksonville.craigslist.org,49395,2019.0,chevrolet,silverado trail boss,new,8 cylinders,...,full-size,pickup,red,https://images.craigslist.org/00x0x_cg9hmWpOhy...,"*** MINT CONDITION *** BRAND NEW 7"" PRO COMP. ...",,fl,30.207288,-81.738969,2021-04-19T13:50:58-0400
99996,7309358251,https://jacksonville.craigslist.org/ctd/d/jack...,jacksonville,https://jacksonville.craigslist.org,18590,2018.0,kia,sportage lx sport utility 4d,good,,...,,other,silver,https://images.craigslist.org/00o0o_4QCkqtOcFy...,Carvana is the safer way to buy a car During t...,,fl,30.330000,-81.650000,2021-04-19T13:50:50-0400
99997,7309355294,https://jacksonville.craigslist.org/ctd/d/jack...,jacksonville,https://jacksonville.craigslist.org,49495,2019.0,chevrolet,silverado trail boss,new,8 cylinders,...,full-size,pickup,red,https://images.craigslist.org/00x0x_cg9hmWpOhy...,"*** MINT CONDITION *** BRAND NEW 7"" PRO COMP. ...",,fl,30.207288,-81.738969,2021-04-19T13:46:36-0400
99998,7309354677,https://jacksonville.craigslist.org/ctd/d/jack...,jacksonville,https://jacksonville.craigslist.org,24495,2014.0,chevrolet,silverado 1500 lt 4x4,excellent,8 cylinders,...,full-size,truck,white,https://images.craigslist.org/00f0f_djNCWjHFxp...,*** MINT CONDITION *** CLEAN CARFAX - NO ACCID...,,fl,30.207288,-81.738969,2021-04-19T13:45:40-0400


# Checking for Preliminary Information

In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 26 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   id            100000 non-null  int64  
 1   url           100000 non-null  object 
 2   region        100000 non-null  object 
 3   region_url    100000 non-null  object 
 4   price         100000 non-null  int64  
 5   year          99572 non-null   float64
 6   manufacturer  95580 non-null   object 
 7   model         98715 non-null   object 
 8   condition     61711 non-null   object 
 9   cylinders     59841 non-null   object 
 10  fuel          99397 non-null   object 
 11  odometer      98479 non-null   float64
 12  title_status  97721 non-null   object 
 13  transmission  99486 non-null   object 
 14  VIN           63962 non-null   object 
 15  drive         69526 non-null   object 
 16  size          27846 non-null   object 
 17  type          80389 non-null   object 
 18  paint

# Checking for Missing Value in the Dataset

In [18]:
df.isna().mean()

id              0.00000
url             0.00000
region          0.00000
region_url      0.00000
price           0.00000
year            0.00428
manufacturer    0.04420
model           0.01285
condition       0.38289
cylinders       0.40159
fuel            0.00603
odometer        0.01521
title_status    0.02279
transmission    0.00514
VIN             0.36038
drive           0.30474
size            0.72154
type            0.19611
paint_color     0.29776
image_url       0.00038
description     0.00039
county          1.00000
state           0.00000
lat             0.00539
long            0.00539
posting_date    0.00038
dtype: float64

# Creating the Preprocessing Input

In [54]:
def onehot_encode(df,columns):
    df=df.copy()
    for column in columns:
        dummies=pd.get_dummies(df[column],prefix=column)
        df=pd.concat([df,dummies],axis=1)
        df=df.drop(column,axis=1)
    return df

In [60]:
def preprocess_inputs(df):
    df=df.copy()
    #dropping the size column due to high number of missing value
    df=df.drop('size',axis=1)
    #dropping the id and url column
    df=df.drop(['id','url','region_url','image_url','county','VIN','description'],axis=1)
    df=df.drop(df[df['cylinders'].isna()].index,axis=0)
    df['condition']=df['condition'].fillna(df['condition'].mode()[0])
    df['paint_color']=df['paint_color'].fillna(df['paint_color'].mode()[0])
    df['manufacturer']=df['manufacturer'].fillna(df['manufacturer'].mode()[0])
    df['type']=df['type'].fillna(df['type'].mode()[0])
    df['fuel']=df['fuel'].fillna(df['fuel'].mode()[0])
    df['drive']=df['drive'].fillna(df['drive'].mode()[0])
    df['title_status']=df['title_status'].fillna(df['title_status'].mode()[0])
    df['model']=df['model'].fillna(df['model'].mode()[0])
    df['year']=df['year'].fillna(df['year'].mode()[0])
    df['transmission']=df['transmission'].fillna(df['transmission'].mode()[0])
    for column in ['lat','long','odometer']:
        df[column]=df[column].fillna(df[column].mean())
    df['cylinders']=df['cylinders'].apply(lambda x:x.split(' ')[0])
    
    df['posting_year']=df['posting_date'].apply(lambda x:x.split('-')[0]).astype(np.int)
    df['posting_month']=df['posting_date'].apply(lambda x:x.split('-')[1]).astype(np.int)

    
    #df['posting_year']=df['posting_date'].dt.year
    #df['posting_month']=df['posting_date'].dt.month
    #df['posting_day']=df['posting_date'].dt.day
    df=df.drop('posting_date',axis=1)
    df['model']=df['model'].apply(lambda x:x.split(' ')[0])
    df=onehot_encode(df,[column for column in df.select_dtypes('object')])
    y=df['price']
    x=df.drop('price',axis=1)
    x_train,x_test,y_train,y_test=train_test_split(x,y,train_size=0.7)
    scaler=StandardScaler()
    scaler.fit(x_train)
    x_train=pd.DataFrame(scaler.transform(x_train),columns=x_train.columns)
    x_test=pd.DataFrame(scaler.transform(x_test),columns=x_test.columns)
    
    return x_train,x_test,y_train,y_test

In [61]:
x_train,x_test,y_train,y_test=preprocess_inputs(df)
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations


(41888, 2318)
(17953, 2318)
(41888,)
(17953,)


# Training the Model

In [None]:
for name,model in models.items():
    model.fit(x_train,y_train)
    print(name)
    print(model.score(x_test,y_test))
    

Linear Regression
-1.047729573545104e+30
KNeighborsRegressor
-262322.548969836




MLPRegressor
-0.21740584974406651


In [51]:
[column for column in x.select_dtypes('object')]

['region',
 'manufacturer',
 'model',
 'condition',
 'cylinders',
 'fuel',
 'title_status',
 'transmission',
 'drive',
 'type',
 'paint_color',
 'state']

In [48]:
x['drive'].unique()

array(['4wd', 'rwd', 'fwd'], dtype=object)

df.isna()

In [38]:
y='2021-05-04 12:31:18-05:00'
y.split(' ')[0].split('-')[2]

'04'

In [None]:
df[df['cylinders'].isna()].index

In [None]:
x.isna().mean()

In [None]:
x['cylinders'].apply(lambda x:x.split(' ')[0])