In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.experimental import enable_iterative_imputer 
from sklearn.impute import IterativeImputer
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor 
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import make_column_transformer, ColumnTransformer
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score 
from sklearn.svm import SVR
from sklearn.ensemble import AdaBoostRegressor, ExtraTreesRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor

In [3]:
df = pd.read_csv('df_modeling.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,price,year,manufacturer,condition,cylinders,fuel,odometer,title_status,transmission,drive,size,type,paint_color,state
0,0,15000.0,2013.0,ford,excellent,6 cylinders,gas,128000.0,clean,automatic,rwd,full-size,truck,black,al
1,1,4500.0,1992.0,jeep,excellent,6 cylinders,gas,192000.0,clean,automatic,4wd,full-size,sedan,white,al
2,2,14000.0,2012.0,honda,excellent,6 cylinders,gas,95000.0,clean,automatic,fwd,full-size,mini-van,silver,al
3,3,15000.0,2017.0,dodge,excellent,8 cylinders,gas,90000.0,rebuilt,automatic,rwd,mid-size,sedan,grey,al
4,4,3000.0,2004.0,chrysler,good,6 cylinders,gas,176144.0,clean,automatic,fwd,mid-size,mini-van,silver,al


In [4]:
df.drop(columns=['Unnamed: 0'], inplace=True)

In [5]:
df.isnull().sum()

price           0
year            0
manufacturer    0
condition       0
cylinders       0
fuel            0
odometer        0
title_status    0
transmission    0
drive           0
size            0
type            0
paint_color     0
state           0
dtype: int64

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 266762 entries, 0 to 266761
Data columns (total 14 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   price         266762 non-null  float64
 1   year          266762 non-null  float64
 2   manufacturer  266762 non-null  object 
 3   condition     266762 non-null  object 
 4   cylinders     266762 non-null  object 
 5   fuel          266762 non-null  object 
 6   odometer      266762 non-null  float64
 7   title_status  266762 non-null  object 
 8   transmission  266762 non-null  object 
 9   drive         266762 non-null  object 
 10  size          266762 non-null  object 
 11  type          266762 non-null  object 
 12  paint_color   266762 non-null  object 
 13  state         266762 non-null  object 
dtypes: float64(3), object(11)
memory usage: 28.5+ MB


In [12]:
list(df.select_dtypes('object').columns.values)

['manufacturer',
 'condition',
 'cylinders',
 'fuel',
 'title_status',
 'transmission',
 'drive',
 'size',
 'type',
 'paint_color',
 'state']

In [13]:
df = pd.get_dummies(df, columns=['manufacturer','condition','cylinders','fuel','title_status',
                                 'transmission','drive','size','type','paint_color','state'], 
                    drop_first=True)
df.head()

Unnamed: 0,price,year,odometer,manufacturer_alfa-romeo,manufacturer_aston-martin,manufacturer_audi,manufacturer_bmw,manufacturer_buick,manufacturer_cadillac,manufacturer_chevrolet,...,state_sd,state_tn,state_tx,state_ut,state_va,state_vt,state_wa,state_wi,state_wv,state_wy
0,15000.0,2013.0,128000.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,4500.0,1992.0,192000.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,14000.0,2012.0,95000.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,15000.0,2017.0,90000.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,3000.0,2004.0,176144.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [14]:
X = df.drop(columns='price')
y = df['price']

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=42)

In [18]:
lr = LinearRegression()
lr.fit(X_train,y_train)

In [19]:
lr.score(X_train,y_train), lr.score(X_test,y_test)

(0.49276370082016785, 0.4950501486637672)

In [16]:
rf = RandomForestRegressor()
rf.fit(X_train,y_train)

In [17]:
rf.score(X_train,y_train), rf.score(X_test,y_test)

(0.9779295655618356, 0.8464623447558085)

In [20]:
etr = ExtraTreesRegressor()
etr.fit(X_train,y_train)

In [21]:
etr.score(X_train,y_train), etr.score(X_test,y_test)

(0.9994594489843648, 0.8438580692636297)