In [2]:
import statsmodels.api as sm
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tsa.stattools import acf, pacf
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from sklearn.metrics import mean_squared_error

from warnings import filterwarnings

from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures, OneHotEncoder, OrdinalEncoder
from sklearn.metrics import mean_squared_error 
from sklearn.model_selection import train_test_split
from sklearn.compose import make_column_transformer, make_column_selector


In [31]:
df = pd.read_csv(r'C:\Users\pbthakke\Documents\Learning\Machine Learning\practical_application_II_starter\data\vehicles.csv')

In [32]:
df.head()

Unnamed: 0,id,region,price,year,manufacturer,model,condition,cylinders,fuel,odometer,title_status,transmission,VIN,drive,size,type,paint_color,state
0,7222695916,prescott,6000,,,,,,,,,,,,,,,az
1,7218891961,fayetteville,11900,,,,,,,,,,,,,,,ar
2,7221797935,florida keys,21000,,,,,,,,,,,,,,,fl
3,7222270760,worcester / central MA,1500,,,,,,,,,,,,,,,ma
4,7210384030,greensboro,4900,,,,,,,,,,,,,,,nc


In [33]:
regions = ['statesboro', 'lafayette', 'logan', 'southwest TX', 'san marcos', 'central louisiana', 'northeast SD', 'northwest KS', 'st louis']
df = df.dropna()
df= df.drop('id', axis = 1)
df = df.drop('VIN', axis = 1)
df = df.drop(df[df['price'] == 0].index)
df['price'] = (df['price'] - df['price'].mean()) / df['price'].std()
df['year'] = (df['year'] - df['year'].mean()) / df['year'].std()
df['odometer'] = (df['odometer'] - df['odometer'].mean()) / df['odometer'].std()

X = df.drop('price', axis = 1)
y = df['price']
df = df.drop('price', axis = 1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=22)


In [34]:
oe_condition = OrdinalEncoder(categories = [['salvage', 'fair', 'good', 'excellent', 'like new', 'new']])
df['condition'].value_counts()

excellent    18259
good          9772
like new      3772
fair           488
new            169
salvage         36
Name: condition, dtype: int64

In [35]:
df['title_status'].value_counts()
oe_title_status = OrdinalEncoder(categories = [['parts only', 'salvage', 'rebuilt', 'missing', 'lien', 'clean']])

In [36]:
df['size'].value_counts()
oe_size = OrdinalEncoder(categories = [['sub-compact', 'compact', 'mid-size', 'full-size']])

In [37]:
df['type'].value_counts()
oe_type = OrdinalEncoder(categories = [['other', 'offroad', 'coupe', 'hatchback', 'convertible', 'sedan', 'wagon', 'SUV', 'truck', 'pickup', 'mini-van', 'van', 'bus']])

In [38]:
df['cylinders'].value_counts()
oe_cylinders = OrdinalEncoder(categories = [['other', '3 cylinders', '4 cylinders', '5 cylinders', '6 cylinders', '8 cylinders', '10 cylinders', '12 cylinders']])

In [39]:
df['paint_color'].value_counts()

white     9092
black     5994
silver    4483
grey      3989
blue      3141
red       2900
custom     868
green      842
brown      748
yellow     215
orange     152
purple      72
Name: paint_color, dtype: int64

In [40]:
df['region'].value_counts()

vermont               676
jacksonville          577
anchorage / mat-su    399
nashville             364
central NJ            359
                     ... 
northeast SD            1
the thumb               1
st louis                1
houma                   1
imperial county         1
Name: region, Length: 390, dtype: int64

In [60]:
train_mses = []
test_mses = []

for i in range(1,6):
    poly = make_column_transformer((PolynomialFeatures(degree=i), make_column_selector(dtype_include=np.number)),
                                               (oe_condition, ['condition']),
                                               (oe_title_status, ['title_status']),
                                               (oe_size, ['size']),
                                               (oe_type, ['type']),
                                               (oe_cylinders, ['cylinders']),
                                                (OneHotEncoder(handle_unknown = 'ignore'), ['region']),
                                                (OneHotEncoder(handle_unknown = 'ignore'), ['manufacturer']),
                                                (OneHotEncoder(handle_unknown = 'ignore'), ['model']),
                                                (OneHotEncoder(handle_unknown = 'ignore'), ['fuel']),
                                                (OneHotEncoder(handle_unknown = 'ignore'), ['transmission']),
                                                (OneHotEncoder(handle_unknown = 'ignore'), ['drive']),
                                                (OneHotEncoder(handle_unknown = 'ignore'), ['paint_color']),
                                                (OneHotEncoder(handle_unknown = 'ignore'), ['state']), 
                                               remainder='passthrough'
                                  )
    pipe = Pipeline([('transformer', poly), ('linreg', LinearRegression())])
                                  
    poly.fit_transform(X_train)
    pipe.fit(X_train, y_train)
    p1 = pipe.predict(X_train)
    p2 = pipe.predict(X_test)
    train_mses.append(mean_squared_error(y_train, p1))
    test_mses.append(mean_squared_error(y_test, p2))
print(train_mses)
print(test_mses)
pipe    

[0.20022151006569494, 0.13988435890660958, 0.13527787338523553, 0.28383977442647523, 0.9861671478852588]
[0.49232538710956253, 0.4701003138415885, 0.23877186656910634, 0.31222652283560265, 1.0236071783402207]


Pipeline(steps=[('transformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('polynomialfeatures',
                                                  PolynomialFeatures(degree=5),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x0000026EA5C58220>),
                                                 ('ordinalencoder-1',
                                                  OrdinalEncoder(categories=[['salvage',
                                                                              'fair',
                                                                              'good',
                                                                              'excellent',
                                                                              'like '
                                                                              'new',
                       

In [61]:
    poly = make_column_transformer((PolynomialFeatures(degree=2), make_column_selector(dtype_include=np.number)),
                                               (oe_condition, ['condition']),
                                               (oe_title_status, ['title_status']),
                                               (oe_size, ['size']),
                                               (oe_type, ['type']),
                                               (oe_cylinders, ['cylinders']),
                                                (OneHotEncoder(handle_unknown = 'ignore'), ['region']),
                                                (OneHotEncoder(handle_unknown = 'ignore'), ['manufacturer']),
                                                (OneHotEncoder(handle_unknown = 'ignore'), ['model']),
                                                (OneHotEncoder(handle_unknown = 'ignore'), ['fuel']),
                                                (OneHotEncoder(handle_unknown = 'ignore'), ['transmission']),
                                                (OneHotEncoder(handle_unknown = 'ignore'), ['drive']),
                                                (OneHotEncoder(handle_unknown = 'ignore'), ['paint_color']),
                                                (OneHotEncoder(handle_unknown = 'ignore'), ['state']), 
                                               remainder='passthrough'
                                  )
    pipe = Pipeline([('transformer', poly), ('linreg', LinearRegression())])
                                  
    poly.fit_transform(X_train)
    pipe.fit(X_train, y_train)
    p1 = pipe.predict(X_train)
    p2 = pipe.predict(X_test)
    train_mse = (mean_squared_error(y_train, p1))
    test_mse = (mean_squared_error(y_test, p2))
print(train_mse)
print(test_mse)
print(len(pipe.named_steps['linreg'].coef_))

0.13988435890660958
0.4701003138415885
4631


In [62]:
X_train1 = X_train.drop('condition', axis=1)
X_test1 = X_test.drop('condition', axis=1)

poly = make_column_transformer((PolynomialFeatures(degree=2), make_column_selector(dtype_include=np.number)),
                                       
                                               (oe_title_status, ['title_status']),
                                               (oe_size, ['size']),
                                               (oe_type, ['type']),
                                               (oe_cylinders, ['cylinders']),
                                                (OneHotEncoder(handle_unknown = 'ignore'), ['region']),
                                                (OneHotEncoder(handle_unknown = 'ignore'), ['manufacturer']),
                                                (OneHotEncoder(handle_unknown = 'ignore'), ['model']),
                                                (OneHotEncoder(handle_unknown = 'ignore'), ['fuel']),
                                                (OneHotEncoder(handle_unknown = 'ignore'), ['transmission']),
                                                (OneHotEncoder(handle_unknown = 'ignore'), ['drive']),
                                                (OneHotEncoder(handle_unknown = 'ignore'), ['paint_color']),
                                                (OneHotEncoder(handle_unknown = 'ignore'), ['state']), 
                                               remainder='passthrough'
                                  )
pipe = Pipeline([('transformer', poly), ('linreg', LinearRegression())])

poly.fit_transform(X_train1)
pipe.fit(X_train1, y_train)
p1 = pipe.predict(X_train1)
p2 = pipe.predict(X_test1)
train_mse = (mean_squared_error(y_train, p1))
test_mse = (mean_squared_error(y_test, p2))
print(train_mse)
print(test_mse)

0.14093884882754681
0.4907137125951561


In [63]:
X_train1 = X_train.drop('title_status', axis=1)
X_test1 = X_test.drop('title_status', axis=1)

poly = make_column_transformer((PolynomialFeatures(degree=2), make_column_selector(dtype_include=np.number)),
                                               (oe_condition, ['condition']),
#                                               (oe_title_status, ['title_status']),
                                               (oe_size, ['size']),
                                               (oe_type, ['type']),
                                               (oe_cylinders, ['cylinders']),
                                                (OneHotEncoder(handle_unknown = 'ignore'), ['region']),
                                                (OneHotEncoder(handle_unknown = 'ignore'), ['manufacturer']),
                                                (OneHotEncoder(handle_unknown = 'ignore'), ['model']),
                                                (OneHotEncoder(handle_unknown = 'ignore'), ['fuel']),
                                                (OneHotEncoder(handle_unknown = 'ignore'), ['transmission']),
                                                (OneHotEncoder(handle_unknown = 'ignore'), ['drive']),
                                                (OneHotEncoder(handle_unknown = 'ignore'), ['paint_color']),
                                                (OneHotEncoder(handle_unknown = 'ignore'), ['state']), 
                                               remainder='passthrough'
                                  )
pipe = Pipeline([('transformer', poly), ('linreg', LinearRegression())])

poly.fit_transform(X_train1)
pipe.fit(X_train1, y_train)
p1 = pipe.predict(X_train1)
p2 = pipe.predict(X_test1)
train_mse = (mean_squared_error(y_train, p1))
test_mse = (mean_squared_error(y_test, p2))
print(train_mse)
print(test_mse)

0.1412395196082924
0.4585611267376086


In [64]:
X_train1 = X_train.drop('size', axis=1)
X_test1 = X_test.drop('size', axis=1)

poly = make_column_transformer((PolynomialFeatures(degree=2), make_column_selector(dtype_include=np.number)),
                                               (oe_condition, ['condition']),
                                               (oe_title_status, ['title_status']),
#                                               (oe_size, ['size']),
                                               (oe_type, ['type']),
                                               (oe_cylinders, ['cylinders']),
                                                (OneHotEncoder(handle_unknown = 'ignore'), ['region']),
                                                (OneHotEncoder(handle_unknown = 'ignore'), ['manufacturer']),
                                                (OneHotEncoder(handle_unknown = 'ignore'), ['model']),
                                                (OneHotEncoder(handle_unknown = 'ignore'), ['fuel']),
                                                (OneHotEncoder(handle_unknown = 'ignore'), ['transmission']),
                                                (OneHotEncoder(handle_unknown = 'ignore'), ['drive']),
                                                (OneHotEncoder(handle_unknown = 'ignore'), ['paint_color']),
                                                (OneHotEncoder(handle_unknown = 'ignore'), ['state']), 
                                               remainder='passthrough'
                                  )
pipe = Pipeline([('transformer', poly), ('linreg', LinearRegression())])

poly.fit_transform(X_train1)
pipe.fit(X_train1, y_train)
p1 = pipe.predict(X_train1)
p2 = pipe.predict(X_test1)
train_mse = (mean_squared_error(y_train, p1))
test_mse = (mean_squared_error(y_test, p2))
print(train_mse)
print(test_mse)

0.14023509458057995
0.46678693836133983


In [65]:
X_train1 = X_train.drop('type', axis=1)
X_test1 = X_test.drop('type', axis=1)

poly = make_column_transformer((PolynomialFeatures(degree=2), make_column_selector(dtype_include=np.number)),
                                               (oe_condition, ['condition']),
                                               (oe_title_status, ['title_status']),
                                               (oe_size, ['size']),
#                                               (oe_type, ['type']),
                                               (oe_cylinders, ['cylinders']),
                                                (OneHotEncoder(handle_unknown = 'ignore'), ['region']),
                                                (OneHotEncoder(handle_unknown = 'ignore'), ['manufacturer']),
                                                (OneHotEncoder(handle_unknown = 'ignore'), ['model']),
                                                (OneHotEncoder(handle_unknown = 'ignore'), ['fuel']),
                                                (OneHotEncoder(handle_unknown = 'ignore'), ['transmission']),
                                                (OneHotEncoder(handle_unknown = 'ignore'), ['drive']),
                                                (OneHotEncoder(handle_unknown = 'ignore'), ['paint_color']),
                                                (OneHotEncoder(handle_unknown = 'ignore'), ['state']), 
                                               remainder='passthrough'
                                  )
pipe = Pipeline([('transformer', poly), ('linreg', LinearRegression())])

poly.fit_transform(X_train1)
pipe.fit(X_train1, y_train)
p1 = pipe.predict(X_train1)
p2 = pipe.predict(X_test1)
train_mse = (mean_squared_error(y_train, p1))
test_mse = (mean_squared_error(y_test, p2))
print(train_mse)
print(test_mse)

0.1399266520312012
0.47119661529999163


In [66]:
X_train1 = X_train.drop('cylinders', axis=1)
X_test1 = X_test.drop('cylinders', axis=1)

poly = make_column_transformer((PolynomialFeatures(degree=2), make_column_selector(dtype_include=np.number)),
                                               (oe_condition, ['condition']),
                                               (oe_title_status, ['title_status']),
                                               (oe_size, ['size']),
                                               (oe_type, ['type']),
#                                               (oe_cylinders, ['cylinders']),
                                                (OneHotEncoder(handle_unknown = 'ignore'), ['region']),
                                                (OneHotEncoder(handle_unknown = 'ignore'), ['manufacturer']),
                                                (OneHotEncoder(handle_unknown = 'ignore'), ['model']),
                                                (OneHotEncoder(handle_unknown = 'ignore'), ['fuel']),
                                                (OneHotEncoder(handle_unknown = 'ignore'), ['transmission']),
                                                (OneHotEncoder(handle_unknown = 'ignore'), ['drive']),
                                                (OneHotEncoder(handle_unknown = 'ignore'), ['paint_color']),
                                                (OneHotEncoder(handle_unknown = 'ignore'), ['state']), 
                                               remainder='passthrough'
                                  )
pipe = Pipeline([('transformer', poly), ('linreg', LinearRegression())])

poly.fit_transform(X_train1)
pipe.fit(X_train1, y_train)
p1 = pipe.predict(X_train1)
p2 = pipe.predict(X_test1)
train_mse = (mean_squared_error(y_train, p1))
test_mse = (mean_squared_error(y_test, p2))
print(train_mse)
print(test_mse)
    

0.13989598602610404
0.469695737942557


In [67]:
X_train1 = X_train.drop('region', axis=1)
X_test1 = X_test.drop('region', axis=1)

poly = make_column_transformer((PolynomialFeatures(degree=2), make_column_selector(dtype_include=np.number)),
                                               (oe_condition, ['condition']),
                                               (oe_title_status, ['title_status']),
                                               (oe_size, ['size']),
                                               (oe_type, ['type']),
                                               (oe_cylinders, ['cylinders']),
#                                                (OneHotEncoder(handle_unknown = 'ignore'), ['region']),
                                                (OneHotEncoder(handle_unknown = 'ignore'), ['manufacturer']),
                                                (OneHotEncoder(handle_unknown = 'ignore'), ['model']),
                                                (OneHotEncoder(handle_unknown = 'ignore'), ['fuel']),
                                                (OneHotEncoder(handle_unknown = 'ignore'), ['transmission']),
                                                (OneHotEncoder(handle_unknown = 'ignore'), ['drive']),
                                                (OneHotEncoder(handle_unknown = 'ignore'), ['paint_color']),
                                                (OneHotEncoder(handle_unknown = 'ignore'), ['state']), 
                                               remainder='passthrough'
                                  )
pipe = Pipeline([('transformer', poly), ('linreg', LinearRegression())])

poly.fit_transform(X_train1)
pipe.fit(X_train1, y_train)
p1 = pipe.predict(X_train1)
p2 = pipe.predict(X_test1)
train_mse = (mean_squared_error(y_train, p1))
test_mse = (mean_squared_error(y_test, p2))
print(train_mse)
print(test_mse)

0.14921624659684848
0.48161926665065175


In [68]:
X_train1 = X_train.drop('manufacturer', axis=1)
X_test1 = X_test.drop('manufacturer', axis=1)

poly = make_column_transformer((PolynomialFeatures(degree=2), make_column_selector(dtype_include=np.number)),
                                               (oe_condition, ['condition']),
                                               (oe_title_status, ['title_status']),
                                               (oe_size, ['size']),
                                               (oe_type, ['type']),
                                               (oe_cylinders, ['cylinders']),
                                                (OneHotEncoder(handle_unknown = 'ignore'), ['region']),
#                                                (OneHotEncoder(handle_unknown = 'ignore'), ['manufacturer']),
                                                (OneHotEncoder(handle_unknown = 'ignore'), ['model']),
                                                (OneHotEncoder(handle_unknown = 'ignore'), ['fuel']),
                                                (OneHotEncoder(handle_unknown = 'ignore'), ['transmission']),
                                                (OneHotEncoder(handle_unknown = 'ignore'), ['drive']),
                                                (OneHotEncoder(handle_unknown = 'ignore'), ['paint_color']),
                                                (OneHotEncoder(handle_unknown = 'ignore'), ['state']), 
                                               remainder='passthrough'
                                  )
pipe = Pipeline([('transformer', poly), ('linreg', LinearRegression())])

poly.fit_transform(X_train1)
pipe.fit(X_train1, y_train)
p1 = pipe.predict(X_train1)
p2 = pipe.predict(X_test1)
train_mse = (mean_squared_error(y_train, p1))
test_mse = (mean_squared_error(y_test, p2))
print(train_mse)
print(test_mse)

0.14146324450134332
0.28007155257471555


In [69]:
X_train1 = X_train.drop('model', axis=1)
X_test1 = X_test.drop('model', axis=1)

poly = make_column_transformer((PolynomialFeatures(degree=2), make_column_selector(dtype_include=np.number)),
                                               (oe_condition, ['condition']),
                                               (oe_title_status, ['title_status']),
                                               (oe_size, ['size']),
                                               (oe_type, ['type']),
                                               (oe_cylinders, ['cylinders']),
                                                (OneHotEncoder(handle_unknown = 'ignore'), ['region']),
                                                (OneHotEncoder(handle_unknown = 'ignore'), ['manufacturer']),
#                                                (OneHotEncoder(handle_unknown = 'ignore'), ['model']),
                                                (OneHotEncoder(handle_unknown = 'ignore'), ['fuel']),
                                                (OneHotEncoder(handle_unknown = 'ignore'), ['transmission']),
                                                (OneHotEncoder(handle_unknown = 'ignore'), ['drive']),
                                                (OneHotEncoder(handle_unknown = 'ignore'), ['paint_color']),
                                                (OneHotEncoder(handle_unknown = 'ignore'), ['state']), 
                                               remainder='passthrough'
                                  )
pipe = Pipeline([('transformer', poly), ('linreg', LinearRegression())])

poly.fit_transform(X_train1)
pipe.fit(X_train1, y_train)
p1 = pipe.predict(X_train1)
p2 = pipe.predict(X_test1)
train_mse = (mean_squared_error(y_train, p1))
test_mse = (mean_squared_error(y_test, p2))
print(train_mse)
print(test_mse)

0.31830613387448864
0.3337206132479505


In [70]:
X_train1 = X_train.drop('fuel', axis=1)
X_test1 = X_test.drop('fuel', axis=1)

poly = make_column_transformer((PolynomialFeatures(degree=2), make_column_selector(dtype_include=np.number)),
                                               (oe_condition, ['condition']),
                                               (oe_title_status, ['title_status']),
                                               (oe_size, ['size']),
                                               (oe_type, ['type']),
                                               (oe_cylinders, ['cylinders']),
                                                (OneHotEncoder(handle_unknown = 'ignore'), ['region']),
                                                (OneHotEncoder(handle_unknown = 'ignore'), ['manufacturer']),
                                                (OneHotEncoder(handle_unknown = 'ignore'), ['model']),
#                                                (OneHotEncoder(handle_unknown = 'ignore'), ['fuel']),
                                                (OneHotEncoder(handle_unknown = 'ignore'), ['transmission']),
                                                (OneHotEncoder(handle_unknown = 'ignore'), ['drive']),
                                                (OneHotEncoder(handle_unknown = 'ignore'), ['paint_color']),
                                                (OneHotEncoder(handle_unknown = 'ignore'), ['state']), 
                                               remainder='passthrough'
                                  )
pipe = Pipeline([('transformer', poly), ('linreg', LinearRegression())])

poly.fit_transform(X_train1)
pipe.fit(X_train1, y_train)
p1 = pipe.predict(X_train1)
p2 = pipe.predict(X_test1)
train_mse = (mean_squared_error(y_train, p1))
test_mse = (mean_squared_error(y_test, p2))
print(train_mse)
print(test_mse)

0.14738837040183503
0.4743136368554628


In [71]:
X_train1 = X_train.drop('transmission', axis=1)
X_test1 = X_test.drop('transmission', axis=1)

poly = make_column_transformer((PolynomialFeatures(degree=2), make_column_selector(dtype_include=np.number)),
                                               (oe_condition, ['condition']),
                                               (oe_title_status, ['title_status']),
                                               (oe_size, ['size']),
                                               (oe_type, ['type']),
                                               (oe_cylinders, ['cylinders']),
                                                (OneHotEncoder(handle_unknown = 'ignore'), ['region']),
                                                (OneHotEncoder(handle_unknown = 'ignore'), ['manufacturer']),
                                                (OneHotEncoder(handle_unknown = 'ignore'), ['model']),
                                                (OneHotEncoder(handle_unknown = 'ignore'), ['fuel']),
#                                                (OneHotEncoder(handle_unknown = 'ignore'), ['transmission']),
                                                (OneHotEncoder(handle_unknown = 'ignore'), ['drive']),
                                                (OneHotEncoder(handle_unknown = 'ignore'), ['paint_color']),
                                                (OneHotEncoder(handle_unknown = 'ignore'), ['state']), 
                                               remainder='passthrough'
                                  )
pipe = Pipeline([('transformer', poly), ('linreg', LinearRegression())])

poly.fit_transform(X_train1)
pipe.fit(X_train1, y_train)
p1 = pipe.predict(X_train1)
p2 = pipe.predict(X_test1)
train_mse = (mean_squared_error(y_train, p1))
test_mse = (mean_squared_error(y_test, p2))
print(train_mse)
print(test_mse)

0.14769649059203974
0.46955383256957745


In [72]:
X_train1 = X_train.drop('drive', axis=1)
X_test1 = X_test.drop('drive', axis=1)

poly = make_column_transformer((PolynomialFeatures(degree=2), make_column_selector(dtype_include=np.number)),
                                               (oe_condition, ['condition']),
                                               (oe_title_status, ['title_status']),
                                               (oe_size, ['size']),
                                               (oe_type, ['type']),
                                               (oe_cylinders, ['cylinders']),
                                                (OneHotEncoder(handle_unknown = 'ignore'), ['region']),
                                                (OneHotEncoder(handle_unknown = 'ignore'), ['manufacturer']),
                                                (OneHotEncoder(handle_unknown = 'ignore'), ['model']),
                                                (OneHotEncoder(handle_unknown = 'ignore'), ['fuel']),
                                                (OneHotEncoder(handle_unknown = 'ignore'), ['transmission']),
#                                                (OneHotEncoder(handle_unknown = 'ignore'), ['drive']),
                                                (OneHotEncoder(handle_unknown = 'ignore'), ['paint_color']),
                                                (OneHotEncoder(handle_unknown = 'ignore'), ['state']), 
                                               remainder='passthrough'
                                  )
pipe = Pipeline([('transformer', poly), ('linreg', LinearRegression())])

poly.fit_transform(X_train1)
pipe.fit(X_train1, y_train)
p1 = pipe.predict(X_train1)
p2 = pipe.predict(X_test1)
train_mse = (mean_squared_error(y_train, p1))
test_mse = (mean_squared_error(y_test, p2))
print(train_mse)
print(test_mse)

0.1413953045137322
0.4574601916308741


In [73]:
X_train1 = X_train.drop('paint_color', axis=1)
X_test1 = X_test.drop('paint_color', axis=1)

poly = make_column_transformer((PolynomialFeatures(degree=2), make_column_selector(dtype_include=np.number)),
                                               (oe_condition, ['condition']),
                                               (oe_title_status, ['title_status']),
                                               (oe_size, ['size']),
                                               (oe_type, ['type']),
                                               (oe_cylinders, ['cylinders']),
                                                (OneHotEncoder(handle_unknown = 'ignore'), ['region']),
                                                (OneHotEncoder(handle_unknown = 'ignore'), ['manufacturer']),
                                                (OneHotEncoder(handle_unknown = 'ignore'), ['model']),
                                                (OneHotEncoder(handle_unknown = 'ignore'), ['fuel']),
                                                (OneHotEncoder(handle_unknown = 'ignore'), ['transmission']),
                                                (OneHotEncoder(handle_unknown = 'ignore'), ['drive']),
#                                                (OneHotEncoder(handle_unknown = 'ignore'), ['paint_color']),
                                                (OneHotEncoder(handle_unknown = 'ignore'), ['state']), 
                                               remainder='passthrough'
                                  )
pipe = Pipeline([('transformer', poly), ('linreg', LinearRegression())])

poly.fit_transform(X_train1)
pipe.fit(X_train1, y_train)
p1 = pipe.predict(X_train1)
p2 = pipe.predict(X_test1)
train_mse = (mean_squared_error(y_train, p1))
test_mse = (mean_squared_error(y_test, p2))
print(train_mse)
print(test_mse)

0.14058352143269226
0.48646791164042247


In [74]:
X_train1 = X_train.drop('state', axis=1)
X_test1 = X_test.drop('state', axis=1)

poly = make_column_transformer((PolynomialFeatures(degree=2), make_column_selector(dtype_include=np.number)),
                                               (oe_condition, ['condition']),
                                               (oe_title_status, ['title_status']),
                                               (oe_size, ['size']),
                                               (oe_type, ['type']),
                                               (oe_cylinders, ['cylinders']),
                                                (OneHotEncoder(handle_unknown = 'ignore'), ['region']),
                                                (OneHotEncoder(handle_unknown = 'ignore'), ['manufacturer']),
                                                (OneHotEncoder(handle_unknown = 'ignore'), ['model']),
                                                (OneHotEncoder(handle_unknown = 'ignore'), ['fuel']),
                                                (OneHotEncoder(handle_unknown = 'ignore'), ['transmission']),
                                                (OneHotEncoder(handle_unknown = 'ignore'), ['drive']),
                                                (OneHotEncoder(handle_unknown = 'ignore'), ['paint_color']),
#                                                (OneHotEncoder(handle_unknown = 'ignore'), ['state']), 
                                               remainder='passthrough'
                                  )
pipe = Pipeline([('transformer', poly), ('linreg', LinearRegression())])

poly.fit_transform(X_train1)
pipe.fit(X_train1, y_train)
p1 = pipe.predict(X_train1)
p2 = pipe.predict(X_test1)
train_mse = (mean_squared_error(y_train, p1))
test_mse = (mean_squared_error(y_test, p2))
print(train_mse)
print(test_mse)

0.14012826644704912
0.46941942586508245


In [75]:
X_train1 = X_train.drop('odometer', axis=1)
X_test1 = X_test.drop('odometer', axis=1)

poly = make_column_transformer((PolynomialFeatures(degree=2), make_column_selector(dtype_include=np.number)),
                                               (oe_condition, ['condition']),
                                               (oe_title_status, ['title_status']),
                                               (oe_size, ['size']),
                                               (oe_type, ['type']),
                                               (oe_cylinders, ['cylinders']),
                                                (OneHotEncoder(handle_unknown = 'ignore'), ['region']),
                                                (OneHotEncoder(handle_unknown = 'ignore'), ['manufacturer']),
                                                (OneHotEncoder(handle_unknown = 'ignore'), ['model']),
                                                (OneHotEncoder(handle_unknown = 'ignore'), ['fuel']),
                                                (OneHotEncoder(handle_unknown = 'ignore'), ['transmission']),
                                                (OneHotEncoder(handle_unknown = 'ignore'), ['drive']),
                                                (OneHotEncoder(handle_unknown = 'ignore'), ['paint_color']),
                                                (OneHotEncoder(handle_unknown = 'ignore'), ['state']), 
                                               remainder='passthrough'
                                  )
pipe = Pipeline([('transformer', poly), ('linreg', LinearRegression())])

poly.fit_transform(X_train1)
pipe.fit(X_train1, y_train)
p1 = pipe.predict(X_train1)
p2 = pipe.predict(X_test1)
train_mse = (mean_squared_error(y_train, p1))
test_mse = (mean_squared_error(y_test, p2))
print(train_mse)
print(test_mse)

0.1550570580685754
0.35202437966453215


In [77]:
X_train1 = X_train.drop('year', axis=1)
X_test1 = X_test.drop('year', axis=1)

poly = make_column_transformer((PolynomialFeatures(degree=2), make_column_selector(dtype_include=np.number)),
                                               (oe_condition, ['condition']),
                                               (oe_title_status, ['title_status']),
                                               (oe_size, ['size']),
                                               (oe_type, ['type']),
                                               (oe_cylinders, ['cylinders']),
                                                (OneHotEncoder(handle_unknown = 'ignore'), ['region']),
                                                (OneHotEncoder(handle_unknown = 'ignore'), ['manufacturer']),
                                                (OneHotEncoder(handle_unknown = 'ignore'), ['model']),
                                                (OneHotEncoder(handle_unknown = 'ignore'), ['fuel']),
                                                (OneHotEncoder(handle_unknown = 'ignore'), ['transmission']),
                                                (OneHotEncoder(handle_unknown = 'ignore'), ['drive']),
                                                (OneHotEncoder(handle_unknown = 'ignore'), ['paint_color']),
                                                (OneHotEncoder(handle_unknown = 'ignore'), ['state']), 
                                               remainder='passthrough'
                                  )
pipe = Pipeline([('transformer', poly), ('linreg', LinearRegression())])

poly.fit_transform(X_train1)
pipe.fit(X_train1, y_train)
p1 = pipe.predict(X_train1)
p2 = pipe.predict(X_test1)
train_mse = (mean_squared_error(y_train, p1))
test_mse = (mean_squared_error(y_test, p2))
print(train_mse)
print(test_mse)

0.21219100977451366
0.34755985681105234
