In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV
from sklearn.preprocessing import PolynomialFeatures, PowerTransformer, StandardScaler
from sklearn.model_selection import GridSearchCV, KFold, cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline

In [2]:
newcars = pd.read_csv('./data/car_data_2015-17.csv')

In [3]:
newcars.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5995 entries, 0 to 5994
Data columns (total 16 columns):
make             5995 non-null object
model            5995 non-null object
year             5995 non-null int64
fuel             5995 non-null object
hp               5995 non-null float64
num_cylinders    5987 non-null float64
transmission     5995 non-null object
driven_wheels    5995 non-null object
num_doors        5995 non-null float64
category         4671 non-null object
size             5995 non-null object
style            5995 non-null object
mpg_highway      5995 non-null int64
mpg_city         5995 non-null int64
popularity       5995 non-null int64
msrp             5995 non-null int64
dtypes: float64(3), int64(5), object(8)
memory usage: 749.5+ KB


### Feature engineering

In [4]:
newcars['mpg_avg'] = (newcars['mpg_highway'] + newcars['mpg_city']) / 2

In [5]:
newcars.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5995 entries, 0 to 5994
Data columns (total 17 columns):
make             5995 non-null object
model            5995 non-null object
year             5995 non-null int64
fuel             5995 non-null object
hp               5995 non-null float64
num_cylinders    5987 non-null float64
transmission     5995 non-null object
driven_wheels    5995 non-null object
num_doors        5995 non-null float64
category         4671 non-null object
size             5995 non-null object
style            5995 non-null object
mpg_highway      5995 non-null int64
mpg_city         5995 non-null int64
popularity       5995 non-null int64
msrp             5995 non-null int64
mpg_avg          5995 non-null float64
dtypes: float64(4), int64(5), object(8)
memory usage: 796.3+ KB


In [6]:
newcars['num_cylinders'] = newcars['num_cylinders'].astype(object)
newcars['num_doors'] = newcars['num_doors'].astype(object)
newcars_dummied = pd.get_dummies(newcars, columns=['make','year', 'fuel', 'num_cylinders', 'num_doors', 
                                                   'transmission','driven_wheels', 'size', 'style'], 
                                 drop_first=True)

In [7]:
newcars_dummied.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5995 entries, 0 to 5994
Data columns (total 86 columns):
model                                    5995 non-null object
hp                                       5995 non-null float64
category                                 4671 non-null object
mpg_highway                              5995 non-null int64
mpg_city                                 5995 non-null int64
popularity                               5995 non-null int64
msrp                                     5995 non-null int64
mpg_avg                                  5995 non-null float64
make_Alfa Romeo                          5995 non-null uint8
make_Aston Martin                        5995 non-null uint8
make_Audi                                5995 non-null uint8
make_BMW                                 5995 non-null uint8
make_Bentley                             5995 non-null uint8
make_Buick                               5995 non-null uint8
make_Cadillac                    

In [8]:
X = newcars_dummied.drop(columns=['model', 'category','msrp'])
y = newcars[['msrp']]

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=7152019)

In [9]:
lr = LinearRegression()
lr.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [10]:
cross_val_score(lr, X_train, y_train, cv=3).mean()

0.8922547169823017

In [11]:
cross_val_score(lr, X_test, y_test, cv=3).mean()

0.8929980925195249