### Models for Car Prices

In [46]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import OneHotEncoder, PolynomialFeatures
from sklearn.pipeline import Pipeline
import seaborn as sns

In [2]:
#load the cars data

cars = pd.read_csv('data/mtcars.csv', index_col = 0)
cars.head()

Unnamed: 0,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
Mazda RX4,21.0,6,160.0,110,3.9,2.62,16.46,0,1,4,4
Mazda RX4 Wag,21.0,6,160.0,110,3.9,2.875,17.02,0,1,4,4
Datsun 710,22.8,4,108.0,93,3.85,2.32,18.61,1,1,4,1
Hornet 4 Drive,21.4,6,258.0,110,3.08,3.215,19.44,1,0,3,1
Hornet Sportabout,18.7,8,360.0,175,3.15,3.44,17.02,0,0,3,2


In [3]:
#mpg vs. hp -- X and y (X as DataFrame)

X = cars[['mpg']]
y = cars['hp']

In [5]:
#Linear model instantiate

lr = LinearRegression()

In [7]:
#cross validate with neg_mean_squared_error as scorer

cross_val_score(lr, X, y, cv = 5, scoring = 'neg_mean_squared_error')

array([-1488.37123279, -1158.31066907, -2477.86137832, -1166.38301226,
       -6395.24822953])

In [8]:
#add a quadratic feature
X['mpg^2'] = cars['mpg']**2
X.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['mpg^2'] = cars['mpg']**2


Unnamed: 0,mpg,mpg^2
Mazda RX4,21.0,441.0
Mazda RX4 Wag,21.0,441.0
Datsun 710,22.8,519.84
Hornet 4 Drive,21.4,457.96
Hornet Sportabout,18.7,349.69


In [9]:
#cross validate linear model

cross_val_score(lr, X, y, cv = 5, scoring = 'neg_mean_squared_error')

array([-1017.45026951,  -754.50829529, -8854.77987111, -1005.40693089,
       -6247.55292699])

In [11]:
import statsmodels.api as sm

In [12]:
model = sm.OLS(y, X).fit()

In [13]:
model.summary()

0,1,2,3
Dep. Variable:,hp,R-squared (uncentered):,0.871
Model:,OLS,Adj. R-squared (uncentered):,0.862
Method:,Least Squares,F-statistic:,101.3
Date:,"Wed, 09 Mar 2022",Prob (F-statistic):,4.55e-14
Time:,19:27:58,Log-Likelihood:,-175.33
No. Observations:,32,AIC:,354.7
Df Residuals:,30,BIC:,357.6
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
mpg,20.7703,1.952,10.640,0.000,16.783,24.757
mpg^2,-0.6251,0.080,-7.829,0.000,-0.788,-0.462

0,1,2,3
Omnibus:,3.335,Durbin-Watson:,1.756
Prob(Omnibus):,0.189,Jarque-Bera (JB):,2.797
Skew:,0.718,Prob(JB):,0.247
Kurtosis:,2.813,Cond. No.,94.6


In [9]:
#Which is better???

#hard to say 

In [10]:
#plot histograms of residuals


### `PolynomialFeatures`

Redo the above the `scikitlearn` way.

In [15]:
#train and test split 

X_train, X_test, y_train, y_test = train_test_split(cars[['mpg']], cars['hp'], random_state = 42)
X_train.head()

Unnamed: 0,mpg
Merc 450SL,17.3
Mazda RX4,21.0
Hornet Sportabout,18.7
Chrysler Imperial,14.7
Valiant,18.1


In [16]:
#instantiate

#will generate a polynomial feature 
poly_features = PolynomialFeatures(degree = 2, include_bias = False)



In [18]:
#fit and transform train

X_train_quad = poly_features.fit_transform(X_train) #second column becomes mpg^2

In [22]:
#transform test 

X_test_quad = poly_features.transform(X_test) #note not fitting here, do not want to learn from test data
X_test_quad

array([[  19.7 ,  388.09],
       [  10.4 ,  108.16],
       [  19.2 ,  368.64],
       [  32.4 , 1049.76],
       [  22.8 ,  519.84],
       [  19.2 ,  368.64],
       [  15.  ,  225.  ],
       [  27.3 ,  745.29]])

In [23]:
#fit the model 

quad = LinearRegression().fit(X_train_quad, y_train)

In [24]:
#make predictions

quad.predict(X_test_quad)

array([131.23560303, 264.76071571, 136.54489949,  67.60639551,
       103.05962838, 136.54489949, 189.53012563,  76.68992407])

In [25]:
from sklearn.metrics import mean_squared_error

In [27]:
mean_squared_error(quad.predict(X_test_quad), y_test, squared = False)

58.520303388705614

In [28]:
#what if we want to use mpg and disp to predict horsepower?

X2 = cars[['mpg', 'disp']]

In [36]:
pd.DataFrame(poly_features.fit_transform(X2), columns = poly_features.get_feature_names()).head()

Unnamed: 0,x0,x1,x0^2,x0 x1,x1^2
0,21.0,160.0,441.0,3360.0,25600.0
1,21.0,160.0,441.0,3360.0,25600.0
2,22.8,108.0,519.84,2462.4,11664.0
3,21.4,258.0,457.96,5521.2,66564.0
4,18.7,360.0,349.69,6732.0,129600.0


### Using a `Pipeline`

Combine transformer with preprocessing.

In [None]:
from sklearn.pipeline import Pipeline

In [44]:
from sklearn import set_config 
set_config('display')

In [None]:
poly_features = PolynomialFeatures()
X_train_polyfeatures = poly_features.fit_transform(X_train)
X_test_polyfeatures = poly_features.transform(X_test)

In [None]:
quad_reg_nopipe = LinearRegression().fit(X_train_polyfeatures)
quad_red_nopipe_preds = quad_reg_nopipe.predict(X_test_polyfeatures)

In [38]:
#pipeline to poly then regressor

quad_pipe = Pipeline([('polynomial_features', PolynomialFeatures(include_bias = False)),
                     ('regressor', LinearRegression())])


In [45]:
quad_pipe

Pipeline(steps=[('polynomial_features', PolynomialFeatures(include_bias=False)),
                ('regressor', LinearRegression())])

In [40]:
#cross validate the pipeline

#works just like an estimator 

cross_val_score(quad_pipe, X_train, y_train, scoring = 'neg_mean_squared_error')

#cross validated scores based on quadratic model 


array([-1038.73412944,  -335.21266277, -1428.57434733,  -926.28033774,
       -4783.35272794])

In [42]:
#evaluate on test

quad_pipe.fit(X_train, y_train)
preds = quad_pipe.predict(X_test)
mean_squared_error(y_test, preds)

3424.62590870615