Steps to Perform:
 - Perform initial preprocessing of data
 - Perform preprocessing for statsmodels
 - Run the model in statsmodels and produce a results summary
 - Evaluate the model on the test set

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.metrics import mean_squared_error, r2_score

## Standard Statsmodels import
import statsmodels.api as sm

## fixing random for lesson generation
np.random.seed(321)

df = pd.read_csv('Data/CarPrice_Assignment.csv')
df.head()

Unnamed: 0,car_ID,symboling,CarName,fueltype,aspiration,doornumber,carbody,drivewheel,enginelocation,wheelbase,...,enginesize,fuelsystem,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg,price
0,1,3,alfa-romero giulia,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495.0
1,2,3,alfa-romero stelvio,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500.0
2,3,1,alfa-romero Quadrifoglio,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500.0
3,4,2,audi 100 ls,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950.0
4,5,2,audi 100ls,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450.0


In [2]:
print("Number of null values : ", df.isna().sum().sum())
print("Number of duplicated values : ", df.duplicated().sum())

Number of null values :  0
Number of duplicated values :  0


In [3]:
## Make x and y variables
y = df['price']
X = df.drop(columns=['price'])

# Train test split
X_train,X_test, y_train, y_test = train_test_split(X,y, random_state=321)

# Begin cleaning

In [4]:
# Lets make our column selectors

# Object selector and columns
obj_selector = make_column_selector(dtype_include='object')
obj_cols = obj_selector(X_train)

# Numeric seector and columns
num_selector = make_column_selector(dtype_include='number')
num_cols = num_selector(X_train)

In [5]:
# Make the nessecary transformers
scaler = StandardScaler()
ohe = OneHotEncoder(handle_unknown='ignore', sparse=False)

In [6]:
# Lets make our pipelines
obj_pipe = make_pipeline(ohe)
num_pipe = make_pipeline(scaler)

In [7]:
# Make column transformer
preprocessor = make_column_transformer((obj_pipe, obj_cols),(num_pipe, num_cols), remainder='passthrough')

# Fit the preprocessor
preprocessor.fit(X_train)

# Convvert processed data into data frame
pd.DataFrame(preprocessor.transform(X_train).round(2))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,158,159,160,161,162,163,164,165,166,167
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,1.08,1.86,1.95,1.10,0.38,-0.55,2.03,0.56,-1.53,-1.52
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.19,-0.49,-0.39,-0.68,0.99,-0.31,-0.46,1.36,0.26,0.32
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.19,-0.36,-0.39,-0.68,0.99,-0.31,-0.46,1.36,0.26,0.32
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.33,2.95,3.10,1.14,2.91,-0.52,1.87,-0.75,-1.53,-1.66
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.49,-0.04,-0.11,0.08,0.60,-0.43,-0.41,-0.25,-0.19,-0.11
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
148,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.76,0.78,0.34,1.71,-0.40,-0.19,0.26,0.56,-0.34,-0.39
149,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.19,-0.17,-0.39,-0.68,0.99,-0.31,-0.07,1.36,-0.19,-0.39
150,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-1.14,-1.44,-0.82,-1.59,0.44,-0.26,-0.72,1.77,0.86,1.02
151,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-1.38,0.51,0.69,0.99,1.90,-0.78,1.07,-0.25,-0.94,-0.95


# Preprocessing for statsmodels

In [8]:
# This list will hold the names for out columns
final_features = []

# Grab the OheHotEncoded step
ohe_step = preprocessor.named_transformers_['pipeline-1'][0]

# Extracting the feature names
cat_features = ohe_step.get_feature_names_out(obj_cols)

# Add these column names to our feature list
final_features.extend(cat_features)

# Add the numeric column names
final_features.extend(num_cols)

In [9]:
X_train_df = pd.DataFrame(preprocessor.transform(X_train), columns=final_features, index=X_train.index)
X_train_df.head()

Unnamed: 0,CarName_alfa-romero giulia,CarName_alfa-romero stelvio,CarName_audi 100 ls,CarName_audi 100ls,CarName_audi 5000,CarName_audi 5000s (diesel),CarName_audi fox,CarName_bmw 320i,CarName_bmw x1,CarName_bmw x3,...,carheight,curbweight,enginesize,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg
17,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,1.078184,1.855775,1.945552,1.104185,0.376132,-0.546313,2.025098,0.557617,-1.532551,-1.517697
39,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.192373,-0.489363,-0.390498,-0.677725,0.992489,-0.308461,-0.461496,1.36343,0.258514,0.318495
40,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.192373,-0.356583,-0.390498,-0.677725,0.992489,-0.308461,-0.461496,1.36343,0.258514,0.318495
47,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.331061,2.951215,3.101778,1.142097,2.90644,-0.522527,1.869686,-0.751829,-1.532551,-1.658942
123,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.48743,-0.0383,-0.107341,0.080534,0.603211,-0.427387,-0.409692,-0.248196,-0.189252,-0.105242


In [10]:
X_test_df = pd.DataFrame(preprocessor.transform(X_test), columns=final_features, index=X_test.index)
X_test_df.head()

Unnamed: 0,CarName_alfa-romero giulia,CarName_alfa-romero stelvio,CarName_audi 100 ls,CarName_audi 100ls,CarName_audi 5000,CarName_audi 5000s (diesel),CarName_audi fox,CarName_bmw 320i,CarName_bmw x1,CarName_bmw x3,...,carheight,curbweight,enginesize,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg
48,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.331061,2.951215,3.101778,1.142097,2.90644,-0.522527,1.869686,-0.751829,-1.532551,-1.658942
196,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.03792,0.742762,0.340992,1.710792,-0.402424,-0.189535,0.26376,0.557617,-0.189252,-0.387733
148,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.514486,-0.262855,-0.437691,1.104185,-2.056855,-0.308461,-0.565104,-0.651102,-0.338507,-0.246487
53,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.192373,-1.190367,-0.838831,-1.132681,-0.402424,-0.308461,-0.927733,-0.248196,0.855536,1.024722
99,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.433958,-0.45031,-0.154534,0.004708,0.635651,-0.427387,-0.176574,0.154711,0.258514,0.45974


In [11]:
# Add in the constant column
X_train_df = sm.add_constant(X_train_df, has_constant='add', prepend=False)
X_test_df = sm.add_constant(X_test_df,has_constant='add', prepend=False)

In [12]:
# Instantiate model
model = sm.OLS(y_train, X_train_df, hasconst=True)

# Store result
result = model.fit()

# Print metrics and other stuff
result.summary()

0,1,2,3
Dep. Variable:,price,R-squared:,0.999
Model:,OLS,Adj. R-squared:,0.984
Method:,Least Squares,F-statistic:,65.75
Date:,"Wed, 02 Nov 2022",Prob (F-statistic):,3.04e-06
Time:,15:35:02,Log-Likelihood:,-1033.6
No. Observations:,153,AIC:,2359.0
Df Residuals:,7,BIC:,2802.0
Df Model:,145,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
CarName_alfa-romero giulia,2624.6105,8791.359,0.299,0.774,-1.82e+04,2.34e+04
CarName_alfa-romero stelvio,5602.9150,8681.425,0.645,0.539,-1.49e+04,2.61e+04
CarName_audi 100 ls,4935.0408,1.2e+04,0.413,0.692,-2.33e+04,3.32e+04
CarName_audi 100ls,-4899.0854,8124.540,-0.603,0.566,-2.41e+04,1.43e+04
CarName_audi 5000,1853.2394,5826.819,0.318,0.760,-1.19e+04,1.56e+04
CarName_audi 5000s (diesel),-3971.7227,6803.341,-0.584,0.578,-2.01e+04,1.21e+04
CarName_audi fox,1950.4632,5491.551,0.355,0.733,-1.1e+04,1.49e+04
CarName_bmw 320i,8518.6392,1.02e+04,0.832,0.433,-1.57e+04,3.27e+04
CarName_bmw x1,2.387e+04,1.13e+04,2.121,0.072,-2745.642,5.05e+04

0,1,2,3
Omnibus:,28.187,Durbin-Watson:,2.026
Prob(Omnibus):,0.0,Jarque-Bera (JB):,144.639
Skew:,-0.431,Prob(JB):,3.91e-32
Kurtosis:,7.685,Cond. No.,1.27e+17


In [13]:
# Because the model above only runs on training data we need to explicitly run the model on our test data.
test_pred = result.predict(X_test_df)

print("Testing data R-2 score ", r2_score(y_test, test_pred))
print("Testing data MeanSquaredError score ", mean_squared_error(y_test, test_pred))

Testing data R-2 score  -0.14593316429346537
Testing data MeanSquaredError score  87664932.7328863
