# Imports

In [1]:
import statsmodels.api as sm
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

In [2]:
data_URL = "./Cleaned Data/Data.csv"

data = pd.read_csv(data_URL, index_col=0)

print(data.shape)

data.head()

(205, 53)


Unnamed: 0_level_0,symboling,wheelbase,carlength,carwidth,carheight,curbweight,enginesize,boreratio,stroke,compressionratio,...,cylindernumber_eight,cylindernumber_twelve,fuelsystem_mpfi,fuelsystem_2bbl,fuelsystem_idi,fuelsystem_1bbl,fuelsystem_spdi,fuelsystem_4bbl,fuelsystem_mfi,fuelsystem_spfi
car_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,3,88.6,168.8,64.1,48.8,2548,130,3.47,2.68,9.0,...,0,0,1,0,0,0,0,0,0,0
2,3,88.6,168.8,64.1,48.8,2548,130,3.47,2.68,9.0,...,0,0,1,0,0,0,0,0,0,0
3,1,94.5,171.2,65.5,52.4,2823,152,2.68,3.47,9.0,...,0,0,1,0,0,0,0,0,0,0
4,2,99.8,176.6,66.2,54.3,2337,109,3.19,3.4,10.0,...,0,0,1,0,0,0,0,0,0,0
5,2,99.4,176.6,66.4,54.3,2824,136,3.19,3.4,8.0,...,0,0,1,0,0,0,0,0,0,0


# Scaling Data

In [3]:
scaler = StandardScaler()

numerical_columns = data.columns.tolist()
numerical_columns = list(filter(lambda x: '_' not in x, numerical_columns))
scaling_columns = list(filter(lambda x: x != "price", numerical_columns))

standard_data = data.copy()
standard_data[scaling_columns] = scaler.fit_transform(data[scaling_columns])
standard_data[scaling_columns].head()

Unnamed: 0_level_0,symboling,wheelbase,carlength,carwidth,carheight,curbweight,enginesize,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg
car_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1,1.74347,-1.690772,-0.426521,-0.844782,-2.020417,-0.014566,0.074449,0.519071,-1.839377,-0.288349,0.174483,-0.26296,-0.646553,-0.546059
2,1.74347,-1.690772,-0.426521,-0.844782,-2.020417,-0.014566,0.074449,0.519071,-1.839377,-0.288349,0.174483,-0.26296,-0.646553,-0.546059
3,0.133509,-0.708596,-0.231513,-0.190566,-0.543527,0.514882,0.604046,-2.40488,0.685946,-0.288349,1.264536,-0.26296,-0.953012,-0.691627
4,0.93849,0.173698,0.207256,0.136542,0.235942,-0.420797,-0.431076,-0.517266,0.462183,-0.035973,-0.053668,0.787855,-0.186865,-0.109354
5,0.93849,0.10711,0.207256,0.230001,0.235942,0.516807,0.218885,-0.517266,0.462183,-0.540725,0.275883,0.787855,-1.106241,-1.2739


In [4]:
scaler = MinMaxScaler()

numerical_columns = data.columns.tolist()
numerical_columns = list(filter(lambda x: '_' not in x, numerical_columns))
scaling_columns = list(filter(lambda x: x != "price", numerical_columns))
data[scaling_columns].head()

minmax_data = data.copy()
minmax_data[scaling_columns] = scaler.fit_transform(data[scaling_columns])

# Splitting Data

In [5]:
standard_X = standard_data.drop(columns="price")
standard_Y = standard_data["price"]

minmax_X = minmax_data.drop(columns="price")
minmax_Y = minmax_data["price"]

standard_train_X, standard_test_X, standard_train_Y, standard_test_Y = \
    train_test_split(standard_X, standard_Y, train_size=0.8, test_size=0.2, shuffle=True)
minmax_train_X, minmax_test_X, minmax_train_Y, minmax_test_Y = \
    train_test_split(minmax_X, minmax_Y, train_size=0.8, test_size=0.2, shuffle=True)

In [6]:
useful_standard_train_X = standard_train_X.drop(columns=["symboling", "carheight", "compressionratio", "peakrpm", "stroke"])
useful_standard_test_X = standard_test_X.drop(columns=["symboling", "carheight", "compressionratio", "peakrpm", "stroke"])

useful_minmax_train_X = minmax_train_X.drop(columns=["symboling", "carheight", "compressionratio", "peakrpm", "stroke"])
useful_minmax_test_X = minmax_test_X.drop(columns=["symboling", "carheight", "compressionratio", "peakrpm", "stroke"])

# Statistical Significance
## Train

In [7]:
minmax_OLS_train_X = sm.add_constant(np.column_stack((useful_standard_train_X["wheelbase"], np.ones(useful_minmax_train_X.shape[0]))))

for column in useful_standard_train_X.columns[1:]:
    minmax_OLS_train_X = sm.add_constant(np.column_stack((useful_minmax_train_X[column], minmax_OLS_train_X)))
minmax_OLS_train_Y = np.log(minmax_train_Y.copy())

minimax_results = sm.OLS(minmax_OLS_train_Y, minmax_OLS_train_X).fit()
print(minimax_results.summary())

                            OLS Regression Results                            
Dep. Variable:                  price   R-squared:                       0.943
Model:                            OLS   Adj. R-squared:                  0.928
Method:                 Least Squares   F-statistic:                     60.97
Date:                Fri, 28 Apr 2023   Prob (F-statistic):           3.42e-64
Time:                        18:03:29   Log-Likelihood:                 116.66
No. Observations:                 164   AIC:                            -161.3
Df Residuals:                     128   BIC:                            -49.73
Df Model:                          35                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
x1             0.2300      0.125      1.838      0.0

## Test

In [8]:
minmax_OLS_test_X = sm.add_constant(np.column_stack((useful_standard_test_X["wheelbase"], np.ones(useful_minmax_test_X.shape[0]))))

for column in useful_standard_test_X.columns[1:]:
    minmax_OLS_test_X = sm.add_constant(np.column_stack((useful_minmax_test_X[column], minmax_OLS_test_X)))
minmax_OLS_test_Y = np.log(minmax_test_Y.copy())

minimax_results = sm.OLS(minmax_OLS_test_Y, minmax_OLS_test_X).fit()
print(minimax_results.summary())

                            OLS Regression Results                            
Dep. Variable:                  price   R-squared:                       0.985
Model:                            OLS   Adj. R-squared:                  0.939
Method:                 Least Squares   F-statistic:                     21.48
Date:                Fri, 28 Apr 2023   Prob (F-statistic):           8.11e-06
Time:                        18:03:29   Log-Likelihood:                 55.634
No. Observations:                  41   AIC:                            -49.27
Df Residuals:                      10   BIC:                             3.852
Df Model:                          30                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const       1.757e-14   7.57e-15      2.322      0.0