# Imports

In [66]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# Data Preprocessing

In [39]:
data_URL = "./Cleaned Data/Data.csv"

data = pd.read_csv(data_URL, index_col=0)

print(data.shape)

data.head()

(205, 53)


Unnamed: 0_level_0,symboling,wheelbase,carlength,carwidth,carheight,curbweight,enginesize,boreratio,stroke,compressionratio,...,cylindernumber_eight,cylindernumber_twelve,fuelsystem_mpfi,fuelsystem_2bbl,fuelsystem_idi,fuelsystem_1bbl,fuelsystem_spdi,fuelsystem_4bbl,fuelsystem_mfi,fuelsystem_spfi
car_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,3,88.6,168.8,64.1,48.8,2548,130,3.47,2.68,9.0,...,0,0,1,0,0,0,0,0,0,0
2,3,88.6,168.8,64.1,48.8,2548,130,3.47,2.68,9.0,...,0,0,1,0,0,0,0,0,0,0
3,1,94.5,171.2,65.5,52.4,2823,152,2.68,3.47,9.0,...,0,0,1,0,0,0,0,0,0,0
4,2,99.8,176.6,66.2,54.3,2337,109,3.19,3.4,10.0,...,0,0,1,0,0,0,0,0,0,0
5,2,99.4,176.6,66.4,54.3,2824,136,3.19,3.4,8.0,...,0,0,1,0,0,0,0,0,0,0


## Data Scaling (Numerical)
### Standard

In [38]:
scaler = StandardScaler()

In [24]:
numerical_columns = data.columns.tolist()
numerical_columns = list(filter(lambda x: '_' not in x, temp))
scaling_columns = list(filter(lambda x: x != "price", temp))
# []"symboling", "carheight", "compressionratio", "peakrpm", "stroke"]
data[scaling_columns].head()

Unnamed: 0_level_0,symboling,wheelbase,carlength,carwidth,carheight,curbweight,enginesize,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg
car_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1,3,88.6,168.8,64.1,48.8,2548,130,3.47,2.68,9.0,111,5000,21,27
2,3,88.6,168.8,64.1,48.8,2548,130,3.47,2.68,9.0,111,5000,21,27
3,1,94.5,171.2,65.5,52.4,2823,152,2.68,3.47,9.0,154,5000,19,26
4,2,99.8,176.6,66.2,54.3,2337,109,3.19,3.4,10.0,102,5500,24,30
5,2,99.4,176.6,66.4,54.3,2824,136,3.19,3.4,8.0,115,5500,18,22


In [40]:
standard_data = data.copy()
standard_data[scaling_columns] = scaler.fit_transform(data[scaling_columns])
standard_data[scaling_columns].head()

Unnamed: 0_level_0,symboling,wheelbase,carlength,carwidth,carheight,curbweight,enginesize,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg
car_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1,1.74347,-1.690772,-0.426521,-0.844782,-2.020417,-0.014566,0.074449,0.519071,-1.839377,-0.288349,0.174483,-0.26296,-0.646553,-0.546059
2,1.74347,-1.690772,-0.426521,-0.844782,-2.020417,-0.014566,0.074449,0.519071,-1.839377,-0.288349,0.174483,-0.26296,-0.646553,-0.546059
3,0.133509,-0.708596,-0.231513,-0.190566,-0.543527,0.514882,0.604046,-2.40488,0.685946,-0.288349,1.264536,-0.26296,-0.953012,-0.691627
4,0.93849,0.173698,0.207256,0.136542,0.235942,-0.420797,-0.431076,-0.517266,0.462183,-0.035973,-0.053668,0.787855,-0.186865,-0.109354
5,0.93849,0.10711,0.207256,0.230001,0.235942,0.516807,0.218885,-0.517266,0.462183,-0.540725,0.275883,0.787855,-1.106241,-1.2739


### MinMax

In [44]:
scaler = MinMaxScaler()

In [45]:
numerical_columns = data.columns.tolist()
numerical_columns = list(filter(lambda x: '_' not in x, temp))
scaling_columns = list(filter(lambda x: x != "price", temp))
# ["symboling", "carheight", "compressionratio", "peakrpm", "stroke"]
data[scaling_columns].head()

Unnamed: 0_level_0,symboling,wheelbase,carlength,carwidth,carheight,curbweight,enginesize,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg
car_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1,3,88.6,168.8,64.1,48.8,2548,130,3.47,2.68,9.0,111,5000,21,27
2,3,88.6,168.8,64.1,48.8,2548,130,3.47,2.68,9.0,111,5000,21,27
3,1,94.5,171.2,65.5,52.4,2823,152,2.68,3.47,9.0,154,5000,19,26
4,2,99.8,176.6,66.2,54.3,2337,109,3.19,3.4,10.0,102,5500,24,30
5,2,99.4,176.6,66.4,54.3,2824,136,3.19,3.4,8.0,115,5500,18,22


In [46]:
minmax_data = data.copy()
minmax_data[scaling_columns] = scaler.fit_transform(data[scaling_columns])
minmax_data[scaling_columns].head()

Unnamed: 0_level_0,symboling,wheelbase,carlength,carwidth,carheight,curbweight,enginesize,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg
car_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1,1.0,0.058309,0.413433,0.316667,0.083333,0.411171,0.260377,0.664286,0.290476,0.125,0.2625,0.346939,0.222222,0.289474
2,1.0,0.058309,0.413433,0.316667,0.083333,0.411171,0.260377,0.664286,0.290476,0.125,0.2625,0.346939,0.222222,0.289474
3,0.6,0.230321,0.449254,0.433333,0.383333,0.517843,0.343396,0.1,0.666667,0.125,0.441667,0.346939,0.166667,0.263158
4,0.8,0.38484,0.529851,0.491667,0.541667,0.329325,0.181132,0.464286,0.633333,0.1875,0.225,0.55102,0.305556,0.368421
5,0.8,0.373178,0.529851,0.508333,0.541667,0.518231,0.283019,0.464286,0.633333,0.0625,0.279167,0.55102,0.138889,0.157895


# Data Splitting

In [71]:
standard_X = standard_data.drop(columns="price")
standard_Y = standard_data["price"]

minmax_X = minmax_data.drop(columns="price")
minmax_Y = minmax_data["price"]

print(standard_X.shape)
print(standard_Y.shape)

(205, 52)
(205,)


In [72]:
standard_train_X, standard_test_X, standard_train_Y, standard_test_Y = train_test_split(standard_X, standard_Y, train_size=0.8, test_size=0.2, shuffle=True)
minmax_train_X, minmax_test_X, minmax_train_Y, minmax_test_Y = train_test_split(minmax_X, minmax_Y, train_size=0.8, test_size=0.2, shuffle=True)

In [49]:
print(standard_train_X.shape, standard_train_Y.shape)
print(standard_test_X.shape, standard_test_Y.shape)

(164, 52) (164,)
(41, 52) (41,)


# Models
## Model 1 (Naive Standard Scale)

In [64]:
model1 = LinearRegression()
model1.fit(standard_train_X, standard_train_Y)

model1_pred_train = model1.predict(standard_train_X)
model1_pred_test = model1.predict(standard_test_X)

model1_train_mse = mean_squared_error(standard_train_Y, model1_pred_train, squared=False)
model1_test_mse = mean_squared_error(standard_test_Y, model1_pred_test, squared=False)

model1_train_r2 = r2_score(standard_train_Y, model1_pred_train) * 100
model1_test_r2 = r2_score(standard_test_Y, model1_pred_test) * 100

print("Train MSE:", model1_train_mse)
print("Test MSE:", model1_test_mse)

print("Train r2:", model1_train_r2)
print("Test r2:", model1_test_r2)

Train MSE: 1892.2508401788127
Test MSE: 2724.225634935857
Train r2: 93.65299696737146
Test r2: 91.52564162739456


## Model 2 (Remove Unnecessary Features Standard Scale)

In [78]:
useful_standard_train_X = standard_train_X.drop(columns=["symboling", "carheight", "compressionratio", "peakrpm", "stroke"])
useful_standard_test_X = standard_test_X.drop(columns=["symboling", "carheight", "compressionratio", "peakrpm", "stroke"])

In [79]:
model2 = LinearRegression()
model2.fit(useful_standard_train_X, standard_train_Y)

model2_pred_train = model2.predict(useful_standard_train_X)
model2_pred_test = model2.predict(useful_standard_test_X)

model2_train_mse = mean_squared_error(standard_train_Y, model2_pred_train, squared=False)
model2_test_mse = mean_squared_error(standard_test_Y, model2_pred_test, squared=False)

model2_train_r2 = r2_score(standard_train_Y, model2_pred_train) * 100
model2_test_r2 = r2_score(standard_test_Y, model2_pred_test) * 100

print("Train MSE:", model2_train_mse)
print("Test MSE:", model2_test_mse)

print("Train r2:", model2_train_r2)
print("Test r2:", model2_test_r2)

Train MSE: 2081.142886253688
Test MSE: 2990.730963874243
Train r2: 93.83084504570185
Test r2: 75.3820854413326


## Model 3 (Log transform Y Standard Scale)

In [68]:
model3 = LinearRegression()
model3.fit(standard_train_X, np.log(standard_train_Y))

model3_pred_train = model3.predict(standard_train_X)
model3_pred_test = model3.predict(standard_test_X)

model3_train_mse = mean_squared_error(np.log(standard_train_Y), model3_pred_train, squared=False)
model3_test_mse = mean_squared_error(np.log(standard_test_Y), model3_pred_test, squared=False)

model3_train_r2 = r2_score(np.log(standard_train_Y), model3_pred_train) * 100
model3_test_r2 = r2_score(np.log(standard_test_Y), model3_pred_test) * 100

print("Train MSE:", model3_train_mse)
print("Test MSE:", model3_test_mse)

print("Train r2:", model3_train_r2)
print("Test r2:", model3_test_r2)

Train MSE: 0.1198041124054464
Test MSE: 0.1601054103959153
Train r2: 93.95019850147833
Test r2: 91.41145369092611


## Model 4 (All of the Above)

In [69]:
useful_standard_train_X = standard_train_X.drop(columns=["symboling", "carheight", "compressionratio", "peakrpm", "stroke"])
useful_standard_test_X = standard_test_X.drop(columns=["symboling", "carheight", "compressionratio", "peakrpm", "stroke"])

In [84]:
model4 = LinearRegression()
model4.fit(useful_standard_train_X, np.log(standard_train_Y))

model4_pred_train = model4.predict(useful_standard_train_X)
model4_pred_test = model4.predict(useful_standard_test_X)

model4_train_mse = mean_squared_error(np.log(standard_train_Y), model4_pred_train, squared=False)
model4_test_mse = mean_squared_error(np.log(standard_test_Y), model4_pred_test, squared=False)

model4_train_r2 = r2_score(np.log(standard_train_Y), model4_pred_train) * 100
model4_test_r2 = r2_score(np.log(standard_test_Y), model4_pred_test) * 100

print("Train MSE:", model4_train_mse)
print("Test MSE:", model4_test_mse)

print("Train r2:", model4_train_r2)
print("Test r2:", model4_test_r2)

Train MSE: 0.12215707989559983
Test MSE: 0.17744007557710176
Train r2: 94.41337993593626
Test r2: 83.8084651177566


## Model 5 (Naive MinMax Scale)

In [73]:
model2 = LinearRegression()
model2.fit(minmax_train_X, minmax_train_Y)

model2_pred_train = model2.predict(minmax_train_X)
model2_pred_test = model2.predict(minmax_test_X)

model2_train_mse = mean_squared_error(minmax_train_Y, model2_pred_train, squared=False)
model2_test_mse = mean_squared_error(minmax_test_Y, model2_pred_test, squared=False)

model2_train_r2 = r2_score(minmax_train_Y, model2_pred_train) * 100
model2_test_r2 = r2_score(minmax_test_Y, model2_pred_test) * 100

print("Train MSE:", model2_train_mse)
print("Test MSE:", model2_test_mse)

print("Train r2:", model2_train_r2)
print("Test r2:", model2_test_r2)

Train MSE: 1961.1755685059425
Test MSE: 2183.320345790528
Train r2: 94.2681707735583
Test r2: 90.08220663555716


## Model 6 (Remove Unnecessary Feature MinMax Scale)

In [75]:
useful_minmax_train_X = minmax_train_X.drop(columns=["symboling", "carheight", "compressionratio", "peakrpm", "stroke"])
useful_minmax_test_X = minmax_test_X.drop(columns=["symboling", "carheight", "compressionratio", "peakrpm", "stroke"])

In [82]:
model6 = LinearRegression()
model6.fit(useful_minmax_train_X, minmax_train_Y)

model6_pred_train = model6.predict(useful_minmax_train_X)
model6_pred_test = model6.predict(useful_minmax_test_X)

model6_train_mse = mean_squared_error(minmax_train_Y, model6_pred_train, squared=False)
model6_test_mse = mean_squared_error(minmax_test_Y, model6_pred_test, squared=False)

model6_train_r2 = r2_score(minmax_train_Y, model6_pred_train) * 100
model6_test_r2 = r2_score(minmax_test_Y, model6_pred_test) * 100

print("Train MSE:", model6_train_mse)
print("Test MSE:", model6_test_mse)

print("Train r2:", model6_train_r2)
print("Test r2:", model6_test_r2)

Train MSE: 2155.8787473976513
Test MSE: 2422.454734446833
Train r2: 93.07357786376514
Test r2: 87.79068009433148


## Model 7 (Log Transform Y MinMax Scale)

In [83]:
model7 = LinearRegression()
model7.fit(minmax_train_X, np.log(minmax_train_Y))

model7_pred_train = model7.predict(minmax_train_X)
model7_pred_test = model7.predict(minmax_test_X)

model7_train_mse = mean_squared_error(np.log(minmax_train_Y), model7_pred_train, squared=False)
model7_test_mse = mean_squared_error(np.log(minmax_test_Y), model7_pred_test, squared=False)

model7_train_r2 = r2_score(np.log(minmax_train_Y), model7_pred_train) * 100
model7_test_r2 = r2_score(np.log(minmax_test_Y), model7_pred_test) * 100

print("Train MSE:", model7_train_mse)
print("Test MSE:", model7_test_mse)

print("Train r2:", model7_train_r2)
print("Test r2:", model7_test_r2)

Train MSE: 0.12475845516189948
Test MSE: 0.12792507642431156
Train r2: 94.14276946667442
Test r2: 91.70398576109784


## Model 8 (All of the Above MinMax Scale)

In [86]:
useful_minmax_train_X = minmax_train_X.drop(columns=["symboling", "carheight", "compressionratio", "peakrpm", "stroke"])
useful_minmax_test_X = minmax_test_X.drop(columns=["symboling", "carheight", "compressionratio", "peakrpm", "stroke"])

In [87]:
model8 = LinearRegression()
model8.fit(useful_minmax_train_X, np.log(minmax_train_Y))

model8_pred_train = model8.predict(useful_minmax_train_X)
model8_pred_test = model8.predict(useful_minmax_test_X)

model8_train_mse = mean_squared_error(np.log(minmax_train_Y), model8_pred_train, squared=False)
model8_test_mse = mean_squared_error(np.log(minmax_test_Y), model8_pred_test, squared=False)

model8_train_r2 = r2_score(np.log(minmax_train_Y), model8_pred_train) * 100
model8_test_r2 = r2_score(np.log(minmax_test_Y), model8_pred_test) * 100

print("Train MSE:", model8_train_mse)
print("Test MSE:", model8_test_mse)

print("Train r2:", model8_train_r2)
print("Test r2:", model8_test_r2)

Train MSE: 0.12920982424630784
Test MSE: 0.13942213606624163
Train r2: 93.71734210076161
Test r2: 90.14579522180398
