In [2]:
import pandas as pd
import numpy as np
from sklearn import linear_model
import random
import warnings
from sklearn.model_selection import StratifiedKFold, KFold, RepeatedKFold
warnings.simplefilter('ignore')

In [3]:
data_grouping=pd.read_csv("data_with_grouping_operations_IS.csv")

In [4]:
train_proportion = 0.8
features_colname=['year','odometer','income_rank','median_income',
       'cylinders_onehot_1', 'cylinders_onehot_2', 'cylinders_onehot_3',
       'cylinders_onehot_4', 'cylinders_onehot_5', 'cylinders_onehot_6',
       'cylinders_onehot_7', 'cylinders_onehot_8', 'type_onehot_1',
       'type_onehot_2', 'type_onehot_3', 'type_onehot_4', 'type_onehot_5',
       'type_onehot_6', 'type_onehot_7', 'type_onehot_8', 'type_onehot_9',
       'type_onehot_10', 'type_onehot_11', 'type_onehot_12', 'type_onehot_13',
       'type_onehot_14','drive_onehot_1', 'drive_onehot_2', 'drive_onehot_3',
       'drive_onehot_4', 'title_status_mean_price','manufacturer_mean_price','manufacturer_make_mean_price','transmission_encoded', 'fuel_encoded']
data_clean=data_grouping[features_colname+["price"]]

# #fill nans in features
# features=features.fillna(features.mean())

#  drop nan
data_clean=data_clean.dropna(axis = 0, how ='any')
print(len(data_clean))
features=data_clean[features_colname]

# scaling
features['odometer']=(features['odometer']-features['odometer'].mean())/features['odometer'].std()
features['median_income']=(features['median_income']-features['median_income'].mean())/features['median_income'].std()
features['manufacturer_mean_price']=(features['manufacturer_mean_price']-features['manufacturer_mean_price'].mean())/features['manufacturer_mean_price'].std()
features['manufacturer_make_mean_price']=(features['manufacturer_make_mean_price']-features['manufacturer_make_mean_price'].mean())/features['manufacturer_make_mean_price'].std()
features["title_status_mean_price"]=(features["title_status_mean_price"]-features["title_status_mean_price"].mean())/features["title_status_mean_price"].std()
features['income_rank']=features['income_rank']/features['income_rank'].max()
target=data_clean["price"]

index_list=[i for i in range(len(data_clean))]

random.shuffle(index_list)
train_list=index_list[:round(len(data_clean)*train_proportion)]
test_list=index_list[round(len(data_clean)*train_proportion):]

# the following variable records the features of examples in the training set
train_x=features.iloc[train_list]
# the following variable records the features of examples in the test set
test_x=features.iloc[test_list]
# the following variable records the labels of examples in the training set
train_y=target.iloc[train_list]
# the following variable records the labels of examples in the test set
test_y=target.iloc[test_list]

326105


In [5]:
def MAE_relative(y,pred):# assume input as arrays
    return abs(1-pred/y).mean()

# Lasso

In [15]:
lasso = linear_model.Lasso(alpha=0.1)
lasso.fit(train_x,train_y)
print(lasso.coef_)
print(lasso.intercept_)

[  573.62538794 -2794.98613234  -600.1856336   -250.66257134
   495.81419395 -2199.89897692 -1416.69464986 -1044.39261307
   -49.30308351   651.24970871  1049.91996257  5622.57780826
  -652.06887798  2023.30000807  2221.76931523   866.30129112
 -1037.2519826    205.20686812  -117.39729006  1264.03817062
  -584.1678379   1244.65952286  -732.46957809   221.21002209
    -0.          -966.9289573   2229.80578541  -455.54038539
    -0.           242.4151412    489.67186404   278.25013809
  5382.79354335  -371.14737491  -977.24306687]
-1137096.4546148474


In [16]:
train_pred = lasso.predict(train_x)
test_pred = lasso.predict(test_x)

train_MAE_relative = MAE_relative(train_y,train_pred)
test_MAE_relative = MAE_relative(test_y,test_pred)

print("Train relative error:",train_MAE_relative)
print("Test relative error:",test_MAE_relative)

Train relative error: 0.496780845138044
Test relative error: 0.4989699613440467


# Huber

In [8]:
huber = linear_model.HuberRegressor().fit(train_x, train_y)
print(huber.coef_)
print(huber.intercept_)

[ 7.33088278e+00 -4.25164208e+03  8.39649071e+00 -1.58674279e+02
  6.48252579e+02 -1.26086832e+01 -6.29649834e+02 -7.44536211e+01
 -4.36437418e+01  8.38412762e+01 -1.44441636e+01 -8.32778224e-01
 -1.96976371e+02  6.15118627e+00 -1.42253939e+02 -1.48454472e+02
 -2.47947986e+02  3.75630807e+01 -3.32803252e+02 -1.56286413e+01
 -1.94142730e+01  9.82689602e+02 -3.94218059e+02  4.81311072e+02
  7.37085105e+01 -1.27265424e+02  1.55628406e+03 -4.86242117e+02
 -4.75142507e+02 -6.38438400e+02  4.44527923e+02  2.18756394e+02
  6.10734496e+03 -5.19468490e+02 -5.55210932e+02]
-43.539251305531074


In [9]:
train_pred = huber.predict(train_x)
test_pred = huber.predict(test_x)

train_MAE_relative = MAE_relative(train_y,train_pred)
test_MAE_relative = MAE_relative(test_y,test_pred)

print("Train relative error:",train_MAE_relative)
print("Test relative error:",test_MAE_relative)

Train relative error: 0.514904153241098
Test relative error: 0.5164665758939389


# Elastic Net (k-fold)

In [12]:
kf = KFold(n_splits=5)
index=list(kf.split(target))
train_index_first, test_index_first=index[0]

# grid search l1:0.25-0.5 alpha:0.005-0.02
train_x=features.iloc[train_index_first]
test_x=features.iloc[test_index_first]
train_y=target.iloc[train_index_first]
test_y=target.iloc[test_index_first]
alpha_list=np.linspace(0.005,0.02,5)
l1_list=np.linspace(0.25,0.5,5)

d={}
for alpha in alpha_list:
    train_MAE_list=[]
    test_MAE_list=[]
    for l1 in l1_list:
        regr = linear_model.ElasticNet(l1_ratio=l1, alpha=alpha)
        regr.fit(train_x, train_y) 
        train_pred = regr.predict(train_x)
        test_pred = regr.predict(test_x)
        train_MAE_relative = MAE_relative(train_y,train_pred)
        test_MAE_relative = MAE_relative(test_y,test_pred)
        d[(alpha,l1)]=test_MAE_relative
        
min_test_MAE_relative=min(d.values())
optim=[key for key in d.keys() if d[key]==min_test_MAE_relative][0]
optim_alpha=optim[0]
optim_l1=optim[1]
       
coef_elastic_k=[]
intercept_elastic_k=[]
train_MAE=[]
test_MAE=[]

for train_index, test_index in index:
    train_x=features.iloc[train_index]
    # the following variable records the features of examples in the test set
    test_x=features.iloc[test_index]
    # the following variable records the labels of examples in the training set
    train_y=target.iloc[train_index]
    # the following variable records the labels of examples in the test set
    test_y=target.iloc[test_index]
    
    # fit and test
    regr = linear_model.ElasticNet(l1_ratio=optim_l1, alpha=optim_alpha)
    regr.fit(train_x, train_y) 
    coef_elastic_k.append(regr.coef_)
    intercept_elastic_k.append(regr.intercept_)
    train_pred = regr.predict(train_x)
    test_pred = regr.predict(test_x)
    train_MAE_relative = MAE_relative(train_y,train_pred)
    test_MAE_relative = MAE_relative(test_y,test_pred)
    train_MAE.append(train_MAE_relative)
    test_MAE.append(test_MAE_relative)

coef=sum(coef_elastic_k)/len(coef_elastic_k)
intercept=sum(intercept_elastic_k)/len(intercept_elastic_k)
train_error=sum(train_MAE)/len(train_MAE)
test_error=sum(test_MAE)/len(test_MAE)
print("Coefficient:",coef)
print("Intercept:",intercept)
print("Train error:",train_error)
print("Test error:",test_error)


Coefficient: [ 5.75339815e+02 -2.75411379e+03 -8.56693058e+01 -1.03436314e+02
  5.54484080e+02 -9.72681239e+01 -1.25722836e+03 -2.56512074e+02
  3.76289825e+01  6.84498209e+02  2.02576665e+02  1.31152214e+02
 -6.35002170e+02  1.15102941e+02  1.15861523e+03  5.75485428e+02
 -7.39090862e+02  9.79889258e+01 -1.44908442e+02  1.27420150e+02
 -3.64842622e+02  1.08784435e+03 -7.57313652e+02  2.13990587e+02
 -8.51718311e+01 -6.50118498e+02  1.63261935e+03 -1.01085036e+03
 -4.89481231e+02 -1.31590235e+02  5.76938771e-01  3.48873466e+02
  5.32624123e+03 -3.36653576e+02 -9.85489567e+02]
Intercept: -1148221.9198178514
Train error: 0.495617078203981
Test error: 0.49592889445360644


# Huber (k-fold)

In [13]:
kf = KFold(n_splits=5)
index=list(kf.split(target))

coef_huber_k=[]
intercept_huber_k=[]
train_MAE=[]
test_MAE=[]

for train_index, test_index in index:
    train_x=features.iloc[train_index]
    # the following variable records the features of examples in the test set
    test_x=features.iloc[test_index]
    # the following variable records the labels of examples in the training set
    train_y=target.iloc[train_index]
    # the following variable records the labels of examples in the test set
    test_y=target.iloc[test_index]
    
    # fit and test
    regr = linear_model.HuberRegressor().fit(train_x, train_y)
    coef_huber_k.append(regr.coef_)
    intercept_huber_k.append(regr.intercept_)
    train_pred = regr.predict(train_x)
    test_pred = regr.predict(test_x)
    train_MAE_relative = MAE_relative(train_y,train_pred)
    test_MAE_relative = MAE_relative(test_y,test_pred)
    train_MAE.append(train_MAE_relative)
    test_MAE.append(test_MAE_relative)

coef=sum(coef_huber_k)/len(coef_huber_k)
intercept=sum(intercept_huber_k)/len(intercept_huber_k)
train_error=sum(train_MAE)/len(train_MAE)
test_error=sum(test_MAE)/len(test_MAE)
print("Coefficient:",coef)
print("Intercept:",intercept)
print("Train error:",train_error)
print("Test error:",test_error)


Coefficient: [ 2.93168687e+00 -3.74977691e+03  1.96473403e+01 -1.40693301e+02
  5.30424244e+02 -3.29473421e+00 -4.59693705e+02 -3.75463638e+01
 -1.66192998e+02  1.19433345e+02 -2.62057502e+00 -5.74453480e-01
 -1.68967644e+01  8.23834391e-01 -5.31899869e+01 -2.60242944e+01
 -1.06866348e+02 -1.52205503e+01 -2.35102855e+02 -5.31373891e+00
  6.79792595e+01  5.16125483e+02 -4.03809344e+02  3.03563501e+02
  2.69057849e+00 -4.88240142e+01  9.77427939e+02 -4.86458919e+02
 -2.60823004e+02 -2.50211258e+02  6.00176156e-01  3.87704934e+02
  4.48623388e+03 -1.59349673e+01 -4.85594772e+02]
Intercept: -20.065288295667273
Train error: 0.6160062466322502
Test error: 0.6100093165357316


# BayesianRidge Regression

In [13]:
reg = linear_model.BayesianRidge().fit(train_x, train_y)
print(reg.coef_)
print(reg.intercept_)

[  573.66478556 -2794.94229164  -631.1949129   -258.8200459
   160.32477627 -2524.10053484 -1752.97587776 -1387.10766046
  -386.35779579   314.97488099   732.06991828  4843.17229318
  -939.42361888  1758.58037374  1936.75688195   581.66336318
 -1325.88292919   -75.38056706  -405.16959687  1015.7608768
  -873.13171156   958.17461109 -1019.71392037   -65.10271387
  -291.2768399  -1255.85420907  1725.41939968  -960.02767339
  -504.75081483  -260.64091142   489.74290288   278.4234478
  5382.32185065  -371.4488836   -977.27695563]
-1136032.5065915978


In [14]:
train_pred = reg.predict(train_x)
test_pred = reg.predict(test_x)

train_MAE_relative = MAE_relative(train_y,train_pred)
test_MAE_relative = MAE_relative(test_y,test_pred)

print("Train relative error:",train_MAE_relative)
print("Test relative error:",test_MAE_relative)

Train relative error: 0.4967864252852826
Test relative error: 0.49898182153534193


In [17]:
kf = KFold(n_splits=5)
index=list(kf.split(target))

coef_bay_k=[]
intercept_bay_k=[]
train_MAE=[]
test_MAE=[]

for train_index, test_index in index:
    train_x=features.iloc[train_index]
    # the following variable records the features of examples in the test set
    test_x=features.iloc[test_index]
    # the following variable records the labels of examples in the training set
    train_y=target.iloc[train_index]
    # the following variable records the labels of examples in the test set
    test_y=target.iloc[test_index]
    
    # fit and test
    regr = linear_model.BayesianRidge().fit(train_x, train_y)
    coef_bay_k.append(regr.coef_)
    intercept_bay_k.append(regr.intercept_)
    train_pred = regr.predict(train_x)
    test_pred = regr.predict(test_x)
    train_MAE_relative = MAE_relative(train_y,train_pred)
    test_MAE_relative = MAE_relative(test_y,test_pred)
    train_MAE.append(train_MAE_relative)
    test_MAE.append(test_MAE_relative)

coef=sum(coef_bay_k)/len(coef_bay_k)
intercept=sum(intercept_bay_k)/len(intercept_bay_k)
train_error=sum(train_MAE)/len(train_MAE)
test_error=sum(test_MAE)/len(test_MAE)
print("Coefficient:",coef)
print("Intercept:",intercept)
print("Train error:",train_error)
print("Test error:",test_error)

Coefficient: [  572.75926499 -2790.39062982  -592.76675318  -237.09160968
   -11.55671508 -2494.46375202 -1928.32340698 -1483.49908888
  -570.18575119   102.27052284   407.93203964  5977.82615173
  -941.92740874  1619.46265931  1927.91937682   604.79167336
 -1317.80865208   -61.98855143  -396.69117837  1091.98223893
  -836.62541144   943.41084509 -1010.80949742   -36.58400337
  -317.46696615 -1267.66512451  1728.39563005  -969.27032482
  -502.71389225  -256.41141304   487.1193661    286.68603093
  5392.42946728  -368.67205382  -992.83834532]
Intercept: -1134010.6005650677
Train error: 0.4975586190802292
Test error: 0.4980129466272647
