In [26]:
import pandas as pd
import numpy as np
import scipy
import matplotlib.pyplot as plt
import math
from sklearn.linear_model import LinearRegression
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
import sklearn
from sklearn import ensemble
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
import pickle
from scipy.stats import uniform 
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso

In [27]:
# from google.colab import files
# uploaded = files.upload()

In [28]:
import io
df = pd.read_csv('cleaned2.csv')
# Dataset is now stored in a Pandas Dataframe

In [29]:
# df = pd.read_csv(r"D:\All_Docs\Masters\CS 584 Machine Learning\Project\Data\cleaned2.csv")

In [30]:
df.head()

Unnamed: 0.1,Unnamed: 0,price,year,manufacturer,condition,cylinders,fuel,odometer,title_status,transmission,drive,type,state
0,0,35990,2010.0,chevrolet,good,8 cylinders,gas,32742.0,clean,other,2,other,south
1,1,7500,2014.0,hyundai,excellent,4 cylinders,gas,93600.0,clean,automatic,2,sedan,south
2,2,4900,2006.0,bmw,good,6 cylinders,gas,87046.0,clean,automatic,2,SUV,south
3,5,29590,2016.0,toyota,good,6 cylinders,gas,33290.0,clean,other,4,pickup,south
4,6,39990,2012.0,ford,good,8 cylinders,gas,9692.0,clean,other,2,coupe,south


In [31]:
df["price"]=np.log1p(df["price"])

## Encoding

### For Numerical Data

#### year

In [32]:
year_min = df["year"].min()
year_max = df["year"].max()
df["year"] = ((df["year"] - year_min) / (year_max - year_min))

#### odometer

In [33]:
odo_min = df["odometer"].min()
odo_max = df["odometer"].max()
df["odometer"] = ((df["odometer"] - odo_min) / (odo_max - odo_min))

### For Ordinal Data

#### cylinders

In [34]:
label_encoder = LabelEncoder()
label_encoder.fit_transform(list(df['cylinders'].astype(str).values))
df['cylinders'] = label_encoder.transform(list(df['cylinders'].astype(str).values))

In [35]:
#Normalizing Cylinder
cyl_min = df["cylinders"].min()
cyl_max = df["cylinders"].max()
df["cylinders"] = ((df["cylinders"] - cyl_min) / (cyl_max - cyl_min))

### OneHotEncoding

In [36]:
train_df = pd.get_dummies(df)

In [37]:
len(train_df)

318824

In [38]:
train_df.head()

Unnamed: 0.1,Unnamed: 0,price,year,cylinders,odometer,drive,manufacturer_acura,manufacturer_alfa-romeo,manufacturer_aston-martin,manufacturer_audi,...,type_other,type_pickup,type_sedan,type_truck,type_van,type_wagon,state_midwest,state_northeast,state_south,state_west
0,0,10.491024,0.583333,0.857143,0.13095,2,0,0,0,0,...,1,0,0,0,0,0,0,0,1,0
1,1,8.922792,0.75,0.428571,0.374431,2,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0
2,2,8.497195,0.416667,0.714286,0.34821,2,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,5,10.295226,0.833333,0.714286,0.133143,4,0,0,0,0,...,0,1,0,0,0,0,0,0,1,0
4,6,10.59641,0.666667,0.857143,0.038732,2,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


### Splitting the Data into test and train

In [39]:
y = train_df["price"]
x = train_df.drop(['price', 'Unnamed: 0'], axis=1)
x_train,x_test,y_train,y_test = sklearn.model_selection.train_test_split(x,y,test_size=0.2,random_state=42)

In [40]:
x_train.head()

Unnamed: 0,year,cylinders,odometer,drive,manufacturer_acura,manufacturer_alfa-romeo,manufacturer_aston-martin,manufacturer_audi,manufacturer_bmw,manufacturer_buick,...,type_other,type_pickup,type_sedan,type_truck,type_van,type_wagon,state_midwest,state_northeast,state_south,state_west
220052,0.958333,0.714286,0.040324,2,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,1,0
10665,0.333333,0.428571,0.273475,2,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
102772,0.75,0.428571,0.308018,2,0,0,0,0,0,0,...,0,0,1,0,0,0,1,0,0,0
90016,0.791667,0.428571,0.528394,4,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
294161,0.708333,1.0,0.386033,4,0,0,0,0,0,0,...,0,0,0,1,0,0,1,0,0,0


# Lasso

In [41]:
parameters={'alpha': uniform()}
tuningModel = Lasso()
rand_search = RandomizedSearchCV(estimator=tuningModel,param_distributions=parameters,n_iter=150)
rand_search.fit(x_train,y_train)
print(rand_search.best_estimator_.alpha)
# print(rand_search.best_score_)

0.0025379360918370564


In [42]:
bestAlphaRange=rand_search.best_estimator_.alpha
alpha =[10**i for i in range(-6,2)]
alpha.append(bestAlphaRange)

In [43]:
lasso_regression_dict={}
for alphavalue in alpha:
  lassoRegressor=Lasso(alpha=alphavalue)
  lassoRegressor.fit(x_train,y_train)
  lassoTrainPrediction=lassoRegressor.predict(x_train)
  lassoTestPrediction=lassoRegressor.predict(x_test)
  testRmse=math.sqrt(mean_squared_error(y_test,lassoTestPrediction))
  trainRmse=math.sqrt(mean_squared_error(y_train,lassoTrainPrediction))
  print("ALPHA VALUE {}".format(alphavalue))
  print("Test RMSE {0}".format(testRmse))
  lassoRSquaredTest=lassoRegressor.score(x_test, y_test)
  print("Test R-Squared {0}".format(lassoRSquaredTest))
  print("Train RMSE {0}".format(trainRmse))
  lassoRSquaredTrain=lassoRegressor.score(x_train,y_train)
  print("Train R-Squared {0}".format(lassoRSquaredTrain))
  lasso_regression_dict[alphavalue]=[testRmse,lassoRSquaredTest,trainRmse,lassoRSquaredTrain]
  print('*********************************************************************')

  model = cd_fast.enet_coordinate_descent(


ALPHA VALUE 1e-06
Test RMSE 0.3577271286344668
Test R-Squared 0.758549273907897
Train RMSE 0.35572003162504234
Train R-Squared 0.7627789727756891
*********************************************************************


  model = cd_fast.enet_coordinate_descent(


ALPHA VALUE 1e-05
Test RMSE 0.3577376813511083
Test R-Squared 0.7585350284203565
Train RMSE 0.35573125958374835
Train R-Squared 0.7627639972348335
*********************************************************************
ALPHA VALUE 0.0001
Test RMSE 0.35800642124440807
Test R-Squared 0.7581721052623576
Train RMSE 0.3559426562152802
Train R-Squared 0.7624819540247998
*********************************************************************
ALPHA VALUE 0.001
Test RMSE 0.3645389359765643
Test R-Squared 0.7492663598641383
Train RMSE 0.3621956903368014
Train R-Squared 0.7540634397304572
*********************************************************************
ALPHA VALUE 0.01
Test RMSE 0.40863593075380894
Test R-Squared 0.6849366661143157
Train RMSE 0.40661855024290827
Train R-Squared 0.6900362454153848
*********************************************************************
ALPHA VALUE 0.1
Test RMSE 0.7053536275225668
Test R-Squared 0.061274596148572
Train RMSE 0.707228428227793
Train R-Squared 0.0623171

In [45]:
ridgeCSVResult=pd.DataFrame.from_dict(lasso_regression_dict,orient='index',columns=["testRmse","lassoRSquaredTest","trainRmse","lassoRSquaredTrain"])
ridgeCSVResult.to_csv('LassoResults.csv')

# Ridge 

In [46]:
parameters= {'alpha': uniform()}
tuningRegressorRidge=Ridge()
randomSearch = RandomizedSearchCV(estimator=tuningRegressorRidge,param_distributions=parameters,n_iter=250)
randomSearch.fit(x_train,y_train)
print(randomSearch.best_estimator_.alpha)

0.4160036442589067


In [47]:
bestAlphaRangeRidge=randomSearch.best_estimator_.alpha
alphaRidge =[10**i for i in range(-6,2)]
alphaRidge.append(bestAlphaRangeRidge)
print(alphaRidge)


[1e-06, 1e-05, 0.0001, 0.001, 0.01, 0.1, 1, 10, 0.4160036442589067]


In [48]:
print(alphaRidge)

[1e-06, 1e-05, 0.0001, 0.001, 0.01, 0.1, 1, 10, 0.4160036442589067]


In [49]:
ridge_regression_dict={}
for alphavalue1 in alphaRidge:
  ridgeRegressor=Ridge(alpha=alphavalue1)
  ridgeRegressor.fit(x_train,y_train)
  trainPrediction=ridgeRegressor.predict(x_train)
  testPrediction=ridgeRegressor.predict(x_test)
  testRmseRidge=math.sqrt(mean_squared_error(y_test,testPrediction))
  trainRmseRidge=math.sqrt(mean_squared_error(y_train,trainPrediction))
  print("ALPHA VALUE {}".format(alphavalue1))
  print("Test RMSE {0}".format(testRmseRidge))
  ridgeRSquaredTest=ridgeRegressor.score(x_test, y_test)
  print("Test R-Squared {0}".format(ridgeRSquaredTest))
  print("Train RMSE {0}".format(trainRmseRidge))
  ridgeRSquaredTrain=ridgeRegressor.score(x_train,y_train)
  print("Train R-Squared {0}".format(ridgeRSquaredTrain))
  ridge_regression_dict[alphavalue1]=[testRmseRidge,ridgeRSquaredTest,trainRmseRidge,ridgeRSquaredTrain]
  print('*********************************************************************')

ALPHA VALUE 1e-06
Test RMSE 0.3577267309769049
Test R-Squared 0.7585498107117679
Train RMSE 0.35571960463636026
Train R-Squared 0.7627795422720569
*********************************************************************
ALPHA VALUE 1e-05
Test RMSE 0.35772673098108665
Test R-Squared 0.7585498107061229
Train RMSE 0.3557196046363609
Train R-Squared 0.762779542272056
*********************************************************************
ALPHA VALUE 0.0001
Test RMSE 0.35772673102291663
Test R-Squared 0.758549810649656
Train RMSE 0.3557196046364203
Train R-Squared 0.7627795422719768
*********************************************************************
ALPHA VALUE 0.001
Test RMSE 0.3577267314424507
Test R-Squared 0.758549810083321
Train RMSE 0.355719604642346
Train R-Squared 0.7627795422640734
*********************************************************************
ALPHA VALUE 0.01
Test RMSE 0.35772673576111913
Test R-Squared 0.7585498042534893
Train RMSE 0.3557196052260141
Train R-Squared 0.7627795

In [50]:
ridge_regression_dict

{1e-06: [0.3577267309769049,
  0.7585498107117679,
  0.35571960463636026,
  0.7627795422720569],
 1e-05: [0.35772673098108665,
  0.7585498107061229,
  0.3557196046363609,
  0.762779542272056],
 0.0001: [0.35772673102291663,
  0.758549810649656,
  0.3557196046364203,
  0.7627795422719768],
 0.001: [0.3577267314424507,
  0.758549810083321,
  0.355719604642346,
  0.7627795422640734],
 0.01: [0.35772673576111913,
  0.7585498042534893,
  0.3557196052260141,
  0.7627795414856062],
 0.1: [0.3577267911611614,
  0.7585497294681811,
  0.35571965580819676,
  0.7627794740216305],
 1: [0.3577284003772513,
  0.7585475571593479,
  0.3557216703787405,
  0.76277678708078],
 10: [0.3577687714084471,
  0.7584930564033339,
  0.35574812503081765,
  0.7627415016796246],
 0.4160036442589067: [0.35772715242573583,
  0.7585492417916894,
  0.3557201966007355,
  0.7627787527390928]}

In [51]:
ridgeCSVResult=pd.DataFrame.from_dict(ridge_regression_dict,orient='index',columns=["testRmseRidge","ridgeRSquaredTest","trainRmseRidge","ridgeRSquaredTrain"])

In [52]:
ridgeCSVResult.to_csv('RidgeResults.csv')