# Imports Required

In [2]:
import pandas as pd
import numpy as np
import scipy
import matplotlib.pyplot as plt
from sklearn.feature_selection import VarianceThreshold
import scipy.stats as stats
import operator
from sklearn.metrics import roc_curve, auc
import random
import warnings
warnings.filterwarnings('ignore')


# Reading the data (Please run the cell and enter the input file path)

In [26]:
#Reading the data
train_df = pd.read_csv("/Users/vigneshsureshbabu/Desktop/imports-85.csv")
n = 1000#number of rows from the dataframe
train_data_points = 800 # number of training data points
#deleting the column 'id' from the dataframe as it is a unique and does not have any effect on the algorithm

print (train_df.head())

          make fuel-type aspiration doors        style drive engine  \
0  alfa-romero       gas        std   two  convertible   rwd  front   
1  alfa-romero       gas        std   two  convertible   rwd  front   
2  alfa-romero       gas        std   two    hatchback   rwd  front   
3         audi       gas        std  four        sedan   fwd  front   
4         audi       gas        std  four        sedan   4wd  front   

   wheel-base  length  width  ...    curb-weight  engine-size  bore  stroke  \
0        88.6   168.8   64.1  ...           2548          130  3.47    2.68   
1        88.6   168.8   64.1  ...           2548          130  3.47    2.68   
2        94.5   171.2   65.5  ...           2823          152  2.68    3.47   
3        99.8   176.6   66.2  ...           2337          109  3.19    3.40   
4        99.4   176.6   66.4  ...           2824          136  3.19    3.40   

   compression-ratio  horsepower  peak-rpm  city-mpg  highway-mpg  price  
0                9.0   

# Seperating the Categorical and Continuous Features

In [27]:
#creating 2 seperate dataframes for categorical and continuous features.
train_df_cat = pd.DataFrame()#training data frame with categorical features
train_df_cont = pd.DataFrame() #training data with continuous features
cat_list = []#list of categorical features
cont_list = []#list of continuous features

#populating the created data frames for categorical and continuous features
cat_list = ["make","fuel-type","aspiration","doors","style","drive","engine"]
cont_list = list(set (train_df.columns) - set (cat_list))

for i in range(0,len(cat_list)):
    train_df_cat[i] = train_df[cat_list[i]]
train_df_cat.columns =cat_list
for i in range(0,len(cont_list)):
    train_df_cont[i] = train_df[cont_list[i]]
train_df_cont.columns =cont_list


# Binarizing the continuous Attributes

In [36]:
#print (train_df_cat.head())
print (train_df_cont.head())
print (train_df_cont.shape)

   price  height  wheel-base  highway-mpg  stroke  compression-ratio  \
0  13495    48.8        88.6           27    2.68                9.0   
1  16500    48.8        88.6           27    2.68                9.0   
2  16500    52.4        94.5           26    3.47                9.0   
3  13950    54.3        99.8           30    3.40               10.0   
4  17450    54.3        99.4           22    3.40                8.0   

   curb-weight  horsepower  peak-rpm  engine-size  length  width  bore  \
0         2548         111      5000          130   168.8   64.1  3.47   
1         2548         111      5000          130   168.8   64.1  3.47   
2         2823         154      5000          152   171.2   65.5  2.68   
3         2337         102      5500          109   176.6   66.2  3.19   
4         2824         115      5500          136   176.6   66.4  3.19   

   city-mpg  
0        21  
1        21  
2        19  
3        24  
4        18  
(198, 14)


## Please do not run this cell more than once, it will binarize the already binarized attributes.

In [40]:

#df_mod_onehotencoded will be the pre processed complete data including test and train
train_df_cat = pd.get_dummies(train_df_cat)
train_df_cat = train_df_cat.astype(int)


print ("After Binarization:")
print (train_df_cat.head())
print (train_df_cat.shape)



After Binarization:
   make_alfa-romero  make_audi  make_bmw  make_chevrolet  make_dodge  \
0                 1          0         0               0           0   
1                 1          0         0               0           0   
2                 1          0         0               0           0   
3                 0          1         0               0           0   
4                 0          1         0               0           0   

   make_honda  make_isuzu  make_jaguar  make_mazda  make_mercedes-benz  \
0           0           0            0           0                   0   
1           0           0            0           0                   0   
2           0           0            0           0                   0   
3           0           0            0           0                   0   
4           0           0            0           0                   0   

      ...       style_convertible  style_hardtop  style_hatchback  \
0     ...                       1

In [44]:

complete_df = pd.DataFrame()
for each_column in train_df_cat.columns:
    complete_df[each_column] = train_df_cat[each_column]
for each_column in train_df_cont.columns:
    complete_df[each_column] = train_df_cont[each_column]
print (complete_df.shape)

(198, 52)


# Splitting the Training and Test Data

In [45]:
#Now that we have binarized the data using the one hot encoding, we have the data in the required format
#The next step is to apply feature selection and the machine learning algorithm

from sklearn.cross_validation import train_test_split
#getting the target column into a seperate dataframe
target_df = pd.DataFrame()
target_df ['price'] = complete_df['price']
#Hence deleting the target column from the features data frame
del complete_df['price']
#splitting tha training data and testing data
# use train/test split with different random_state values
x_train, x_test, y_train, y_test = train_test_split(complete_df, target_df, random_state=10)
#print (x_train.shape)
#print (y_train.shape)
#print (x_test.columns)


# Using 10 Fold Cross Valid with the above Combination for Tuning Depth in Decision Tree

In [63]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.cross_validation import cross_val_score
from sklearn.model_selection import validation_curve
count = 1

k_range = range(1, 30)
k_scores = []
for k in k_range:
    tree = DecisionTreeRegressor(max_depth=k)
    scores = cross_val_score(tree, x_train.values, y_train['price'].values, cv=7, scoring='mean_squared_error')
    
    k_scores.append(scores.mean())
    # plot the value of K for KNN (x-axis) versus the cross-validated accuracy (y-axis)
    dec_ftre_list.append(str(i))
    print ("The rmse for depth",k,"is ",max(k_scores))
    



The rmse for depth 1 is  -23401583.992826466
The rmse for depth 2 is  -12454534.623283688
The rmse for depth 3 is  -12454534.623283688
The rmse for depth 4 is  -9661476.937701674
The rmse for depth 5 is  -8896693.900742471
The rmse for depth 6 is  -8896693.900742471
The rmse for depth 7 is  -8896693.900742471
The rmse for depth 8 is  -8896693.900742471
The rmse for depth 9 is  -8896693.900742471
The rmse for depth 10 is  -8287949.76188132
The rmse for depth 11 is  -8287949.76188132
The rmse for depth 12 is  -8287949.76188132
The rmse for depth 13 is  -8287949.76188132
The rmse for depth 14 is  -8287949.76188132
The rmse for depth 15 is  -8287949.76188132
The rmse for depth 16 is  -8287949.76188132
The rmse for depth 17 is  -8287949.76188132
The rmse for depth 18 is  -8287949.76188132
The rmse for depth 19 is  -8287949.76188132
The rmse for depth 20 is  -8287949.76188132
The rmse for depth 21 is  -8287949.76188132
The rmse for depth 22 is  -8287949.76188132
The rmse for depth 23 is  -82

# Tuning of Depth for Random Forest (RF).

In [68]:
from sklearn import grid_search, datasets
from sklearn.ensemble import RandomForestRegressor
from sklearn.grid_search import GridSearchCV

param_grid = {"max_depth": [3, 5,7,9],
              "bootstrap": [True, False],
              "n_estimators": [10, 20, 40, 80]}
gs = grid_search.GridSearchCV(RandomForestRegressor(), param_grid=param_grid)
gs.fit(x_train.values, y_train['price'].values)
rf_pred = gs.predict(x_test.values)

# Tuning of Depth for RF using Apache Spark.

In [71]:
from sklearn import grid_search, datasets
from sklearn.ensemble import RandomForestRegressor

#It is implmeneted by changing just this one line
from spark_sklearn import GridSearchCV

param_grid = {"max_depth": [3, 5,7,9],
              "bootstrap": [True, False],
              "n_estimators": [10, 20, 40, 80]}
gs = grid_search.GridSearchCV(RandomForestRegressor(), param_grid=param_grid)
gs.fit(x_train.values, y_train['price'].values)
rf_pred = gs.predict(x_test.values)

In [72]:
from sklearn.metrics import mean_squared_error
print (mean_squared_error(y_test, rf_pred))

26476138.159389578


# Tuning of Depth for Gradient Boosting using Apache Spark.

In [74]:
from sklearn import grid_search, datasets
from sklearn.ensemble import GradientBoostingRegressor

#It is implmeneted by changing just this one line
from spark_sklearn import GridSearchCV

param_grid = {"max_depth": [3, 5,7,9],
              "learning_rate": [0.01,0.1,1],
              "n_estimators": [10, 20, 40, 80]}
gbm = grid_search.GridSearchCV(GradientBoostingRegressor(), param_grid=param_grid)
gbm.fit(x_train.values, y_train['price'].values)
gbm_pred = gs.predict(x_test.values)