In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV


Using a house dataset we will try to build a house price estimator. First let's look at the data.

In [2]:
data = pd.read_csv("train.csv")
data.head(10)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000
5,6,50,RL,85.0,14115,Pave,,IR1,Lvl,AllPub,...,0,,MnPrv,Shed,700,10,2009,WD,Normal,143000
6,7,20,RL,75.0,10084,Pave,,Reg,Lvl,AllPub,...,0,,,,0,8,2007,WD,Normal,307000
7,8,60,RL,,10382,Pave,,IR1,Lvl,AllPub,...,0,,,Shed,350,11,2009,WD,Normal,200000
8,9,50,RM,51.0,6120,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2008,WD,Abnorml,129900
9,10,190,RL,50.0,7420,Pave,,Reg,Lvl,AllPub,...,0,,,,0,1,2008,WD,Normal,118000


In the following cell, in am taking care of missing data in the dataset. I iterate over the data and if a column contains numerical data I replace the missing data with the mean value of that column. If not I replace the missing values with missing.

In [3]:
for key, values in data.iteritems():
    if (pd.api.types.is_numeric_dtype(data[key])):
        data[key].fillna(value= data[key].mean(), inplace=True)
    else :
        data[key].fillna(value= "Missing", inplace=True) 
    
mising_data = pd.Series(data.isnull().sum())     


Here I am using a Label Encoder to deal with non numerical data. I also split the data into X and y.

In [4]:
one_hot_enc = LabelEncoder()
for key, values in data.iteritems():
    if (pd.api.types.is_string_dtype(data[key])):        
           data[key] = one_hot_enc.fit_transform(data[key])


scaler = StandardScaler()
scaler.fit(data)
scaler.transform(data)

X = data.drop('SalePrice', 1)
y = data.SalePrice

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

Now that we scaled and split our data, let us find which is the best regressor. We will use grid search to achieve this. This notebook is useful : https://github.com/codebasics/py/blob/master/ML/15_gridsearch/15_grid_search.ipynb

In [5]:
grid_search_linear_reg = GridSearchCV(LinearRegression(), { 'fit_intercept': [True, False],
                                                        'normalize': [True, False], 
                                                        'copy_X': [True, False] 
                                                        }, cv=5)
grid_search_linear_reg.fit(X_train, y_train)

decision_tree_param_grid = {'criterion': ['mse', 'mae'],
              'min_samples_split': [10, 20, 40],
              'max_depth': [2, 6, 8],
              'min_samples_leaf': [20, 40, 100],
              'max_leaf_nodes': [5, 20, 100],
              }

grid_search_decision_trees = GridSearchCV(DecisionTreeRegressor(), decision_tree_param_grid, cv=5)
grid_search_decision_trees.fit(X_train, y_train)

print("linear reg score :", grid_search_linear_reg.best_score_)
print("decision trees score :", grid_search_decision_trees.best_score_)
print("decision trees score whitout grid search :", cross_val_score(DecisionTreeRegressor(), X_train, y_train, cv=5).mean())


print(grid_search_decision_trees.best_params_)
print(grid_search_linear_reg.best_params_)


linear reg score : 0.808134428933404
decision trees score : 0.7465866782452731
decision trees score whitout grid search : 0.7051327840799312
{'criterion': 'mae', 'max_depth': 8, 'max_leaf_nodes': 100, 'min_samples_leaf': 20, 'min_samples_split': 10}
{'copy_X': True, 'fit_intercept': False, 'normalize': True}


It seems like linear regression makes better predictions on the cross validation set. We can now test our regressors on the test set.

In [6]:
Linear_Regressor = LinearRegression(copy_X= True, fit_intercept= False, normalize= True)
# Decision_Tree_Regressor = DecisionTreeRegressor(criterion= 'mae', max_depth= 8, max_leaf_nodes= 100, min_samples_leaf= 20, min_samples_split= 10)
Decision_Tree_Regressor = DecisionTreeRegressor()
Linear_Regressor.fit(X_train, y_train)
Decision_Tree_Regressor.fit(X_train, y_train)

print("linear regression score :", Linear_Regressor.score(X, y))
print("decision tree score :", Decision_Tree_Regressor.score(X, y))


linear regression score : 0.8326689734892734
decision tree score : 0.9467894217939107
