### Midterm project - Kaggle exercise project

https://www.kaggle.com/c/house-prices-advanced-regression-techniques

Goals

* Get familar with Kaggle competition since our final project may be a real Kaggle competition.
* Practice on large data set
* Data quality checking, missing data imputation
* Feature selection. What's your rationals to choose those features.
* Create at least 5 new features and explain your reason for every new features.
* Use feature importance to help you choose 10 features
* Choose 3 ML algorithms we covered in the class to build your prediction models
* For each model you build, you will need to evaluate and show your effort to improve it.
* Your jupyter submission should be in an article quality. Do NOT print huge data set in the notebook. Use head(). Use plotting to visualize your analysis and results. Use markdown to write your comments.
* Your score is based on completeness on every step.
* Submit in HTML and ipynb format on canvas

In [310]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [311]:
train_data = pd.read_csv('train.csv')
train_data.head()


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [312]:
#selecting features from existing train data
df_feat = train_data[['LotArea', 'YearBuilt', 'OverallQual',
                              'OverallCond','CentralAir','BsmtQual', 'GrLivArea', 
                              'TotRmsAbvGrd','YearRemodAdd','SaleCondition','YrSold']]

df_feat.head()

Unnamed: 0,LotArea,YearBuilt,OverallQual,OverallCond,CentralAir,BsmtQual,GrLivArea,TotRmsAbvGrd,YearRemodAdd,SaleCondition,YrSold
0,8450,2003,7,5,Y,Gd,1710,8,2003,Normal,2008
1,9600,1976,6,8,Y,Gd,1262,6,1976,Normal,2007
2,11250,2001,7,5,Y,Gd,1786,6,2002,Normal,2008
3,9550,1915,7,5,Y,TA,1717,7,1970,Abnorml,2006
4,14260,2000,8,5,Y,Gd,2198,9,2000,Normal,2008


In [313]:
#shows which columns have missing data
df_feat.isna().any()

LotArea          False
YearBuilt        False
OverallQual      False
OverallCond      False
CentralAir       False
BsmtQual          True
GrLivArea        False
TotRmsAbvGrd     False
YearRemodAdd     False
SaleCondition    False
YrSold           False
dtype: bool

In [314]:
#replacing NA for no basement and no garage with None before label encoding

df_feat['BsmtQual'].fillna('None',  inplace=True,)

#shows that none of the data columns are NA anymore
df_feat.isna().any()

LotArea          False
YearBuilt        False
OverallQual      False
OverallCond      False
CentralAir       False
BsmtQual         False
GrLivArea        False
TotRmsAbvGrd     False
YearRemodAdd     False
SaleCondition    False
YrSold           False
dtype: bool

In [315]:
#creating the 5 new features and adding to df_feat dataframe 

df_feat['FinishBsmtSF'] = train_data['TotalBsmtSF'] - train_data['BsmtUnfSF']

df_feat['YearsSinceRemodel'] = train_data['YrSold'] - train_data['YearRemodAdd']

df_feat['TotalLivArea'] = train_data['GrLivArea'] + df_feat['FinishBsmtSF'] 

df_feat['TotalRooms'] = train_data['BsmtFullBath'] + train_data['BsmtHalfBath'] + train_data['FullBath'] + train_data['HalfBath']+ train_data['BedroomAbvGr']+ train_data['KitchenAbvGr']

df_feat['TotalExteriorSF'] = train_data['GarageArea'] + train_data['WoodDeckSF'] + train_data['OpenPorchSF'] + train_data['3SsnPorch'] + train_data['ScreenPorch'] + train_data['PoolArea'] 

df_feat.head()

Unnamed: 0,LotArea,YearBuilt,OverallQual,OverallCond,CentralAir,BsmtQual,GrLivArea,TotRmsAbvGrd,YearRemodAdd,SaleCondition,YrSold,FinishBsmtSF,YearsSinceRemodel,TotalLivArea,TotalRooms,TotalExteriorSF
0,8450,2003,7,5,Y,Gd,1710,8,2003,Normal,2008,706,5,2416,8,609
1,9600,1976,6,8,Y,Gd,1262,6,1976,Normal,2007,978,31,2240,7,758
2,11250,2001,7,5,Y,Gd,1786,6,2002,Normal,2008,486,6,2272,8,650
3,9550,1915,7,5,Y,TA,1717,7,1970,Abnorml,2006,216,36,1933,6,677
4,14260,2000,8,5,Y,Gd,2198,9,2000,Normal,2008,655,8,2853,9,1112


In [316]:
#encoding the categorical features
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# generating binary values using get_dummies

copy_df = pd.get_dummies(df_feat["SaleCondition"], columns=['SaleCondition'],  prefix='SaleCond')
df_feat = df_feat.join(copy_df)

copy_df = pd.get_dummies(df_feat["CentralAir"], columns=['CentralAir'], prefix='CentralAir')
df_feat = df_feat.join(copy_df)

copy_df = pd.get_dummies(df_feat["BsmtQual"], columns=['BsmtQual'], prefix='BsmtQual')
df_feat = df_feat.join(copy_df)


df_feat.head() 

Unnamed: 0,LotArea,YearBuilt,OverallQual,OverallCond,CentralAir,BsmtQual,GrLivArea,TotRmsAbvGrd,YearRemodAdd,SaleCondition,...,SaleCond_Family,SaleCond_Normal,SaleCond_Partial,CentralAir_N,CentralAir_Y,BsmtQual_Ex,BsmtQual_Fa,BsmtQual_Gd,BsmtQual_None,BsmtQual_TA
0,8450,2003,7,5,Y,Gd,1710,8,2003,Normal,...,0,1,0,0,1,0,0,1,0,0
1,9600,1976,6,8,Y,Gd,1262,6,1976,Normal,...,0,1,0,0,1,0,0,1,0,0
2,11250,2001,7,5,Y,Gd,1786,6,2002,Normal,...,0,1,0,0,1,0,0,1,0,0
3,9550,1915,7,5,Y,TA,1717,7,1970,Abnorml,...,0,0,0,0,1,0,0,0,0,1
4,14260,2000,8,5,Y,Gd,2198,9,2000,Normal,...,0,1,0,0,1,0,0,1,0,0


In [317]:

df = df_feat.copy()
df = df.drop("SaleCondition",1)
df = df.drop("CentralAir",1)
df = df.drop("BsmtQual",1)

df.head()

Unnamed: 0,LotArea,YearBuilt,OverallQual,OverallCond,GrLivArea,TotRmsAbvGrd,YearRemodAdd,YrSold,FinishBsmtSF,YearsSinceRemodel,...,SaleCond_Family,SaleCond_Normal,SaleCond_Partial,CentralAir_N,CentralAir_Y,BsmtQual_Ex,BsmtQual_Fa,BsmtQual_Gd,BsmtQual_None,BsmtQual_TA
0,8450,2003,7,5,1710,8,2003,2008,706,5,...,0,1,0,0,1,0,0,1,0,0
1,9600,1976,6,8,1262,6,1976,2007,978,31,...,0,1,0,0,1,0,0,1,0,0
2,11250,2001,7,5,1786,6,2002,2008,486,6,...,0,1,0,0,1,0,0,1,0,0
3,9550,1915,7,5,1717,7,1970,2006,216,36,...,0,0,0,0,1,0,0,0,0,1
4,14260,2000,8,5,2198,9,2000,2008,655,8,...,0,1,0,0,1,0,0,1,0,0


In [318]:
df.columns

Index(['LotArea', 'YearBuilt', 'OverallQual', 'OverallCond', 'GrLivArea',
       'TotRmsAbvGrd', 'YearRemodAdd', 'YrSold', 'FinishBsmtSF',
       'YearsSinceRemodel', 'TotalLivArea', 'TotalRooms', 'TotalExteriorSF',
       'SaleCond_Abnorml', 'SaleCond_AdjLand', 'SaleCond_Alloca',
       'SaleCond_Family', 'SaleCond_Normal', 'SaleCond_Partial',
       'CentralAir_N', 'CentralAir_Y', 'BsmtQual_Ex', 'BsmtQual_Fa',
       'BsmtQual_Gd', 'BsmtQual_None', 'BsmtQual_TA'],
      dtype='object')

In [319]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn import metrics

model = LinearRegression()
sc = StandardScaler()

In [320]:
X = df
X = sc.fit_transform(X)
y = train_data['SalePrice']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)

model.fit(X_train,y_train)


coeff_parameter = pd.DataFrame(model.coef_,df.columns,columns=['Coefficient'])
print(coeff_parameter)

                    Coefficient
LotArea             5827.551223
YearBuilt          11559.448977
OverallQual        24746.794334
OverallCond         6214.947107
GrLivArea          17321.646854
TotRmsAbvGrd        4357.872467
YearRemodAdd         963.137165
YrSold              -684.425068
FinishBsmtSF          74.036148
YearsSinceRemodel  -1007.397389
TotalLivArea       11787.003948
TotalRooms         -3980.274109
TotalExteriorSF     9023.721774
SaleCond_Abnorml   -2068.682992
SaleCond_AdjLand     536.531991
SaleCond_Alloca     -614.207077
SaleCond_Family    -1895.069872
SaleCond_Normal      266.961614
SaleCond_Partial    2395.264125
CentralAir_N        -299.556712
CentralAir_Y         299.556712
BsmtQual_Ex        11611.407293
BsmtQual_Fa         -888.889608
BsmtQual_Gd        -1815.750804
BsmtQual_None      -1014.173246
BsmtQual_TA        -4042.424061


In [321]:
import pandas 
#pandas.set_option('display.max_rows', None)
coeff_parameter.sort_values('Coefficient')

Unnamed: 0,Coefficient
BsmtQual_TA,-4042.424061
TotalRooms,-3980.274109
SaleCond_Abnorml,-2068.682992
SaleCond_Family,-1895.069872
BsmtQual_Gd,-1815.750804
BsmtQual_None,-1014.173246
YearsSinceRemodel,-1007.397389
BsmtQual_Fa,-888.889608
YrSold,-684.425068
SaleCond_Alloca,-614.207077


In [322]:
#dropping the features to drop that have low coeff
df.drop(["CentralAir_N", "CentralAir_Y"],1)
df.head()


Unnamed: 0,LotArea,YearBuilt,OverallQual,OverallCond,GrLivArea,TotRmsAbvGrd,YearRemodAdd,YrSold,FinishBsmtSF,YearsSinceRemodel,...,SaleCond_Family,SaleCond_Normal,SaleCond_Partial,CentralAir_N,CentralAir_Y,BsmtQual_Ex,BsmtQual_Fa,BsmtQual_Gd,BsmtQual_None,BsmtQual_TA
0,8450,2003,7,5,1710,8,2003,2008,706,5,...,0,1,0,0,1,0,0,1,0,0
1,9600,1976,6,8,1262,6,1976,2007,978,31,...,0,1,0,0,1,0,0,1,0,0
2,11250,2001,7,5,1786,6,2002,2008,486,6,...,0,1,0,0,1,0,0,1,0,0
3,9550,1915,7,5,1717,7,1970,2006,216,36,...,0,0,0,0,1,0,0,0,0,1
4,14260,2000,8,5,2198,9,2000,2008,655,8,...,0,1,0,0,1,0,0,1,0,0


In [323]:
from sklearn.model_selection import train_test_split
linear = LinearRegression()
sc = StandardScaler()
X = df
X = sc.fit_transform(X)
y = train_data['SalePrice']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)



In [324]:
#linear regression model
linear.fit(X_train,y_train)
y_pred = linear.predict(X_test)

print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))  
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
print('R-squared Error:', metrics.r2_score(y_test, y_pred))

#want to decrease the top 3 and increase r-squared

#print(sum(y_pred)/len(y_pred)) ==> this tells you average of the predicitons

Mean Absolute Error: 22526.17004471814
Mean Squared Error: 1064346700.2777876
Root Mean Squared Error: 32624.32681723544
R-squared Error: 0.8336172330068808


In [325]:
#decision Tree Regressor
from sklearn.tree import DecisionTreeRegressor
regressor = DecisionTreeRegressor()
regressor.fit(X_train, y_train)
y_pred = regressor.predict(X_test)

print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))  
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
print('R-squared Error:', metrics.r2_score(y_test, y_pred))

Mean Absolute Error: 26006.592465753423
Mean Squared Error: 1448570628.6952055
Root Mean Squared Error: 38060.092336924325
R-squared Error: 0.7735538717559169


In [326]:
#knn regressor
from sklearn import neighbors
knn = neighbors.KNeighborsRegressor()
knn.fit(X_train, y_train)  #fit the model
y_pred=knn.predict(X_test)

print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))  
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
print('R-squared Error:', metrics.r2_score(y_test, y_pred))

Mean Absolute Error: 24599.4
Mean Squared Error: 1269491906.2783563
Root Mean Squared Error: 35629.929922445204
R-squared Error: 0.8015481459313634


### Improved Algorithms

In [345]:
# Improved linear regression model

df_improve = train_data[['MSSubClass', 'YearBuilt', 'OverallQual',
                              'OverallCond','Neighborhood','BsmtQual', 'LotConfig', 
                              'YearRemodAdd','SaleCondition','YrSold']]
#adding the 5 new features

df_improve['FinishBsmtSF'] = train_data['TotalBsmtSF'] - train_data['BsmtUnfSF']

df_improve['YearsSinceRemodel'] = train_data['YrSold'] - train_data['YearRemodAdd']

df_improve['TotalLivArea'] = train_data['GrLivArea'] + df_improve['FinishBsmtSF'] 

df_improve['TotalRooms'] = train_data['BsmtFullBath'] + train_data['BsmtHalfBath'] + train_data['FullBath'] + train_data['HalfBath']+ train_data['BedroomAbvGr']+ train_data['KitchenAbvGr']

df_improve['TotalExteriorSF'] = train_data['GarageArea'] + train_data['WoodDeckSF'] + train_data['OpenPorchSF'] + train_data['3SsnPorch'] + train_data['ScreenPorch'] + train_data['PoolArea'] 



# generating binary values

copy_df = pd.get_dummies(df_improve["SaleCondition"], columns=['SaleCondition'],  prefix='SaleCond')
df_improve = df_improve.join(copy_df)

copy_df = pd.get_dummies(df_improve["Neighborhood"], columns=['Neighborhood'], prefix='Neighborhood')
df_improve = df_improve.join(copy_df)

copy_df = pd.get_dummies(df_improve["BsmtQual"], columns=['BsmtQual'], prefix='BsmtQual')
df_improve = df_improve.join(copy_df)

copy_df = pd.get_dummies(df_improve["LotConfig"], columns=['LotConfig'], prefix='LotConfig')
df_improve = df_improve.join(copy_df)


df_improve = df_improve.drop("SaleCondition",1)
df_improve = df_improve.drop("Neighborhood",1)
df_improve = df_improve.drop("BsmtQual",1)
df_improve = df_improve.drop("LotConfig",1)



linear = LinearRegression()
X_improve = df_improve
X_improve = sc.fit_transform(X)
y_improve = train_data['SalePrice']
X_train, X_test, y_train, y_test = train_test_split(X_improve, y_improve, test_size=0.2, random_state=15)

linear.fit(X_train,y_train)
y_pred = linear.predict(X_test)

print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))  
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
print('R-squared Error:', metrics.r2_score(y_test, y_pred))



Mean Absolute Error: 20595.416958940816
Mean Squared Error: 921438570.1233048
Root Mean Squared Error: 30355.20663944334
R-squared Error: 0.8543404338349588


In [343]:
#Improved decision Tree Regressor


df_improve = train_data[['MSSubClass', 'OverallQual', 'GrLivArea', 
                              'OverallCond','Neighborhood','BsmtQual', 'LotConfig', 
                              'YearRemodAdd','SaleCondition','YrSold']]
#adding the 5 new features

df_improve['FinishBsmtSF'] = train_data['TotalBsmtSF'] - train_data['BsmtUnfSF']

df_improve['YearsSinceRemodel'] = train_data['YrSold'] - train_data['YearRemodAdd']

df_improve['TotalLivArea'] = train_data['GrLivArea'] + df_improve['FinishBsmtSF'] 

df_improve['TotalRooms'] = train_data['BsmtFullBath'] + train_data['BsmtHalfBath'] + train_data['FullBath'] + train_data['HalfBath']+ train_data['BedroomAbvGr']+ train_data['KitchenAbvGr']

df_improve['TotalExteriorSF'] = train_data['GarageArea'] + train_data['WoodDeckSF'] + train_data['OpenPorchSF'] + train_data['3SsnPorch'] + train_data['ScreenPorch'] + train_data['PoolArea'] 



# generating binary values

copy_df = pd.get_dummies(df_improve["SaleCondition"], columns=['SaleCondition'],  prefix='SaleCond')
df_improve = df_improve.join(copy_df)

copy_df = pd.get_dummies(df_improve["Neighborhood"], columns=['Neighborhood'], prefix='Neighborhood')
df_improve = df_improve.join(copy_df)

copy_df = pd.get_dummies(df_improve["BsmtQual"], columns=['BsmtQual'], prefix='BsmtQual')
df_improve = df_improve.join(copy_df)

copy_df = pd.get_dummies(df_improve["LotConfig"], columns=['LotConfig'], prefix='LotConfig')
df_improve = df_improve.join(copy_df)


df_improve = df_improve.drop("SaleCondition",1)
df_improve = df_improve.drop("Neighborhood",1)
df_improve = df_improve.drop("BsmtQual",1)
df_improve = df_improve.drop("LotConfig",1)

X_improve = df_improve
X_improve = sc.fit_transform(X)
y_improve = train_data['SalePrice']
X_train, X_test, y_train, y_test = train_test_split(X_improve, y_improve, test_size=0.2, random_state=10)


regressor = DecisionTreeRegressor(max_depth=20, criterion='mae')
regressor.fit(X_train, y_train)
y_pred = regressor.predict(X_test)

print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))  
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
print('R-squared Error:', metrics.r2_score(y_test, y_pred))

Mean Absolute Error: 25349.36301369863
Mean Squared Error: 1204400581.8150685
Root Mean Squared Error: 34704.47495374434
R-squared Error: 0.8117234719493067
