### Midterm project - Kaggle exercise project

https://www.kaggle.com/c/house-prices-advanced-regression-techniques

Goals

* Get familar with Kaggle competition since our final project may be a real Kaggle competition.
* Practice on large data set
* Data quality checking, missing data imputation
* Feature selection. What's your rationals to choose those features.
* Create at least 5 new features and explain your reason for every new features.
* Use feature importance to help you choose 10 features
* Choose 3 ML algorithms we covered in the class to build your prediction models
* For each model you build, you will need to evaluate and show your effort to improve it.
* Your jupyter submission should be in an article quality. Do NOT print huge data set in the notebook. Use head(). Use plotting to visualize your analysis and results. Use markdown to write your comments.
* Your score is based on completeness on every step.
* Submit in HTML and ipynb format on canvas

In [648]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [649]:
train_data = pd.read_csv('train.csv')
train_data.head()


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [650]:
#selecting features from existing train data
df_feat = train_data[['LotArea', 'YearBuilt', 'OverallQual',
                              'OverallCond','CentralAir','BsmtQual', 'GrLivArea', 
                              'TotRmsAbvGrd','YearRemodAdd','SaleCondition','YrSold']]

df_feat.head()

Unnamed: 0,LotArea,YearBuilt,OverallQual,OverallCond,CentralAir,BsmtQual,GrLivArea,TotRmsAbvGrd,YearRemodAdd,SaleCondition,YrSold
0,8450,2003,7,5,Y,Gd,1710,8,2003,Normal,2008
1,9600,1976,6,8,Y,Gd,1262,6,1976,Normal,2007
2,11250,2001,7,5,Y,Gd,1786,6,2002,Normal,2008
3,9550,1915,7,5,Y,TA,1717,7,1970,Abnorml,2006
4,14260,2000,8,5,Y,Gd,2198,9,2000,Normal,2008


In [651]:
#shows which columns have missing data
df_feat.isna().any()

LotArea          False
YearBuilt        False
OverallQual      False
OverallCond      False
CentralAir       False
BsmtQual          True
GrLivArea        False
TotRmsAbvGrd     False
YearRemodAdd     False
SaleCondition    False
YrSold           False
dtype: bool

In [652]:
#replacing NA for no basement and no garage with None before label encoding

df_feat['BsmtQual'].fillna('None',  inplace=True,)

#shows that none of the data columns are NA anymore
df_feat.isna().any()

LotArea          False
YearBuilt        False
OverallQual      False
OverallCond      False
CentralAir       False
BsmtQual         False
GrLivArea        False
TotRmsAbvGrd     False
YearRemodAdd     False
SaleCondition    False
YrSold           False
dtype: bool

In [653]:
#creating the 5 new features and adding to df_feat dataframe 

df_feat['FinishBsmtSF'] = train_data['TotalBsmtSF'] - train_data['BsmtUnfSF']

df_feat['YearsSinceRemodel'] = train_data['YrSold'] - train_data['YearRemodAdd']

df_feat['TotalLivArea'] = train_data['GrLivArea'] + df_feat['FinishBsmtSF'] 

df_feat['TotalRooms'] = train_data['BsmtFullBath'] + train_data['BsmtHalfBath'] + train_data['FullBath'] + train_data['HalfBath']+ train_data['BedroomAbvGr']+ train_data['KitchenAbvGr']

### NEEDD ONE MORE FEATURE CREATED HEREEEE

df_feat.head()

Unnamed: 0,LotArea,YearBuilt,OverallQual,OverallCond,CentralAir,BsmtQual,GrLivArea,TotRmsAbvGrd,YearRemodAdd,SaleCondition,YrSold,FinishBsmtSF,YearsSinceRemodel,TotalLivArea,TotalRooms
0,8450,2003,7,5,Y,Gd,1710,8,2003,Normal,2008,706,5,2416,8
1,9600,1976,6,8,Y,Gd,1262,6,1976,Normal,2007,978,31,2240,7
2,11250,2001,7,5,Y,Gd,1786,6,2002,Normal,2008,486,6,2272,8
3,9550,1915,7,5,Y,TA,1717,7,1970,Abnorml,2006,216,36,1933,6
4,14260,2000,8,5,Y,Gd,2198,9,2000,Normal,2008,655,8,2853,9


In [654]:
#encoding the categorical features
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# generating binary values using get_dummies

copy_df = pd.get_dummies(df_feat["SaleCondition"], columns=['SaleCondition'],  prefix='SaleCond')
df_feat = df_feat.join(copy_df)

copy_df = pd.get_dummies(df_feat["CentralAir"], columns=['CentralAir'], prefix='CentralAir')
df_feat = df_feat.join(copy_df)

copy_df = pd.get_dummies(df_feat["BsmtQual"], columns=['BsmtQual'], prefix='BsmtQual')
df_feat = df_feat.join(copy_df)


df_feat.head() 

Unnamed: 0,LotArea,YearBuilt,OverallQual,OverallCond,CentralAir,BsmtQual,GrLivArea,TotRmsAbvGrd,YearRemodAdd,SaleCondition,...,SaleCond_Family,SaleCond_Normal,SaleCond_Partial,CentralAir_N,CentralAir_Y,BsmtQual_Ex,BsmtQual_Fa,BsmtQual_Gd,BsmtQual_None,BsmtQual_TA
0,8450,2003,7,5,Y,Gd,1710,8,2003,Normal,...,0,1,0,0,1,0,0,1,0,0
1,9600,1976,6,8,Y,Gd,1262,6,1976,Normal,...,0,1,0,0,1,0,0,1,0,0
2,11250,2001,7,5,Y,Gd,1786,6,2002,Normal,...,0,1,0,0,1,0,0,1,0,0
3,9550,1915,7,5,Y,TA,1717,7,1970,Abnorml,...,0,0,0,0,1,0,0,0,0,1
4,14260,2000,8,5,Y,Gd,2198,9,2000,Normal,...,0,1,0,0,1,0,0,1,0,0


In [655]:

df = df_feat.copy()
df = df.drop("SaleCondition",1)
df = df.drop("CentralAir",1)
df = df.drop("BsmtQual",1)

df.head()

Unnamed: 0,LotArea,YearBuilt,OverallQual,OverallCond,GrLivArea,TotRmsAbvGrd,YearRemodAdd,YrSold,FinishBsmtSF,YearsSinceRemodel,...,SaleCond_Family,SaleCond_Normal,SaleCond_Partial,CentralAir_N,CentralAir_Y,BsmtQual_Ex,BsmtQual_Fa,BsmtQual_Gd,BsmtQual_None,BsmtQual_TA
0,8450,2003,7,5,1710,8,2003,2008,706,5,...,0,1,0,0,1,0,0,1,0,0
1,9600,1976,6,8,1262,6,1976,2007,978,31,...,0,1,0,0,1,0,0,1,0,0
2,11250,2001,7,5,1786,6,2002,2008,486,6,...,0,1,0,0,1,0,0,1,0,0
3,9550,1915,7,5,1717,7,1970,2006,216,36,...,0,0,0,0,1,0,0,0,0,1
4,14260,2000,8,5,2198,9,2000,2008,655,8,...,0,1,0,0,1,0,0,1,0,0


In [656]:
df.columns

Index(['LotArea', 'YearBuilt', 'OverallQual', 'OverallCond', 'GrLivArea',
       'TotRmsAbvGrd', 'YearRemodAdd', 'YrSold', 'FinishBsmtSF',
       'YearsSinceRemodel', 'TotalLivArea', 'TotalRooms', 'SaleCond_Abnorml',
       'SaleCond_AdjLand', 'SaleCond_Alloca', 'SaleCond_Family',
       'SaleCond_Normal', 'SaleCond_Partial', 'CentralAir_N', 'CentralAir_Y',
       'BsmtQual_Ex', 'BsmtQual_Fa', 'BsmtQual_Gd', 'BsmtQual_None',
       'BsmtQual_TA'],
      dtype='object')

In [657]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler

model = LinearRegression()
sc = StandardScaler()

In [658]:
X = df
X = sc.fit_transform(X)
y = train_data['SalePrice']
model.fit(X_train,y_train)

coeff_parameter = pd.DataFrame(model.coef_,jj.columns,columns=['Coefficient'])
print(coeff_parameter)

                    Coefficient
LotArea             6539.981134
YearBuilt          14078.349951
OverallQual        26122.644329
OverallCond         6683.526778
GrLivArea          19671.792638
TotRmsAbvGrd        4184.780305
YearRemodAdd        1068.486529
YrSold              -848.760964
FinishBsmtSF         332.411166
YearsSinceRemodel  -1123.344988
TotalLivArea       13538.740996
TotalRooms         -4794.210382
SaleCond_Abnorml   -2175.875203
SaleCond_AdjLand     351.651496
SaleCond_Alloca     -353.692995
SaleCond_Family    -1862.431386
SaleCond_Normal      215.208309
SaleCond_Partial    2500.373353
CentralAir_N        -280.712820
CentralAir_Y         280.712820
BsmtQual_Ex        12165.424624
BsmtQual_Fa         -644.080656
BsmtQual_Gd        -1992.909390
BsmtQual_None      -1034.160851
BsmtQual_TA        -4242.691279


In [659]:
import pandas 
#pandas.set_option('display.max_rows', None)
coeff_parameter.sort_values('Coefficient')

Unnamed: 0,Coefficient
TotalRooms,-4794.210382
BsmtQual_TA,-4242.691279
SaleCond_Abnorml,-2175.875203
BsmtQual_Gd,-1992.90939
SaleCond_Family,-1862.431386
YearsSinceRemodel,-1123.344988
BsmtQual_None,-1034.160851
YrSold,-848.760964
BsmtQual_Fa,-644.080656
SaleCond_Alloca,-353.692995


In [660]:
#dropping the features to drop that have low coeff
df.drop(["CentralAir_N", "CentralAir_Y"],1)
df.head()


Unnamed: 0,LotArea,YearBuilt,OverallQual,OverallCond,GrLivArea,TotRmsAbvGrd,YearRemodAdd,YrSold,FinishBsmtSF,YearsSinceRemodel,...,SaleCond_Family,SaleCond_Normal,SaleCond_Partial,CentralAir_N,CentralAir_Y,BsmtQual_Ex,BsmtQual_Fa,BsmtQual_Gd,BsmtQual_None,BsmtQual_TA
0,8450,2003,7,5,1710,8,2003,2008,706,5,...,0,1,0,0,1,0,0,1,0,0
1,9600,1976,6,8,1262,6,1976,2007,978,31,...,0,1,0,0,1,0,0,1,0,0
2,11250,2001,7,5,1786,6,2002,2008,486,6,...,0,1,0,0,1,0,0,1,0,0
3,9550,1915,7,5,1717,7,1970,2006,216,36,...,0,0,0,0,1,0,0,0,0,1
4,14260,2000,8,5,2198,9,2000,2008,655,8,...,0,1,0,0,1,0,0,1,0,0


In [661]:


from sklearn.model_selection import train_test_split
linear = LinearRegression()
sc = StandardScaler()
X = df
X = sc.fit_transform(X)
y = train_data['SalePrice']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)



In [662]:
#linear regression model
linear.fit(X_train,y_train)
y_pred = linear.predict(X_test)

print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))  
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
print('R-squared Error:', metrics.r2_score(y_test, y_pred))

#want to decrease the top 3 and increase r-squared

#print(sum(y_pred)/len(y_pred)) ==> this tells you average of the predicitons

Mean Absolute Error: 22952.139712467106
Mean Squared Error: 1082339537.5603538
Root Mean Squared Error: 32898.92912482645
R-squared Error: 0.8308045235275834


In [663]:
#decision Tree Regressor
from sklearn.tree import DecisionTreeRegressor
regressor = DecisionTreeRegressor()
regressor.fit(X_train, y_train)
y_pred = regressor.predict(X_test)

print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))  
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
print('R-squared Error:', metrics.r2_score(y_test, y_pred))

Mean Absolute Error: 25666.383561643837
Mean Squared Error: 1461548246.6164384
Root Mean Squared Error: 38230.20071378698
R-squared Error: 0.7715251606431276


In [664]:
#knn regressor
from sklearn import neighbors
knn = neighbors.KNeighborsRegressor()
knn.fit(X_train, y_train)  #fit the model
y_pred=knn.predict(X_test)

print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))  
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
print('R-squared Error:', metrics.r2_score(y_test, y_pred))

Mean Absolute Error: 25482.5198630137
Mean Squared Error: 1383095029.910822
Root Mean Squared Error: 37189.98561321074
R-squared Error: 0.7837892690126884
