## BUILDING REGRESSION MODELS FOR PREDICTING HOUSE PRICES AND SELECT THE BEST MODEL THAT GIVES HIGHER SCORE ON KAGGLE COMPETITION

In [36]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import r2_score
from matplotlib.pyplot import figure
from sklearn.preprocessing import OrdinalEncoder
import statsmodels.api as sm
from sklearn.ensemble import BaggingRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeRegressor 
from sklearn.neighbors import KNeighborsRegressor
import warnings 
warnings.filterwarnings ("ignore")


In [2]:
df_HouseP = pd.read_csv("train.csv")
df_HouseP

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,RL,62.0,7917,Pave,,Reg,Lvl,AllPub,...,0,,,,0,8,2007,WD,Normal,175000
1456,1457,20,RL,85.0,13175,Pave,,Reg,Lvl,AllPub,...,0,,MnPrv,,0,2,2010,WD,Normal,210000
1457,1458,70,RL,66.0,9042,Pave,,Reg,Lvl,AllPub,...,0,,GdPrv,Shed,2500,5,2010,WD,Normal,266500
1458,1459,20,RL,68.0,9717,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2010,WD,Normal,142125


In [3]:
df_HouseP.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

### Deleting columns with more NaN values i.e with NaN values exceeding the half of data

In [4]:
df_HousePrice = df_HouseP.drop(columns=['Alley','FireplaceQu','PoolQC','Fence','MiscFeature'])
df_HousePrice

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,Reg,Lvl,AllPub,Inside,...,0,0,0,0,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,Reg,Lvl,AllPub,FR2,...,0,0,0,0,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,IR1,Lvl,AllPub,Inside,...,0,0,0,0,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,IR1,Lvl,AllPub,Corner,...,272,0,0,0,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,IR1,Lvl,AllPub,FR2,...,0,0,0,0,0,12,2008,WD,Normal,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,RL,62.0,7917,Pave,Reg,Lvl,AllPub,Inside,...,0,0,0,0,0,8,2007,WD,Normal,175000
1456,1457,20,RL,85.0,13175,Pave,Reg,Lvl,AllPub,Inside,...,0,0,0,0,0,2,2010,WD,Normal,210000
1457,1458,70,RL,66.0,9042,Pave,Reg,Lvl,AllPub,Inside,...,0,0,0,0,2500,5,2010,WD,Normal,266500
1458,1459,20,RL,68.0,9717,Pave,Reg,Lvl,AllPub,Inside,...,112,0,0,0,0,4,2010,WD,Normal,142125


### Separating categorical and numerical columns into 2 different data frames

In [5]:
df_categorical = df_HousePrice.loc[:,df_HousePrice.dtypes==np.object]
df_categorical

Unnamed: 0,MSZoning,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,...,Electrical,KitchenQual,Functional,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,SaleType,SaleCondition
0,RL,Pave,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,...,SBrkr,Gd,Typ,Attchd,RFn,TA,TA,Y,WD,Normal
1,RL,Pave,Reg,Lvl,AllPub,FR2,Gtl,Veenker,Feedr,Norm,...,SBrkr,TA,Typ,Attchd,RFn,TA,TA,Y,WD,Normal
2,RL,Pave,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,...,SBrkr,Gd,Typ,Attchd,RFn,TA,TA,Y,WD,Normal
3,RL,Pave,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,Norm,Norm,...,SBrkr,Gd,Typ,Detchd,Unf,TA,TA,Y,WD,Abnorml
4,RL,Pave,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,Norm,Norm,...,SBrkr,Gd,Typ,Attchd,RFn,TA,TA,Y,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,RL,Pave,Reg,Lvl,AllPub,Inside,Gtl,Gilbert,Norm,Norm,...,SBrkr,TA,Typ,Attchd,RFn,TA,TA,Y,WD,Normal
1456,RL,Pave,Reg,Lvl,AllPub,Inside,Gtl,NWAmes,Norm,Norm,...,SBrkr,TA,Min1,Attchd,Unf,TA,TA,Y,WD,Normal
1457,RL,Pave,Reg,Lvl,AllPub,Inside,Gtl,Crawfor,Norm,Norm,...,SBrkr,Gd,Typ,Attchd,RFn,TA,TA,Y,WD,Normal
1458,RL,Pave,Reg,Lvl,AllPub,Inside,Gtl,NAmes,Norm,Norm,...,FuseA,Gd,Typ,Attchd,Unf,TA,TA,Y,WD,Normal


In [6]:
df_numerical = df_HousePrice.select_dtypes(include='number')
df_numerical

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
0,1,60,65.0,8450,7,5,2003,2003,196.0,706,...,0,61,0,0,0,0,0,2,2008,208500
1,2,20,80.0,9600,6,8,1976,1976,0.0,978,...,298,0,0,0,0,0,0,5,2007,181500
2,3,60,68.0,11250,7,5,2001,2002,162.0,486,...,0,42,0,0,0,0,0,9,2008,223500
3,4,70,60.0,9550,7,5,1915,1970,0.0,216,...,0,35,272,0,0,0,0,2,2006,140000
4,5,60,84.0,14260,8,5,2000,2000,350.0,655,...,192,84,0,0,0,0,0,12,2008,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,62.0,7917,6,5,1999,2000,0.0,0,...,0,40,0,0,0,0,0,8,2007,175000
1456,1457,20,85.0,13175,6,6,1978,1988,119.0,790,...,349,0,0,0,0,0,0,2,2010,210000
1457,1458,70,66.0,9042,7,9,1941,2006,0.0,275,...,0,60,0,0,0,0,2500,5,2010,266500
1458,1459,20,68.0,9717,5,6,1950,1996,0.0,49,...,366,0,112,0,0,0,0,4,2010,142125


## Filling NaN values in numerical data

In [7]:
df_numerical = df_numerical.fillna(df_numerical.mean())
df_numerical.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 38 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   LotFrontage    1460 non-null   float64
 3   LotArea        1460 non-null   int64  
 4   OverallQual    1460 non-null   int64  
 5   OverallCond    1460 non-null   int64  
 6   YearBuilt      1460 non-null   int64  
 7   YearRemodAdd   1460 non-null   int64  
 8   MasVnrArea     1460 non-null   float64
 9   BsmtFinSF1     1460 non-null   int64  
 10  BsmtFinSF2     1460 non-null   int64  
 11  BsmtUnfSF      1460 non-null   int64  
 12  TotalBsmtSF    1460 non-null   int64  
 13  1stFlrSF       1460 non-null   int64  
 14  2ndFlrSF       1460 non-null   int64  
 15  LowQualFinSF   1460 non-null   int64  
 16  GrLivArea      1460 non-null   int64  
 17  BsmtFullBath   1460 non-null   int64  
 18  BsmtHalf

## Dealing with categorical columns 

### Checking the unique categories in each categorical column

In [8]:
Unique = {}
for i in df_categorical.columns:
    Unique[i] = (len(df_categorical[i].unique()))
    
Unique

{'MSZoning': 5,
 'Street': 2,
 'LotShape': 4,
 'LandContour': 4,
 'Utilities': 2,
 'LotConfig': 5,
 'LandSlope': 3,
 'Neighborhood': 25,
 'Condition1': 9,
 'Condition2': 8,
 'BldgType': 5,
 'HouseStyle': 8,
 'RoofStyle': 6,
 'RoofMatl': 8,
 'Exterior1st': 15,
 'Exterior2nd': 16,
 'MasVnrType': 5,
 'ExterQual': 4,
 'ExterCond': 5,
 'Foundation': 6,
 'BsmtQual': 5,
 'BsmtCond': 5,
 'BsmtExposure': 5,
 'BsmtFinType1': 7,
 'BsmtFinType2': 7,
 'Heating': 6,
 'HeatingQC': 5,
 'CentralAir': 2,
 'Electrical': 6,
 'KitchenQual': 4,
 'Functional': 7,
 'GarageType': 7,
 'GarageFinish': 4,
 'GarageQual': 6,
 'GarageCond': 6,
 'PavedDrive': 3,
 'SaleType': 9,
 'SaleCondition': 6}

### deleting columns with more than 7 unique categories

In [9]:
df_categorical.drop(columns=['Neighborhood','Condition1','Condition2','HouseStyle','RoofMatl','Exterior1st','Exterior2nd',
                            'BsmtFinType1','BsmtFinType2','Functional','GarageType','SaleType'], inplace=True)

df_categorical

Unnamed: 0,MSZoning,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,BldgType,RoofStyle,MasVnrType,...,Heating,HeatingQC,CentralAir,Electrical,KitchenQual,GarageFinish,GarageQual,GarageCond,PavedDrive,SaleCondition
0,RL,Pave,Reg,Lvl,AllPub,Inside,Gtl,1Fam,Gable,BrkFace,...,GasA,Ex,Y,SBrkr,Gd,RFn,TA,TA,Y,Normal
1,RL,Pave,Reg,Lvl,AllPub,FR2,Gtl,1Fam,Gable,,...,GasA,Ex,Y,SBrkr,TA,RFn,TA,TA,Y,Normal
2,RL,Pave,IR1,Lvl,AllPub,Inside,Gtl,1Fam,Gable,BrkFace,...,GasA,Ex,Y,SBrkr,Gd,RFn,TA,TA,Y,Normal
3,RL,Pave,IR1,Lvl,AllPub,Corner,Gtl,1Fam,Gable,,...,GasA,Gd,Y,SBrkr,Gd,Unf,TA,TA,Y,Abnorml
4,RL,Pave,IR1,Lvl,AllPub,FR2,Gtl,1Fam,Gable,BrkFace,...,GasA,Ex,Y,SBrkr,Gd,RFn,TA,TA,Y,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,RL,Pave,Reg,Lvl,AllPub,Inside,Gtl,1Fam,Gable,,...,GasA,Ex,Y,SBrkr,TA,RFn,TA,TA,Y,Normal
1456,RL,Pave,Reg,Lvl,AllPub,Inside,Gtl,1Fam,Gable,Stone,...,GasA,TA,Y,SBrkr,TA,Unf,TA,TA,Y,Normal
1457,RL,Pave,Reg,Lvl,AllPub,Inside,Gtl,1Fam,Gable,,...,GasA,Ex,Y,SBrkr,Gd,RFn,TA,TA,Y,Normal
1458,RL,Pave,Reg,Lvl,AllPub,Inside,Gtl,1Fam,Hip,,...,GasA,Gd,Y,FuseA,Gd,Unf,TA,TA,Y,Normal


## filling NaN values in categorical data by the column mode

In [10]:
df_categorical = df_categorical.fillna(df_categorical.mode().iloc[0])
df_categorical.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 26 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   MSZoning       1460 non-null   object
 1   Street         1460 non-null   object
 2   LotShape       1460 non-null   object
 3   LandContour    1460 non-null   object
 4   Utilities      1460 non-null   object
 5   LotConfig      1460 non-null   object
 6   LandSlope      1460 non-null   object
 7   BldgType       1460 non-null   object
 8   RoofStyle      1460 non-null   object
 9   MasVnrType     1460 non-null   object
 10  ExterQual      1460 non-null   object
 11  ExterCond      1460 non-null   object
 12  Foundation     1460 non-null   object
 13  BsmtQual       1460 non-null   object
 14  BsmtCond       1460 non-null   object
 15  BsmtExposure   1460 non-null   object
 16  Heating        1460 non-null   object
 17  HeatingQC      1460 non-null   object
 18  CentralAir     1460 non-null

## Encoding categorical columns

In [11]:
ord_enc = OrdinalEncoder()

encoded = []
for i in df_categorical.columns:
    df_categorical[i] = ord_enc.fit_transform(df_categorical[[i]])
    encoded.append(df_categorical)
    
df_categorical_encoded = encoded[25]
df_categorical_encoded

Unnamed: 0,MSZoning,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,BldgType,RoofStyle,MasVnrType,...,Heating,HeatingQC,CentralAir,Electrical,KitchenQual,GarageFinish,GarageQual,GarageCond,PavedDrive,SaleCondition
0,3.0,1.0,3.0,3.0,0.0,4.0,0.0,0.0,1.0,1.0,...,1.0,0.0,1.0,4.0,2.0,1.0,4.0,4.0,2.0,4.0
1,3.0,1.0,3.0,3.0,0.0,2.0,0.0,0.0,1.0,2.0,...,1.0,0.0,1.0,4.0,3.0,1.0,4.0,4.0,2.0,4.0
2,3.0,1.0,0.0,3.0,0.0,4.0,0.0,0.0,1.0,1.0,...,1.0,0.0,1.0,4.0,2.0,1.0,4.0,4.0,2.0,4.0
3,3.0,1.0,0.0,3.0,0.0,0.0,0.0,0.0,1.0,2.0,...,1.0,2.0,1.0,4.0,2.0,2.0,4.0,4.0,2.0,0.0
4,3.0,1.0,0.0,3.0,0.0,2.0,0.0,0.0,1.0,1.0,...,1.0,0.0,1.0,4.0,2.0,1.0,4.0,4.0,2.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,3.0,1.0,3.0,3.0,0.0,4.0,0.0,0.0,1.0,2.0,...,1.0,0.0,1.0,4.0,3.0,1.0,4.0,4.0,2.0,4.0
1456,3.0,1.0,3.0,3.0,0.0,4.0,0.0,0.0,1.0,3.0,...,1.0,4.0,1.0,4.0,3.0,2.0,4.0,4.0,2.0,4.0
1457,3.0,1.0,3.0,3.0,0.0,4.0,0.0,0.0,1.0,2.0,...,1.0,0.0,1.0,4.0,2.0,1.0,4.0,4.0,2.0,4.0
1458,3.0,1.0,3.0,3.0,0.0,4.0,0.0,0.0,3.0,2.0,...,1.0,2.0,1.0,0.0,2.0,2.0,4.0,4.0,2.0,4.0


## Recalling Numerical and encoded categorical data frames

In [12]:
df_categorical_encoded.head()

Unnamed: 0,MSZoning,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,BldgType,RoofStyle,MasVnrType,...,Heating,HeatingQC,CentralAir,Electrical,KitchenQual,GarageFinish,GarageQual,GarageCond,PavedDrive,SaleCondition
0,3.0,1.0,3.0,3.0,0.0,4.0,0.0,0.0,1.0,1.0,...,1.0,0.0,1.0,4.0,2.0,1.0,4.0,4.0,2.0,4.0
1,3.0,1.0,3.0,3.0,0.0,2.0,0.0,0.0,1.0,2.0,...,1.0,0.0,1.0,4.0,3.0,1.0,4.0,4.0,2.0,4.0
2,3.0,1.0,0.0,3.0,0.0,4.0,0.0,0.0,1.0,1.0,...,1.0,0.0,1.0,4.0,2.0,1.0,4.0,4.0,2.0,4.0
3,3.0,1.0,0.0,3.0,0.0,0.0,0.0,0.0,1.0,2.0,...,1.0,2.0,1.0,4.0,2.0,2.0,4.0,4.0,2.0,0.0
4,3.0,1.0,0.0,3.0,0.0,2.0,0.0,0.0,1.0,1.0,...,1.0,0.0,1.0,4.0,2.0,1.0,4.0,4.0,2.0,4.0


In [13]:
df_numerical.head()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
0,1,60,65.0,8450,7,5,2003,2003,196.0,706,...,0,61,0,0,0,0,0,2,2008,208500
1,2,20,80.0,9600,6,8,1976,1976,0.0,978,...,298,0,0,0,0,0,0,5,2007,181500
2,3,60,68.0,11250,7,5,2001,2002,162.0,486,...,0,42,0,0,0,0,0,9,2008,223500
3,4,70,60.0,9550,7,5,1915,1970,0.0,216,...,0,35,272,0,0,0,0,2,2006,140000
4,5,60,84.0,14260,8,5,2000,2000,350.0,655,...,192,84,0,0,0,0,0,12,2008,250000


## Joining both data frames to gether

In [14]:
House_Prices_df = df_categorical_encoded.join(df_numerical)
House_Prices_df

Unnamed: 0,MSZoning,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,BldgType,RoofStyle,MasVnrType,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
0,3.0,1.0,3.0,3.0,0.0,4.0,0.0,0.0,1.0,1.0,...,0,61,0,0,0,0,0,2,2008,208500
1,3.0,1.0,3.0,3.0,0.0,2.0,0.0,0.0,1.0,2.0,...,298,0,0,0,0,0,0,5,2007,181500
2,3.0,1.0,0.0,3.0,0.0,4.0,0.0,0.0,1.0,1.0,...,0,42,0,0,0,0,0,9,2008,223500
3,3.0,1.0,0.0,3.0,0.0,0.0,0.0,0.0,1.0,2.0,...,0,35,272,0,0,0,0,2,2006,140000
4,3.0,1.0,0.0,3.0,0.0,2.0,0.0,0.0,1.0,1.0,...,192,84,0,0,0,0,0,12,2008,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,3.0,1.0,3.0,3.0,0.0,4.0,0.0,0.0,1.0,2.0,...,0,40,0,0,0,0,0,8,2007,175000
1456,3.0,1.0,3.0,3.0,0.0,4.0,0.0,0.0,1.0,3.0,...,349,0,0,0,0,0,0,2,2010,210000
1457,3.0,1.0,3.0,3.0,0.0,4.0,0.0,0.0,1.0,2.0,...,0,60,0,0,0,0,2500,5,2010,266500
1458,3.0,1.0,3.0,3.0,0.0,4.0,0.0,0.0,3.0,2.0,...,366,0,112,0,0,0,0,4,2010,142125


## Extracting target variable and predictors from the final data frame

In [15]:
X_train_predictors = House_Prices_df.drop(columns=['SalePrice'])
y_train_target = House_Prices_df['SalePrice']
X_train_predictors

Unnamed: 0,MSZoning,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,BldgType,RoofStyle,MasVnrType,...,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold
0,3.0,1.0,3.0,3.0,0.0,4.0,0.0,0.0,1.0,1.0,...,548,0,61,0,0,0,0,0,2,2008
1,3.0,1.0,3.0,3.0,0.0,2.0,0.0,0.0,1.0,2.0,...,460,298,0,0,0,0,0,0,5,2007
2,3.0,1.0,0.0,3.0,0.0,4.0,0.0,0.0,1.0,1.0,...,608,0,42,0,0,0,0,0,9,2008
3,3.0,1.0,0.0,3.0,0.0,0.0,0.0,0.0,1.0,2.0,...,642,0,35,272,0,0,0,0,2,2006
4,3.0,1.0,0.0,3.0,0.0,2.0,0.0,0.0,1.0,1.0,...,836,192,84,0,0,0,0,0,12,2008
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,3.0,1.0,3.0,3.0,0.0,4.0,0.0,0.0,1.0,2.0,...,460,0,40,0,0,0,0,0,8,2007
1456,3.0,1.0,3.0,3.0,0.0,4.0,0.0,0.0,1.0,3.0,...,500,349,0,0,0,0,0,0,2,2010
1457,3.0,1.0,3.0,3.0,0.0,4.0,0.0,0.0,1.0,2.0,...,252,0,60,0,0,0,0,2500,5,2010
1458,3.0,1.0,3.0,3.0,0.0,4.0,0.0,0.0,3.0,2.0,...,240,366,0,112,0,0,0,0,4,2010


## Using stepwise for feature selection

In [16]:
# Define predictors and target variables
X_dep = X_train_predictors
y = y_train_target
# define a function called forward_stepwise that will perform feature selection using forward stepwise approach
def forward_stepwise(X_dep, y,
                       threshold_in,
                       verbose=False):
    # creating a variable a_list and assign an empy list 
    a_list = []
    # creating a variable included that will have a list of a_list variable 
    included = list(a_list)
    # Using while loop to set a condition for selecting best features into included and worst features into excluded variables
    while True:
        # creating a variable excluded that will take the columns name excluding the ones from included variable
        excluded = list(set(X_dep.columns)-set(included)) 
        # creating a new variable new_pval that holds all the pvalues of the excluded columns
        new_pval = pd.Series(index=excluded)
#         print('The remaining independent variables in our dataset are:')   # displaying this message
#         print(excluded)     # printing or displaying all the columns in excluded list
#         print(' ')       # displaying empty space
        changed=False      # set the condition at this stage
        
        for new_column in excluded:           # use for loop to iterate through each variable
            ## use stepwise to fit the model 
            final_model = sm.OLS(y, sm.add_constant(pd.DataFrame(X_dep[included+[new_column]]))).fit()
            
            ## extracting pvalues from the model
            new_pval[new_column] = final_model.pvalues[new_column]
            
        ## best-pval are the minimum p-values from the list of p-values obtained above in new-pval
        best_pval = new_pval.min()
        
        
        ## condition to compare the best-pval we get with the threshold-in pvalue we put (as the input) in the function created
        ## if this pvalue is less than the threshold-in, then we should accept this pvalue and append it in the included list created
        ## and the best performed variables are the variables with the lowes p-values
        
        if best_pval < threshold_in:
            performed_variables = new_pval.idxmin()
            included.append(performed_variables)
            changed=True
            
            ## this will give us the most features using stepwise regression method i.e it will Add best features as performed variables and print it out            
            if verbose:
#                 print('now add this variable to the best performing features')
                print('Add  {:30} with p-value {:.6}'.format(performed_variables, best_pval))
#                 print(' ')
#                 print('The best performing features are:')
#                 print(included)
#                 print(' ')
        if not changed:
            break
    ## this will return the included list of features with p-values that are less than threshold-in and summary statistics of the model
    return included
## calling our function and pass in the parameters as defined i.e X,y are our variables, 0.05 is the threshold_in for pvalues
best_features = forward_stepwise(X_dep, y, 0.05, verbose = True)

best_features

Add  OverallQual                    with p-value 2.18568e-313
Add  GrLivArea                      with p-value 1.87051e-87
Add  BsmtFinSF1                     with p-value 3.42354e-39
Add  BsmtQual                       with p-value 6.32269e-33
Add  MSSubClass                     with p-value 5.65479e-23
Add  GarageCars                     with p-value 2.728e-18
Add  KitchenQual                    with p-value 5.15133e-20
Add  BsmtExposure                   with p-value 1.33768e-09
Add  ExterQual                      with p-value 8.34399e-08
Add  LotArea                        with p-value 4.58998e-07
Add  OverallCond                    with p-value 1.70485e-06
Add  YearBuilt                      with p-value 2.11778e-10
Add  MasVnrArea                     with p-value 2.72519e-06
Add  Fireplaces                     with p-value 1.55794e-05
Add  BsmtFullBath                   with p-value 0.000654671
Add  LotFrontage                    with p-value 0.000944266
Add  MasVnrType          

['OverallQual',
 'GrLivArea',
 'BsmtFinSF1',
 'BsmtQual',
 'MSSubClass',
 'GarageCars',
 'KitchenQual',
 'BsmtExposure',
 'ExterQual',
 'LotArea',
 'OverallCond',
 'YearBuilt',
 'MasVnrArea',
 'Fireplaces',
 'BsmtFullBath',
 'LotFrontage',
 'MasVnrType',
 'ScreenPorch',
 'SaleCondition',
 'WoodDeckSF',
 'LandContour',
 'Street']

In [17]:
best_features

['OverallQual',
 'GrLivArea',
 'BsmtFinSF1',
 'BsmtQual',
 'MSSubClass',
 'GarageCars',
 'KitchenQual',
 'BsmtExposure',
 'ExterQual',
 'LotArea',
 'OverallCond',
 'YearBuilt',
 'MasVnrArea',
 'Fireplaces',
 'BsmtFullBath',
 'LotFrontage',
 'MasVnrType',
 'ScreenPorch',
 'SaleCondition',
 'WoodDeckSF',
 'LandContour',
 'Street']

## Creating a function to prepare test data as done on training data

In [18]:
def wrangle(filepath):
    # Read CSV file
    HouseP_test = pd.read_csv(filepath)
    # deleting columns with large number of NaN values i.e exceeding the half of data given
    df_HouseP_test = HouseP_test.drop(columns=['Alley','FireplaceQu','PoolQC','Fence','MiscFeature'])
    # Separating categorical and numerical data and save them into 2 different data frames
    df_categorical = df_HouseP_test.loc[:,df_HouseP_test.dtypes==np.object]
    df_numerical = df_HouseP_test.select_dtypes(include='number')
    # filling NaN values in numerical and categorical data frames
    df_numerical = df_numerical.fillna(df_numerical.mean())
    df_categorical = df_categorical.fillna(df_categorical.mode().iloc[0])
    # deleting categorical columns with more than 7 unique categories
    df_categorical.drop(columns=['Neighborhood','Condition1','Condition2','HouseStyle','RoofMatl','Exterior1st','Exterior2nd',
                            'BsmtFinType1','BsmtFinType2','Functional','GarageType','SaleType'], inplace=True)
    
    # Encoding categorical columns
    ord_enc = OrdinalEncoder()

    encoded = []
    for i in df_categorical.columns:
        df_categorical[i] = ord_enc.fit_transform(df_categorical[[i]])
        encoded.append(df_categorical)

    df_categorical_encoded = encoded[25]
    df_categorical_encoded
    
    # joing both numerical and categorical dataframes together
    HouseP_test_df = df_categorical_encoded.join(df_numerical)
    
    return HouseP_test_df

In [25]:
X_test_predictors = wrangle('test.csv')
y_test_target = pd.read_csv('sample_submission.csv')
y_test = y_test_target['SalePrice']


## Using the best features obtained from stepwise to build the regression models

In [55]:
# Training data
X_train = X_train_predictors[best_features]
y_train = House_Prices_df['SalePrice']

# Testing data
X_test = X_test_predictors[best_features]
y_test = y_test_target['SalePrice']

# MODEL BUILDING

# 1. LINEAR REGRESSION MODEL

In [56]:
# creating object for linear regression model and fit the model with training data
model = LinearRegression().fit(X_train,y_train)
# generating predictions of the model
y_prediction = model.predict(X_test)
y_prediction

array([113243.04773924, 159869.31188651, 171348.01314642, ...,
       151306.4334879 , 109392.62294316, 232433.68618587])

In [57]:
# Evaluating the performance of the model by calculating R^2 and RMSE
r_sq = r2_score(y_test,y_prediction)
print('coefficient of determination:',r_sq)

# Evaluating the performance of the model by calculating mean square error (mse)
mean_square_error = mse(y_test, y_prediction)

# Evaluating the performance of the model by calculating root mean square error (rmse)
RMSE = np.sqrt(mean_square_error)
print(f'RMSE for Linear Regression is {RMSE}')

coefficient of determination: -16.962033285559674
RMSE for Linear Regression is 69983.28012265405


### Generating data frame of predictions

In [30]:
prediction_df1 = pd.DataFrame(y_prediction)
prediction_df = prediction_df1.rename(columns={0:'SalePrice'})
prediction_df

Unnamed: 0,SalePrice
0,113243.047739
1,159869.311887
2,171348.013146
3,182644.189807
4,182535.803294
...,...
1454,66772.128793
1455,59806.095690
1456,151306.433488
1457,109392.622943


### Extracting ID's for customers in the sample submission dataset given for testing data

In [31]:
Customer_ID = y_test_target[['Id']]
Customer_ID

Unnamed: 0,Id
0,1461
1,1462
2,1463
3,1464
4,1465
...,...
1454,2915
1455,2916
1456,2917
1457,2918


## Joining data frames of predictions and customer's Id's

In [32]:
House_Prices_Predictions = Customer_ID.join(prediction_df)
House_Prices_Predictions

Unnamed: 0,Id,SalePrice
0,1461,113243.047739
1,1462,159869.311887
2,1463,171348.013146
3,1464,182644.189807
4,1465,182535.803294
...,...,...
1454,2915,66772.128793
1455,2916,59806.095690
1456,2917,151306.433488
1457,2918,109392.622943


## Saving predictions as csv to be submitted to kaggle competition

In [34]:
House_Prices_Predictions.to_csv(r'house-prices-advanced-regression-techniques.csv', index = False)

# CONCLUSION

By submitting my predictions on Kaggle competition, where I used linear regression model, I got a Score of 0.16646. However, to improve the score, lets build other two regression models to see which one is better than others. i.e to see which best model that gives high score 

# 2. K-Nearest Neighbors Regression model

In [58]:
# Ref: https://realpython.com/knn-python/

# create model object
knn_model = KNeighborsRegressor(n_neighbors=3)
# fit the model
knn_model.fit(X_train, y_train)
# generate predictions
y_predictions = knn_model.predict(X_test)
y_predictions

array([134333.33333333, 178666.66666667, 160833.33333333, ...,
       182000.        , 132416.66666667, 240766.66666667])

In [59]:
# Evaluating performance of the model by calculating RMSE and R-square

RMSE_KNN = np.sqrt(mse(y_test, y_predictions))
print(f'The root mean square for KNN is {RMSE_KNN}')

KNN_r_square = r2_score(y_test, y_predictions)
print(f'The r-square for the KNN model is {KNN_r_square}')

The root mean square for KNN is 59978.88474382531
The r-square for the KNN model is -12.19361117847178


### Generating data frame of predictions

In [43]:
predictions_df1 = pd.DataFrame(y_predictions)
predictions_df = predictions_df1.rename(columns={0:'SalePrice'})
predictions_df

Unnamed: 0,SalePrice
0,134333.333333
1,178666.666667
2,160833.333333
3,186666.666667
4,126766.666667
...,...
1454,90166.666667
1455,88166.666667
1456,182000.000000
1457,132416.666667


## Joining data frames of predictions and customer's Id's

In [44]:
House_Price_Predictions = Customer_ID.join(predictions_df)
House_Price_Predictions

Unnamed: 0,Id,SalePrice
0,1461,134333.333333
1,1462,178666.666667
2,1463,160833.333333
3,1464,186666.666667
4,1465,126766.666667
...,...,...
1454,2915,90166.666667
1455,2916,88166.666667
1456,2917,182000.000000
1457,2918,132416.666667


## Saving predictions as csv to be submitted on kaggle competition

In [53]:
House_Price_Predictions.to_csv(r'house-prices-KNN.csv', index = False)

# CONCLUSION

By submitting my predictions on Kaggle competition, where I used KNN regression model, I got a Score of 0.26. 

## Improving KNN performance in scikit-learn With Bagging
## Ref: https://realpython.com/knn-python/

In [47]:
# creating parameters variables to be selected the best performing
parameters = {"n_neighbors": range(1, 50),"weights": ["uniform", "distance"]}
# create object for grid search to select the best performing parameters
gridsearch = GridSearchCV(KNeighborsRegressor(), parameters)
# fit the Grid Search object with training data
gridsearch.fit(X_train, y_train)

GridSearchCV(estimator=KNeighborsRegressor(),
             param_grid={'n_neighbors': range(1, 50),
                         'weights': ['uniform', 'distance']})

In [48]:
# dispay the best parameters
gridsearch.best_params_

{'n_neighbors': 8, 'weights': 'distance'}

In [49]:
# selecting best neighbor and weights from best parameters
best_neighbor = gridsearch.best_params_["n_neighbors"]
best_weights = gridsearch.best_params_["weights"]

# initialize another object for bagging model and fit the model
MybaggedKNN = KNeighborsRegressor(n_neighbors = best_neighbor, weights = best_weights)
model_Bagging = BaggingRegressor(MybaggedKNN, n_estimators=100)
model_Bagging.fit(X_train, y_train)

# generate predictions
y_predictions_grid = model_Bagging.predict(X_test)
y_predictions_grid

array([135217.27795394, 178127.61068918, 175425.92590984, ...,
       220152.147422  , 127826.24126649, 231582.25149939])

In [50]:
# Evaluating the model by calculating RMSE and R-square

RMSE_KNN_Bag = np.sqrt(mse(y_test, y_predictions_grid))
print(f'The root mean square for KNN bagging model is {RMSE_KNN_Bag}')

KNN_r_square_Bag = r2_score(y_test, y_predictions_grid)
print(f'The r-square for the KNN bagging model is {KNN_r_square_Bag}')

The root mean square for KNN bagging model is 51693.181855609764
The r-square for the KNN bagging model is -8.800166151153835


### Generating data frame of predictions

In [51]:
predictions_KNNBag1 = pd.DataFrame(y_predictions_grid)
predictions_KNNBag = predictions_KNNBag1.rename(columns={0:'SalePrice'})
predictions_KNNBag

Unnamed: 0,SalePrice
0,135217.277954
1,178127.610689
2,175425.925910
3,181849.006926
4,130090.245195
...,...
1454,91303.779989
1455,96130.747037
1456,220152.147422
1457,127826.241266


## Joining data frames of predictions and customer's Id's

In [52]:
House_Price_Bag_Predictions = Customer_ID.join(predictions_KNNBag)
House_Price_Bag_Predictions

Unnamed: 0,Id,SalePrice
0,1461,135217.277954
1,1462,178127.610689
2,1463,175425.925910
3,1464,181849.006926
4,1465,130090.245195
...,...,...
1454,2915,91303.779989
1455,2916,96130.747037
1456,2917,220152.147422
1457,2918,127826.241266


## Saving predictions as csv to be submitted on kaggle competition

In [54]:
House_Price_Bag_Predictions.to_csv(r'house-prices-Bag-KNN.csv', index = False)

# CONCLUSION

By submitting my predictions on Kaggle competition, where I used Bag KNN regression model, I got a Score of 0.2696. 

# 3. Decision Trees regression model

### Checking the optimal number of max_depth that will perform better for Decision Tree

In [67]:
# creating empty dictionary that will store max depths as keys and mse as values
MSE = {}
# use for loop to iterate in the range of 100 max depths to see which one is optimal than others
for i in range(1,100):
    # create a regressor object
    Dec_regressor = DecisionTreeRegressor(random_state = 0, max_depth=i) 

    # fit the regressor with X_trains and y_trains data
    Dec_regressor.fit(X_train, y_train)

    # predicting the test data

    y_Decision_pred = Dec_regressor.predict(X_test)
    # Calculating mean square error
    MSE_DecisionTree = mse(y_test, y_Decision_pred)
    # saving max depth as key and mse as values in empty dictionary created
    MSE[i] = MSE_DecisionTree
    
# Extracting max depth that gives minimum mse and take it as an optimal max depth
Keymin = min(zip(MSE.values(), MSE.keys()))[1]
# printing the optimal max-depth
print(f'The optimal max_depth is {Keymin}')

The optimal max_depth is 1


In [68]:
# create a regressor object
Dec_regressor = DecisionTreeRegressor(random_state = 0, max_depth=Keymin) 
  
# fit the regressor with X_trains and y_trains data
Dec_regressor.fit(X_train, y_train)

# predicting the test data
y_Decision_pred = Dec_regressor.predict(X_test)
y_Decision_pred

array([157832.43298132, 157832.43298132, 157832.43298132, ...,
       157832.43298132, 157832.43298132, 157832.43298132])

In [69]:
# Evaluating the model by calculating RMSE and R-square

# calculating and print root mean square
RMSE_DecisionTree = np.sqrt(mse(y_test, y_Decision_pred))
print(f'The root mean square for Decision Tree regression model is {RMSE_DecisionTree}')

# calculating and print r_square
DecisionTree_r_square = r2_score(y_test, y_Decision_pred)
print(f'The r-square for the Decision Tree regression model is {DecisionTree_r_square}')

The root mean square for Decision Tree regression model is 57259.83556256904
The r-square for the Decision Tree regression model is -11.024502077961063


### Getting data frames of predictions

In [70]:
predictions_DecisionTrees = pd.DataFrame(y_Decision_pred)
predictions_DecisionTree = predictions_DecisionTrees.rename(columns={0:'SalePrice'})
predictions_DecisionTree

Unnamed: 0,SalePrice
0,157832.432981
1,157832.432981
2,157832.432981
3,157832.432981
4,305035.899563
...,...
1454,157832.432981
1455,157832.432981
1456,157832.432981
1457,157832.432981


## Joining data frames of predictions and customer's Id's

In [71]:
House_Price_DecisionTree_Predictions = Customer_ID.join(predictions_DecisionTree)
House_Price_DecisionTree_Predictions

Unnamed: 0,Id,SalePrice
0,1461,157832.432981
1,1462,157832.432981
2,1463,157832.432981
3,1464,157832.432981
4,1465,305035.899563
...,...,...
1454,2915,157832.432981
1455,2916,157832.432981
1456,2917,157832.432981
1457,2918,157832.432981


## Saving predictions as csv to be submitted on kaggle competition

In [72]:
House_Price_DecisionTree_Predictions.to_csv(r'house-prices-DecisionTree.csv', index = False)

# CONCLUSION

By submitting these predictions on Kaggle competition, where I used Decision Tree regression model, I got a Score of 0.33013. 

### Therefore, from the results obtained, Decision Tree regressor gives higher score than KNN and Linear regression model