# Ensemble learning



In [None]:
#importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import warnings
warnings.filterwarnings('ignore')

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
#from catboost import CatBoostRegressor
from sklearn.model_selection import RandomizedSearchCV

train=pd.read_csv("train.csv")

#separating categorical and numerical features
categorical_features=[feature for feature in train.columns if train[feature].dtype=='object']
numerical_features=[feature for feature in train.columns if feature not in categorical_features]

# EDA

In [None]:
#distribution of saleprice
sns.set_style('darkgrid')
plt.figure(figsize=(20,10))
sns.distplot(train['SalePrice'], color='red', bins=100, hist_kws={'alpha': 0.4})

In [None]:
train_num=train[numerical_features]
plt.figure(figsize=(20,25))
train_num.hist(figsize=(20,25),bins=50, xlabelsize=5, ylabelsize=5)

In [None]:
#correlation heatmap
plt.figure(figsize=(25,25))
corr=train_num.corr()
msk = np.triu(np.ones_like(corr))
sns.heatmap(corr,cmap=plt.cm.RdBu_r,annot=True,annot_kws={'size':10},mask=msk)

### Key takeaways from the heatmap

- Garage Area和Garage Cars的相关系数为0.88。为了避免零共线性，我们将只保留其中一个，即Garage Cars，原因是它与销售价格的相关系数略高于车库面积。
- 1stFlrSF和TotalBsmtSF的相关系数为0.82，出于同样的原因，我们只保留1stFlrSF
- TotRmsAbvGr和GrLivArea的相关系数为0.83，出于同样的原因，我们将只保留GrLivArea
- GarageYrBlt和YearBuilt的相关性为0.83，因此我们将只保留YearBuilt

In [None]:
#visualization of numerical features using box plot to determine/remove extreme outliers
n_row=19
n_col=2

fig,ax=plt.subplots(nrows=19,ncols=2,figsize=(20,6*n_row))
i=0  #for row number
for feature in numerical_features:
    n=i//n_col   #for row no.
    m=i%n_col    #for col no.
    sns.boxplot(x=feature,data=train,palette='magma',ax=ax[n,m])
    i+=1

### Observations from the above sets of boxplots

- 一些特征，如LotFrontage,BsmtFinSF1,TotalBsmtSF和MScVal有一些极端的异常值。我们将在数据预处理部分更严格地研究它们。

In [None]:
#for determining no. of unique categories for each feature
dict_={}
for feature in categorical_features:
    dict_[feature]=train[feature].value_counts().shape[0]
pd.DataFrame(dict_,index=['unique_counts']).transpose()

In [None]:
#To visualize the distribution of categories in all the categorical features we will use countplot
# we need to treet 3 features Neighbourhood, Exterior1st and Exterior2nd separately because of the large number of different categories as we can get that from the above dataframe
n_row=20
n_col=2
features_1=['Neighborhood','Exterior1st','Exterior2nd']
features_2=[feature for feature in categorical_features if feature not in features_1]

fig,ax=plt.subplots(nrows=20,ncols=2,figsize=(20,6*n_row))
i=0  #for row number
for feature in features_2:
    n=i//n_col   #for row no.
    m=i%n_col    #for col no.
    sns.countplot(x=feature,data=train,palette='magma',ax=ax[n,m])
    i+=1

#for plots in features_2
fig=plt.figure(figsize=(20,10))
ax=[None for _ in range(3)]
ax[0]=plt.subplot2grid((3,1),(0,0))
ax[1]=plt.subplot2grid((3,1),(1,0))
ax[2]=plt.subplot2grid((3,1),(2,0))
sns.countplot(x='Neighborhood',data=train,palette='magma',ax=ax[0])
sns.countplot(x='Exterior1st',data=train,palette='magma',ax=ax[1])
sns.countplot(x='Exterior2nd',data=train,palette='magma',ax=ax[2])

In [None]:
#To visualize correlation of different categorical features with the SalePrice
#we will make few changes to the above cell and will use box plot for this purpose
# we need to treet 3 features Neighbourhood, Exterior1st and Exterior2nd separately because of the large number of different categories as we can get that from the above dataframe
n_row=20
n_col=2
features_1=['Neighborhood','Exterior1st','Exterior2nd']
features_2=[feature for feature in categorical_features if feature not in features_1]

fig,ax=plt.subplots(nrows=20,ncols=2,figsize=(20,6*n_row))
i=0  #for row number
for feature in features_2:
    n=i//n_col   #for row no.
    m=i%n_col    #for col no.
    sns.boxplot(x=feature,y='SalePrice',data=train,palette='magma',ax=ax[n,m])
    i+=1

#for plots in features_2
fig=plt.figure(figsize=(20,10))
ax=[None for _ in range(3)]
ax[0]=plt.subplot2grid((3,1),(0,0))
ax[1]=plt.subplot2grid((3,1),(1,0))
ax[2]=plt.subplot2grid((3,1),(2,0))
sns.boxplot(x='Neighborhood',y='SalePrice',data=train,palette='magma',ax=ax[0])
sns.boxplot(x='Exterior1st',y='SalePrice',data=train,palette='magma',ax=ax[1])
sns.boxplot(x='Exterior2nd',y='SalePrice',data=train,palette='magma',ax=ax[2])

### Observations from the above set of countplots and boxplots:

- 街道有两个类别“Pave”:铺砌和“Grvl”:砾石，只有6个观察是砾石类型的街道。
Alley有两个类别“Pave”和“Grvl”，两个类别总共有91个观察结果。铺砌型小巷通道的房屋更高，这是应该的，但在Grvl类型中也有一些异常值，并且在数据描述中提到Null值对应于无小巷通道。
- Utilities在数据集中有两个类别，只有一个属于NoSeWa。虽然在测试集中“Utility”的所有数据点都是AllPub
- Condition2共有8个类别，但仅类别norm就有1445个数据点。我们可以去掉这个类别，因为根据我的理解，同时保留“类别1”和“类别2”特征会导致多重共线性。
- 屋顶材料共有8个类别，但1434个数点来自“CompShg”，屋顶材料为“WdShngl”的房屋价格昂贵，在这一类别中也有一些极端的异常值。
- Fence共有288个非空值，其中157个属于MnPrv，数据描述中给出null值对应No Fence
- 我们将处理异常值，如果需要，在数据预处理时更深入到类别中去

In [None]:
#for numerical features
train_=train.copy()

In [None]:
#dropping the numerical features having high correlation among themselves as mentioned above and ofcourse ID column
train_.drop(['Id','GarageArea','TotalBsmtSF','TotRmsAbvGrd','GarageYrBlt'],axis=1,inplace=True)

In [None]:
#separating numerical and categorical features
categorical_features=[feature for feature in train_.columns if train_[feature].dtype=='object']
numerical_features=[feature for feature in train_.columns if feature not in categorical_features]

In [None]:
df_num=train_[numerical_features]
df_cat=train_[categorical_features]

In [None]:
#checking for null values in numerical features
dict_={}
for feature in df_num.columns:
    dict_[feature]=((df_num[feature].isnull().sum())/len(df_num))*100
pd.DataFrame(dict_,index=['%age null']).transpose()

在数值特征中，我们只有两个特征具有空值，Lotfrontage和MasVntArea，并且在Lotfrontage中有很少的离群值，并且假设存在真正的缺失值，我们将使用该特征和masvnarea的平均值来插值，它在数据描述中给出，None意味着没有，因此我们将简单地用零来插值

In [None]:
df_num['LotFrontage']=np.where(df_num['LotFrontage'].isnull()==True,df_num['LotFrontage'].median(),df_num['LotFrontage'])
df_num['MasVnrArea']=np.where(df_num['MasVnrArea'].isnull()==True,0,df_num['MasVnrArea'])

从数值特征的箱形图中，我们将分析具有极端异常值的特征

In [None]:
#LotFrontage
df_num[df_num['LotFrontage']>150][['LotFrontage','SalePrice']]

In [None]:
df_num[df_num['LotArea']>100000][['LotArea','SalePrice','LotFrontage']]

在关于lotArea的两表中，基于SalePrice和LotFrontage都很高，保留最高值215245很合理。但关于LotFrontage，删除极端离群值LotFontage(313)很合理

In [None]:
df_num[df_num['BsmtFinSF1']>2000]

In [None]:
df_num[df_num['1stFlrSF']>3000]

In [None]:
df_num[df_num['SalePrice']>700000]

我们只需要删除LotFrontage=313的行，然后删除BsmtFinSF1和1stFlrSf中的极端异常值以及SalePrice大于7000000的房屋

In [None]:
idx_1=df_num[df_num['LotFrontage']==313].index
idx_2=df_num[df_num['SalePrice']>700000].index

In [None]:
#we need to drop the following rows from both numerical dataframe df_num and categorical dataframe df_cat
df_num.drop(idx_1,inplace=True)
df_num.drop(idx_2,inplace=True)
df_num.reset_index(drop=True,inplace=True)

df_cat.drop(idx_1,inplace=True)
df_cat.drop(idx_2,inplace=True)
df_cat.reset_index(drop=True,inplace=True)

当我们观察测试测试时，有很多数值特征包含有空值，因此我们将使用带有中位数策略的skleran imputer对测试集进行估算

In [None]:
#we will first separate the target variable and independent variables separately
X=df_num.drop('SalePrice',axis=1)
y=df_num['SalePrice']
imputer_num=SimpleImputer(strategy='median')
X_=pd.DataFrame(imputer_num.fit_transform(X),columns=X.columns)

In [None]:
scaler_=StandardScaler()
X_=pd.DataFrame(scaler_.fit_transform(X_),columns=X_.columns)

## Categorical features

In [None]:
#null values in categorical features
dict_={}
for feature in df_cat.columns:
    dict_[feature]=((df_cat[feature].isnull().sum())/len(df_cat))*100
pd.DataFrame(dict_,index=['%age null']).transpose()

对于“POOLQC”，它有超过99%的缺失值，从countplot和boxplot中，虽然POOLQC是优秀的房子是昂贵的，但在该特定类别中只有两个条目，并且在“SalePrice”中也有许多异常值，没有Pool，它将被删除

In [None]:
df_cat.drop('PoolQC',axis=1,inplace=True)

对于MiscFeature，它有大约96%的缺失值，但再次从描述中提到，空值代表没有MiscFeature，从countplot和boxplot中有几个(大约50)值属于shed类别，现在我只是要用No_Misc替换缺失值

“Heating”功能在GasA类型类别中有1400多个条目，因此最好删除它，无论如何我们有加热qc功能。

In [None]:
df_cat.drop(['Heating'],axis=1,inplace=True)

In [None]:
df_cat['MiscFeature']=np.where(df_cat['MiscFeature'].isnull()==True,'No_misc',df_cat['MiscFeature'])

对于alley,fence,MasVnrType和FireplaceQu空值表示该特征不存在，因此我们将地相应地替换空值

In [None]:
df_cat['Alley']=np.where(df_cat['Alley'].isnull()==True,'No_alley',df_cat['Alley'])
df_cat['Fence']=np.where(df_cat['Fence'].isnull()==True,'No_fence',df_cat['Fence'])
df_cat['MasVnrType']=np.where(df_cat['MasVnrType'].isnull()==True,'No_venner',df_cat['MasVnrType'])
df_cat['FireplaceQu']=np.where(df_cat['FireplaceQu'].isnull()==True,'No_fireplace',df_cat['FireplaceQu'])

对于GarageType,GarageFinish,GarageQual和GarageCond，在所有这些空值中表示没有车库。

我相信GarageQual和GarageCond在某种程度上是一样的甚至我们从countplot中观察到的分布也是一样的，因此我们将删除GarageCond在其他三个中我们将null值替换为No_garage

In [None]:
df_cat.drop('GarageCond',axis=1,inplace=True)
df_cat['GarageType']=np.where(df_cat['GarageType'].isnull()==True,'No_garage',df_cat['GarageType'])
df_cat['GarageFinish']=np.where(df_cat['GarageFinish'].isnull()==True,'No_garage',df_cat['GarageFinish'])
df_cat['GarageQual']=np.where(df_cat['GarageQual'].isnull()==True,'No_garage',df_cat['GarageQual'])

从basementQual,BsmtCond,BsmtFinType1,BsmtFinType2和BsmtExposure我们将删除BsmtCond和BsmtFinType2(基于计数图的分布)

In [None]:
df_cat.drop(['BsmtCond','BsmtFinType2'],axis=1,inplace=True)
df_cat['BsmtQual']=np.where(df_cat['BsmtQual'].isnull()==True,'no_bsmt',df_cat['BsmtQual'])
df_cat['BsmtFinType1']=np.where(df_cat['BsmtFinType1'].isnull()==True,'no_bsmt',df_cat['BsmtFinType1'])
df_cat['BsmtExposure']=np.where(df_cat['BsmtExposure'].isnull()==True,'no_bsmt',df_cat['BsmtExposure'])

我们使用众数对electrical进行插值

In [None]:
df_cat['Electrical']=np.where(df_cat['Electrical'].isnull()==True,df_cat['Electrical'].mode()[0],df_cat['Electrical'])

我们还将创建一个imputer类，就像我们为numerical_features所做的那样，如果存在除上述之外的任何其他空值，我们将使用它

In [None]:
imputer_cat=SimpleImputer(strategy='most_frequent')
df_cat_=pd.DataFrame(imputer_cat.fit_transform(df_cat),columns=df_cat.columns)

现在我们已经完成了插值，我们将进入编码部分，如果我们看一下分类特征，很明显，有些特征是有序的，有些应该被视为nominal的。我会试着把它们分开，并相应地对待它们

In [None]:
ordinal_features=['ExterQual','ExterCond','BsmtQual','BsmtExposure','HeatingQC','KitchenQual','FireplaceQu','GarageQual','GarageFinish','CentralAir']
nominal_features=[feature for feature in df_cat.columns if feature not in ordinal_features]

In [None]:
df_cat_['ExterQual']=df_cat_['ExterQual'].map({'Ex':5,'Gd':4,'TA':3,'Fa':2,'Po':1})
df_cat_['ExterCond']=df_cat_['ExterCond'].map({'Ex':5,'Gd':4,'TA':3,'Fa':2,'Po':1})
df_cat_['BsmtQual']=df_cat_['BsmtQual'].map({'Ex':5,'Gd':4,'TA':3,'Fa':2,'Po':1,'no_bsmt':0})
df_cat_['BsmtExposure']=df_cat_['BsmtExposure'].map({'Gd':4,'Av':3,'Mn':2,'No':1,'no_bsmt':0})
df_cat_['HeatingQC']=df_cat_['HeatingQC'].map({'Ex':5,'Gd':4,'TA':3,'Fa':2,'Po':1})
df_cat_['KitchenQual']=df_cat_['KitchenQual'].map({'Ex':5,'Gd':4,'TA':3,'Fa':2,'Po':1})
df_cat_['FireplaceQu']=df_cat_['FireplaceQu'].map({'Ex':5,'Gd':4,'TA':3,'Fa':2,'Po':1,'No_fireplace':0})
df_cat_['GarageQual']=df_cat_['GarageQual'].map({'Ex':5,'Gd':4,'TA':3,'Fa':2,'Po':1,'No_garage':0})
df_cat_['GarageFinish']=df_cat_['GarageFinish'].map({'Fin':3,'RFn':2,'Unf':1,'No_garage':0})
df_cat_['CentralAir']=df_cat_['CentralAir'].map({'Y':1,'N':0})

In [None]:
df_cat_nominal=df_cat_[nominal_features]
df_cat_ordinal=df_cat_[ordinal_features]

In [None]:
df_cat_ordinal.info()

In [None]:
# we will use one hot encoding technique for nominal features
encoder_=OneHotEncoder(handle_unknown='ignore')
df_cat_nominal=pd.DataFrame(encoder_.fit_transform(df_cat_nominal).toarray())

我计划在RandomForest,xgboost和catboost上训练数据，由于catboost需要分类特征，因此我将以两种不同的方式连接所有特征

In [None]:
train_data=pd.concat([X_,df_cat_ordinal,df_cat_nominal],axis=1)

In [None]:
#splitting the dataset
X_train_,X_val_,y_train_,y_val_=train_test_split(train_data,y,test_size=0.2,random_state=42)
X_train_=X_train_.to_numpy()
y_train_=y_train_.to_numpy()
X_val_=X_val_.to_numpy()
y_val_=y_val_.to_numpy()

In [None]:
forest=RandomForestRegressor()

n_estimators=[int(x) for x in np.linspace(2,100,50)]
max_features=['auto','sqrt']
max_depth=[int(x) for x in np.linspace(2,50,10)]
min_samples_split=[1,2,3,4,5]
min_samples_leaf=[1,2,3,4,5]
bootstrap=[True,False]
#defining the parameter grid space

parameter_grid_forest={     
    'n_estimators':n_estimators,
    'max_features':max_features,
    'min_samples_split':min_samples_split,
    'min_samples_leaf':min_samples_leaf,
    'bootstrap':bootstrap
}

grid_search=RandomizedSearchCV(estimator=forest,param_distributions=parameter_grid_forest,n_iter=20,cv=5,scoring='neg_mean_squared_error',random_state=42,verbose=True)

In [None]:
grid_search.fit(X_train_,y_train_)

In [None]:
model=grid_search.best_estimator_

In [None]:
model.fit(X_train_,y_train_)

In [None]:
y_pred=model.predict(X_val_)

In [None]:
np.sqrt(mean_squared_error(y_val_,y_pred))

In [None]:
xgb=XGBRegressor()

#parameter grid
n_estimators = [100, 500, 900, 1100, 1500]
max_depth = [2, 3, 5, 10, 15]
booster=['gbtree','gblinear']
learning_rate=[0.00001,0.00005,0.0001,0.0005,0.001,0.005,0.01,0.05]
min_child_weight=[1,2,3,4]
hyperparameter_grid = {
    'n_estimators': n_estimators,
    'max_depth':max_depth,
    'learning_rate':learning_rate,
    'min_child_weight':min_child_weight,
    'booster':booster
    }
grid_=RandomizedSearchCV(estimator=xgb,
            param_distributions=hyperparameter_grid,
            cv=5, n_iter=50,
            scoring = 'neg_mean_absolute_error',n_jobs = 4,
            verbose = 5, 
            random_state=42)


In [None]:
grid_.fit(X_train_,y_train_)

In [None]:
model=grid_.best_estimator_

In [None]:
model.fit(X_train_,y_train_)

In [None]:
y_pred=model.predict(X_val_)

In [None]:
np.sqrt(mean_squared_error(y_val_,y_pred))

In [None]:
models = pd.DataFrame({
    'Model' : ['RandomForest Regressor','XGB Regressor'],
    'rms_Score' : [23890.54,20648.60]
})

In [None]:
px.bar(data_frame = models, x = 'rms_Score', y = 'Model', color = 'rms_Score', template = 'plotly_dark', title = 'Models Comparison')