In [79]:
#Import libraries
import numpy as np 
import pandas as pd 
from sklearn import model_selection

In [124]:
#Read data
train_data = pd.read_csv("train.csv")
test_data = pd.read_csv("test.csv")
submission = pd.read_csv('sample_submission_new.csv')

### Check for and replace missing values

In [128]:
test_data.shape

(1459, 80)

In [126]:
#replace missing values in LotFrontage with the mean
test_data['LotFrontage'].fillna(value=test_data.LotFrontage.mean(), inplace=True)

#replace missing values in MasVnrArea with the mean
test_data['MasVnrArea'].fillna(value=test_data.MasVnrArea.mean(), inplace=True)

#replace missing values in other columns with the mode(Highest occuring value)
test_data = test_data.apply(lambda x: x.fillna(x.value_counts().index[0]))

In [129]:
#replace missing values in LotFrontage with the mean
train_data['LotFrontage'].fillna(value=train_data.LotFrontage.mean(), inplace=True)

#replace missing values in MasVnrArea with the mean
train_data['MasVnrArea'].fillna(value=train_data.MasVnrArea.mean(), inplace=True)

#replace missing values in other columns with the mode(Highest occuring value)
train_data = train_data.apply(lambda x: x.fillna(x.value_counts().index[0]))

### Creating Folds
As shown below, let’s create a new column with the name kfold on the last column.

In [130]:
train_data['kfold'] = -1

### Let's proceed to create 5 folds using the following code block:

In [131]:
kf  = model_selection.KFold(n_splits= 5,shuffle = True, random_state=42)

for fold, (train_indicies,valid_indicies)in enumerate(kf.split(X=train_data)):
    train_data.loc[valid_indicies, "kfold"]=fold

### After running the cell above, we will output the new csv file (train_kfolds.csv) with kfolds by running the code block below:

In [132]:
train_data.to_csv('train_kfolds.csv', index=False)

# Let's start building the regression Model

In [133]:
#Read the new csv file (train_kfolds.csv)
data = pd.read_csv('train_kfolds.csv')

### Import libraries

In [134]:
from  sklearn.preprocessing  import  OrdinalEncoder
from  sklearn.model_selection  import  train_test_split
from  sklearn.ensemble  import  RandomForestRegressor
from sklearn.metrics import explained_variance_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

In [11]:
import sys
!{sys.executable} -m pip install xgboost

Collecting xgboost
  Downloading xgboost-1.6.1-py3-none-win_amd64.whl (125.4 MB)
Installing collected packages: xgboost
Successfully installed xgboost-1.6.1


In [88]:
import xgboost

In [135]:
from  xgboost  import  XGBRegressor

### Feature Selection

In [136]:
#data.info()
useful_features = [i for i in data.columns if i not in ("Id", "SalePrice","kfold")]
object_cols = [col for col in useful_features if "cat" in col]
test_data = test_data[useful_features]

In [137]:
xtrain = data[data.kfold != fold].reset_index(drop=True)
xvalid = data[data.kfold == fold].reset_index(drop=True)
xtest = test_data.copy()
    
ytrain = xtrain.SalePrice
yvalid = xvalid.SalePrice
    
xtrain = xtrain[useful_features]
xvalid = xvalid[useful_features]
# Data Encoding 
#OneHot Encode the train data
xtrain = pd.get_dummies(xtrain)
#OneHot Encode the test and validation data
xtest =pd.get_dummies(xtest)
xvalid= pd.get_dummies(xvalid)
#Reindex the test data to align with the number of columns in the train data
xvalid = xvalid.reindex(columns=xtrain.columns, fill_value=0)
xtest = xtest.reindex(columns=xtrain.columns, fill_value=0)

In [138]:
xtest.shape, xtrain.shape, xvalid.shape

((1459, 285), (1168, 285), (292, 285))

In [69]:
test_data.head(1)

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,60,RL,65.0,8450,Pave,Grvl,Reg,Lvl,AllPub,Inside,...,0,0,Gd,MnPrv,Shed,0,2,2008,WD,Normal


## Modeling
Run the following lines of code to build our model

In [139]:
final_predictions =[]

for fold in range(5):
    xtrain = data[data.kfold != fold].reset_index(drop=True)
    xvalid = data[data.kfold == fold].reset_index(drop=True)
    xtest = test_data.copy()
    
    ytrain = xtrain.SalePrice
    yvalid = xvalid.SalePrice
    
    
    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]
    
    # Data Encoding 
    #OneHot Encode the train data
    xtrain = pd.get_dummies(xtrain)
    #OneHot Encode the test and validation data
    xtest =pd.get_dummies(xtest)
    xvalid= pd.get_dummies(xvalid)
    #Reindex the test data to align with the number of columns in the train data
    xvalid = xvalid.reindex(columns=xtrain.columns, fill_value=0)
    xtest = xtest.reindex(columns=xtrain.columns, fill_value=0)

    
    # Model Training
    model = XGBRegressor(random_state = fold, n_jobs=5, learning_rate =0.1, subsample=0.8,
                         max_depth = 5, min_child_weight = 1, gamma = 0, scale_pos_weight = 1)
    model.fit(xtrain, ytrain)
    preds_valid = model.predict(xvalid)
    preds_test = model.predict(xtest)
    final_predictions.append(preds_test)
    print(fold, mean_squared_error(yvalid, preds_valid, squared=False))

0 26347.049993725173
1 29996.115037179654
2 45282.17783459024
3 27723.739590178844
4 22498.84727409934


For each fold, we will encode the data and then train the model using XGBoost (Extreme Gradient Boosting), an ensemble learning technique to boost the performance of our model.

XGBoost is a regularized boosting technique that provides high predictive power and is faster than other boosting techniques. We will then evaluate each fold individually and print out the results of the model.

## Model Evaluation
After individually evaluating each fold, we will now evaluate our model’s performance by getting the mean predictions on our test data.

To do this, use the following code block:

In [140]:
preds = np.mean(np.column_stack(final_predictions), axis=1)

To see how our model performed, we will output the results of our model’s prediction using the following code:

In [141]:
submission.SalePrice =preds
submission.to_csv("submission2.csv", index=False)

In [142]:
sub = pd.read_csv("submission2.csv")
sub

Unnamed: 0,Id,SalePrice
0,1461,127176.270
1,1462,160065.520
2,1463,180068.780
3,1464,189737.550
4,1465,190466.900
...,...,...
1454,2915,83397.450
1455,2916,83024.914
1456,2917,165247.580
1457,2918,120965.670


### Hyperparameter optimization

In this process, we’ll fine-tune and optimize our model’s algorithm parameters until we achieve the desired result.

A few common XGBoost parameters with a large effect on the model perfomance include; n_jobs, max_depth, learning_rate, n_estimators, colsample_bytree, and subsample.

To fine-tune our model, add the following changes to the XGBoost regressor:

In [None]:
model = XGBRegressor(random_state = fold, n_jobs=5, learning_rate =0.1, subsample=0.8,
                         max_depth = 5, min_child_weight = 1, gamma = 0, scale_pos_weight = 1)