In [1]:
# import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# To Plot matplotlib figures inline on the notebook
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, LassoCV, RidgeCV, Ridge
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import KFold, RepeatedKFold,GridSearchCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

import pickle

In [2]:
# ignore FutureWarning messages in scikit-learn

from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)

## Final Test Prediction

In [3]:
df_train = pd.read_csv('train.csv')

df_train = df_train.drop(columns=['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu'])

df_train['LotFrontage'].fillna((df_train['LotFrontage'].median()), inplace=True)

cat_ = [x for x in df_train.select_dtypes(exclude=['int64','float64']).columns]

# df_train = pd.get_dummies(df_train, columns=cat_)
# X_final = df_train

In [78]:
df_test = pd.read_csv('test.csv')

df_test = df_test.drop(columns=['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu'])

df_test['LotFrontage'].fillna((df_test['LotFrontage'].median()), inplace=True)

## Analyzing null values

In [5]:
na = df_test.isna().sum().reset_index().rename(columns={0:'test'})

ne = df_train.isna().sum().reset_index().rename(columns={0:'train'})

na = na.merge(ne)

na.sort_values(by='train', ascending=False)[:20]

Unnamed: 0,index,test,train
56,GarageType,76,81
57,GarageYrBlt,78,81
58,GarageFinish,78,81
61,GarageQual,78,81
62,GarageCond,78,81
34,BsmtFinType2,42,38
31,BsmtExposure,44,38
30,BsmtCond,45,37
29,BsmtQual,44,37
32,BsmtFinType1,42,37


The same number of null values occur for features related wiht Garage, and the same thing applies to features related to Bsmt.

Making an arbitrary assumption, I assume that this means that those houses don't have garage or basement, so I will convert those values to NO

In [6]:
df_train['GarageType'].fillna('NO', inplace = True) 
df_train['GarageQual'].fillna('NO', inplace = True) 
df_train['GarageFinish'].fillna('NO', inplace = True)
df_train['GarageCond'].fillna('NO', inplace = True)
df_train['BsmtFinType2'].fillna('NO', inplace = True)
df_train['BsmtExposure'].fillna('NO', inplace = True)
df_train['BsmtCond'].fillna('NO', inplace = True)
df_train['BsmtQual'].fillna('NO', inplace = True)
df_train['BsmtFinType1'].fillna('NO', inplace = True)
df_train['MasVnrType'].fillna('None', inplace = True)

In [81]:
df_test['GarageType'].fillna('NO', inplace = True) 
df_test['GarageQual'].fillna('NO', inplace = True) 
df_test['GarageFinish'].fillna('NO', inplace = True)
df_test['GarageCond'].fillna('NO', inplace = True)
df_test['BsmtFinType2'].fillna('NO', inplace = True)
df_test['BsmtExposure'].fillna('NO', inplace = True)
df_test['BsmtCond'].fillna('NO', inplace = True)
df_test['BsmtQual'].fillna('NO', inplace = True)
df_test['BsmtFinType1'].fillna('NO', inplace = True)
df_test['MasVnrType'].fillna('None', inplace = True)

Replace continuous features with median value 

In [8]:
df_train['GarageYrBlt'].fillna(df_train['GarageYrBlt'].median(), inplace = True)
df_train['MasVnrArea'].fillna(df_train['MasVnrArea'].median(), inplace = True)

In [83]:
df_test['GarageYrBlt'].fillna(df_test['GarageYrBlt'].median(), inplace = True)
df_test['MasVnrArea'].fillna(df_test['MasVnrArea'].median(), inplace = True)

In [10]:
na = df_test.isna().sum().reset_index().rename(columns={0:'test'})

ne = df_train.isna().sum().reset_index().rename(columns={0:'train'})

na = na.merge(ne)

na.sort_values(by='train', ascending=False)[:5]

Unnamed: 0,index,test,train
41,Electrical,0,1
0,Id,0,0
48,FullBath,0,0
54,Functional,2,0
53,TotRmsAbvGrd,0,0


In [11]:
na.sort_values(by='test', ascending=False)[:16]

Unnamed: 0,index,test,train
2,MSZoning,4,0
47,BsmtHalfBath,2,0
46,BsmtFullBath,2,0
54,Functional,2,0
8,Utilities,2,0
22,Exterior1st,1,0
52,KitchenQual,1,0
59,GarageCars,1,0
60,GarageArea,1,0
33,BsmtFinSF1,1,0


After those transformations, we  an see that there is only one null value in df_train, and we will drop that row

In [12]:
df_train = df_train.dropna()

As we have to predict for all rows in df_test, let's see what we can do

In [13]:
tt = na.sort_values(by='test', ascending=False)[:16]

tt = list(tt['index'])

na_cat = [x for x in cat_ if x in tt]

na_cat

['MSZoning',
 'Utilities',
 'Exterior1st',
 'Exterior2nd',
 'KitchenQual',
 'Functional',
 'SaleType']

In [14]:
na_num = [x for x in tt if x not in cat_]

na_num

['BsmtHalfBath',
 'BsmtFullBath',
 'GarageCars',
 'GarageArea',
 'BsmtFinSF1',
 'TotalBsmtSF',
 'BsmtFinSF2',
 'BsmtUnfSF',
 'Fireplaces']

In [85]:
df_test['Fireplaces'].mean(), df_test['Fireplaces'].median()

df_test['Fireplaces'].fillna(df_test['Fireplaces'].median(), inplace = True)

In [87]:
df_test['BsmtUnfSF'].nunique(), df_test['BsmtUnfSF'].mean(), df_test['BsmtUnfSF'].median()

df_test['BsmtUnfSF'].fillna(df_test['BsmtUnfSF'].median(), inplace = True)

In [89]:
df_test['BsmtFinSF2'].nunique(), df_test['BsmtFinSF2'].mean(), df_test['BsmtFinSF2'].median()

df_test['BsmtFinSF2'].fillna(df_test['BsmtFinSF2'].median(), inplace = True)

In [91]:
df_test['TotalBsmtSF'].nunique(), df_test['TotalBsmtSF'].mean(), df_test['TotalBsmtSF'].median()

df_test['TotalBsmtSF'].fillna(df_test['TotalBsmtSF'].median(), inplace = True)

In [93]:
df_test['BsmtFinSF1'].nunique(), df_test['BsmtFinSF1'].mean(), df_test['BsmtFinSF1'].median()

df_test['BsmtFinSF1'].fillna(df_test['BsmtFinSF1'].median(), inplace = True)

In [95]:
df_test['GarageArea'].nunique(), df_test['GarageArea'].mean(), df_test['GarageArea'].median()

df_test['GarageArea'].fillna(0.0, inplace = True)

In [97]:
df_test['GarageCars'].nunique(), df_test['GarageCars'].mean(), df_test['GarageCars'].median()

df_test['GarageCars'].fillna(0.0, inplace = True)

In [99]:
df_test['BsmtFullBath'].nunique(), df_test['BsmtFullBath'].mean(), df_test['BsmtFullBath'].median()

df_test['BsmtFullBath'].fillna(df_test['BsmtFullBath'].median(), inplace = True)

In [102]:
df_test['BsmtHalfBath'].nunique(), df_test['BsmtHalfBath'].mean(), df_test['BsmtHalfBath'].median()

df_test['BsmtHalfBath'].fillna(df_test['BsmtHalfBath'].median(), inplace = True)

In [24]:
test_col = [x for x in df_test.columns]

final_col = [x for x in df_train.columns]

ff = [x for x in final_col if x not in test_col]
ff

['SalePrice']

In [25]:
train_cat = df_test.select_dtypes(exclude=['int64','float64']).columns
pd.set_option('display.max_rows', 300)
df_output = pd.DataFrame()
for i in range(len(train_cat)):
    c = train_cat[i]
    dfx = pd.DataFrame({'Variable':[c]*len(df_test[c].unique()),
                       'Level':df_test[c].unique(),
                       'Count':df_test[c].value_counts(dropna = False)})
    dfx['Percentage'] = 100 * dfx['Count']  / dfx['Count'].sum()
    df_output = df_output.append(dfx, ignore_index = True)

In [26]:
cat = [x for x in train_cat]

In [27]:
cat.sort()
cat


['BldgType',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'BsmtQual',
 'CentralAir',
 'Condition1',
 'Condition2',
 'Electrical',
 'ExterCond',
 'ExterQual',
 'Exterior1st',
 'Exterior2nd',
 'Foundation',
 'Functional',
 'GarageCond',
 'GarageFinish',
 'GarageQual',
 'GarageType',
 'Heating',
 'HeatingQC',
 'HouseStyle',
 'KitchenQual',
 'LandContour',
 'LandSlope',
 'LotConfig',
 'LotShape',
 'MSZoning',
 'MasVnrType',
 'Neighborhood',
 'PavedDrive',
 'RoofMatl',
 'RoofStyle',
 'SaleCondition',
 'SaleType',
 'Street',
 'Utilities']

In [28]:
# df_output

In [110]:
df_output[df_output['Variable'] == 'Utilities']

Unnamed: 0,Count,Level,Variable,Percentage
16,1457,AllPub,Utilities,99.86292
17,2,,Utilities,0.13708


In [107]:
# df_train = df_train[df_train.Utilities == 'AllPub']
# df_test = df_test[df_test.Utilities == 'AllPub']

In [111]:
df_test['Utilities'].fillna('AllPub', inplace = True)

In [114]:
def ExterCond(row):
    if row['ExterCond'] not in  ['TA','Fa','Gd']:
        return 'other'
    else:
        return row['ExterCond']
    
df_test['ExterCond'] = df_test.apply(ExterCond, axis=1)
df_train['ExterCond'] = df_train.apply(ExterCond, axis=1)

In [115]:
def GarageQual(row):
    if row['GarageQual'] not in  ['TA','Fa','Gd','NO']:
        return 'other'
    else:
        return row['GarageQual']
    
df_test['GarageQual'] = df_test.apply(GarageQual, axis=1)
df_train['GarageQual'] = df_train.apply(GarageQual, axis=1)

In [116]:
def Functional(row):
    if row['Functional'] not in  ['Typ','Min1','Maj1']:
        return 'other'
    else:
        return row['Functional']
    
df_test['Functional'] = df_test.apply(Functional, axis=1)
df_train['Functional'] = df_train.apply(Functional, axis=1)

In [117]:
def Electrical(row):
    if row['Electrical'] not in  ['SBrkr','FuseF']:
        return 'other'
    else:
        return row['Electrical']
    
df_test['Electrical'] = df_test.apply(Electrical, axis=1)
df_train['Electrical'] = df_train.apply(Electrical, axis=1)

In [118]:
def Heating(row):
    if row['Heating'] != 'GasA' or row['Heating'] != 'GasW':
        return 'other'
    else:
        return row['Heating']
    
df_test['Heating'] = df_test.apply(Heating, axis=1)
df_train['Heating'] = df_train.apply(Heating, axis=1)

In [119]:
def Exterior2nd(row):
    if row['Exterior2nd'] not in  ['VinylSd','MetalSd','Wd Shng','HdBoard','Plywood','Wd Sdng','CmentBd']:
        return 'other'
    else:
        return row['Exterior2nd']
    
df_test['Exterior2nd'] = df_test.apply(Exterior2nd, axis=1)
df_train['Exterior2nd'] = df_train.apply(Exterior2nd, axis=1)

In [120]:
def Exterior1st(row):
    if row['Exterior1st'] not in ['VinylSd','MetalSd','Wd Sdng','HdBoard','BrkFace','WdShing','CemntBd']:
        return 'other'
    else:
        return row['Exterior1st']

df_test['Exterior1st'] = df_test.apply(Exterior1st, axis=1)
df_train['Exterior1st'] = df_train.apply(Exterior1st, axis=1)

In [121]:
def RoofMatl(row):
    if row['RoofMatl'] != 'CompShg':
        return 'other'
    else:
        return row['RoofMatl']
df_test['RoofMatl'] = df_test.apply(RoofMatl, axis=1)
df_train['RoofMatl'] = df_train.apply(RoofMatl, axis=1)

In [122]:
def HouseStyle(row):
    if row['HouseStyle'] != ['2Story','1Story','1.5Fin','1.5Unf','SFoyer']:
        return 'other'
    else:
        return row['HouseStyle']
df_test['HouseStyle'] = df_test.apply(HouseStyle, axis=1)
df_train['HouseStyle'] = df_train.apply(HouseStyle, axis=1)

In [123]:
def Condition2(row):
    if row['Condition2'] != 'Norm':
        return 'other'
    else:
        return row['Condition2']
df_test['Condition2'] = df_test.apply(Condition2, axis=1)
df_train['Condition2'] = df_train.apply(Condition2, axis=1)

In [124]:
def MSZoning(row):
    if row['MSZoning'] == 'FV' or row['MSZoning'] == 'RH':
        return 'other'
    else:
        return row['MSZoning']
df_test['MSZoning'] = df_test.apply(MSZoning, axis=1)
df_train['MSZoning'] = df_train.apply(MSZoning, axis=1)

In [125]:
def Foundation(row):
    if row['Foundation'] not in ['CBlock','PConc','BrkTil','Stone']:
        return 'other'
    else:
        return row['Foundation']
df_test['Foundation'] = df_test.apply(Foundation, axis=1)
df_train['Foundation'] = df_train.apply(Foundation, axis=1)

In [126]:
df_final = pd.get_dummies(df_train, columns=train_cat)

df_test_final = pd.get_dummies(df_test, columns=train_cat)

In [44]:
test_col = [x for x in df_test_final.columns]

final_col = [x for x in df_final.columns]

ff = [x for x in test_col if x not in final_col]
ff

[]

In [45]:
ff = [x for x in final_col if x not in test_col]
ff

['SalePrice']

# Train

## Train/Test Split

In [46]:
X = df_final.drop(columns=['SalePrice'])

# Select our target/dependent variable
y = df_final['SalePrice']

In [47]:
# Split the data into training/testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.33, random_state=42)

## Gradient Boosting

In [48]:
parameters = {
    "learning_rate": [0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2],
    "min_samples_split": np.linspace(0.1, 0.5, 12),
    "min_samples_leaf": np.linspace(0.1, 0.5, 12),
    "max_depth":np.arange(2,15,3),
    "n_estimators":[10]
    }

xgb = GridSearchCV(GradientBoostingRegressor(max_depth=5), cv=10, verbose=1,
                  param_grid=parameters)

xgb.fit(X_train.drop(columns=['Id']), y_train)

y_pred = xgb.predict(X_test.drop(columns=['Id']))
print(xgb.best_params_);

# model evaluation
print('GradientBoost absolute error: ', mean_absolute_error(y_test, y_pred))
print('GradientBoost squared error: ',mean_squared_error(y_test, y_pred))
print('GradientBoost Root Mean Squared Error:', np.sqrt(mean_squared_error(y_test, y_pred)))

print(xgb.best_score_)



Fitting 10 folds for each of 5040 candidates, totalling 50400 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 50400 out of 50400 | elapsed: 29.2min finished


{'min_samples_split': 0.1, 'n_estimators': 10, 'learning_rate': 0.2, 'max_depth': 5, 'min_samples_leaf': 0.1}
GradientBoost absolute error:  23318.308528060246
GradientBoost squared error:  1960817736.9159303
GradientBoost Root Mean Squared Error: 44281.12167635244
0.7587703987823674


In [49]:
Y = y_test.reset_index()

Y['log_value'] = np.log(Y['SalePrice'])

Y['y_pred_xgb_tune'] = y_pred

Y['log_pred'] = np.log(Y['y_pred_xgb_tune'])

print('GradientBoost Root Mean Squared Error:', np.sqrt(mean_squared_error(Y['log_value'], Y['log_pred'])))

GradientBoost Root Mean Squared Error: 0.1979919489406803


In [50]:
xgb.best_score_


0.7587703987823674

In [51]:
pickle.dump(xgb, open('xgb_15_5_nonull.pkl','wb'))

In [52]:
feats = {} # a dict to hold feature_name: feature_importance
for feature, importance in zip(X_train.drop(columns=['Id']), xgb.best_estimator_.feature_importances_):
    feats[feature] = importance #add the name/value pair 
    
    

# Final

In [53]:
na = df_test.isna().sum().reset_index().rename(columns={0:'test'})

ne = df_train.isna().sum().reset_index().rename(columns={0:'train'})

na = na.merge(ne)

na.sort_values(by='train', ascending=False)

Unnamed: 0,index,test,train
0,Id,0,0
56,GarageType,0,0
54,Functional,0,0
53,TotRmsAbvGrd,0,0
52,KitchenQual,1,0
51,KitchenAbvGr,0,0
50,BedroomAbvGr,0,0
49,HalfBath,0,0
48,FullBath,0,0
47,BsmtHalfBath,0,0


In [54]:
df_test.isna().sum()

Id               0
MSSubClass       0
MSZoning         3
LotFrontage      0
LotArea          0
Street           0
LotShape         0
LandContour      0
Utilities        0
LotConfig        0
LandSlope        0
Neighborhood     0
Condition1       0
Condition2       0
BldgType         0
HouseStyle       0
OverallQual      0
OverallCond      0
YearBuilt        0
YearRemodAdd     0
RoofStyle        0
RoofMatl         0
Exterior1st      0
Exterior2nd      0
MasVnrType       0
MasVnrArea       0
ExterQual        0
ExterCond        0
Foundation       0
BsmtQual         0
BsmtCond         0
BsmtExposure     0
BsmtFinType1     0
BsmtFinSF1       0
BsmtFinType2     0
BsmtFinSF2       0
BsmtUnfSF        0
TotalBsmtSF      0
Heating          0
HeatingQC        0
CentralAir       0
Electrical       0
1stFlrSF         0
2ndFlrSF         0
LowQualFinSF     0
GrLivArea        0
BsmtFullBath     0
BsmtHalfBath     0
FullBath         0
HalfBath         0
BedroomAbvGr     0
KitchenAbvGr     0
KitchenQual 

In [128]:
y_pred_final = xgb.predict(df_test_final.drop(columns=['Id']))
# print(xgb.best_params_);

In [129]:
len(df_test)

1459

In [130]:
len(y_pred_final)

1459

In [131]:
df_test['pred'] = y_pred_final

In [132]:
fin = df_test[['Id','pred']]

In [133]:
fin = fin.rename(columns={'pred':'SalePrice'})

In [136]:
fin.to_csv('../KAG_HOUSE/fina.csv', index = False)