In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
# Regular Imports
#import os
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import matplotlib.patches as patches
from tabulate import tabulate
import missingno as msno 
import warnings
from joblib import dump, load
warnings.filterwarnings("ignore")

#!pip install -U scikit-learn==0.24.1

#import sklearn
#sklearn.__version__
from sklearn.preprocessing import OneHotEncoder

# Set Color Palettes for the notebook
custom_colors = ['#74a09e','#86c1b2','#98e2c6','#f3c969','#f2a553', '#d96548', '#c14953']
sns.palplot(sns.color_palette(custom_colors))

# Set Style
sns.set_style("whitegrid",{"grid.linestyle":"--"})
sns.despine(left=True, bottom=True)
mpl.rcParams['figure.dpi'] = 250
mpl.rc('axes', labelsize=10)
plt.rc('xtick',labelsize=10)
plt.rc('ytick',labelsize=10)

%matplotlib inline
%config InlineBackend.figure_format = 'svg'


# EDA 

In [None]:
#import the dataset
house_df = pd.read_csv('kc_cleaned_after_removingoutliers.csv',parse_dates=['date'])

In [None]:
#Assess the presence of missing values

house_df.isna().sum()

In [None]:
#Or graphically:

msno.matrix(house_df, figsize=(12.5,5), fontsize=10, color=(0.8, 0.25, 0.25));


In [None]:
# number of unique values for each feature

for column in house_df.columns:
    print(f'Unique values for {column}: {len(house_df[column].unique())}')

In [None]:
#"id" feature has basically an unique values for each transaction to identify it.
#Therefore, it can be eliminated from the dataset as not informative.

house_df.drop('id', axis=1, inplace=True)

In [None]:
house_df.info()

In [None]:
#Pearson correlation matrix

sns.set(style="whitegrid", font_scale=1)

plt.figure(figsize=(13,13))
plt.title('Pearson Correlation Matrix',fontsize=25)
sns.heatmap(house_df.corr(),linewidths=0.45,vmax=0.7,square=True,cmap="GnBu",linecolor='w',
            annot=True, annot_kws={"size":7}, cbar_kws={"shrink": .8});

sns.set_style("whitegrid",{"grid.linestyle":"--"})

In [None]:
price_corr = house_df.corr()['price'].sort_values(ascending=False)
print(price_corr)

In [None]:
# ZipCode
#Taken at face value, the zipcode does not appear to capture much information about the house prices.
#Correlation is -0.05. However, this is highly misleading. All things equal, "Zipcodes" connected to posh, well-off areas 
#identify proprieties with higher prices or values. 
#In total there are 70 zipcodes in King County:

In [None]:
# Number of Zipcodes:
len(house_df['zipcode'].unique())

In [None]:
fig, ax = plt.subplots(figsize=(13,6))

g = sns.countplot(x='zipcode', hue='waterfront', data=house_df, ax=ax, )
g.set_xticklabels(labels = house_df['zipcode'].unique(), rotation=90, fontsize=10);
g.grid(linestyle='--')

In [None]:
fig,ax = plt.subplots(figsize=(15,6))
sns.boxplot(x='zipcode',y='price',data=house_df,ax=ax, palette='Reds');
ax.set_xticklabels(labels = house_df['zipcode'].unique(), rotation=90, fontsize=10);
ax.set_title('Boxplot: Price Distribution by Zipcodes');

In [None]:
#Feature Analysis: Bedrooms, Floors and Bathrooms:

f, axes = plt.subplots(1, 2,figsize=(15,5))
sns.boxplot(x=house_df['bedrooms'],y=house_df['price'], ax=axes[0], palette = 'autumn_r')
sns.boxplot(x=house_df['floors'],y=house_df['price'], ax=axes[1], palette = 'autumn_r')
sns.despine(left=True, bottom=True)
axes[0].set(xlabel='Bedrooms', ylabel='Price')
axes[0].yaxis.tick_left()
axes[1].yaxis.set_label_position("right")
axes[1].yaxis.tick_right()
axes[1].set(xlabel='Floors', ylabel='Price')

f, axe = plt.subplots(1, 1,figsize=(15,5))
sns.despine(left=True, bottom=True)
sns.boxplot(x=house_df['bathrooms'],y=house_df['price'], ax=axe, palette = 'autumn_r')
axe.yaxis.tick_left()
axe.set(xlabel='Bathrooms', ylabel='Price');

In [None]:
#Feature Analysis: WaterFront, View and Grade:

f, axes = plt.subplots(1, 2,figsize=(15,5))
sns.boxplot(x=house_df['waterfront'],y=house_df['price'], ax=axes[0], palette = 'viridis')
sns.boxplot(x=house_df['view'],y=house_df['price'], ax=axes[1], palette = 'viridis')
sns.despine(left=True, bottom=True)
axes[0].set(xlabel='Waterfront', ylabel='Price')
axes[0].yaxis.tick_left()
axes[1].yaxis.set_label_position("right")
axes[1].yaxis.tick_right()
axes[1].set(xlabel='View', ylabel='Price')


f, axe = plt.subplots(1, 1,figsize=(15,5))
sns.boxplot(x=house_df['grade'],y=house_df['price'], ax=axe, palette = 'viridis')
sns.despine(left=True, bottom=True)
axe.yaxis.tick_left()
axe.set(xlabel='Grade', ylabel='Price');

In [None]:
# Construnction Year and Renovations: Binning

# just take the year from the date column
house_df['sales_yr']=pd.DatetimeIndex(house_df['date']).year
house_df['sales_mth']=pd.DatetimeIndex(house_df['date']).month

# add the age of the buildings when the houses were sold as a new column
house_df['age']=house_df['sales_yr']-house_df['yr_built']

# add the age of the renovation when the houses were sold as a new column
house_df['age_rnv']=0
house_df['age_rnv']=house_df['sales_yr'][house_df['yr_renovated']!=0].astype(int)-house_df['yr_renovated'][house_df['yr_renovated']!=0]
house_df['age_rnv'][house_df['age_rnv'].isnull()]=house_df['age']

In [None]:
# partition the age into bins
bins_age = [-2,1,5,10,20,30,60,100,100000]
labels = [0,5,10,20,30,60,80,100]
house_df['age_binned'] = pd.cut(house_df['age'], bins=bins_age, labels=labels)

In [None]:
# partition the age_rnv into bins
bins_ren = [-2,1,5,10,20,30,50,100000]
labels = [0,5,10,20,30,60,100]
house_df['age_rnv_binned'] = pd.cut(house_df['age_rnv'], bins=bins_ren, labels=labels)

In [None]:
fig, ax = plt.subplots(1,2, figsize=(15,5))

sns.countplot(house_df['age_binned'], palette='Reds', ax=ax[0], alpha=0.85);
sns.countplot(house_df['age_rnv_binned'], palette='Blues', ax=ax[1], alpha=0.85)
ax[0].set_title('Years since Construction')
ax[1].set_title('Years since Renovation');

In [None]:
#Year and Month of Transaction - Information Extraction

house_df.groupby(["sales_yr","sales_mth"])["price"].agg(['mean','median']).plot(figsize=(15,6), marker='*', markersize = 12)
plt.title('Price Evolution over Time', fontsize=17);

In [None]:
house_df.drop(['date'], inplace=True, axis=1)
house_df.drop(['yr_built','yr_renovated'], inplace=True, axis=1)



In [None]:
house_df_bin = house_df[['price', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors',
                         'waterfront', 'view', 'condition', 'grade', 'sqft_above','sqft_basement',
                         'zipcode', 'lat', 'long', 'sqft_living15','sqft_lot15', 'sales_yr',
                         'sales_mth','age_binned','age_rnv_binned']]

## Features to Normalize

In [None]:
# Price:

plt.figure(figsize=(10,6))
sns.distplot(house_df_bin["price"], color='r');

In [None]:
house_df_bin["log_price"] = np.log(house_df_bin["price"])

plt.figure(figsize=(10,6))
sns.distplot(house_df_bin["log_price"], color='r');

In [None]:
house_df_bin.drop(['price'], inplace=True, axis=1)

In [None]:
#Addtional Feature to Normalize:

cols = ["sqft_living","sqft_lot","sqft_basement","sqft_living15","sqft_lot15"]
house_df_bin[cols].describe()

In [None]:
house_df_bin.loc[:,'sqft_basement_log'] = np.log(house_df_bin.loc[:,'sqft_basement']+1)
house_df_bin.loc[:,'sqft_living_log'] = np.log(house_df_bin.loc[:,'sqft_living'])
house_df_bin.loc[:,'sqft_lot_log'] = np.log(house_df_bin.loc[:,'sqft_lot'])
house_df_bin.loc[:,'sqft_living15_log'] = np.log(house_df_bin.loc[:,'sqft_living15'])
house_df_bin.loc[:,'sqft_lot15_log'] = np.log(house_df_bin.loc[:,'sqft_lot15'])

In [None]:
house_df_bin.head()

In [None]:
log_cols = ["sqft_living_log","sqft_lot_log","sqft_basement_log","sqft_living15_log","sqft_lot15_log"]

fig, axes = plt.subplots(2,5,figsize=(13,5))
axes = np.ravel(axes)
for num, ax in enumerate(axes):
  if num<5:
    sns.distplot(house_df_bin[cols[num]],ax=ax, color=custom_colors[num])
  else:
    sns.distplot(house_df_bin[log_cols[num-5]],ax=ax, color=custom_colors[num-5])
    
plt.tight_layout()

In [None]:
house_df_bin.drop(cols, axis=1, inplace=True)

In [None]:
house_df_bin.columns

In [None]:
#Data Preprocessing/Feature Engineering:

#OneHotEncoder()

from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler

house_df_bin['age_binned'] = house_df_bin['age_binned'].astype('int64') 
house_df_bin['age_rnv_binned'] = house_df_bin['age_rnv_binned'].astype('int64') 

numerical_columns = house_df_bin.drop(['log_price','zipcode'], axis=1).columns
scaler = MinMaxScaler()

categorical_columns = ['zipcode']
ohe = OneHotEncoder(handle_unknown='error', drop='first', sparse=False)

In [None]:
# Logarithm and back to normal value

#house_df_bin["SalePrice"] = 10 ** house_df_bin["log_price"] 

In [None]:
X_bin = house_df_bin.drop(['log_price'], axis=1)
y = house_df_bin['log_price']

In [None]:
print(f'Total number of Features: {len(X_bin.columns)}')
X_bin.columns

In [None]:
#Train-Validation-Test dataset:

from sklearn.model_selection import train_test_split, GridSearchCV

X_train, X_test, y_train, y_test = train_test_split(X_bin, y,  test_size=.15, random_state=170378)
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train,  test_size=.18, random_state=170378)

print(f"Train Data Shape: {X_train.shape}")
print(f"Valid Data Shape: {X_valid.shape}")
print(f"Test Data Shape: {X_test.shape}")

In [None]:
y_train_lev = np.exp(y_train)
y_valid_lev = np.exp(y_valid)
y_test_lev = np.exp(y_test)

In [None]:
#ColumnTransformer()

from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer(transformers =[('num', scaler, numerical_columns),('cat', ohe, categorical_columns)],remainder='drop')

In [None]:
#Modeling

from xgboost import XGBRegressor
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score

from sklearn import set_config
set_config(display='diagram',)

In [None]:
#A DataFRame to store the results is created:

index = ['XGBRegressor']
col = ['R2 Train', 'RMSE Train','R2 Valid', 'RMSE Valid']

results_df_log = pd.DataFrame(index=index, columns=col)
results_df_lev = pd.DataFrame(index=index, columns=col)

In [None]:
#XGBRegressor

# hyper-parameters to tune

#xgb1 = XGBRegressor(nthread=4,subsample=0.9,colsample_bytree=0.7,min_child_weight=4,silent=1,objective='reg:squarederror'
#                    ,verbosity=0)

#xg_param = {'learning_rate': [0.01, 0.03, 0.05, 0.1],
#            'max_depth': [7, 8, 9, 10],
#              'n_estimators': [200, 300, 500, 700, 900]}

#Xb_gridsearch = GridSearchCV(estimator=xgb1,
#                          param_grid=xg_param,
#                          cv=5,
#                          return_train_score=True) 

In [None]:
#Create pipeline

#xg_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
#                              ('m', Xb_gridsearch)])

In [None]:
#%%time
#xg_best_model = xg_pipeline.fit(X_train, y_train)
#xg_best_model

In [None]:
#xg_best_model['m'].best_params_

In [None]:
learn_rate = 0.03 #xg_best_model['m'].best_params_.get('learning_rate')
n_est = 700 #xg_best_model['m'].best_params_.get('n_estimators')
tree_md = 8 #xg_best_model['m'].best_params_.get('max_depth')

In [None]:
%%time
from sklearn.model_selection import cross_val_score
# Various hyper-parameters to tune
xgb_opt = XGBRegressor(learning_rate=learn_rate,
                       n_estimators=n_est,
                       max_depth=tree_md,
                       nthread=4,
                       subsample=0.9,
                       colsample_bytree=0.7,
                       min_child_weight=4,
                       objective='reg:squarederror')

best_xg_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                    ('m', xgb_opt)])

print(cross_val_score(best_xg_pipeline,X_train, y_train,cv=5))

best_xg_model = best_xg_pipeline.fit(X_train, y_train)
best_xg_model

In [None]:
#Train Score: 
print(f'Score on Training set: {best_xg_model.score(X_train, y_train)}')

#Validation Score:
print(f'Score on Valuation set: {best_xg_model.score(X_valid, y_valid)}')

In [None]:
y_hat_train = best_xg_model.predict(X_train)
y_hat_valid = best_xg_model.predict(X_valid)

y_hat_train_lev = np.exp(y_hat_train)
y_hat_valid_lev = np.exp(y_hat_valid)

In [None]:
fig, ax = plt.subplots(1,2,figsize=(11,5), sharey=True, sharex=True)

#y_hat_train and y_hat_valid 

ax[0].scatter(y_hat_train,y_train)
ax[1].scatter(y_hat_valid,y_valid, c='r')
ax[0].set_title('Train Dataset')
ax[1].set_title('Validation Dataset')

plt.suptitle('XGBRegressor');

In [None]:
#y_hat_train_lev and y_hat_valid_lev 

#fig, ax = plt.subplots(1,2,figsize=(11,5), sharey=True, sharex=True)

#ax[0].scatter(y_hat_train_lev,y_train_lev)
#ax[1].scatter(y_hat_valid_lev,y_valid_lev, c='r')
#ax[0].set_title('Train Dataset')
#ax[1].set_title('Validation Dataset');

#plt.suptitle('XGBRegressor');

In [None]:
mse_train_xgb = mean_squared_error(y_train, y_hat_train, squared=False)
mse_valid_xgb = mean_squared_error(y_valid, y_hat_valid, squared=False)

r2_train_xgb = r2_score(y_train, y_hat_train)
r2_valid_xgb = r2_score(y_valid, y_hat_valid)

print(f'MSE Score on Training set: {mse_train_xgb}')
print(f'MSE Score on Validation set: {mse_valid_xgb}')
print('\n')
print(f'R2 Score on Training set: {r2_train_xgb}')
print(f'R2 Score on Training set: {r2_valid_xgb}')

In [None]:
results_df_log.loc['XGBRegressor','R2 Train'] = r2_train_xgb
results_df_log.loc['XGBRegressor','R2 Valid'] = r2_valid_xgb
results_df_log.loc['XGBRegressor','RMSE Train'] = mse_train_xgb
results_df_log.loc['XGBRegressor','RMSE Valid'] = mse_valid_xgb

In [None]:
#XGBRagressor is the model delivering the best results on the validation dataset. 
#The table below summarizes the overall results using the target feature in log:

results_df_log

In [None]:
#The model is now tested on the test dataset

y_hat_test = best_xg_model.predict(X_test)
y_hat_test_lev = np.exp(y_hat_test)

In [None]:
mse_test_xgb = mean_squared_error(y_test, y_hat_test, squared=False)
r2_test_xgb = r2_score(y_test, y_hat_test)

print(f'MSE Score on Test set: {mse_test_xgb}')
print('\n')
print(f'R2 Score on Test set: {r2_test_xgb}')

In [None]:
mse_test_xgb = mean_squared_error(y_test_lev, y_hat_test_lev, squared=False)
r2_test_xgb = r2_score(y_test_lev, y_hat_test_lev)

print(f'MSE Score on Test set: {mse_test_xgb}')
print('\n')
print(f'R2 Score on Test set: {r2_test_xgb}')

In [None]:
fig, ax = plt.subplots(1,1,figsize=(6,5), sharey=True, sharex=True)

ax.scatter(y_hat_test_lev,y_test_lev);

In [None]:
import matplotlib.pyplot as plt

x= y_hat_test_lev
y = y_test_lev
plt.scatter(x,y)
plt.title("Real  Vs Prediction")
plt.xlabel("Predicted Price")
plt.ylabel("Real Price")
plt.show()

In [None]:
prediction = pd.DataFrame(index=y_test.index,columns=['Real Value','Prediction','Difference'])

prediction['Real Value'] = np.round(y_test_lev,0)
prediction['Prediction'] = np.round(y_hat_test_lev,0)
prediction['Difference'] = np.round(abs(y_test_lev - y_hat_test_lev),0)

#Convert DataFrame to a csv file that can be uploaded
#This is saved in the same directory as your notebook
filename = 'King County House Prediction.csv'

prediction.to_csv(filename,index=False)

print('Saved file: ' + filename)

In [None]:
price_pred= prediction['Prediction']
price_diff = prediction['Difference']
df_test=pd.DataFrame({'price_actual':prediction['Real Value'],'price_predicted':price_pred ,'difference' :price_diff})
df_test.tail(10)

In [None]:
house_df.columns

In [None]:
import pickle
with open('gboostmodel_1.pickle','wb') as f:
    pickle.dump(best_xg_model,f)