<a href="https://colab.research.google.com/github/lightsixer/P1regression/blob/dev/notebooks/House_Price_Prediction_using_XGBoost.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from xgboost import XGBRegressor

In [None]:
path='/home/preptrain.csv'
data = pd.read_csv(path)
data.head()

Unnamed: 0,GrLivArea,LotArea,TotalBsmtSF,BsmtUnfSF,GarageArea,YearBuilt,LotFrontage,YearRemodAdd,BsmtFinSF1,OpenPorchSF,SalePrice
0,1710,8450,856,150,548,2003,65.0,2003,706,61,208500
1,1262,9600,1262,284,460,1976,80.0,1976,978,0,181500
2,1786,11250,920,434,608,2001,68.0,2002,486,42,223500
3,1717,9550,756,540,642,1915,60.0,1970,216,35,140000
4,2198,14260,1145,490,836,2000,84.0,2000,655,84,250000


In [None]:
X = data.drop('SalePrice',axis=1)
y = data['SalePrice']

X.head()
X.describe()
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   GrLivArea     1460 non-null   int64  
 1   LotArea       1460 non-null   int64  
 2   TotalBsmtSF   1460 non-null   int64  
 3   BsmtUnfSF     1460 non-null   int64  
 4   GarageArea    1460 non-null   int64  
 5   YearBuilt     1460 non-null   int64  
 6   LotFrontage   1460 non-null   float64
 7   YearRemodAdd  1460 non-null   int64  
 8   BsmtFinSF1    1460 non-null   int64  
 9   OpenPorchSF   1460 non-null   int64  
dtypes: float64(1), int64(9)
memory usage: 114.2 KB


In [None]:
X_train_full,X_valid_full,y_train,y_test = train_test_split(X,y,train_size=0.8,test_size=0.2,random_state=0)

In [None]:
categorical_col=[col for col in X_train_full.columns if X_train_full[col].dtype=='O' and X_train_full[col].nunique()<10]
print(categorical_col)

numerical_col=[col for col in X_train_full.columns if X_train_full[col].dtype in ['float64','int64']]
print(numerical_col)

my_cols=categorical_col + numerical_col
X_train = X_train_full[my_cols].copy()
X_valid = X_valid_full[my_cols].copy()

[]
['GrLivArea', 'LotArea', 'TotalBsmtSF', 'BsmtUnfSF', 'GarageArea', 'YearBuilt', 'LotFrontage', 'YearRemodAdd', 'BsmtFinSF1', 'OpenPorchSF']


In [None]:
X_train.head()

Unnamed: 0,GrLivArea,LotArea,TotalBsmtSF,BsmtUnfSF,GarageArea,YearBuilt,LotFrontage,YearRemodAdd,BsmtFinSF1,OpenPorchSF
618,1828,11694,1822,1774,774,2007,90.0,2007,48,108
870,894,6600,894,894,308,1962,60.0,1962,0,0
92,964,13360,876,163,432,1921,80.0,2006,713,0
817,1689,13265,1568,350,857,2002,70.049958,2002,1218,59
302,1541,13704,1541,1541,843,2001,118.0,2002,0,81


In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

numerical_transformer = SimpleImputer(strategy='mean')

categorical_transformer = Pipeline(steps=[
                                          ('impute',SimpleImputer(strategy='most_frequent')),
                                          ('onehot',OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(transformers=[
                                               ('num',numerical_transformer,numerical_col),
                                               ('cat',categorical_transformer,categorical_col)
])

In [None]:
my_model = XGBRegressor(n_estimators=1000, learning_rate=0.05, n_jobs=4)

In [None]:
eval_set_pipe = Pipeline(steps = [('preprocessor', preprocessor)])

X_train_eval=X_train.copy()
X_train_eval = eval_set_pipe.fit_transform(X_train)
pd.DataFrame(X_train_eval,index=X_train.index)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
618,1828.0,11694.0,1822.0,1774.0,774.0,2007.0,90.000000,2007.0,48.0,108.0
870,894.0,6600.0,894.0,894.0,308.0,1962.0,60.000000,1962.0,0.0,0.0
92,964.0,13360.0,876.0,163.0,432.0,1921.0,80.000000,2006.0,713.0,0.0
817,1689.0,13265.0,1568.0,350.0,857.0,2002.0,70.049958,2002.0,1218.0,59.0
302,1541.0,13704.0,1541.0,1541.0,843.0,2001.0,118.000000,2002.0,0.0,81.0
...,...,...,...,...,...,...,...,...,...,...
763,2365.0,9430.0,1252.0,89.0,856.0,1999.0,82.000000,1999.0,1163.0,128.0
835,1067.0,9600.0,1067.0,625.0,436.0,1950.0,60.000000,1995.0,442.0,0.0
1216,1902.0,8930.0,0.0,0.0,539.0,1978.0,68.000000,1978.0,0.0,0.0
559,1557.0,3196.0,1374.0,1374.0,420.0,2003.0,70.049958,2004.0,0.0,20.0


In [None]:

# Make a copy to avoid changing original data
X_valid_eval=X_valid.copy()
# fit transform X_valid.copy()
X_valid_eval = eval_set_pipe.transform (X_valid_eval)

pd.DataFrame(X_valid_eval, index=X_valid.index)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
529,2515.0,32668.0,2035.0,816.0,484.0,1957.0,70.049958,1975.0,1219.0,0.0
491,1578.0,9490.0,806.0,238.0,240.0,1941.0,79.000000,1950.0,403.0,0.0
459,1203.0,7015.0,709.0,524.0,352.0,1950.0,70.049958,1950.0,185.0,0.0
279,2022.0,10005.0,1160.0,768.0,505.0,1977.0,83.000000,1977.0,392.0,117.0
655,1092.0,1680.0,525.0,525.0,264.0,1971.0,21.000000,1971.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...
326,1719.0,10846.0,1719.0,100.0,473.0,1993.0,32.000000,1993.0,1619.0,30.0
440,2402.0,15431.0,3094.0,788.0,672.0,2008.0,105.000000,2008.0,1767.0,72.0
1387,2526.0,8520.0,714.0,0.0,216.0,1916.0,60.000000,1950.0,168.0,15.0
1323,708.0,5330.0,420.0,140.0,0.0,1940.0,50.000000,1950.0,280.0,0.0


In [None]:
my_model.fit(X_train_eval, y_train, early_stopping_rounds=20, eval_metric = "mae", eval_set=[(X_valid_eval, y_test)])

[0]	validation_0-mae:172724.89330
[1]	validation_0-mae:164479.10580
[2]	validation_0-mae:156620.85147
[3]	validation_0-mae:149130.27496
[4]	validation_0-mae:141967.23223
[5]	validation_0-mae:135129.99664
[6]	validation_0-mae:128648.18203
[7]	validation_0-mae:122467.15481
[8]	validation_0-mae:116542.54843
[9]	validation_0-mae:111105.48828
[10]	validation_0-mae:105853.58788
[11]	validation_0-mae:100892.56562
[12]	validation_0-mae:96176.80148
[13]	validation_0-mae:91630.22878
[14]	validation_0-mae:87472.89632
[15]	validation_0-mae:83379.94902
[16]	validation_0-mae:79542.51262
[17]	validation_0-mae:75939.61226
[18]	validation_0-mae:72506.74314
[19]	validation_0-mae:69203.04815




[20]	validation_0-mae:66146.91251
[21]	validation_0-mae:63156.61973
[22]	validation_0-mae:60323.12285
[23]	validation_0-mae:57774.99591
[24]	validation_0-mae:55195.03588
[25]	validation_0-mae:52823.92529
[26]	validation_0-mae:50573.82172
[27]	validation_0-mae:48560.00040
[28]	validation_0-mae:46625.24086
[29]	validation_0-mae:44730.39513
[30]	validation_0-mae:42907.28145
[31]	validation_0-mae:41223.47985
[32]	validation_0-mae:39635.93318
[33]	validation_0-mae:38216.55920
[34]	validation_0-mae:36827.72357
[35]	validation_0-mae:35555.82037
[36]	validation_0-mae:34365.42479
[37]	validation_0-mae:33228.77686
[38]	validation_0-mae:32164.91425
[39]	validation_0-mae:31158.74995
[40]	validation_0-mae:30210.34886
[41]	validation_0-mae:29327.83734
[42]	validation_0-mae:28553.60604
[43]	validation_0-mae:27895.07916
[44]	validation_0-mae:27270.33772
[45]	validation_0-mae:26624.24569
[46]	validation_0-mae:26086.50102
[47]	validation_0-mae:25568.65616
[48]	validation_0-mae:25094.32653
[49]	validatio

In [None]:
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import accuracy_score
preds = my_model.predict(X_valid_eval)

# Evaluate the model
score = mean_absolute_error(y_test, preds)
print('MAE:', score)



MAE: 19265.368739297945


In [None]:
import pickle
import os
pickle.dump(House_Price_Prediction_using_XGBoost, open(os.path.join('./house-prices', 'XGBOOST.pkl'), 'wb'), protocol=4)

NameError: ignored