# LIBRARY

In [1]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor

# DATA EXPLORATION

In [2]:
training_data = pd.read_csv('train.csv')
testing_data = pd.read_csv('test.csv')

In [3]:
training_data.shape

(1460, 81)

In [4]:
training_data.describe()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
count,1460.0,1460.0,1201.0,1460.0,1460.0,1460.0,1460.0,1460.0,1452.0,1460.0,...,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0
mean,730.5,56.89726,70.049958,10516.828082,6.099315,5.575342,1971.267808,1984.865753,103.685262,443.639726,...,94.244521,46.660274,21.95411,3.409589,15.060959,2.758904,43.489041,6.321918,2007.815753,180921.19589
std,421.610009,42.300571,24.284752,9981.264932,1.382997,1.112799,30.202904,20.645407,181.066207,456.098091,...,125.338794,66.256028,61.119149,29.317331,55.757415,40.177307,496.123024,2.703626,1.328095,79442.502883
min,1.0,20.0,21.0,1300.0,1.0,1.0,1872.0,1950.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2006.0,34900.0
25%,365.75,20.0,59.0,7553.5,5.0,5.0,1954.0,1967.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,2007.0,129975.0
50%,730.5,50.0,69.0,9478.5,6.0,5.0,1973.0,1994.0,0.0,383.5,...,0.0,25.0,0.0,0.0,0.0,0.0,0.0,6.0,2008.0,163000.0
75%,1095.25,70.0,80.0,11601.5,7.0,6.0,2000.0,2004.0,166.0,712.25,...,168.0,68.0,0.0,0.0,0.0,0.0,0.0,8.0,2009.0,214000.0
max,1460.0,190.0,313.0,215245.0,10.0,9.0,2010.0,2010.0,1600.0,5644.0,...,857.0,547.0,552.0,508.0,480.0,738.0,15500.0,12.0,2010.0,755000.0


In [6]:
training_data.isnull().any()

Id               False
MSSubClass       False
MSZoning         False
LotFrontage       True
LotArea          False
                 ...  
MoSold           False
YrSold           False
SaleType         False
SaleCondition    False
SalePrice        False
Length: 81, dtype: bool

# DATA PROCESSING

In [7]:
# Keep NA for categorical variables. For numeric variables,
# NaN is replaced with the median value
catCols = training_data.select_dtypes("object").columns
catCols = set(catCols)
for c in training_data.columns:
    if c in catCols:
        training_data[c].fillna("Undefined", inplace = True)
    else:
        training_data[c].fillna(training_data[c].median(), inplace = True)
for c in testing_data.columns:
    if c in catCols:
        testing_data[c].fillna("Undefined", inplace = True)
    else:
        testing_data[c].fillna(testing_data[c].median(), inplace = True)

In [7]:
# run training model over all features
Ttrain = training_data["SalePrice"]
Xtrain = training_data.drop("SalePrice", axis = 1)
Xtest = testing_data.copy() # to avoid mutating the original data

In [8]:
# Now I want to turn category into number to work with sklearn model
# using one hot encoding
# https://www.kaggle.com/albertespin/pre-processing-gradient-boosting-top-5
encoder = OneHotEncoder(handle_unknown = 'ignore', sparse = False)
encoder.fit(Xtrain[catCols])
one_hot_columns = encoder.get_feature_names(list(catCols))
train_one_hot = pd.DataFrame(encoder.transform(Xtrain[catCols]),
                            columns = one_hot_columns)
test_one_hot = pd.DataFrame(encoder.transform(Xtest[catCols]),
                           columns = one_hot_columns)
Xtrain.drop(columns = catCols, axis = 1, inplace = True)
Xtest.drop(columns = catCols, axis = 1, inplace = True)
# since we have new columns for each category the original one is not used anymore
Xtrain[train_one_hot.columns] = train_one_hot
Xtest[train_one_hot.columns] = test_one_hot

In [9]:
# standarization so all feature have similar influence on the model
scaler = StandardScaler()
scaler.fit(Xtrain)
Xtrain = pd.DataFrame(scaler.transform(Xtrain), columns = Xtrain.columns)
Xtest = pd.DataFrame(scaler.transform(Xtest), columns = Xtest.columns)

In [10]:
# divide training data into training data and validation data 
# to work on Random Forest, Neural Network. and Gradient Boosting Tree
# and calculate error to see which method works best
train_x, val_x, train_y, val_y = train_test_split(Xtrain, Ttrain, random_state = 0)


In [11]:
# Random Forest Approach
forest_model = RandomForestRegressor(random_state = 1)
forest_model.fit(train_x, train_y)
forest_pred = forest_model.predict(val_x)
print(mean_absolute_error(val_y, forest_pred))

17270.212849315067


In [37]:
# Gradient Boosting Approach
gb_model = GradientBoostingRegressor(random_state = 1,
                                    n_estimators = 300)
gb_model.fit(train_x, train_y)
gb_pred = gb_model.predict(val_x)
print(mean_absolute_error(val_y, gb_pred))

16275.16764929074


In [39]:
val_y_pd = pd.DataFrame(val_y)
print(mean_absolute_error(val_y, gb_pred)/(val_y_pd.var(axis = 0)))

SalePrice    0.000002
dtype: float64


In [13]:
nn_model = MLPRegressor(random_state = 1)
nn_model.fit(train_x, train_y)
nn_pred = nn_model.predict(val_x)
print(mean_absolute_error(val_y, nn_pred))

175899.43473730152




In [16]:
# Since Gradient Boosting yielded lowest mean error,
#I applied the model to full training data and predict test data
gb_model.fit(Xtrain, Ttrain)
test_pred = gb_model.predict(Xtest)
test_pred

array([117801.24680246, 163547.66598632, 183674.40004204, ...,
       160431.67565927, 117203.42502095, 227554.76048316])

In [17]:
output = pd.DataFrame({'Id':testing_data.Id, 'SalePrice':test_pred})
output.to_csv('submission.csv', index = False)