In [41]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from statsmodels.api import OLS
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.ticker import MultipleLocator
from sklearn.preprocessing import LabelEncoder
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression

In [42]:
training_data = pd.read_csv("house-prices-advanced-regression-techniques/train.csv", index_col="Id")
testing_data = pd.read_csv("house-prices-advanced-regression-techniques/test.csv", index_col="Id")

In [43]:
all_data = pd.concat((training_data, testing_data)).reset_index(drop=True)
all_data

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,2,2008,WD,Normal,208500.0
1,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,,,,0,5,2007,WD,Normal,181500.0
2,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,,,,0,9,2008,WD,Normal,223500.0
3,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,,,,0,2,2006,WD,Abnorml,140000.0
4,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,,,,0,12,2008,WD,Normal,250000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2914,160,RM,21.0,1936,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,6,2006,WD,Normal,
2915,160,RM,21.0,1894,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,4,2006,WD,Abnorml,
2916,20,RL,160.0,20000,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,9,2006,WD,Abnorml,
2917,85,RL,62.0,10441,Pave,,Reg,Lvl,AllPub,Inside,...,0,,MnPrv,Shed,700,7,2006,WD,Normal,


In [44]:
def ConvertStringToInt(df):
  columns_to_encode = list(df.select_dtypes(include=['category','object'])) #gets all columns with non-numeric values
  le = LabelEncoder()
  for feature in columns_to_encode:
      try:
          df[feature] = le.fit_transform(df[feature])
      except:
          print('Error encoding '+feature)
  return df

In [45]:
all_data.drop(['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu'], axis=1, inplace=True)
columns_has_null = all_data.isna().sum().sort_values(ascending=False)
columns_has_null[columns_has_null > 0]

SalePrice       1459
LotFrontage      486
GarageCond       159
GarageYrBlt      159
GarageFinish     159
GarageQual       159
GarageType       157
BsmtExposure      82
BsmtCond          82
BsmtQual          81
BsmtFinType2      80
BsmtFinType1      79
MasVnrType        24
MasVnrArea        23
MSZoning           4
BsmtHalfBath       2
BsmtFullBath       2
Utilities          2
Functional         2
BsmtFinSF1         1
SaleType           1
TotalBsmtSF        1
Electrical         1
KitchenQual        1
BsmtFinSF2         1
Exterior2nd        1
Exterior1st        1
GarageCars         1
GarageArea         1
BsmtUnfSF          1
dtype: int64

In [46]:
all_data = ConvertStringToInt(all_data)
all_data = all_data.replace({np.NaN:0})

# Forest regression starts here

In [47]:
training_data = all_data.iloc[:1460, :]
testing_data = all_data.iloc[-1459:, :]
training_data.shape, testing_data.shape

((1460, 75), (1459, 75))

In [48]:
X = all_data.iloc[:, :-1]
y = all_data.iloc[:, -1:]
X_train, X_test = training_data.iloc[:,:-1], testing_data.iloc[:,:-1]
#X_test
y_train, y_test = training_data.iloc[:,-1:], testing_data.iloc[:,-1:]
#y_test.drop(['SalePrice'], axis=1, inplace=True)
X_test

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,...,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition
1460,20,2,80.0,11622,1,3,3,0,4,0,...,0,0,0,120,0,0,6,2010,8,4
1461,20,3,81.0,14267,1,0,3,0,0,0,...,36,0,0,0,0,12500,6,2010,8,4
1462,60,3,74.0,13830,1,0,3,0,4,0,...,34,0,0,0,0,0,3,2010,8,4
1463,60,3,78.0,9978,1,0,3,0,4,0,...,36,0,0,0,0,0,6,2010,8,4
1464,120,3,43.0,5005,1,0,1,0,4,0,...,82,0,0,144,0,0,1,2010,8,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2914,160,4,21.0,1936,1,3,3,0,4,0,...,0,0,0,0,0,0,6,2006,8,4
2915,160,4,21.0,1894,1,3,3,0,4,0,...,24,0,0,0,0,0,4,2006,8,0
2916,20,3,160.0,20000,1,3,3,0,4,0,...,0,0,0,0,0,0,9,2006,8,0
2917,85,3,62.0,10441,1,3,3,0,4,0,...,32,0,0,0,0,700,7,2006,8,4


In [49]:
forest = RandomForestRegressor()
forest.fit(X_train, y_train)

  forest.fit(X_train, y_train)


# Random Forest Model Prediction for 50% of missing housing data

In [54]:
print("model score on training data:", forest.score(X_train, y_train))
#print("model score on test data:", forest.score(X_test, y_test)) # we do not need this as we do not yet have any prices to compare the random forest model for the missing data
y_predicted = forest.predict(X_test) # This becomes our predicted prices for the missing dataset using forest prediction
y_predicted

model score on training data: 0.8556527707780708


array([123283.8724, 155804.8366, 177116.2132, ..., 153259.5777,
       113740.7864, 226950.1196])

# Reverse Forest 50:50 split to verify that the prediction is correct

In [55]:
forest = RandomForestRegressor()
forest.fit(X_test, y_predicted)

In [56]:
print("model score on training data:", forest.score(X_test, y_predicted))
print("model score on test data:", forest.score(X_train, y_train))

model score on training data: 0.998400692296273
model score on test data: 0.8430246080111112


In [60]:
y_predicted = pd.DataFrame(y_predicted, columns = ['SalePrice'])
y_all_data = pd.concat((y_train, y_predicted)).reset_index(drop=True)

testing_data = pd.read_csv("house-prices-advanced-regression-techniques/test.csv", index_col="Id")
testing_data["SalePrice"] = y_predicted
testing_data

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,Inside,...,0,,MnPrv,,0,6,2010,WD,Normal,
1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,Corner,...,0,,,Gar2,12500,6,2010,WD,Normal,
1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,Inside,...,0,,MnPrv,,0,3,2010,WD,Normal,
1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,Inside,...,0,,,,0,6,2010,WD,Normal,
1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,Inside,...,0,,,,0,1,2010,WD,Normal,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2915,160,RM,21.0,1936,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,6,2006,WD,Normal,
2916,160,RM,21.0,1894,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,4,2006,WD,Abnorml,
2917,20,RL,160.0,20000,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,9,2006,WD,Abnorml,
2918,85,RL,62.0,10441,Pave,,Reg,Lvl,AllPub,Inside,...,0,,MnPrv,Shed,700,7,2006,WD,Normal,


In [None]:
X_all_data = pd.concat((X_train, X_test)).reset_index(drop=True)
X_all_data