### Baseline Model

In [66]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

In [67]:
train_data = pd.read_csv("../dataset/house_prices/train.csv")
test_data = pd.read_csv("../dataset/house_prices/test.csv")

In [68]:
train_data.shape

(1460, 81)

In [69]:
train_data.isnull().sum()

Id                 0
MSSubClass         0
MSZoning           0
LotFrontage      259
LotArea            0
                ... 
MoSold             0
YrSold             0
SaleType           0
SaleCondition      0
SalePrice          0
Length: 81, dtype: int64

In [70]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
train_data_le = train_data.copy()
train_data_oh = train_data.copy()

In [71]:
le = LabelEncoder()
for col in train_data_le:
    train_data_le[col] = le.fit_transform(train_data_le[col].astype(str).values)

In [72]:
oh = OneHotEncoder()
train_data_oh = oh.fit_transform(train_data_oh)

In [73]:
train_data_le

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,0,9,3,75,859,1,2,3,3,0,...,0,3,4,4,0,4,2,8,4,343
1,572,4,3,90,1030,1,2,3,3,0,...,0,3,4,4,0,7,1,8,4,270
2,683,9,3,78,161,1,2,0,3,0,...,0,3,4,4,0,11,2,8,4,373
3,794,10,3,70,1021,1,2,0,3,0,...,0,3,4,4,0,4,0,8,0,125
4,905,9,3,94,386,1,2,0,3,0,...,0,3,4,4,0,3,2,8,4,425
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,508,9,3,72,799,1,2,3,3,0,...,0,3,4,4,0,10,1,8,4,245
1456,509,4,3,95,327,1,2,3,3,0,...,0,3,2,4,0,4,4,8,4,346
1457,510,10,3,76,947,1,2,3,3,0,...,0,3,0,2,7,7,4,8,4,458
1458,511,4,3,78,1037,1,2,3,3,0,...,0,3,4,4,0,6,4,8,4,130


In [74]:
# Handling Missing Value code & other preprocessing code here
# train_data.dropna(axis = 1, inplace = True)
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy="mean")
imputed_train_data = imputer.fit_transform(train_data_oh)

In [75]:
train_data_le.isnull().sum()

Id               0
MSSubClass       0
MSZoning         0
LotFrontage      0
LotArea          0
                ..
MoSold           0
YrSold           0
SaleType         0
SaleCondition    0
SalePrice        0
Length: 81, dtype: int64

In [76]:
X = train_data_le.drop(["SalePrice"], axis = 1)
y = train_data_le.SalePrice

In [77]:
X.shape, y.shape

((1460, 80), (1460,))

In [78]:
X = X.select_dtypes(exclude=["object"])

In [79]:
X.shape, y.shape

((1460, 80), (1460,))

In [80]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 12)

In [81]:
model = RandomForestRegressor()
model.fit(X_train, y_train)
preds = model.predict(X_test)

In [82]:
mean_absolute_error(y_test, preds)

76.74626712328767