### Baseline Model

In [2]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

In [3]:
train_data = pd.read_csv("../dataset/house_prices/train.csv")
test_data = pd.read_csv("../dataset/house_prices/test.csv")

In [4]:
train_data.shape

(1460, 81)

In [5]:
train_data.isnull().sum()

Id                 0
MSSubClass         0
MSZoning           0
LotFrontage      259
LotArea            0
                ... 
MoSold             0
YrSold             0
SaleType           0
SaleCondition      0
SalePrice          0
Length: 81, dtype: int64

In [6]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
train_data_le = train_data.copy()
train_data_oh = train_data.copy()

In [7]:
le = LabelEncoder()
for col in train_data_le:
    train_data_le[col] = le.fit_transform(train_data_le[col].astype(str).values)

In [8]:
oh = OneHotEncoder()
train_data_oh = oh.fit_transform(train_data_oh)

In [9]:
train_data_le

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,...,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,0,9,3,75,859,1,2,3,3,0,4,0,5,2,2,0,5,7,4,104,53,1,1,12,13,1,84,2,4,2,2,3,3,2,499,5,0,173,624,1,...,1,4,647,343,0,422,1,0,2,1,3,1,2,10,6,0,5,1,89,1,2,234,4,4,2,0,166,0,0,0,0,3,4,4,0,4,2,8,4,343
1,572,4,3,90,1030,1,2,3,3,0,2,0,24,1,2,0,2,6,7,77,26,1,1,8,8,2,0,3,4,1,2,3,1,0,625,5,0,321,167,1,...,1,4,187,0,0,150,0,1,2,0,3,1,3,8,6,1,4,1,62,1,2,163,4,4,2,146,0,0,0,0,0,3,4,4,0,7,1,8,4,270
2,683,9,3,78,161,1,2,0,3,0,4,0,5,2,2,0,5,7,4,102,52,1,1,12,13,1,59,2,4,2,2,3,2,2,353,5,0,440,659,1,...,1,4,691,350,0,461,1,0,2,1,3,1,2,8,6,1,4,1,87,1,2,274,4,4,2,0,144,0,0,0,0,3,4,4,0,11,2,8,4,373
3,794,10,3,70,1021,1,2,0,3,0,0,0,6,2,2,0,5,7,4,19,20,1,1,13,15,2,0,3,4,0,3,1,3,0,185,5,0,509,562,1,...,1,4,721,291,0,427,1,0,1,0,3,1,2,9,6,1,2,5,84,2,3,298,4,4,2,0,134,78,0,0,0,3,4,4,0,4,0,8,0,125
4,905,9,3,94,386,1,2,0,3,0,2,0,15,2,2,0,5,8,4,101,50,1,1,12,13,1,185,2,4,2,2,3,0,2,466,5,0,480,96,1,...,1,4,107,15,0,625,1,0,2,1,4,1,2,11,6,1,4,1,86,1,3,394,4,4,2,74,188,0,0,0,0,3,4,4,0,3,2,8,4,425
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,508,9,3,72,799,1,2,3,3,0,4,0,8,2,2,0,5,6,4,100,50,1,1,12,13,2,0,3,4,2,2,3,3,5,0,5,0,755,683,1,...,1,4,714,254,0,380,0,0,2,1,3,1,3,9,6,1,4,1,85,1,2,163,4,4,2,0,140,0,0,0,0,3,4,4,0,10,1,8,4,245
1456,509,4,3,95,327,1,2,3,3,0,4,0,14,2,2,0,2,6,5,79,38,1,1,9,10,3,23,3,4,1,2,3,3,0,547,4,28,540,303,1,...,1,4,483,0,0,585,1,0,2,0,3,1,3,9,2,2,4,1,64,2,2,196,4,4,2,176,0,0,0,0,0,3,2,4,0,4,4,8,4,346
1457,510,10,3,76,947,1,2,3,3,0,4,0,6,2,2,0,5,7,8,44,56,1,1,5,5,2,0,0,2,4,3,1,3,2,219,5,0,715,101,1,...,1,4,138,50,0,658,0,0,2,0,4,1,2,11,6,2,2,1,29,1,1,38,4,4,2,0,165,0,0,0,0,3,0,2,7,7,4,8,4,458
1458,511,4,3,78,1037,1,2,3,3,0,4,0,12,2,2,0,2,5,5,51,46,3,1,8,8,2,0,3,4,1,3,3,2,2,355,4,1,0,51,1,...,1,0,56,0,0,38,1,0,1,0,2,1,2,7,6,0,5,1,36,2,1,33,4,4,2,188,0,5,0,0,0,3,4,4,0,6,4,8,4,130


In [10]:
# Handling Missing Value code & other preprocessing code here
# train_data.dropna(axis = 1, inplace = True)
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy="mean")
imputed_train_data = imputer.fit_transform(train_data_oh)

In [11]:
train_data_le.isnull().sum()

Id               0
MSSubClass       0
MSZoning         0
LotFrontage      0
LotArea          0
                ..
MoSold           0
YrSold           0
SaleType         0
SaleCondition    0
SalePrice        0
Length: 81, dtype: int64

In [12]:
X = train_data_le.drop(["SalePrice"], axis = 1)
y = train_data_le.SalePrice

In [13]:
X.shape, y.shape

((1460, 80), (1460,))

In [14]:
X = X.select_dtypes(exclude=["object"])

In [15]:
X.shape, y.shape

((1460, 80), (1460,))

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 12)

In [45]:
model = RandomForestRegressor()
model.fit(X_train, y_train)
preds = model.predict(X_test)

In [18]:
mean_absolute_error(y_test, preds)

77.13106164383561

### Scaling 해보기

In [60]:
from mlxtend.preprocessing import minmax_scaling
import numpy as np

X_train_scaled = np.array(X_train)
X_test_scaled = np.array(X_test)
y_train_scaled = np.array(y_train)
y_test_scaled = np.array(y_test)
X_train_scaled = minmax_scaling(X_train_scaled, columns=[0])
y_train_scaled = minmax_scaling(y_train_scaled, columns=[0])
X_test_scaled = minmax_scaling(X_test_scaled, columns=[0])
y_test_scaled = minmax_scaling(y_test_scaled, columns=[0])
y_train_scaled = y_train_scaled.ravel()

In [61]:
newModel = RandomForestRegressor()
newModel.fit(X_train_scaled, y_train_scaled)

In [62]:
newPreds = newModel.predict(X_test_scaled)

In [63]:
mean_absolute_error(y_test_scaled, newPreds)

0.2575748419032395