In [3]:
import os

In [4]:
if not os.path.exists("./dataset/train.csv"):
    os.symlink("./dataset/dummy/train.csv", "./dataset/train.csv")
    os.symlink("./dataset/dummy/test.csv", "./dataset/test.csv")

In [5]:
# Essential libraries
import pandas as pd
from sklearn.model_selection import train_test_split

In [6]:
bast_path = "./dataset/"
train = pd.read_csv(bast_path + "train.csv")
test = pd.read_csv(bast_path + "test.csv")

In [7]:
features = ['LotArea', 'YearBuilt', '1stFlrSF', '2ndFlrSF', 'FullBath', 'BedroomAbvGr', 'TotRmsAbvGrd']

X = train[features]
y = train.SalePrice

In [8]:
test_X = test[features]

In [9]:
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=42, train_size=0.8, test_size=0.2)

In [10]:
X.head()

Unnamed: 0,LotArea,YearBuilt,1stFlrSF,2ndFlrSF,FullBath,BedroomAbvGr,TotRmsAbvGrd
0,8450,2003,856,854,2,3,8
1,9600,1976,1262,0,2,3,6
2,11250,2001,920,866,2,3,6
3,9550,1915,961,756,1,3,7
4,14260,2000,1145,1053,2,4,9


In [11]:
X.describe()

Unnamed: 0,LotArea,YearBuilt,1stFlrSF,2ndFlrSF,FullBath,BedroomAbvGr,TotRmsAbvGrd
count,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0
mean,10516.828082,1971.267808,1162.626712,346.992466,1.565068,2.866438,6.517808
std,9981.264932,30.202904,386.587738,436.528436,0.550916,0.815778,1.625393
min,1300.0,1872.0,334.0,0.0,0.0,0.0,2.0
25%,7553.5,1954.0,882.0,0.0,1.0,2.0,5.0
50%,9478.5,1973.0,1087.0,0.0,2.0,3.0,6.0
75%,11601.5,2000.0,1391.25,728.0,2.0,3.0,7.0
max,215245.0,2010.0,4692.0,2065.0,3.0,8.0,14.0


In [13]:
# Random forest models
from sklearn.ensemble import RandomForestRegressor


model = RandomForestRegressor(n_estimators=100, random_state=42)  # Default
model_abs = RandomForestRegressor(n_estimators=100, random_state=42, criterion="absolute_error")
model_max_7 = RandomForestRegressor(n_estimators=100, random_state=42, max_depth=7)
model_50 = RandomForestRegressor(n_estimators=50, random_state=42)
model_200 = RandomForestRegressor(n_estimators=200, random_state=42, min_samples_split=20)

models = [
    model,
    model_abs,
    model_max_7,
    model_50,
    model_200
]

In [14]:
from sklearn.metrics import mean_absolute_error

In [15]:
def get_mae(model, train_X, val_X, train_y, val_y):
    model.fit(train_X, train_y)
    pred = model.predict(val_X)
    mae = mean_absolute_error(val_y, pred)
    return mae

In [33]:
def get_mae_loop(models, train_X, val_X, train_y, val_y):
    lowest_mae = 9999999
    best_model = None
    for model in models:
        mae = get_mae(model, train_X, val_X, train_y, val_y)
        print(f"Validation MAE of {model}: {mae:,.0f}")
        
        if mae < lowest_mae:
            lowest_mae = mae
            best_model = model
            
    print(f"\nBset validation MAE is {best_model}'s: {lowest_mae:,.0f}")

In [34]:
get_mae_loop(models, train_X, val_X, train_y, val_y)

Validation MAE of RandomForestRegressor(random_state=42): 22,538
Validation MAE of RandomForestRegressor(criterion='absolute_error', random_state=42): 22,570
Validation MAE of RandomForestRegressor(max_depth=7, random_state=42): 23,045
Validation MAE of RandomForestRegressor(n_estimators=50, random_state=42): 22,411
Validation MAE of RandomForestRegressor(min_samples_split=20, n_estimators=200, random_state=42): 22,720

Bset validation MAE is RandomForestRegressor(n_estimators=50, random_state=42)'s: 22,411


In [103]:
sample_submission = pd.read_csv("./dataset/sample_submission.csv")

In [105]:
submission_id = sample_submission.Id.values

In [106]:
def get_csv(best_model, X, y):
    best_model.fit(X, y)
    pred = best_model.predict(test_X)
    output = pd.DataFrame({
        "Id": submission_id, 
        "SalePrice": pred
    })
    return output

In [107]:
csv = get_csv(model_50, X, y)
csv.to_csv(bast_path + "submission.csv", index=False)

In [108]:
pd.read_csv(bast_path + "submission.csv")

Unnamed: 0,Id,SalePrice
0,1461,122605.00
1,1462,154828.00
2,1463,185276.00
3,1464,178936.00
4,1465,185677.58
...,...,...
1454,2915,82768.00
1455,2916,87948.00
1456,2917,154687.28
1457,2918,123660.00
