In [1]:
import numpy as np
import pandas as pd
from scipy.stats import skew
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

# Đọc và xử lý data

In [2]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
all_data = pd.concat((train.loc[:,'MSSubClass':'SaleCondition'],
                      test.loc[:,'MSSubClass':'SaleCondition']))

In [3]:
def correct(data):
    # những cột có kiểu số
    numeric_feats = data.dtypes[data.dtypes != "object"].index
    # tính độ lệch chuẩn và hàm log
    skewed_feats = data[numeric_feats].apply(lambda x: skew(x.dropna()))
    skewed_feats = skewed_feats[skewed_feats > 0.75]
    skewed_feats = skewed_feats.index
    data[skewed_feats] = np.log1p(data[skewed_feats])
    data = pd.get_dummies(data)
    data = data.fillna(data.mean())
    return data

In [4]:
all_data = correct(all_data)

train['SalePrice'] = np.log1p(train['SalePrice'])
X_train = all_data[:train.shape[0]]
y = train.SalePrice

X_test = all_data[train.shape[0]:]

# Grid Search

In [5]:
parameter = {
    'criterion' : ['mse', 'mae'],
    'warm_start' : [True, False]
}
clf = GridSearchCV(RandomForestRegressor(n_estimators=10), parameter, cv=4)
clf.fit(X_train, y)
grs = clf.best_params_
grs

{'criterion': 'mse', 'warm_start': True}

# Model

In [6]:
model = RandomForestRegressor(n_estimators=10, criterion=grs['criterion'] , warm_start=grs['warm_start'])
model.fit(X_train, y)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=10,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=True)

In [7]:
result = model.predict(X_test)
result = list(map(lambda x: np.exp(x)-1, result))

# Xuất File

In [8]:
submit = {'Id' : range(1461, 1461+len(result)), 'SalePrice' : result}
pd.DataFrame(data=submit).to_csv(path_or_buf='data/submission.csv', index=False)