In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import pandas as pd

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.model_selection import train_test_split, GridSearchCV

from EDA import get_test, get_train, save_pred

In [3]:
X_train, y_train = get_train()
X_test = get_test()


In [4]:
print(X_train.shape)
# print(X_train.head(1))
print(X_test.shape)
print(X_test.dropna().shape)

(19168, 19)
(6966, 19)
(6966, 19)


In [5]:
# 训练随机森林解决回归问题
def random_forest_result(param_grid):
    estimator = RandomForestRegressor()
    model = GridSearchCV(estimator, param_grid, scoring="neg_root_mean_squared_error")
    # print(estimator.get_params().keys())
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print('Best RandomForestRegressor regressor: {}'.format(model.best_params_))

    return model.predict(X_test)

In [10]:
# 
def ada_boost_result(params):
    base_estimator = DecisionTreeRegressor()
    estimator = AdaBoostRegressor(base_estimator=base_estimator)
    # print(estimator.get_params().keys())
    model = GridSearchCV(estimator, params, scoring="neg_root_mean_squared_error")
    model.fit(X_train, y_train)
    # Store the parameters of the best model
    best_params = model.best_params_

    # Predict class labels of test data on the model with the best found parameters
    y_pred = model.predict(X_test)

    print('Best AdaBoost (with DecisionTreeRegressor) regressor: {}'.format(best_params))
    return y_pred

In [8]:
param_grid = {'n_estimators': [10, 20, 50, 100, 200], 'max_features': ["sqrt", "log2", "auto"], 'min_samples_leaf': [5, 10, 20, 50], 'n_jobs': [-1]}
y_pred = random_forest_result(param_grid=param_grid)
print(y_pred.shape, y_pred)
out_file = "data/result_random_forest.csv"
save_pred(y_pred, out_file)


Best RandomForestRegressor regressor: {'max_features': 'log2', 'min_samples_leaf': 100, 'n_estimators': 20, 'n_jobs': -1}
(6966,) [1508805.30925693 1687553.24611076 4618456.52102155 ... 3243091.43905007
  652638.78175065 3264251.7328681 ]


In [12]:
params = {'base_estimator__max_depth':[10, 20, 50, 100, 200], "n_estimators": [5, 10, 20, 50, 100], "learning_rate": [0.01, 0.1, 0.5, 1]}
y_pred_ada = ada_boost_result(params)
print(y_pred_ada.shape, y_pred_ada)
out_file = "data/result_ada_boost.csv"
save_pred(y_pred_ada, out_file)

Best AdaBoost (with DecisionTreeRegressor) regressor: {'base_estimator__max_depth': 10, 'learning_rate': 0.01, 'n_estimators': 50}
(6966,) [1212456.64739884 1667201.11111111 1593469.13580247 ... 3272106.18556701
  787500.         3646661.86440678]
