In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import numpy as np
import pandas as pd

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.model_selection import train_test_split, GridSearchCV

from EDA import get_test, get_train, save_pred

In [4]:
X_train, y_train = get_train()
X_test = get_test()


In [5]:
print(X_train.shape)
# print(X_train.head(1))
print(X_test.shape)
print(X_test.dropna().shape)

(19168, 19)
(6966, 19)
(6966, 19)


In [6]:
# 训练随机森林解决回归问题
def random_forest_result(param_grid):
    estimator = RandomForestRegressor()
    model = GridSearchCV(estimator, param_grid, scoring="neg_root_mean_squared_error")
    # print(estimator.get_params().keys())
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print('Best RandomForestRegressor regressor: {}'.format(model.best_params_))

    return model.predict(X_test)

In [6]:
regressor = RandomForestRegressor()
regressor.fit(X_train, y_train)
y_pred = regressor.predict(X_test)
print(regressor.feature_importances_)
out_file = "data/result_random_forest_1.csv"
save_pred(y_pred, out_file)


[1.11099384e-01 1.72903670e-01 3.07083624e-01 1.73388233e-01
 1.15772299e-03 1.90753659e-03 1.83548085e-01 1.29332064e-02
 2.81785173e-02 3.26130038e-03 3.08103985e-04 7.47658096e-04
 7.82782260e-04 8.50354415e-05 5.34736754e-04 1.63897114e-04
 5.35032265e-04 3.32465414e-04 1.04900912e-03]


In [7]:
# 训练ada_boost解决回归问题
def ada_boost_result(params):
    base_estimator = DecisionTreeRegressor()
    estimator = AdaBoostRegressor(base_estimator=base_estimator)
    # print(estimator.get_params().keys())
    model = GridSearchCV(estimator, params, scoring="neg_root_mean_squared_error")
    model.fit(X_train, y_train)
    # Store the parameters of the best model
    best_params = model.best_params_

    # Predict class labels of test data on the model with the best found parameters
    y_pred = model.predict(X_test)

    print('Best AdaBoost (with DecisionTreeRegressor) regressor: {}'.format(best_params))
    return y_pred

In [8]:
param_grid = {'n_estimators': [10, 20, 40, 50, 60, 80, 100, 120], 'max_depth': [10, 20, 40, 50, 60, 80, 100]}
y_pred = random_forest_result(param_grid=param_grid)
print(y_pred.shape, y_pred)
out_file = "data/result_random_forest.csv"
save_pred(y_pred, out_file)


Best RandomForestRegressor regressor: {'max_depth': 50, 'max_features': 3, 'n_estimators': 40, 'n_jobs': -1}
(6966,) [1426028.9246244  1716750.         1449072.5        ... 3267228.41666667
  627675.89285714 3825672.45375458]


In [10]:
params = {'base_estimator__max_depth':[5, 10, 20, 50, 100], "n_estimators": [5, 10, 20, 50, 100], "learning_rate": [0.01, 0.1, 0.5, 1]}
y_pred_ada = ada_boost_result(params)
print(y_pred_ada.shape, y_pred_ada)
out_file = "data/result_ada_boost.csv"
save_pred(y_pred_ada, out_file)

dict_keys(['base_estimator__ccp_alpha', 'base_estimator__criterion', 'base_estimator__max_depth', 'base_estimator__max_features', 'base_estimator__max_leaf_nodes', 'base_estimator__min_impurity_decrease', 'base_estimator__min_impurity_split', 'base_estimator__min_samples_leaf', 'base_estimator__min_samples_split', 'base_estimator__min_weight_fraction_leaf', 'base_estimator__random_state', 'base_estimator__splitter', 'base_estimator', 'learning_rate', 'loss', 'n_estimators', 'random_state'])
Best AdaBoost (with DecisionTreeRegressor) regressor: {'base_estimator__max_depth': 10, 'learning_rate': 1, 'n_estimators': 20}
(6966,) [1591760.         1660307.31707317 1571892.68292683 ... 3259136.58536585
  608680.         3474980.37383178]
