# Random Forestで機械学習の結果を解釈する

In [None]:
%matplotlib inline
from math import sqrt
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error


In [None]:
df_date_data = pd.read_csv('../data/org/day.csv', header=0)
df_date_data.head()

In [None]:
#今回使用するカラムに限定
used_columns = ['cnt', 'season', 'yr', 'mnth', 'holiday', 'weathersit', 'temp', 'hum', 'windspeed']
df = df_date_data.loc[:,used_columns]
df.head()

In [None]:
#特定のカラムに関して、ダミー変数を作成する

ohe_columns = ['season', 'mnth', 'weathersit']
df_ohe = pd.get_dummies(df, columns=ohe_columns)

In [None]:
X = df_ohe.iloc[:, 1:]
y = df_ohe.iloc[:, 0]
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2, random_state=0)

X.head()

### RandomForest + feature importance

In [None]:
rf_model = RandomForestRegressor(n_estimators=10,random_state=50)
rf_model.fit(train_X, train_y)

pred_y = rf_model.predict(test_X)
score = sqrt(mean_squared_error(test_y, pred_y))

print('RMSE: {}'.format(score))

In [None]:
# feature importance
# tempの影響度がかなり大きい
features = df_ohe.columns[1:]
importances = rf_model.feature_importances_
indices = np.argsort(importances)

plt.figure(figsize=(10,10))
plt.barh(range(len(indices)), importances[indices], color='b', align='center')
plt.yticks(range(len(indices)), features[indices])
plt.show()

### Random Forest + Permutation Importance

In [None]:
# !pip install eli5
import eli5
from eli5.sklearn import PermutationImportance

perm = PermutationImportance(rf_model, random_state=1).fit(train_X, train_y)
eli5.show_weights(perm, feature_names = train_X.columns.tolist())

### Random Forest + Partial Dependence Plot

In [None]:
# !pip install pdpbox
from pdpbox import pdp, get_dataset, info_plots

# Create the data that we will plot
features = train_X.columns.values
feature_to_plot = 'temp'
pdp_goals = pdp.pdp_isolate(model=rf_model, dataset=train_X[features], model_features=list(features), feature=feature_to_plot)

# PDPの実行
# データ自体が元から正規化されているためここでは具体的な値がわからないが傾向はつかめる
pdp.pdp_plot(pdp_goals, feature_to_plot)
plt.show()