In [1]:
import pandas as pd
import numpy as np

# 1727, 191
train = pd.read_csv('./1727-nonsquare-train-from-1918-nonsquare-spmm-over-3s.csv')
test = pd.read_csv('./191-nonsquare-test-from-1918-nonsquare-spmm-over-3s.csv')

In [2]:
# Train + Valid
X_train = train[['lr','lc','rc','ld','rd','lnnz','rnnz','lr*lc','lc*rc','lr*rc','lr*lc*rc','ld*rd','lr*rc*ld*rd','lr*lc*rc*ld*rd']] 
y_train = train['bz_smsm']

# Test
X_test = test[['lr','lc','rc','ld','rd','lnnz','rnnz','lr*lc','lc*rc','lr*rc','lr*lc*rc','ld*rd','lr*rc*ld*rd','lr*lc*rc*ld*rd']] 
y_test = test['bz_smsm']

In [3]:
from sklearn.metrics import mean_squared_error

def mean_absolute_percentage_error(y_test, y_pred):
    y_test, y_pred = np.array(y_test), np.array(y_pred)
    return np.mean(np.abs((y_test - y_pred) / y_test)) * 100

def rmse(y_true, y_pred):
    rmse = np.sqrt(np.mean(np.square(y_pred - y_true))) 
    return rmse

def custom_scoring(y_test, y_pred):

    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mape = mean_absolute_percentage_error(y_test, y_pred)

    return mape

In [4]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.metrics import make_scorer


grid = {
    'max_depth' : [16,18,20,22,24],
    'min_samples_leaf' : [1,2,3,4],
    'min_samples_split' : [2,3,4,6,8,10],
    'n_estimators' : [200,400,600,800,1000]
}

regressor = RandomForestRegressor(n_jobs=-1)

kf = KFold(n_splits=9,
           shuffle=True,
           random_state=30
          )

regressor_grid = GridSearchCV(regressor, 
                              param_grid = grid, 
                              scoring = make_scorer(custom_scoring, greater_is_better=False),
                              cv=kf, 
                              n_jobs=-1,
                              verbose=3) 

regressor_grid.fit(X_train, y_train)

print("최고의 파라미터 :", regressor_grid.best_params_)

Fitting 10 folds for each of 300 candidates, totalling 3000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 48 concurrent workers.
[Parallel(n_jobs=-1)]: Done  32 tasks      | elapsed:   10.5s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   31.0s
[Parallel(n_jobs=-1)]: Done 416 tasks      | elapsed:   57.4s
[Parallel(n_jobs=-1)]: Done 704 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 1056 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 1472 tasks      | elapsed:  3.2min
[Parallel(n_jobs=-1)]: Done 1952 tasks      | elapsed:  4.2min
[Parallel(n_jobs=-1)]: Done 2496 tasks      | elapsed:  5.4min
[Parallel(n_jobs=-1)]: Done 3000 out of 3000 | elapsed:  6.5min finished


최고의 파라미터 : {'criterion': 'mse', 'max_depth': 18, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 100}


In [5]:
regressor = RandomForestRegressor(
criterion='mse',
max_depth=18,
min_samples_leaf=2, 
min_samples_split=4, 
n_estimators=400
)
regressor.fit(X_train, y_train)

y_pred_train = regressor.predict(X_train)
y_pred = regressor.predict(X_test)

In [6]:
# 훈련데이터 예측
y_train_pred = best_model.predict(X_train)
print("-------- 훈련데이터 예측 --------------------------")
print("rmse : {}".format(np.sqrt(mean_squared_error(y_train, y_train_pred))))
print("mape : {}%".format(mean_absolute_percentage_error(y_train, y_train_pred)))
print("\n")

# 테스트데이터 예측
y_pred = best_model.predict(X_test)
print("-------- 테스트데이터 예측 -------------------------")
print("rmse : {}".format(np.sqrt(mean_squared_error(y_test, y_pred))))
print("mape : {}%".format(mean_absolute_percentage_error(y_test, y_pred)))
print("\n")

--------train 에러율--------
rmse : 29321.784605081888
mape : 27.040270422963737
--------test 에러율--------
rmse : 29321.784605081888
mape : 43.37778453651863


In [None]:
import matplotlib.pyplot as plt

n_feature = X_train.shape[1]
index = np.arange(n_feature)

plt.barh(index, regressor.feature_importances_, align='center')

plt.yticks(index, X_train.columns)

plt.ylim(-1, n_feature)

plt.xlabel('feature importance', size=15)

plt.ylabel('feature', size=15)

plt.show()