# Public Score : 2.65071

# 環境
- Windows Server 2016 Standaard
- Python 3.6.7 (anaconda custom)
- Pandas 0.23.4
- joblib 0.13.2
- NumPy 1.15.4
- Scikit-Learn 0.20.2
- LightGBM 2.2.1
- XGBoost 0.80

In [1]:
import pandas as pd
import os
import numpy as np
import joblib
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.model_selection import KFold
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV
from sklearn.pipeline import Pipeline
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor

# データの読み込み
csvのときは、**read_csv**だったが、tsvなので**read_table**を使用

In [2]:
df_train = pd.read_table("train.tsv").set_index("id")
df_test = pd.read_table("test.tsv").set_index("id")
target_col = "mpg"

## 欠損値の置き換え
"?"だと型変換が上手くできない。一般的な欠損値を表す値である、NaNへと置き換え

In [3]:
df_train.replace({"?": np.nan}, inplace=True)
df_test.replace({"?": np.nan}, inplace=True)

In [4]:
df_train["horsepower"] = df_train["horsepower"].astype(np.float32)
df_test["horsepower"] = df_test["horsepower"].astype(np.float32)

# 評価関数の作成
Scikit-LearnではRMSEはない為、MSEからルートを取り作成する

In [5]:
def rmse(y_true, predict):
    return np.sqrt(mean_squared_error(y_true, predict))
rmse_scorer = make_scorer(rmse)

# 車種の分類
車種に関する知識がない為、メーカー名でまとめる

In [6]:
makers = set([i[0] for i in df_train["car name"].str.split(" ")])

In [7]:
for maker in makers:
    df_train[maker] = df_train["car name"].str.contains(maker)
    df_test[maker] = df_test["car name"].str.contains(maker)

In [8]:
(df_train.drop([target_col], axis=1).columns == df_test.columns).any()

True

**True**なので、学習用データと評価用データの片方のみに含まれるメーカー名はない

In [9]:
df_train.drop(["car name"], axis=1, inplace=True)
df_test.drop(["car name"], axis=1, inplace=True)

# 前処理用のパイプラインを作成
欠損値をNaNへと置き換えたので、欠損値の補完をしなければならない<br>
とりあえず各列の中央値とした<br>
他の処理(スケーリングなど)を行う場合は、2行目のような形式で追加すれば、パイプラインを呼び出すことで自動で処理を施すことが出来る

In [10]:
preprocess = Pipeline([
    ("impute", SimpleImputer(strategy="median")),
])

In [11]:
X = df_train.drop([target_col], axis=1).values
y = df_train[target_col].values
X_test = df_test.values

X = preprocess.fit_transform(X)
X_test = preprocess.transform(X_test)

In [12]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)

In [13]:
params = {
    "learning_rate": [0.1, 0.3, 0.5],
    "max_depth": [2,3,5,10],
#     "max_child_weight": np.arange(0.5, 1.5, 0.1),
#     "reg_lambda": np.arange(1, 5)
    "subsample": [0.5,0.8,0.9,1],
    "colsample_bytree": [0.5,1.0],
}

os.makedirs(".\\models\\", exist_ok=True)

print("XGBoost GridSearching...")
xgb_gs = GridSearchCV(XGBRegressor(), params, cv=5, scoring=rmse_scorer, iid=False).fit(X, y)
xgb_base = xgb_gs.best_estimator_
joblib.dump(xgb_gs, ".\\models\\gs_xgb.model")

print("LightGBM GridSearching...")
lgb_gs = GridSearchCV(LGBMRegressor(), params, cv=5, scoring=rmse_scorer, iid=False).fit(X, y)
lgb_base = lgb_gs.best_estimator_
joblib.dump(lgb_gs, ".\\models\\gs_lgb.model")

XGBoost GridSearching...
LightGBM GridSearching...


['.\\models\\gs_lgb.model']

In [14]:
lgb_path = ".\\lgb_models\\"
xgb_path = ".\\xgb_models\\"

os.makedirs(lgb_path, exist_ok=True)
os.makedirs(xgb_path, exist_ok=True)

In [15]:
xgb_cv = np.zeros(len(X))
lgb_cv = np.zeros(len(X))
mean_cv = np.zeros(len(X))

for cnt, idx in enumerate(kf.split(X)):
    print("\n", cnt+1, "Fold")
    X_train, y_train = X[idx[0]], y[idx[0]]
    X_valid, y_valid = X[idx[1]], y[idx[1]]
    
    X_train_, X_valid_ = X_train, X_valid
#     X_train_ = preprocess.fit_transform(X_train)
#     X_valid_ = preprocess.transform(X_valid)
    
    est_xgb = XGBRegressor(device="gpu",  min_child_weight=3, max_depth=7)
    est_lgb = LGBMRegressor(device="gpu", min_child_weight=4, max_depth=5)
#     est_xgb = xgb_base
#     est_lgb = lgb_base
    
    print("XGBoosting Fitting...")
    est_xgb.fit(X_train_, y_train)
    joblib.dump(est_xgb, "{}{}_{}Fold.mlmodel".format(xgb_path, "XGB", cnt+1))
    
    print("LightGBM Fitting...")
    est_lgb.fit(X_train_, y_train)
    joblib.dump(est_lgb, "{}{}_{}Fold.mlmodel".format(lgb_path, "LGB", cnt+1))
    
    xgb_cv[idx[1]] += est_xgb.predict(X_valid_)
    lgb_cv[idx[1]] += est_lgb.predict(X_valid_)


 1 Fold
XGBoosting Fitting...
LightGBM Fitting...

 2 Fold
XGBoosting Fitting...
LightGBM Fitting...

 3 Fold
XGBoosting Fitting...
LightGBM Fitting...

 4 Fold
XGBoosting Fitting...
LightGBM Fitting...

 5 Fold
XGBoosting Fitting...
LightGBM Fitting...


In [16]:
mean_cv = np.mean([xgb_cv, lgb_cv], axis=0)
rmse_mean = rmse(y, mean_cv)

In [17]:
rmse_xgb = rmse(y, xgb_cv)
rmse_lgb = rmse(y, lgb_cv)

In [18]:
rmse_lgb

3.1601745019168983

In [19]:
rmse_xgb

3.4235279937353322

In [20]:
rmse_mean

3.2176952547247417

In [21]:
lgb_models = sorted(os.listdir(lgb_path))
xgb_models = sorted(os.listdir(xgb_path))

In [22]:
lgb_predict = np.zeros(len(X_test))
xgb_predict = np.zeros(len(X_test))
mean_predict = np.zeros(len(X_test))

print("LightGBM Predicting...")
for model in lgb_models:
    est = joblib.load(os.path.join(lgb_path, model))
    lgb_predict += est.predict(X_test)

print("XGBoositng Predicting...")
for model in xgb_models:
    est = joblib.load(os.path.join(xgb_path, model))
    xgb_predict += est.predict(X_test)

LightGBM Predicting...
XGBoositng Predicting...


In [23]:
lgb_predict /= len(lgb_models)
xgb_predict /= len(xgb_models)
mean_predict = np.mean([lgb_predict, xgb_predict], axis=0)

In [24]:
pd.DataFrame(lgb_predict, index=df_test.index).to_csv("LGBM_submit.csv", header=False)
pd.DataFrame(xgb_predict, index=df_test.index).to_csv("XGB_submit.csv", header=False)
pd.DataFrame(mean_predict, index=df_test.index).to_csv("Mean_submit.csv", header=False)

In [25]:
xgb_base

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1.0, gamma=0, learning_rate=0.5, max_delta_step=0,
       max_depth=10, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=0.5)

In [26]:
est_lgb

LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
       device='gpu', importance_type='split', learning_rate=0.1,
       max_depth=5, min_child_samples=20, min_child_weight=4,
       min_split_gain=0.0, n_estimators=100, n_jobs=-1, num_leaves=31,
       objective=None, random_state=None, reg_alpha=0.0, reg_lambda=0.0,
       silent=True, subsample=1.0, subsample_for_bin=200000,
       subsample_freq=0)

# 相関に従い、特徴選択を行う
目的変数との相関が強い特徴変数を一つずつ追加する<br>
追加したことによりスコアが良くなるまで続ける

In [37]:
corr = df_train.corr().abs().sort_values(by=target_col, ascending=False)

In [38]:
corr_sort = list(corr.index[1:])

## XGBoost

In [39]:
est_xgb = XGBRegressor()
use_col = []
min_col = []
min_score = 100000000000000
for col in corr_sort:
    use_col.append(col)
    X_tmp = df_train[use_col]
    X_train, X_valid, y_train, y_valid = train_test_split(X_tmp, y)
    X_train = preprocess.fit_transform(X_train)
    X_valid = preprocess.transform(X_valid)
    est_xgb.fit(X_train, y_train)
    predict = est_xgb.predict(X_valid)
    score = rmse(y_valid, predict)
    if score > min_score:
        min_score = score
    min_col = use_col

In [40]:
min_col[:10]

['weight',
 'displacement',
 'horsepower',
 'cylinders',
 'model year',
 'origin',
 'acceleration',
 'amc',
 'vw',
 'datsun']

In [41]:
X_ = df_train[min_col]

In [42]:
est_xgb.fit(X_, y)
predict = est_xgb.predict(df_test[min_col])

In [43]:
pd.DataFrame(predict, index=df_test.index).to_csv("Submit_XGB.csv", header=False)

## LightGBM

In [45]:
est_lgb = LGBMRegressor()
use_col = []
min_col = []
min_score = 100000000000000
for col in corr_sort:
    use_col.append(col)
    X_tmp = df_train[use_col]
    X_train, X_valid, y_train, y_valid = train_test_split(X_tmp, y)
    X_train = preprocess.fit_transform(X_train)
    X_valid = preprocess.transform(X_valid)
    est_lgb.fit(X_train, y_train)
    predict = est_lgb.predict(X_valid)
    score = rmse(y_valid, predict)
    if score > min_score:
        min_score = score
    min_col = use_col

In [46]:
X_ = df_train[min_col]
est_lgb.fit(X_, y)
predict = est_lgb.predict(df_test[min_col])

In [47]:
pd.DataFrame(predict, index=df_test.index).to_csv("Submit_LGB.csv", header=False)