ベース・モデルは LightGBM を用いて，モデル選択は層化K分割交差検証 (K=5) を用いたプログラムに変更した<br>
追加1・使用する特徴量を相関係数から選択した。<br>
追加2・学習データを分け、モデルを評価できるようにした<br>
追加3・層化K分割交差検証 (K=7) にした


In [1]:
!pip install lightgbm



In [13]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

 # データの読み込み
train = pd.read_csv("./original/train.csv")
test = pd.read_csv("./original/test.csv")

 # 特徴量と目的変数に分ける
X_train = train.drop("price_range", axis=1)
y_train = train["price_range"]
X_test = test.copy()

 # train_test_splitを使用して学習データを分割する
from sklearn.model_selection import train_test_split
X_train_, X_val, y_train_, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

 # 特徴量のスケーリング
scaler = StandardScaler()
X_train_ = scaler.fit_transform(X_train_)
X_test = scaler.transform(X_test)

 # ハイパーパラメータの設定
params = {
    'objective': 'multiclass',
    'num_class': 4,
    'metric': 'multi_error',
    'boosting_type': 'gbdt',
    'n_jobs': -1,
    'num_leaves': 31,
    'learning_rate': 0.05,
    'max_depth': -1,
    'min_child_samples': 20,
    'subsample_freq': 1,
    'subsample': 0.8,
    'colsample_bytree': 0.6,
    'reg_alpha': 0.1,
    'reg_lambda': 0.1,
    'verbosity': -1,
    'seed': 42
}

 # モデルの作成と学習
n_splits = 7
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
oof = np.zeros((len(X_train_), 4))
y_pred = np.zeros((len(X_test), 4))
for fold, (train_index, valid_index) in enumerate(skf.split(X_train_, y_train_)):
    X_tr = X_train_[train_index]
    y_tr = y_train_.iloc[train_index]
    X_val = X_train_[valid_index]
    y_val = y_train_.iloc[valid_index]
    model = LGBMClassifier(**params)
    model.fit(X_tr, y_tr, eval_set=[(X_val, y_val)], early_stopping_rounds=100, verbose=False)
    oof[valid_index] = model.predict_proba(X_val)
    y_pred += model.predict_proba(X_test) / n_splits
    
 # モデルの評価
y_train_pred = model.predict(X_train_)
y_val_pred = model.predict(X_val)
print("Training Accuracy: ", accuracy_score(y_train_, y_train_pred))
print("Validation Accuracy: ", accuracy_score(y_val, y_val_pred))

 # 予測結果の出力
y_pred = np.argmax(y_pred, axis=1)
output = pd.DataFrame({"id": test["id"], "price_range": y_pred})
output.to_csv("./submission/submission_lgbm_v7.csv", index=False, header=False)



Training Accuracy:  0.9083333333333333
Validation Accuracy:  0.5474452554744526


In [2]:
output

Unnamed: 0,index,price_range
0,1,1
1,2,0
2,6,3
3,10,1
4,12,1
...,...,...
795,1978,3
796,1980,1
797,1982,3
798,1988,2
