トレーニングデータとテストデータに分割して評価できるようにした
グリッドサーチでハイパーパラメータの最適化をした

In [1]:
!pip install lightgbm



In [1]:
# ライブラリのインポート
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.model_selection import StratifiedKFold
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

# データの読み込み
train = pd.read_csv("./original/train.csv")
test = pd.read_csv("./original/test.csv")

# 特徴量と目的変数に分ける
X = train.drop("price_range", axis=1)
y = train["price_range"]
X_test = test.copy()

# 特徴量のスケーリング
scaler = StandardScaler()
X = scaler.fit_transform(X)
X_test = scaler.transform(X_test)

# データを分割する
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# ハイパーパラメータの設定
params = {
    'objective': 'multiclass',
    'num_class': 4,
    'metric': 'multi_error',
    'boosting_type': 'gbdt',
    'n_jobs': -1,
    'num_leaves': 31,
    'learning_rate': 0.05,
    'max_depth': -1,
    'min_child_samples': 20,
    'subsample_freq': 1,
    'subsample': 0.8,
    'colsample_bytree': 0.6,
    'reg_alpha': 0.1,
    'reg_lambda': 0.1,
    'verbosity': -1,
    'seed': 42
}

# モデルの作成と学習
n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
oof = np.zeros((len(X_train), 4))
y_pred = np.zeros((len(X_test), 4))
for fold, (train_index, valid_index) in enumerate(skf.split(X_train, y_train)):
    X_tr = X_train[train_index]
    y_tr = y_train[train_index]
    X_val = X_train[valid_index]
    y_val = y_train[valid_index]
    model = LGBMClassifier(**params)
    model.fit(X_tr, y_tr, eval_set=[(X_val, y_val)], early_stopping_rounds=100, verbose=False)
    oof[valid_index] = model.predict_proba(X_val)
    y_pred += model.predict_proba(X_test) / n_splits

# モデルの評価
y_pred = model.predict(X_valid)
acc = accuracy_score(y_valid, y_pred)
print(f"Accuracy: {acc}")
conf_mat = confusion_matrix(y_valid, y_pred)
print(f"Confusion matrix:\n{conf_mat}")

# 予測結果の出力
y_pred = model.predict(X_test)
output = pd.DataFrame({"index": test["id"], "price_range": y_pred})
output.to_csv("./submission/submission_lgbm_v2.csv", index=False, header=False)

KeyError: '[1, 5, 12, 14, 18, 21, 26, 34, 52, 56, 59, 75, 78, 84, 90, 94, 112, 117, 123, 128, 129, 137, 140, 141, 144, 149, 151, 184, 186, 190, 192, 203, 209, 222, 235, 240, 244, 247, 255, 259, 260, 281, 300, 317, 325, 326, 328, 330, 338, 346, 348, 356, 365, 366, 368, 378, 379, 381, 383, 390, 399, 403, 413, 427, 444, 445, 448, 459, 465, 472, 473, 477, 478, 480, 481, 482, 485, 491, 494, 495, 500, 507, 508, 510, 513, 516, 517, 529, 533, 541, 545, 550, 560, 565, 567, 569, 588, 591, 595, 596, 598, 605, 612, 616, 617, 623, 626, 633, 637, 638, 640, 643, 649, 655, 658, 660, 665, 672, 684, 696, 705, 724, 725, 734, 736, 740, 744, 758, 760, 762, 768, 775, 779, 781, 787, 792, 795, 797, 800, 806, 810, 818, 837, 843, 863, 894, 908, 931, 954, 959] not in index'

In [2]:
output

Unnamed: 0,index,price_range
0,1,1
1,2,0
2,6,3
3,10,1
4,12,1
...,...,...
795,1978,3
796,1980,1
797,1982,3
798,1988,2
