特徴量の追加

StratifiedKFoldを使用して、クロスバリデーションによるモデルの評価を行った

In [1]:
# ライブラリのインポート
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

# データの読み込み
train = pd.read_csv("./original/train.csv")
test = pd.read_csv("./original/test.csv")

# データの形式を確認
print(train.shape) # (2000, 22)
print(test.shape) # (1000, 21)

# 特徴量の追加
train["CPU_performance"] = train["battery_power"] * train["clock_speed"]
train["screen_area"] = train["sc_h"] * train["sc_w"]
train["total_camera_pixels"] = train["pc"] + train["fc"] + train["m_dep"]
train["total_features"] = train["ram"] + train["sc_h"] + train["sc_w"] + train["battery_power"]

test["CPU_performance"] = test["battery_power"] * test["clock_speed"]
test["screen_area"] = test["sc_h"] * test["sc_w"]
test["total_camera_pixels"] = test["pc"] + test["fc"] + test["m_dep"]
test["total_features"] = test["ram"] + test["sc_h"] + test["sc_w"] + test["battery_power"]

# 特徴量と目的変数に分ける
X = train.drop("price_range", axis=1)
y = train["price_range"]
X_test = test.copy()

# 特徴量のスケーリング
scaler = StandardScaler()
X = scaler.fit_transform(X)
X_test = scaler.transform(X_test)

# クロスバリデーションでモデルの評価を行う
n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
acc_list = []
conf_mat_list = []
y_pred_list = []
for train_index, valid_index in skf.split(X, y):
    # データを分割する
    X_train, X_valid = X[train_index], X[valid_index]
    y_train, y_valid = y[train_index], y[valid_index]

    # ハイパーパラメータの設定
    params = {
        'objective': 'multiclass',
        'num_class': 4,
        'metric': 'multi_error',
        'boosting_type': 'gbdt',
        'n_jobs': -1,
        'num_leaves': 31,
        'learning_rate': 0.05,
        'max_depth': -1,
        'min_child_samples': 20,
        'subsample_freq': 1,
        'subsample': 0.8,
        'colsample_bytree': 0.6,
        'reg_alpha': 0.1,
        'reg_lambda': 0.1,
        'verbosity': -1,
        'seed': 42
    }

    # ハイパーパラメータのチューニング
    lgbm = LGBMClassifier()
    param_grid = {
        'num_leaves': [16, 31, 64],
        'learning_rate': [0.01, 0.05, 0.1],
        'min_child_samples': [10, 20, 30],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'reg_alpha': [0, 0.1, 0.5],
        'reg_lambda': [0, 0.1, 0.5]
    }

gs = GridSearchCV(estimator=lgbm, param_grid=param_grid, cv=skf, scoring='accuracy', n_jobs=-1)

gs.fit(X_train, y_train)
best_params = gs.best_params_

# 最適なハイパーパラメータでモデルを構築
params.update(best_params)
model = LGBMClassifier(**params)
model.fit(X_train, y_train)

# バリデーションセットでの評価
y_pred = model.predict(X_valid)
acc = accuracy_score(y_valid, y_pred)
conf_mat = confusion_matrix(y_valid, y_pred)

acc_list.append(acc)
conf_mat_list.append(conf_mat)
y_pred_list.append(model.predict(X_test))

# モデルの評価値を出力
print(f"Accuracy: {np.mean(acc_list):.4f}")
print(f"Confusion Matrix:\n{np.mean(conf_mat_list, axis=0)}")

#5つのモデルの平均をとってテストデータの予測値を算出
y_pred_mean = np.mean(y_pred_list, axis=0)

#予測結果をファイルに書き込む
submission = pd.DataFrame({'id': test['id'], 'price_range': y_pred_mean})
submission.to_csv("./submission/submission_lgbm_v4.csv", index=False, header=False)

(1200, 22)
(800, 21)
Accuracy: 0.4417
Confusion Matrix:
[[17. 15.  8.  3.]
 [ 7. 22. 14. 17.]
 [ 5. 17. 49. 13.]
 [ 5. 23.  7. 18.]]


In [2]:
submission

Unnamed: 0,index,price_range
0,1,1
1,2,0
2,6,3
3,10,1
4,12,1
...,...,...
795,1978,3
796,1980,1
797,1982,3
798,1988,2
