特徴量の追加

StratifiedKFoldを使用して、クロスバリデーションによるモデルの評価を行った

In [1]:
#エラーを修正したい

# ライブラリのインポート
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

# データの読み込み
train = pd.read_csv("./original/train.csv")
test = pd.read_csv("./original/test.csv")

# 特徴量の追加
train["CPU_performance"] = train["battery_power"] * train["clock_speed"]
train["screen_area"] = train["sc_h"] * train["sc_w"]
train["total_camera_pixels"] = train["pc"] + train["fc"] + train["m_dep"]
train["total_features"] = train["ram"] + train["sc_h"] + train["sc_w"] + train["battery_power"]

test["CPU_performance"] = test["battery_power"] * test["clock_speed"]
test["screen_area"] = test["sc_h"] * test["sc_w"]
test["total_camera_pixels"] = test["pc"] + test["fc"] + test["m_dep"]
test["total_features"] = test["ram"] + test["sc_h"] + test["sc_w"] + test["battery_power"]

# 特徴量と目的変数に分ける
X = train.drop("price_range", axis=1)
y = train["price_range"]
X_test = test.copy()

# 特徴量のスケーリング
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_test_scaled = scaler.transform(X_test)

# モデルの作成
params = {'n_estimators': 1000,
          'objective': 'multiclass',
          'num_class': 4,
          'learning_rate': 0.05,
          'max_depth': 8,
          'colsample_bytree': 0.8,
          'subsample': 0.8,
          'reg_alpha': 0.01,
          'reg_lambda': 10,
          'importance_type': 'gain',
          'n_jobs': -1,
          'random_state': 42}

importance = [750, 700, 600, 150, 100, 50]
feature_importance = {f'feature_{i}': importance[i] for i in range(len(importance))}
params['feature_importance'] = feature_importance

model = LGBMClassifier(**params)

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
acc_list = [] 
conf_mat_list = [] 
y_pred_list = [] 
for train_index, valid_index in skf.split(X_scaled, y): 
    #データを分割する 
    X_train, X_valid = X_scaled[train_index], X_scaled[valid_index] 
    y_train, y_valid = y[train_index], y[valid_index]

    # ハイパーパラメータの探索
    params = {'n_estimators': [1000, 2000],
              'max_depth': [6, 8],
              'colsample_bytree': [0.7, 0.8, 0.9],
              'subsample': [0.7, 0.8],
              'reg_alpha': [0.01, 0.1],
              'reg_lambda': [1, 10]}

    grid_search = GridSearchCV(model, params, cv=3, n_jobs=-1)
    grid_search.fit(X_train, y_train)

    best_params = grid_search.best_params_

# モデルの作成
model = LGBMClassifier(**params, **best_params, subsample=0.7, reg_lambda=1)


# モデルの学習
model.fit(X_train, y_train, eval_set=(X_valid, y_valid), early_stopping_rounds=50, verbose=0)

# テストデータの予測
y_pred = model.predict(X_test_scaled)
y_pred_list.append(y_pred)

# 評価
acc = accuracy_score(y_valid, model.predict(X_valid))
conf_mat = confusion_matrix(y_valid, model.predict(X_valid))
acc_list.append(acc)
conf_mat_list.append(conf_mat)

#予測
model.fit(X, y) 
y_pred_test = model.predict(X_test)

#出力
submission = pd.DataFrame({"id": test["id"], "price_range": y_pred_test}) 
submission.to_csv("./submission/submission_lgbm_v4.csv", index=False)


In [2]:
submission

Unnamed: 0,index,price_range
0,1,1
1,2,0
2,6,3
3,10,1
4,12,1
...,...,...
795,1978,3
796,1980,1
797,1982,3
798,1988,2
