特徴量の追加

In [1]:
!pip install lightgbm



In [3]:
# ライブラリのインポート
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

# データの読み込み
train = pd.read_csv("./original/train.csv")
test = pd.read_csv("./original/test.csv")

# データの形式を確認
print(train.shape) # (2000, 22)
print(test.shape) # (1000, 21)

#特徴量の追加
train["CPU_performance"] = train["battery_power"] * train["clock_speed"]
train["screen_area"] = train["sc_h"] * train["sc_w"]
train["total_camera_pixels"] = train["pc"] + train["fc"] + train["m_dep"]
train["total_features"] = train["ram"] + train["sc_h"] + train["sc_w"] + train["battery_power"]

test["CPU_performance"] = test["battery_power"] * test["clock_speed"]
test["screen_area"] = test["sc_h"] * test["sc_w"]
test["total_camera_pixels"] = test["pc"] + test["fc"] + test["m_dep"]
test["total_features"] = test["ram"] + test["sc_h"] + test["sc_w"] + test["battery_power"]

# 特徴量と目的変数に分ける
X = train.drop("price_range", axis=1)
y = train["price_range"]
X_test = test.copy()

# 特徴量のスケーリング
scaler = StandardScaler()
X = scaler.fit_transform(X)
X_test = scaler.transform(X_test)

# データを分割する
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# ハイパーパラメータの設定
params = {
    'objective': 'multiclass',
    'num_class': 4,
    'metric': 'multi_error',
    'boosting_type': 'gbdt',
    'n_jobs': -1,
    'num_leaves': 31,
    'learning_rate': 0.05,
    'max_depth': -1,
    'min_child_samples': 20,
    'subsample_freq': 1,
    'subsample': 0.8,
    'colsample_bytree': 0.6,
    'reg_alpha': 0.1,
    'reg_lambda': 0.1,
    'verbosity': -1,
    'seed': 42
}

# ハイパーパラメータのチューニング
lgbm = LGBMClassifier()
param_grid = {
    'num_leaves': [16, 31, 64],
    'learning_rate': [0.01, 0.05, 0.1],
    'min_child_samples': [10, 20, 30],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'reg_alpha': [0, 0.1, 1.0],
    'reg_lambda': [0, 0.1, 1.0],
}
grid_search = GridSearchCV(lgbm, param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)


# 最適なハイパーパラメータを設定する
params = grid_search.best_params_

# モデルの作成と学習
model = LGBMClassifier(**params)
model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], early_stopping_rounds=100, verbose=False)

# モデルの評価
y_pred = model.predict(X_valid)
acc = accuracy_score(y_valid, y_pred)
print(f"Accuracy: {acc}")
conf_mat = confusion_matrix(y_valid, y_pred)
print(f"Confusion matrix:\n{conf_mat}")

# 予測結果の出力
y_pred = model.predict(X_test)
output = pd.DataFrame({"index": test["id"], "price_range": y_pred})
output.to_csv("./submission/submission_lgbm_v3.csv", index=False, header=False)

(1200, 22)
(800, 21)
Fitting 5 folds for each of 2187 candidates, totalling 10935 fits




Accuracy: 0.49583333333333335
Confusion matrix:
[[12 13  7 10]
 [ 5 23 12 20]
 [ 7 12 61  4]
 [11 16  4 23]]


In [2]:
output

Unnamed: 0,index,price_range
0,1,1
1,2,0
2,6,3
3,10,1
4,12,1
...,...,...
795,1978,3
796,1980,1
797,1982,3
798,1988,2
