ベース・モデルは LightGBM を用いて，モデル選択は層化K分割交差検証 (K=5) を用いたプログラムに変更した<br>
追加１・使用する特徴量を相関係数から選択した。<br>
追加２・学習データを分け、モデルを評価できるようにした

One-hotエンコーディング

In [1]:
!pip install lightgbm



In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.model_selection import StratifiedKFold, train_test_split
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
# データの読み込み
train = pd.read_csv("./original/train.csv")
test = pd.read_csv("./original/test.csv")
# One-hot encoding
cols_to_encode = ['blue', 'dual_sim', 'four_g', 'three_g', 'touch_screen', 'wifi']
print(train.head())
train = pd.get_dummies(train, columns=cols_to_encode)
test = pd.get_dummies(test, columns=cols_to_encode)
print(train.head())
 # 特徴量と目的変数に分ける
X_train = train.drop("price_range", axis=1)
y_train = train["price_range"]
X_test = test.copy()
# train_test_splitを使用して学習データを分割する
X_train_, X_val, y_train_, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)
# 特徴量のスケーリング
scaler = StandardScaler()
X_train_ = scaler.fit_transform(X_train_)
X_test = scaler.transform(X_test)
 # ハイパーパラメータの設定
params = {
    'objective': 'multiclass',
    'num_class': 4,
    'metric': 'multi_error',
    'boosting_type': 'gbdt',
    'n_jobs': -1,
    'num_leaves': 31,
    'learning_rate': 0.05,
    'max_depth': -1,
    'min_child_samples': 20,
    'subsample_freq': 1,
    'subsample': 0.8,
    'colsample_bytree': 0.6,
    'reg_alpha': 0.1,
    'reg_lambda': 0.1,
    'verbosity': -1,
    'seed': 42
}
# モデルの作成と学習
n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True)
oof = np.zeros((len(X_train_), 4))
y_pred = np.zeros((len(X_test), 4))
for fold, (train_index, valid_index) in enumerate(skf.split(X_train_, y_train_)):
    X_tr = X_train_[train_index]
    y_tr = y_train_.iloc[train_index]
    X_val = X_train_[valid_index]
    y_val = y_train_.iloc[valid_index]
    model = LGBMClassifier(**params)
    model.fit(X_tr, y_tr, eval_set=[(X_val, y_val)], early_stopping_rounds=100, verbose=False)
    oof[valid_index] = model.predict_proba(X_val)
    y_pred += model.predict_proba(X_test) / n_splits
# モデルの評価
y_train_pred = model.predict(X_train_)
y_val_pred = model.predict(X_val)
print("Training Accuracy: ", accuracy_score(y_train_, y_train_pred))
print("Validation Accuracy: ", accuracy_score(y_val, y_val_pred))
# 予測結果の出力
y_pred = np.argmax(y_pred, axis=1)
output = pd.DataFrame({"id": test["id"], "price_range": y_pred})
output.to_csv("./submission/submission_lgbm_v7.csv", index=False, header=False)

   id  battery_power  blue  clock_speed  dual_sim  fc  four_g  int_memory   
0   0           1203     0     0.680981         1   1       1          23  \
1   3           1203     1     2.602754         1   0       0           8   
2   4           1980     1     2.604065         1   0       0           6   
3   5           1185     1     2.669403         1   3       0          33   
4   7           1203     1     2.375453         0   4       1          12   

      m_dep  mobile_wt  ...  px_height  px_width   ram  sc_h  sc_w  talk_time   
0  0.402580        117  ...       1331       721  1970    15     1          4  \
1  0.415612        194  ...       1571      1262  1150    14    16         14   
2  0.858110        122  ...        364       721  1970    18    12         11   
3  0.027065        132  ...        447      1162  1950    15     3         19   
4  0.141485        135  ...        364      1917  1970    16     7          6   

   three_g  touch_screen  wifi  price_range  
0   



Training Accuracy:  0.8875
Validation Accuracy:  0.4791666666666667


In [2]:
output

Unnamed: 0,index,price_range
0,1,1
1,2,0
2,6,3
3,10,1
4,12,1
...,...,...
795,1978,3
796,1980,1
797,1982,3
798,1988,2
