In [6]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, PolynomialFeatures, LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE




In [7]:
train_data = pd.read_csv("./dataset/train_Mod.csv")
test_data = pd.read_csv("./dataset/test.csv")

# 外れ値の除去
# 外れ値を削除
train_data = train_data[train_data['knee_depth'] < 17.5]
train_data = train_data[train_data['jump_height'] < 80]

# 欠損値を持つ行を削除
train_data = train_data.dropna()

#最初の６列をXとして格納
X = train_data.iloc[:, 1:6]
X_test = train_data.iloc[:, 1:6]
#最後の列をyとして格納
y = train_data.iloc[:, -1]



In [8]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)

scaler = MinMaxScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

poly = PolynomialFeatures(degree=3, include_bias=False).fit(X_train_scaled)
X_train_poly = poly.transform(X_train_scaled)
X_val_poly = poly.transform(X_val_scaled)
X_test_poly = poly.transform(X_test_scaled)


In [9]:
import lightgbm as lgb

param = {
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [4, 5, 6],
    'n_estimators': [50, 100, 150]
}

# LightGBMのモデルを作成
lgbm = lgb.LGBMClassifier(objective='multiclass', num_class=6)

# GridSearchCVの設定
grid_search = GridSearchCV(lgbm, param, cv=3, n_jobs=-1)

# ハイパーパラメータ探索
grid_search.fit(X_train, y_train)

# 最適なパラメータとその時のスコアを表示
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best score: {grid_search.best_score_}")

# 最適なパラメータでモデルを訓練
best_lgbm = grid_search.best_estimator_

# テストデータで評価
y_pred = best_lgbm.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)
print(f"Valid accuracy: {accuracy}")

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000187 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 809
[LightGBM] [Info] Number of data points in the train set: 483, number of used features: 5
[LightGBM] [Info] Start training from score -1.691380
[LightGBM] [Info] Start training from score -1.773297
[LightGBM] [Info] Start training from score -1.823308
[LightGBM] [Info] Start training from score -1.488669
[LightGBM] [Info] Start training from score -1.990362
[LightGBM] [Info] Start training from score -2.102479
Best parameters: {'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 100}
Best score: 0.772256728778468
Valid accuracy: 0.8099173553719008


In [10]:
#XGBoost
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder

# 1. データの準備
le = LabelEncoder()
y_train_encoded = le.fit_transform(y_train)
y_val_encoded = le.transform(y_val)

param = {
    'classifier__learning_rate': [0.01, 0.1, 0.2],
    'classifier__n_estimators': [50, 100, 150],
    'classifier__max_depth': [3, 4, 5],
    'classifier__min_child_weight': [1, 2, 3],
    'classifier__gamma': [0, 0.1, 0.2],
    'classifier__subsample': [0.8, 0.9, 1.0],
    'classifier__colsample_bytree': [0.8, 0.9, 1.0]
}

pipeline = Pipeline([
    ('classifier', XGBClassifier(eval_metric='mlogloss'))  # ステップ3: XGBoost
])

model = GridSearchCV(pipeline, param, cv=3, scoring='accuracy', n_jobs=-1)
model.fit(X_train, y_train_encoded)

#最適なパラメータを表示
print("the best param is : ", model.best_params_)
print("the best score is : ", model.best_score_)
#検証データで評価
y_pred = model.predict(X_val)
print("valid accuracy_score : ", accuracy_score(y_val_encoded, y_pred))

best_xgb = model.best_estimator_

  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)


the best param is :  {'classifier__colsample_bytree': 0.8, 'classifier__gamma': 0.2, 'classifier__learning_rate': 0.1, 'classifier__max_depth': 5, 'classifier__min_child_weight': 1, 'classifier__n_estimators': 100, 'classifier__subsample': 0.8}
the best score is :  0.7929606625258799
valid accuracy_score :  0.8181818181818182


  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)


In [11]:
# アンサンブル学習の適用

from sklearn.ensemble import VotingClassifier

# アンサンブルモデルの設定（Voting Classifier）
ensemble_model = VotingClassifier(estimators=[('xgb', best_xgb), ('lgb', best_lgbm)], voting='soft')
# モデルの訓練
ensemble_model.fit(X_train, y_train_encoded)  # エンコードされたラベルでアンサンブルモデルを学習

y_pred_encoded = ensemble_model.predict(X_val)
y_pred = le.inverse_transform(y_pred_encoded)

print("valid accuracy_score : ", accuracy_score(y_val, y_pred))

  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000149 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 809
[LightGBM] [Info] Number of data points in the train set: 483, number of used features: 5
[LightGBM] [Info] Start training from score -1.691380
[LightGBM] [Info] Start training from score -1.773297
[LightGBM] [Info] Start training from score -1.823308
[LightGBM] [Info] Start training from score -1.488669
[LightGBM] [Info] Start training from score -1.990362
[LightGBM] [Info] Start training from score -2.102479
valid accuracy_score :  0.8264462809917356


  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)


In [13]:
#CSV 
#テストデータで評価
X_test = test_data.iloc[:, 1:6]
X_test

y_pred = ensemble_model.predict(X_test)

#予測結果をデコードする
y_pred = le.inverse_transform(y_pred)

# 提出用ファイルの作成
submit = pd.DataFrame({'id': test_data['id'], 'personal_id': y_pred})
submit.to_csv('submit9.csv', index=False)



  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
