In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt

# 前処理

In [6]:
#データセットの読み込み
TRAIN_DATA_PATH: str = "dataset/train_Mod.csv"
TEST_DATA_PATH: str = "dataset/test.csv"
train_data = pd.read_csv(TRAIN_DATA_PATH)
test_data = pd.read_csv(TEST_DATA_PATH)

train_data.head(5)

Unnamed: 0,id,knee_depth,hip_depth,misalignment,width_of_hand,jump_height,personal_id
0,0,4.114675,20.081082,13.244238,-14.312221,44.388231,A
1,1,2.194313,16.056999,3.360566,-13.784517,45.88517,A
2,2,6.526472,15.633804,3.355818,-0.642056,41.962244,A
3,3,2.30796,17.281763,13.097795,-6.454952,45.614645,A
4,4,3.076618,16.892115,10.72325,-6.547873,44.228786,A


In [7]:
#欠損値の確認
train_data.isnull().sum()

id               0
knee_depth       2
hip_depth        4
misalignment     0
width_of_hand    0
jump_height      0
personal_id      0
dtype: int64

## XGBoost

In [8]:
#とりあえず、一番精度がでたモデル
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score

# 外れ値の除去
df_ = train_data[train_data['knee_depth'] < 17.5]
df_ = df_[df_['jump_height'] < 80]
df_dropped = df_.dropna()
train_data, valid_data = train_test_split(df_dropped, test_size=0.2, random_state=0)
X_train = train_data.drop("personal_id", axis=1)
y_train = train_data["personal_id"]
X_valid = valid_data.drop("personal_id", axis=1)
y_valid = valid_data["personal_id"]


param = {
    'classifier__colsample_bytree': [1.0], 
    'classifier__gamma': [0.2], 
    'classifier__learning_rate': [0.3], 
    'classifier__max_depth': [5], 
    'classifier__min_child_weight': [1], 
    'classifier__n_estimators': [50], 
    'classifier__subsample': [0.9]}


pipeline = Pipeline([
    ('classifier', XGBClassifier(eval_metric='mlogloss'))  # ステップ3: XGBoost
])

#yのラベリング
train_y = train_data["personal_id"]
valid_y = valid_data["personal_id"]
# LabelEncoderのインスタンスを作成
label_encoder = LabelEncoder()
# クラスラベルを数値に変換
y_train_encoded = label_encoder.fit_transform(train_y)
y_valid_encoded = label_encoder.fit_transform(valid_y)


model = GridSearchCV(pipeline, param, cv=5, scoring='accuracy', n_jobs=-1)
model.fit(X_train, y_train_encoded)

#最適なパラメータを表示
print("the best param is : ", model.best_params_)
print("the best score is : ", model.best_score_)
#検証データで評価
y_pred = model.predict(X_valid)
print("valid accuracy_score : ", accuracy_score(y_valid_encoded, y_pred))

best_xgb = model.best_estimator_

the best param is :  {'classifier__colsample_bytree': 1.0, 'classifier__gamma': 0.2, 'classifier__learning_rate': 0.3, 'classifier__max_depth': 5, 'classifier__min_child_weight': 1, 'classifier__n_estimators': 50, 'classifier__subsample': 0.9}
the best score is :  0.9710910652920962
valid accuracy_score :  0.9669421487603306


  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)


## LightGBM

In [9]:
import lightgbm as lgb

# 外れ値の除去
df_ = train_data[train_data['knee_depth'] < 17.5]
df_ = df_[df_['jump_height'] < 80]
df_dropped = df_.dropna()
train_data, valid_data = train_test_split(df_dropped, test_size=0.2, random_state=0)
X_train = train_data.drop("personal_id", axis=1)
y_train = train_data["personal_id"]
X_valid = valid_data.drop("personal_id", axis=1)
y_valid = valid_data["personal_id"]


# LightGBMのモデルを作成
lgbm = lgb.LGBMClassifier(objective='multiclass', num_class=6)

# GridSearchで探索するパラメータの設定
param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.5],
    'max_depth': [3, 5, 7]
}

# GridSearchCVの設定
grid_search = GridSearchCV(lgbm, param_grid, cv=3)

# ハイパーパラメータ探索
grid_search.fit(X_train, y_train)

# 最適なパラメータとその時のスコアを表示
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best score: {grid_search.best_score_}")

# 最適なパラメータでモデルを訓練
best_lgbm = grid_search.best_estimator_

# テストデータで評価
y_pred = best_lgbm.predict(X_valid)
accuracy = accuracy_score(y_valid, y_pred)
print(f"Valid accuracy: {accuracy}")

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001078 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 518
[LightGBM] [Info] Number of data points in the train set: 257, number of used features: 6
[LightGBM] [Info] Start training from score -1.742414
[LightGBM] [Info] Start training from score -1.787876
[LightGBM] [Info] Start training from score -1.787876
[LightGBM] [Info] Start training from score -1.454732
[LightGBM] [Info] Start training from score -2.083340
[LightGBM] [Info] Start training from score -2.022716
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000114 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 517
[LightGBM] [Info] Number of data points in the train set: 257, number of used features: 6
[LightGBM] [Info] Start training from scor

# アンサンブル学習

In [10]:
from sklearn.ensemble import VotingClassifier

# 外れ値の除去
df_ = train_data[train_data['knee_depth'] < 17.5]
df_ = df_[df_['jump_height'] < 80]
df_dropped = df_.dropna()
train_data, valid_data = train_test_split(df_dropped, test_size=0.2, random_state=0)
X_train = train_data.drop("personal_id", axis=1)
y_train = train_data["personal_id"]
X_valid = valid_data.drop("personal_id", axis=1)
y_valid = valid_data["personal_id"]

# アンサンブルモデルの設定（Voting Classifier）
ensemble_model = VotingClassifier(estimators=[('xgb', best_xgb), ('lgb', best_lgbm)], voting='soft')
# モデルの訓練
ensemble_model.fit(X_train, y_train)

# テストデータで評価
y_pred = ensemble_model.predict(X_valid)
accuracy = accuracy_score(y_valid, y_pred)
print(f"Valid accuracy: {accuracy}")


  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000090 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 619
[LightGBM] [Info] Number of data points in the train set: 308, number of used features: 6
[LightGBM] [Info] Start training from score -1.798274
[LightGBM] [Info] Start training from score -1.669657
[LightGBM] [Info] Start training from score -1.945910
[LightGBM] [Info] Start training from score -1.426035
[LightGBM] [Info] Start training from score -1.968900
[LightGBM] [Info] Start training from score -2.092514
Valid accuracy: 0.9230769230769231


  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)


In [11]:
#テストデータで評価
X_test = test_data.drop("personal_id", axis=1)

y_test_pred = ensemble_model.predict(X_test)

#提出用ファイルの作成
test_data.loc[:, 'personal_id'] = y_test_pred
#特徴量の列を削除してcsv化
columns_to_save = ['id', 'personal_id']
test_data.to_csv('submission.csv', columns=columns_to_save, index=True)




  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)


In [12]:
# 岡野

# 外れ値を削除
df_ = df[df['knee_depth'] < 17.5]
df_ = df_[df_['jump_height'] < 80]
df_dropped = df_.dropna()

NameError: name 'df' is not defined