02で前処理をしたデータの読み込みとモデルの学習を行うためのnotebookです。  
ここで作成したモデルは **src/models/** フォルダに格納して推論の際に使うようにして下さい。

## 必要なライブラリのimport

In [None]:
%pip install pandas
%pip install sqlalchemy
%pip install scikit-learn
%pip install imblearn
# %pip install matplotlib
# %pip install seaborn
# %pip install lightgbm

In [34]:
import warnings
import time
import pickle

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

warnings.simplefilter(action='ignore', category=FutureWarning)
pd.set_option('display.max_columns', 500)

In [35]:
# データの読み込み
train = pd.read_csv('../data/processed/processed20240626_2_train.csv')
test = pd.read_csv('../data/processed/processed20240626_2_test.csv')
# 目的変数と説明変数の作成
target = train['target']
del train['target']

In [None]:
# lightgbmのパラメータ
param = {'num_leaves': 111,
         'min_data_in_leaf': 149,
         'objective':'regression',
         'max_depth': 9,
         'learning_rate': 0.005,
         "boosting": "gbdt",
         "feature_fraction": 0.7522,
         "bagging_freq": 1,
         "bagging_fraction": 0.7083 ,
         "bagging_seed": 11,
         "metric": 'rmse',
         "lambda_l1": 0.2634,
         "random_state": 133,
         "verbosity": -1}

In [None]:
# 特徴量の情報
features = [c for c in train.columns if c not in ['card_id', 'first_active_month']]
categorical_feats = ['feature_2', 'feature_3']

In [None]:
import os
# データをKFoldで5分割して学習
folds = KFold(n_splits=5, shuffle=True, random_state=15)
oof = np.zeros(len(train))
predictions = np.zeros(len(test))
start = time.time()
feature_importance_df = pd.DataFrame()

for fold_, (trn_idx, val_idx) in enumerate(folds.split(train.values, target.values)):
    print("fold n°{}".format(fold_))
    trn_data = lgb.Dataset(train.iloc[trn_idx][features],
                           label=target.iloc[trn_idx],
                           categorical_feature=categorical_feats
                          )
    val_data = lgb.Dataset(train.iloc[val_idx][features],
                           label=target.iloc[val_idx],
                           categorical_feature=categorical_feats
                          )

    num_round = 10000
    clf = lgb.train(params=param,
                    train_set=trn_data,
                    num_boost_round=num_round,
                    valid_sets=[val_data],
                    callbacks=[lgb.early_stopping(stopping_rounds=200), lgb.log_evaluation(100)])

    oof[val_idx] = clf.predict(train.iloc[val_idx][features], num_iteration=clf.best_iteration)
    fold_importance_df = pd.DataFrame()
    fold_importance_df["feature"] = features
    fold_importance_df["importance"] = clf.feature_importance()
    fold_importance_df["fold"] = fold_ + 1
    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)

    # ディレクトリが存在しない場合に作成する
    directory = 'C:/Users/momoka.miyaguchi.kd/OneDrive - AMBL株式会社/Python研修/kaggle/AnalyticalPipeline/src/models/'
    if not os.path.exists(directory):
        os.makedirs(directory)

    # モデルを保存するファイルパス
    model_file = f'{directory}/model_fold_{fold_}.pkl'

    # モデルを保存
    with open(model_file, 'wb') as f:
        pickle.dump(clf, f)
print("CV score: {:<8.5f}".format(mean_squared_error(oof, target)**0.5))

In [None]:
print(f"Directory exists: {os.path.exists(directory)}")
print(f"Files in directory: {os.listdir(directory)}")

In [None]:
feature_importance_df

In [None]:
# 特徴量重要度の可視化

cols = (feature_importance_df[["feature", "importance"]]
        .groupby("feature")
        .mean()
        .sort_values(by="importance", ascending=False)[:1000].index)

best_features = feature_importance_df.loc[feature_importance_df.feature.isin(cols)]

plt.figure(figsize=(14,50))
sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False))

plt.title('LightGBM Features (avg over folds)')
plt.tight_layout()
# plt.savefig('lgbm_importances.png')


In [None]:
type(cols)

In [33]:
df_rank = feature_importance_df[["feature", "importance"]].groupby("feature").mean().sort_values(by="importance", ascending=False).reset_index()
df_rank['rank'] = df_rank.index + 1
df_rank.to_csv('importance_rank.csv', index= False)

In [None]:
# import pandas as pd
# import numpy as np
# import seaborn as sns
# import matplotlib.pyplot as plt

# 仮のデータフレームを作成する (重要度としてランダムな値を使用)
np.random.seed(42)
features = ['feature_{}'.format(i) for i in range(1, 11)]
importance_values = np.random.rand(10)
feature_importance_df = pd.DataFrame({'feature': features, 'importance': importance_values})

# 重要度でソートして順位を付ける
feature_importance_df = feature_importance_df.sort_values(by='importance', ascending=False)
feature_importance_df['rank'] = range(1, len(feature_importance_df) + 1)

# プロットの準備
plt.figure(figsize=(10, 6))
sns.barplot(x='importance', y='feature', data=feature_importance_df, palette='viridis')

# 各棒の横に順位を表示する
for index, row in feature_importance_df.iterrows():
    plt.text(row['importance'] + 0.005, index, f'{row["rank"]}', va='center')

plt.title('Feature Importance Ranking')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.tight_layout()
plt.show()

