## SONY ビールの離脱予測シーケンスモデル


## 課題内容

##　データ
- デモグラフィック情報
    - 性別
    - 年代
    - 都道府県
    - 年収
    - 子供の有無
    - 結婚の有無
- シーケンス情報
    - シーケンスパターンのマルチホットベクトル



## コード内容

### verの変更点

### 0. 環境構築

In [None]:
# ====================================================
# Library
# ====================================================
from pathlib import Path
import os
import random
from tqdm import tqdm
import gc
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import japanize_matplotlib
import seaborn as sns
import pickle
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss
from sklearn.metrics import roc_auc_score, average_precision_score
from sklearn.preprocessing import LabelEncoder

import lightgbm as lgb
import datetime
import torch
from sklearn.decomposition import PCA

pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)

In [None]:

# ====================================================
# Configurations
# ====================================================
class CFG:
  VER = 1.0
  exp = 1.0
  AUTHOR = 'SHIMIZU'

  INPUT_DATA_PATH = Path("../input/")
  OOF_DATA_PATH = Path("../oof/")
  MODEL_DATA_PATH = Path("../models")
  OUTPUT_DATA_PATH = Path('../output')

  METHOD_LIST = ['lightgbm']
  seed = 42
  n_folds = 3
  target_col = "is_churn"
  metric = "auc"
  metric_maximize_flag = True
  num_boost_round = 100
  early_stopping_round = 50
  verbose = 25
  classification_lgb_params = {
    'objective': 'binary',
    'metric': 'auc',
    'boosting': 'gbdt',
    'learning_rate': 0.05,
    'is_unbalance': True,
    'seed': seed,
    'verbose':-1,
  }
  model_weight_dict = {'lightgbm': 1.0}

In [None]:
# ====================================================
# Seed everything
# ====================================================
def seed_everything(seed: int) -> None:
    """乱数のseedを固定する"""
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

seed_everything(CFG.seed)

In [None]:
# ====================================================
# LightGBM Metric
# ====================================================
def lgb_rmsle(y_pred, data):
    """RMSLE (Root Mean Squared Logarithmic Error)を計算する関数

    Args:
        y_true (np.array): 真の値
        y_pred (np.array): 予測値

    Returns:
        float: RMSLEスコア
    """
    y_true = data.get_label()
    y_pred = np.maximum(0, np.array(y_pred))  # 予測値を非負に制限

    # sklearnの関数を使用してRMSLEを計算
    rmsle = np.sqrt(mean_squared_log_error(y_true, y_pred))
    return 'rmsle', rmsle, CFG.metric_maximize_flag


In [None]:
# ====================================================
# Load Dataset
# ====================================================
df = pd.read_csv("/Users/hayatashimizu/Desktop/DSnotebook/seq2pat/seq2pat_user_sequences.csv")

In [None]:
#df["label"]はランダムで0or1
#各行でランダムに0or1を生成
df["is_churn"] = [random.randint(0, 1) for _ in range(len(df))]

### 2.Train Testのsplit

In [None]:
skf_test = StratifiedKFold(n_splits = 5, shuffle = True, random_state = CFG.seed)

train_val_idx, test_idx = next(skf_test.split(df, y = df[CFG.target_col]))
df_train_val = df.iloc[train_val_idx]
df_test = df.iloc[test_idx]
print(f"Train/Val: {len(df_train_val)} records, Test: {len(df_test)} records")
print(f"Test Churn Rate: {df_test[CFG.target_col].mean():.2%}")

### 3.特徴量の作成

In [None]:
def Preprocessing(train_df: pd.DataFrame) -> pd.DataFrame:
    """データの前処理を行う"""
    train_df = train_df.copy()
    return train_df

train = Preprocessing(df_train_val)
test = Preprocessing(df_test)

In [None]:
categorical_features = ['gender', 'age', 'prefecture', 'income', 'has_child', 'is_married']
numerical_features = ['total_cost']

features = categorical_features + numerical_features

In [None]:
# ========================================
# Encoding features
# ========================================
le_dict = {}
for categorical_feature in tqdm(categorical_features):
    le = LabelEncoder()
    all_values = list(train[categorical_feature].unique()) + list(test[categorical_feature].unique())
    le.fit(all_values)
    train[categorical_feature] = le.transform(train[categorical_feature])
    test[categorical_feature] = le.transform(test[categorical_feature])
    le_dict[categorical_feature] = le

In [None]:
def add_aggrigation_feats(train, test, num_cols, cat_cols, agg_cols=['min', 'max', 'mean', 'std']):
  for col in cat_cols:
      grp_df = train.groupby(col)[num_cols].agg(agg_cols)
      grp_df.columns = [f'{col}_' + '_'.join(c) for c in grp_df.columns]
      train = train.merge(grp_df, on=col, how='left')
      test = test.merge(grp_df, on=col, how='left')

  return train, test
train, test = add_aggrigation_feats(train, test, numerical_features, categorical_features)

### 4. トレーニング

In [None]:
def lightgbm_training(x_train: pd.DataFrame, y_train: pd.DataFrame, x_valid: pd.DataFrame, y_valid: pd.DataFrame, 
                      categorical_features: list):
    """LightGBMの学習を行う"""
    lgb_train = lgb.Dataset(x_train, y_train, categorical_feature=categorical_features)
    lgb_valid = lgb.Dataset(x_valid, y_valid, categorical_feature=categorical_features)
    model = lgb.train(
                params = CFG.classification_lgb_params,
                train_set = lgb_train,
                num_boost_round = 500,
                valid_sets = [lgb_train, lgb_valid],
                callbacks=[lgb.early_stopping(stopping_rounds=50,
                                              verbose=-1)]
            )
    # Predict validation
    valid_pred = model.predict(x_valid)
    return model, valid_pred


def gradient_boosting_model_cv_training(method: str, train_df: pd.DataFrame, 
                                        features: list, categorical_features: list):
    # Create a numpy array to store out of folds predictions
    oof_predictions = np.zeros(len(train_df))
    oof_fold = np.zeros(len(train_df))
    skf_test = StratifiedKFold(n_splits = CFG.n_folds, shuffle = True, random_state = CFG.seed)
    y_true = train_df[CFG.target_col].values
    for fold, (train_index, valid_index) in enumerate(skf_test.split(train_df[features], train_df[CFG.target_col])):
        print('-'*50)
        print(f'{method} training fold {fold+1}')

        x_train = train_df[features].iloc[train_index]
        x_valid = train_df[features].iloc[valid_index]
        y_train = train_df[CFG.target_col].iloc[train_index]
        y_valid = train_df[CFG.target_col].iloc[valid_index]
        if method == 'lightgbm':
           model, valid_pred = lightgbm_training(x_train, y_train, x_valid, y_valid, categorical_features)

        # Save best model
        pickle.dump(model, open(CFG.MODEL_DATA_PATH / 
                                f'{method}_fold{fold + 1}_seed{CFG.seed}_ver{CFG.VER}.pkl', 'wb'))
        # Add to out of folds array
        oof_predictions[valid_index] = valid_pred
        del x_train, x_valid, y_train, y_valid, model, valid_predS
        gc.collect()


    # Compute out of folds metric
    score = average_precision_score(y_true, oof_predictions)
    
    print(f'{method} out of folds CV RMSLE score is {score}')
    
    oof_df = pd.DataFrame({CFG.target_col: train_df[CFG.target_col], 
                           f'{method}_prediction': oof_predictions, 'fold': oof_fold})
    oof_df.to_csv(CFG.OOF_DATA_PATH / 
                  f'oof_{method}_seed{CFG.seed}_{CFG.AUTHOR}_ver{CFG.VER}.csv', index = False)

def Learning(train_feats):
    for method in CFG.METHOD_LIST:
        print(method)
        gradient_boosting_model_cv_training(method, train, train_feats, categorical_features)

In [None]:
features = train.columns.to_list()
features.remove(CFG.target_col)

In [None]:
# 集約特徴量で重要度として０となったものを削除 <= ハードコーディングなので変更したい
features = list(set(features) - set([]))

In [None]:
Learning(features)

### 5. 推論

In [None]:
def lightgbm_inference(x_test: pd.DataFrame):
    test_pred = np.zeros(len(x_test))
    for fold in range(CFG.n_folds):
        model = pickle.load(open(CFG.MODEL_DATA_PATH / 
                                 f'lightgbm_fold{fold + 1}_seed{CFG.seed}_ver{CFG.VER}.pkl', 'rb'))
        # Predict
        pred = model.predict(x_test)
        test_pred += pred
    return test_pred / CFG.n_folds
def gradient_boosting_model_inference(method: str, test_df: pd.DataFrame, 
                                      features: list, categorical_features: list):
    x_test = test_df[features]
    if method == 'lightgbm':
        test_pred = lightgbm_inference(x_test)
    return test_pred
def Predicting(input_df: pd.DataFrame, features: list, categorical_features: list):
    output_df = input_df.copy()
    output_df['pred_prob'] = 0
    for method in CFG.METHOD_LIST:
        output_df[f'{method}_pred_prob'] = gradient_boosting_model_inference(method, input_df, features, categorical_features)
        output_df['pred_prob'] += CFG.model_weight_dict[method] * output_df[f'{method}_pred_prob']
    return output_df

In [None]:
test_df = Predicting(test, features, categorical_features)

In [None]:
test_df.head()

In [None]:
score = average_precision_score(test_df[CFG.target_col],test_df["pred_prob"])
print(f"PR-AUC: {score}")

### 6.特徴量重要度

In [None]:
model = pickle.load(open(CFG.MODEL_DATA_PATH / f'lightgbm_fold1_seed{CFG.seed}_ver{CFG.VER}.pkl', 'rb'))
importance_df = pd.DataFrame(model.feature_importance(), index=model.feature_name(), columns=['importance'])
importance_df['importance'] = importance_df['importance'] / np.sum(importance_df['importance'])
importance_df.sort_values('importance', ascending=False)

### 7.特徴量作成後のeda