# exp064_th_opt

各questionごとthを調整してcvを最適化する

In [12]:
import os
import sys
import traceback
import gc
import time
import random
import pickle
import pathlib
import subprocess
from dataclasses import dataclass
from collections import defaultdict

import pandas as pd
import numpy as np
from sklearn.metrics import f1_score
from sklearn.metrics import log_loss
from sklearn.model_selection import GroupKFold
import lightgbm as lgb

import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import itertools

import warnings
warnings.simplefilter('ignore')

In [13]:
@dataclass
class Cfg:
    mode = "local_cv" # "local_cv" or "kaggle_inf" 
    exp_name = "exp064"
    input_dir = "/mnt/predict-student-performance-from-game-play/input/"
    output_dir = "/mnt/predict-student-performance-from-game-play/output/"
    prep_dir = "/mnt/predict-student-performance-from-game-play/prep/"
    seed = 42
    n_splits = 5
    best_threshold = 0.630 # local_cvの結果を入れる
cfg = Cfg()

if cfg.mode == "local_cv":
    os.makedirs(os.path.join(cfg.output_dir, cfg.exp_name), exist_ok=True)
    os.makedirs(os.path.join(cfg.output_dir, cfg.exp_name, "cache"), exist_ok=True)

elif cfg.mode == "kaggle_inf":
    import jo_wilder_310

In [14]:
oof = pd.read_csv(cfg.output_dir + f"{cfg.exp_name}/oof.csv.gz")

In [15]:
import optuna

def objective(trial):
    th1 = trial.suggest_float("th1", 0.4, 0.91)
    th2 = trial.suggest_float("th2", 0.4, 0.91)
    th3 = trial.suggest_float("th3", 0.4, 0.91)
    th4 = trial.suggest_float("th4", 0.4, 0.91)
    th5 = trial.suggest_float("th5", 0.4, 0.91)
    th6 = trial.suggest_float("th6", 0.4, 0.91)
    th7 = trial.suggest_float("th7", 0.4, 0.91)
    th8 = trial.suggest_float("th8", 0.4, 0.91)
    th9 = trial.suggest_float("th9", 0.4, 0.91)
    th10 = trial.suggest_float("th10", 0.4, 0.91)
    th11 = trial.suggest_float("th11", 0.4, 0.91)
    th12 = trial.suggest_float("th12", 0.4, 0.91)
    th13 = trial.suggest_float("th13", 0.4, 0.91)
    th14 = trial.suggest_float("th14", 0.4, 0.91)
    th15 = trial.suggest_float("th15", 0.4, 0.91)
    th16 = trial.suggest_float("th16", 0.4, 0.91)
    th17 = trial.suggest_float("th17", 0.4, 0.91)
    th18 = trial.suggest_float("th18", 0.4, 0.91)

    tmp = oof.copy()
    tmp.loc[tmp["question"]==1, "pred"] = (tmp.loc[tmp["question"]==1, "pred"] > th1).astype(int)
    tmp.loc[tmp["question"]==2, "pred"] = (tmp.loc[tmp["question"]==2, "pred"] > th2).astype(int)
    tmp.loc[tmp["question"]==3, "pred"] = (tmp.loc[tmp["question"]==3, "pred"] > th3).astype(int)
    tmp.loc[tmp["question"]==4, "pred"] = (tmp.loc[tmp["question"]==4, "pred"] > th4).astype(int)
    tmp.loc[tmp["question"]==5, "pred"] = (tmp.loc[tmp["question"]==5, "pred"] > th5).astype(int)
    tmp.loc[tmp["question"]==6, "pred"] = (tmp.loc[tmp["question"]==6, "pred"] > th6).astype(int)
    tmp.loc[tmp["question"]==7, "pred"] = (tmp.loc[tmp["question"]==7, "pred"] > th7).astype(int)
    tmp.loc[tmp["question"]==8, "pred"] = (tmp.loc[tmp["question"]==8, "pred"] > th8).astype(int)
    tmp.loc[tmp["question"]==9, "pred"] = (tmp.loc[tmp["question"]==9, "pred"] > th9).astype(int)
    tmp.loc[tmp["question"]==10, "pred"] = (tmp.loc[tmp["question"]==10, "pred"] > th10).astype(int)
    tmp.loc[tmp["question"]==11, "pred"] = (tmp.loc[tmp["question"]==11, "pred"] > th11).astype(int)
    tmp.loc[tmp["question"]==12, "pred"] = (tmp.loc[tmp["question"]==12, "pred"] > th12).astype(int)
    tmp.loc[tmp["question"]==13, "pred"] = (tmp.loc[tmp["question"]==13, "pred"] > th13).astype(int)
    tmp.loc[tmp["question"]==14, "pred"] = (tmp.loc[tmp["question"]==14, "pred"] > th14).astype(int)
    tmp.loc[tmp["question"]==15, "pred"] = (tmp.loc[tmp["question"]==15, "pred"] > th15).astype(int)
    tmp.loc[tmp["question"]==16, "pred"] = (tmp.loc[tmp["question"]==16, "pred"] > th16).astype(int)
    tmp.loc[tmp["question"]==17, "pred"] = (tmp.loc[tmp["question"]==17, "pred"] > th17).astype(int)
    tmp.loc[tmp["question"]==18, "pred"] = (tmp.loc[tmp["question"]==18, "pred"] > th18).astype(int)
    tmp["pred"] = tmp["pred"].astype(int)
    
    return f1_score(tmp["correct"].values, tmp["pred"].values, average='macro')   
 
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=10000)

[32m[I 2023-05-30 15:07:45,534][0m A new study created in memory with name: no-name-61d820fa-e51e-483c-b599-2af2e474b25a[0m
[32m[I 2023-05-30 15:07:45,571][0m Trial 0 finished with value: 0.654407538168797 and parameters: {'th1': 0.8607488572875832, 'th2': 0.8107108125409008, 'th3': 0.9025121365105783, 'th4': 0.45609891997604374, 'th5': 0.5028970256387999, 'th6': 0.8607174163109461, 'th7': 0.48160585383205773, 'th8': 0.42053068638096985, 'th9': 0.428480477282191, 'th10': 0.8505084426341141, 'th11': 0.7655687965528131, 'th12': 0.7314763614638307, 'th13': 0.8058201410263361, 'th14': 0.6936351181786149, 'th15': 0.679536311823281, 'th16': 0.43899706490313767, 'th17': 0.8539072600527277, 'th18': 0.9098165162040066}. Best is trial 0 with value: 0.654407538168797.[0m
[32m[I 2023-05-30 15:07:45,603][0m Trial 1 finished with value: 0.6731816869944117 and parameters: {'th1': 0.6056778715749964, 'th2': 0.8494151064518185, 'th3': 0.4722880840894431, 'th4': 0.6133298218460799, 'th5': 0.8192

In [16]:
print(study.best_value)
print(study.best_params)

0.7009483945128617
{'th1': 0.6369097213054733, 'th2': 0.5138287991120808, 'th3': 0.551275039877425, 'th4': 0.6334532078723507, 'th5': 0.5901541071850643, 'th6': 0.5899586901066654, 'th7': 0.6900838023005419, 'th8': 0.6660158448349756, 'th9': 0.5485405243397861, 'th10': 0.6643148039778702, 'th11': 0.6156735296909601, 'th12': 0.575236853666501, 'th13': 0.7488970791713223, 'th14': 0.6604480560317616, 'th15': 0.6252079728192952, 'th16': 0.5590918700451828, 'th17': 0.6196069280767185, 'th18': 0.6148444013220833}
