# exp030  
[Notion](https://www.notion.so/exp030-8569295ea6ef417b9aebcf6d903c4e13?pvs=4)  
scikit-learnを用いた評価指標に対する予測値の最適化  
exp029のraw predictionからvalidationに対して最適化  

In [1]:
import os
import random
import sys
import warnings
warnings.filterwarnings('ignore')
from collections import defaultdict
from typing import Tuple, Any, Dict
%matplotlib inline
import numpy as np
import pandas as pd
import lightgbm as lgb
from tqdm import tqdm
from sklearn.metrics import log_loss
from sklearn.multioutput import MultiOutputRegressor

# リポジトリtopに移動
while os.path.basename(os.getcwd()) != 'rsna-2023':
    os.chdir('../')
    if os.getcwd() == '/':
        raise Exception('Could not find project root directory.')
from src.classification.dataset import load_df
from src.metrics import score, create_training_solution, normalize_probabilities_to_one

In [2]:
class CFG_INF:
    exp_name = 'exp_029'
    # evaluation時：'train', submission時：'test'
    phase = 'train'
    base_dir = 'data/rsna-2023-abdominal-trauma-detection'
    image_dir = f'data/rsna-2023-abdominal-trauma-detection/{phase}_images'
    # dataframeはこのconfigにもたせ、phaseで対応できるようにする.
    if phase == 'train':
        df = pd.read_csv(os.path.join(base_dir, 'train.csv'))
    elif phase == 'test':
        df = pd.read_csv(os.path.join(base_dir, 'sample_submission.csv'))
    df_series_meta = pd.read_csv(os.path.join(base_dir, f'{phase}_series_meta.csv'))
    image_size = (512, 512)
    # sample submissionで極端にスライス数が少ない場合があるため対応.
    min_slices = 10
    # 推論時間制限のため
    max_slices = 500
    max_series = 2
    model_save_dir = "outputs"
    seg_model_mode = 'final'
    lsk_model_mode = 'final'
    be_model_mode = 'final'
    # other config
    seed = 42

class CFG_LSK:
    exp_name = 'exp_024'
    n_fold = 6
    include_evaluation = False
    train_folds = 1
    # path
    image_dir = "data/dataset002"
    model_save_dir = "outputs"


In [3]:
# organs dict (for SEG and LSK models)
organ_index_dict_inv = {
    0: 'liver',
    1: 'spleen',
    2: 'kidney',
    3: 'bowel'
}
organ_index_dict = {v: k for k, v in organ_index_dict_inv.items()}

# labels dict (for BE models)
label_index_dict_inv = {
    0: 'bowel',
    1: 'extravasation'
}

df_solid_organ = load_df(CFG_LSK)
# fold 0のpatient_idを取得
pids = df_solid_organ[df_solid_organ["fold"] == 0]["patient_id"].unique()
df_all = pd.read_csv(os.path.join(CFG_INF.base_dir, 'train.csv'))
train_pids = df_solid_organ[df_solid_organ["fold"] != 0]["patient_id"].unique()
valid_pids = df_solid_organ[df_solid_organ["fold"] == 0]["patient_id"].unique()
df_train = df_all[df_all["patient_id"].isin(train_pids)].reset_index(drop=True)
df_valid = df_all[df_all["patient_id"].isin(valid_pids)].reset_index(drop=True)

In [4]:
import pickle
# resultsを保存
dir_ = os.path.join(CFG_INF.model_save_dir, CFG_INF.exp_name)
os.makedirs(dir_, exist_ok=True)
path = os.path.join(dir_, "results.pkl")
# with open(path, 'wb') as f:
#     pickle.dump(results, f)

# resultsを読み込み
with open(path, 'rb') as f:
   results = pickle.load(f)

In [5]:
submission = pd.DataFrame(results)
order = CFG_INF.df.columns.tolist()
if "any_injury" in order:
    order.remove("any_injury")
submission = submission[order]

In [6]:
submission

Unnamed: 0,patient_id,bowel_healthy,bowel_injury,extravasation_healthy,extravasation_injury,kidney_healthy,kidney_low,kidney_high,liver_healthy,liver_low,liver_high,spleen_healthy,spleen_low,spleen_high
0,10007,0.816518,0.183482,0.947678,0.052322,0.951286,0.029996,0.012964,0.905826,0.060036,0.033985,0.941805,0.036837,0.016900
1,10205,0.779929,0.220071,0.946795,0.053205,0.872198,0.078430,0.053869,0.919182,0.054013,0.024664,0.928935,0.046155,0.023084
2,10275,0.956565,0.043435,0.965257,0.034743,0.953950,0.034398,0.011485,0.944327,0.085028,0.038267,0.970852,0.041635,0.021986
3,10430,0.956799,0.043201,0.844909,0.155091,0.864793,0.135347,0.130747,0.858291,0.108673,0.066383,0.894300,0.077345,0.039788
4,10494,0.816477,0.183523,0.916104,0.083896,0.927674,0.054439,0.019471,0.942114,0.037052,0.006877,0.957192,0.028234,0.008788
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
520,9537,0.917280,0.082720,0.993776,0.006224,0.965861,0.021696,0.007533,0.935847,0.050291,0.015325,0.923654,0.049562,0.024757
521,96,0.877808,0.122192,0.984195,0.015805,0.918090,0.050579,0.026869,0.916685,0.057559,0.027281,0.895422,0.067554,0.040966
522,9620,0.929224,0.070776,0.950181,0.049819,0.947871,0.034871,0.012524,0.854836,0.099105,0.050291,0.883313,0.073148,0.045738
523,9835,0.951448,0.048552,0.971621,0.028379,0.951497,0.034846,0.011203,0.928395,0.076779,0.034023,0.942346,0.063164,0.034978


In [18]:
use_train_columns = submission.columns.tolist()
use_train_columns.remove("patient_id")
y_columns = use_train_columns.copy()
params = {
    'objective': 'binary',
    'bagging_seed': CFG_INF.seed, # random seed の固定
    'random_state': CFG_INF.seed, # random seed の固定
    'boosting_type': 'gbdt',
    'learning_rate': 0.001,
    'verbosity': 1,
    'extra_trees': True,
    'device': 'gpu',
    }

In [19]:
class_weight = {
    "bowel_healthy": 1.0,
    "bowel_injury": 2.0,
    "extravasation_healthy": 1.0,
    "extravasation_injury": 6.0,
    "liver_healthy": 1.0,
    "liver_low": 2.0,
    "liver_high": 4.0,
    "spleen_healthy": 1.0,
    "spleen_low": 2.0,
    "spleen_high": 4.0,
    "kidney_healthy": 1.0,
    "kidney_low": 2.0,
    "kidney_high": 4.0,
}
def create_weight(df)-> pd.DataFrame:
    """class_weightからweightを作成する."""
    df_weight = []
    for col in df.columns[1:]:
        df_weight.append([class_weight[col] for _ in range(len(df))])
    df_weight = pd.DataFrame(df_weight).T
    df_weight.columns = df.columns[1:]
    return df_weight
df_weight = create_weight(submission)

In [20]:
models = []# defaultdict(list)
train_scores = []# defaultdict(list)
pred = None
# for y_column in tqdm(y_columns):
tr_x, va_x = submission[use_train_columns], submission[use_train_columns]
tr_y, va_y = df_valid[y_columns], df_valid[y_columns]
weight = df_weight[y_columns]

# train_data = lgb.Dataset(tr_x, label=tr_y, weight=weight) # LightGBM用にデータを整形
# valid_data = lgb.Dataset(va_x, label=va_y)
# 学習を実行する
model = MultiOutputRegressor(lgb.LGBMRegressor(**params))
model.fit(tr_x, tr_y)

models.append(model)
va_pred = model.predict(va_x)
train_score = log_loss(va_y,va_pred)
train_scores.append(train_score)

#evaluationさせる
#全部のfoldについてyに入れておいて、
y_pred_proba = model.predict(va_x)

[LightGBM] [Info] Number of positive: 515, number of negative: 10
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 2288
[LightGBM] [Info] Number of data points in the train set: 525, number of used features: 13
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce RTX 3090, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...




[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 13 dense feature groups (0.01 MB) transferred to GPU in 0.000926 secs. 0 sparse feature groups
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.980952 -> initscore=3.941582
[LightGBM] [Info] Start training from score 3.941582
[LightGBM] [Info] Number of positive: 10, number of negative: 515
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 2288
[LightGBM] [Info] Number of data points in the train set: 525, number of used features: 13
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce RTX 3090, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 13 dense feature groups (0.01 MB) transferred to GPU in 0.000557 secs. 0 sparse feature groups
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.019048 -> initscor

In [21]:
submission_pred = pd.DataFrame(y_pred_proba, columns=use_train_columns)

In [22]:
submission_pred["patient_id"] = df_valid["patient_id"].values

In [23]:
submission_pred

Unnamed: 0,bowel_healthy,bowel_injury,extravasation_healthy,extravasation_injury,kidney_healthy,kidney_low,kidney_high,liver_healthy,liver_low,liver_high,spleen_healthy,spleen_low,spleen_high,patient_id
0,0.981214,0.018786,0.939611,0.060389,0.937197,0.041527,0.020747,0.912696,0.074189,0.013923,0.902615,0.063069,0.033951,10007
1,0.981243,0.018757,0.938990,0.061010,0.923685,0.049832,0.024543,0.908983,0.075665,0.014782,0.901363,0.063982,0.035513,10205
2,0.982681,0.017319,0.939498,0.060502,0.938645,0.040868,0.020857,0.909887,0.075187,0.014071,0.904910,0.062143,0.032811,10275
3,0.981139,0.018861,0.935185,0.064815,0.930844,0.043795,0.026441,0.900270,0.087614,0.014645,0.898670,0.067785,0.035239,10430
4,0.982141,0.017859,0.931138,0.068862,0.930150,0.047459,0.022071,0.910809,0.075418,0.014204,0.904204,0.062396,0.033807,10494
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
520,0.982658,0.017342,0.941788,0.058212,0.938598,0.041134,0.020806,0.911989,0.075050,0.014061,0.905604,0.061439,0.033003,9537
521,0.982396,0.017604,0.940676,0.059324,0.930401,0.049193,0.021213,0.908231,0.077680,0.014919,0.899940,0.066054,0.035466,96
522,0.981756,0.018244,0.937441,0.062559,0.937691,0.041336,0.021397,0.897456,0.084984,0.016782,0.898391,0.067151,0.035385,9620
523,0.982704,0.017296,0.940074,0.059926,0.937979,0.041342,0.020969,0.910039,0.074689,0.014392,0.902979,0.064187,0.033018,9835


In [24]:
# add weight
solution_train = create_training_solution(df_valid)

no_scale_score = score(solution_train.copy(),submission_pred.copy(),'patient_id')
print(f'Training score without scaling: {no_scale_score:.4f}')

bowel: 0.1566
extravasation: 0.8160
kidney: 0.5841
liver: 0.6370
spleen: 0.7582
any_injury: 1.5477
mean: 0.7499
Training score without scaling: 0.7499


In [25]:
# Group by different sample weights
scale_by_1 = ['bowel_injury']
scale_by_2 = ['kidney_low','liver_low','spleen_low']
scale_by_4 = ['kidney_high','liver_high','spleen_high']
scale_by_6 = ['extravasation_injury']

# Scale factors based on described metric 
sf_1 = 0.3
sf_2 = 2
sf_4 = 4
sf_6 = 6

# Reset the prediction
y_pred = submission_pred.copy()

# Scale each target 
y_pred[scale_by_1] *=sf_1
y_pred[scale_by_2] *=sf_2
y_pred[scale_by_4] *=sf_4
y_pred[scale_by_6] *=sf_6

weight_scale_score = score(solution_train.copy(),y_pred.copy(),'patient_id')
print(f'Training score with weight scaling: {weight_scale_score:.4f}')

bowel: 0.1881
extravasation: 0.5801
kidney: 0.5235
liver: 0.5830
spleen: 0.6734
any_injury: 0.9411
mean: 0.5815
Training score with weight scaling: 0.5815
