# exp016  
[Notion](https://www.notion.so/exp016-84d461a1956642108bea01f424cec718?pvs=4)  
コンペmetricをcv上で再現、更にスコア向上のための予測値の後処理を試す。

In [1]:
import os
import random
import sys
import warnings
warnings.filterwarnings('ignore')
from collections import defaultdict
from typing import Tuple, Any, Dict

import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import pandas as pd
import torch
from torch.utils.data import DataLoader
from tqdm import tqdm

# リポジトリtopに移動
while os.path.basename(os.getcwd()) != 'rsna-2023':
    os.chdir('../')
    if os.getcwd() == '/':
        raise Exception('Could not find project root directory.')
    
from src.inference import Inference
from src.classification.dataset import load_df
from src.metrics import score, create_training_solution

# Configs

In [2]:
class CFG_INF:
    exp_name = 'exp_016'
    # evaluation時：'train', submission時：'test'
    phase = 'train'
    base_dir = 'data/rsna-2023-abdominal-trauma-detection'
    image_dir = f'data/rsna-2023-abdominal-trauma-detection/{phase}_images'
    # dataframeはこのconfigにもたせ、phaseで対応できるようにする.
    if phase == 'train':
        df = pd.read_csv(os.path.join(base_dir, 'train.csv'))
    elif phase == 'test':
        df = pd.read_csv(os.path.join(base_dir, 'sample_submission.csv'))
    df_series_meta = pd.read_csv(os.path.join(base_dir, f'{phase}_series_meta.csv'))
    image_size = (512, 512)
    # sample submissionで極端にスライス数が少ない場合があるため対応.
    min_slices = 10
    # 推論時間制限のため
    max_slices = 500
    max_series = 2
    model_save_dir = "outputs"

class CFG_SEG:
    exp_name = 'exp_004'
    # model config
    backbone = 'efficientnet-b3'
    n_ch = 1
    n_class = 4 # 学習時は腎臓の左右を区別しないので、5->4
    # hyper params
    init_lr = 1e-3
    min_lr = 1e-6
    weight_decay = 1e-4
    image_size = (512, 512)
    batch_size = 32
    amp = True
    n_epoch = 20
    iteration_per_epoch = 200
    pretrain = True
    freeze_epochs = 1
    noaug_epochs = 1
    # fold config
    n_fold = 6
    include_evaluation = False
    train_folds = 1
    # path
    image_dir = "data/dataset001/train_images"
    mask_dir = "data/dataset001/segmentations"
    model_save_dir = "outputs"
    # other config
    seed = 42
    num_workers = 0
    num_gpus = 2
    progress_bar = True
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

class CFG_LSK:
    exp_name = 'exp_014'
    # model config
    # timm backbone
    backbone = 'efficientnet-b4'
    n_ch = 1
    expand_ch_dim = True
    # n_class: healthy, low, high
    n_class = 3
    # hyper params
    init_lr = 1e-4
    min_lr = 1e-6
    weight_decay = 1e-4
    image_size = (128, 128, 128)
    batch_size = 64
    amp = True
    eps = 1e-6
    n_epoch = 20
    pretrain = True
    freeze_epochs = 1
    noaug_epochs = 1
    # fold config
    n_fold = 6
    include_evaluation = False
    train_folds = 1
    # path
    image_dir = "data/dataset002"
    model_save_dir = "outputs"
    # other config
    seed = 42
    num_workers = 0
    num_gpus = 2
    progress_bar = True
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

class CFG_BE:
    exp_name = 'exp_011'
    # model config
    # timm backbone
    backbone = 'efficientnet_b4'
    # n_ch: z軸方向のスライス数
    n_ch = 1 # support only 1
    expand_ch_dim = False
    # n_class: bowel_injury, extravasation
    n_class = 2
    label_smoothing = None #Optional(float)
    # hyper params
    init_lr = 5e-5
    min_lr = 1e-6
    weight_decay = 1e-4
    image_size = (512, 512)
    batch_size = 64
    amp = True
    n_epoch = 20
    iteration_per_epoch = 100
    pretrain = True
    freeze_epochs = 1
    noaug_epochs = 1
    # fold config
    n_fold = 6
    include_evaluation = False
    train_folds = 1
    # path
    image_dir = "data/dataset001"
    model_save_dir = "outputs"
    # other config
    seed = 42
    num_workers = 0
    num_gpus = 2
    progress_bar = True
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [3]:
# organs dict (for SEG and LSK models)
organ_index_dict_inv = {
    0: 'liver',
    1: 'spleen',
    2: 'kidney',
    3: 'bowel'
}
organ_index_dict = {v: k for k, v in organ_index_dict_inv.items()}

# labels dict (for BE models)
label_index_dict_inv = {
    0: 'bowel',
    1: 'extravasation'
}

# Dataset  
lskモデル及びbeモデルはfold0のみ学習を行っているため、これをvalidationとして予測を行い、予測結果のdataframeを取得する.  
※segはガッツリリークしているが、精度は十分高いことは既知であり、ここではあまり気にしない.  

In [4]:
df_solid_organ = load_df(CFG_LSK)
# fold 0のpatient_idを取得
pids = df_solid_organ[df_solid_organ["fold"] == 0]["patient_id"].unique()
# fold 0のpatient_idのみを含むデータフレームを作成
CFG_INF.df = CFG_INF.df[CFG_INF.df["patient_id"].isin(pids)].reset_index(drop=True)
CFG_INF.df.head()

Unnamed: 0,patient_id,bowel_healthy,bowel_injury,extravasation_healthy,extravasation_injury,kidney_healthy,kidney_low,kidney_high,liver_healthy,liver_low,liver_high,spleen_healthy,spleen_low,spleen_high,any_injury
0,10007,1,0,1,0,1,0,0,1,0,0,1,0,0,0
1,10205,1,0,1,0,1,0,0,1,0,0,1,0,0,0
2,10275,1,0,1,0,1,0,0,1,0,0,1,0,0,0
3,10430,1,0,1,0,1,0,0,0,1,0,1,0,0,1
4,10494,1,0,0,1,1,0,0,1,0,0,1,0,0,1


In [10]:
inference_instance = Inference(CFG_INF, CFG_SEG, CFG_LSK, CFG_BE)

In [11]:
results = []
for pid in tqdm(CFG_INF.df['patient_id'].to_list()):
    result = inference_instance(pid)
    results.append(result)

100%|██████████| 525/525 [3:10:37<00:00, 21.79s/it]  


In [11]:
import pickle
# resultsを保存
path = os.path.join(CFG_INF.model_save_dir, "results.pkl")
# with open(path, 'wb') as f:
#     pickle.dump(results, f)

# resultsを読み込み
with open(path, 'rb') as f:
    results = pickle.load(f)

In [12]:
submission = pd.DataFrame(results)
order = CFG_INF.df.columns.tolist()
if "any_injury" in order:
    order.remove("any_injury")
submission = submission[order]

In [13]:
submission.head()

Unnamed: 0,patient_id,bowel_healthy,bowel_injury,extravasation_healthy,extravasation_injury,kidney_healthy,kidney_low,kidney_high,liver_healthy,liver_low,liver_high,spleen_healthy,spleen_low,spleen_high
0,10007,0.884326,0.115674,0.913491,0.086509,0.894338,0.067166,0.044069,0.892719,0.067127,0.042504,0.891519,0.069191,0.046914
1,10205,0.212204,0.787796,0.873215,0.126785,0.88656,0.071439,0.050034,0.889268,0.070586,0.044942,0.886773,0.070589,0.04818
2,10275,0.991846,0.008154,0.947898,0.052102,0.907574,0.058174,0.036592,0.946751,0.065587,0.033865,0.904147,0.070189,0.046747
3,10430,0.994514,0.005486,0.949412,0.050588,0.892506,0.075481,0.053364,0.903192,0.069976,0.045679,0.907305,0.063017,0.03915
4,10494,0.891527,0.108473,0.932255,0.067745,0.910356,0.056938,0.025118,0.873322,0.127193,0.051986,0.913574,0.052809,0.032255


In [14]:
# add weight
solution_train = create_training_solution(CFG_INF.df)

no_scale_score = score(solution_train.copy(),submission.copy(),'patient_id')
print(f'Training score without scaling: {no_scale_score:.4f}')

Training score without scaling: 0.6784


In [16]:
# Group by different sample weights
scale_by_2 = ['bowel_injury','kidney_low','liver_low','spleen_low']
scale_by_4 = ['kidney_high','liver_high','spleen_high']
scale_by_6 = ['extravasation_injury']

# Scale factors based on described metric 
sf_2 = 2
sf_4 = 4
sf_6 = 6

# The score function deletes the ID column so we remake it
solution_train = create_training_solution(CFG_INF.df)

# Reset the prediction
y_pred = submission.copy()

# Scale each target 
y_pred[scale_by_2] *=sf_2
y_pred[scale_by_4] *=sf_4
y_pred[scale_by_6] *=sf_6

weight_scale_score = score(solution_train.copy(),y_pred.copy(),'patient_id')
print(f'Training score with weight scaling: {weight_scale_score:.4f}')

Training score with weight scaling: 0.6016228058260522


In [17]:
# Group by different sample weights
scale_by_2 = ['bowel_injury','kidney_low','liver_low','spleen_low']
scale_by_4 = ['kidney_high','liver_high','spleen_high']
scale_by_6 = ['extravasation_injury']

# Scale factors based on described metric
sf_6 = 14

# The score function deletes the ID column so we remake it
solution_train = create_training_solution(CFG_INF.df)

# Reset the prediction
y_pred = submission.copy()

# Scale each target 
y_pred[scale_by_6] *=sf_6

weight_scale_score = score(solution_train.copy(),y_pred.copy(),'patient_id')
print(f'Training score with better scaling: {weight_scale_score:.4f}')

Training score with better scaling: 0.611501921425843


In [18]:
submission.head()

Unnamed: 0,patient_id,bowel_healthy,bowel_injury,extravasation_healthy,extravasation_injury,kidney_healthy,kidney_low,kidney_high,liver_healthy,liver_low,liver_high,spleen_healthy,spleen_low,spleen_high
0,10007,0.884326,0.115674,0.913491,0.086509,0.894338,0.067166,0.044069,0.892719,0.067127,0.042504,0.891519,0.069191,0.046914
1,10205,0.212204,0.787796,0.873215,0.126785,0.88656,0.071439,0.050034,0.889268,0.070586,0.044942,0.886773,0.070589,0.04818
2,10275,0.991846,0.008154,0.947898,0.052102,0.907574,0.058174,0.036592,0.946751,0.065587,0.033865,0.904147,0.070189,0.046747
3,10430,0.994514,0.005486,0.949412,0.050588,0.892506,0.075481,0.053364,0.903192,0.069976,0.045679,0.907305,0.063017,0.03915
4,10494,0.891527,0.108473,0.932255,0.067745,0.910356,0.056938,0.025118,0.873322,0.127193,0.051986,0.913574,0.052809,0.032255


In [19]:
df_all = pd.read_csv(os.path.join(CFG_INF.base_dir, 'train.csv'))
train_pids = df_solid_organ[df_solid_organ["fold"] != 0]["patient_id"].unique()
valid_pids = df_solid_organ[df_solid_organ["fold"] == 0]["patient_id"].unique()
df_train = df_all[df_all["patient_id"].isin(train_pids)].reset_index(drop=True)
df_valid = df_all[df_all["patient_id"].isin(valid_pids)].reset_index(drop=True)

In [22]:
def normalization(arr: np.ndarray, mean: float)-> np.ndarray:
    """arrを平均meanに正規化する(min:0, max:1)."""
    arr = arr.astype(np.float32)
    arr = arr - arr.min()
    arr = arr / arr.max()
    arr = arr * mean
    return arr

def weighted_normalization(df_prediction: pd.DataFrame, df_train: pd.DataFrame)->pd.DataFrame:
    """学習データのターゲット分布からsample weighted loglossに最適な正規化を行う."""
    # fold1~5のそれぞれのターゲットの平均を算出して、正規化をかける。
    for target in df_prediction.columns:
        if target == "patient_id":continue
        mean = df_train[target].mean()
        df_prediction[target] = normalization(df_prediction[target].values, mean)
    return df_prediction

In [23]:
submission_norm = weighted_normalization(submission, df_train)