* 複数スキャンがある場合に、どの値を予測させるのか？
 
 1スキャンは、そのまま
 
 2スキャンは、大きい方の値
 
 3スキャンは、中央値を用いる
 
** 安全則に考えると、最大値を予測するのが一番か？中央値は何か問題があるか？

In [1]:
import pandas as pd
import numpy as np
from remove_duplicate_left_max_ctdi import remove_ducplicate_left_ctdi

In [2]:
def preprocess_import_data(df):
    """この関数は、読み込んだデータを整形して、列名を変更する関数です.
    
    params:
        df: 読み込んだDataFrame
        
    Return:
        df: データ整形後のDataFrame
    """
    df.drop('Unnamed: 0', axis=1, inplace=True)
    # 予測に不要な特徴量を削除する
    # 実施検査日、study_date, accessionno, 患者ID, プリセット名称は削除
    drop_list = ['実施検査日(YYYYMMDD)', 'study_date', '患者ID', 'プリセット名称', 'DLP']
    df.drop(drop_list, axis=True, inplace=True)
    
    # column名を変更する
    df.rename(columns={'検査時年齢': 'age', '性別': 'gender', '身長（ｃｍ）': 'height_cm', '体重（ｋｇ）': 'weight_kg',
                       '依頼科名称': 'department', '入院病棟名称': 'hospital_ward', '実施検査室名称': 'room', '撮影機種': 'modality', 
                       '部位名称': 'scan_area', '検査方法': 'scan_method', 'ACCESSIONNO': 'accession'}, inplace=True)
    # 予測に使う装置
    df.query('modality == "Revolution"', inplace=True)
    
    # 現状ではroom, modalityは１つだけを想定しているので、dropする。
    df.drop(['room', 'modality'], axis=1, inplace=True)

    # hospital_wardのNaNは'外来'を意味する
    df.loc[df['hospital_ward'].isna(), 'hospital_ward'] = '外来'
    
    df.reset_index(inplace=True)
    df.drop(['index', 'kV', 'rotation_time', 'scan_series'], axis=1, inplace=True)
    

In [3]:
df = pd.read_excel('./scan_data/202109_all_scan_data.xlsx')
preprocess_import_data(df)

In [6]:
# accessionのユニークな値を取り出す
df = remove_ducplicate_left_ctdi(df)

正常に処理が行われました。


In [7]:
df

Unnamed: 0,accession,age,gender,height_cm,weight_kg,adult_child,department,hospital_ward,scan_area,scan_method,scan_protocol,mA,CTDI
2,1184792820210921,85,M,171.0,58.9,成人,循環器内科,外来,胸部〜骨盤CT,単純,5.7 P+CE Chest-Pelvis Routine,234.59,16.66
5,1183177120210914,81,F,160.0,49.1,成人,救急科,外来,脳CT,単純,1.7 Brain Head Routine TFI-H,170.25,26.10
6,1180474420210905,79,M,170.1,62.6,成人,救急科,外来,頸部〜骨盤CT,単純,5.3 QQ Chest - Pelvis Routine,192.71,8.72
7,1185946220210925,52,M,170.0,60.0,成人,救急科,外来,骨盤骨CT,単純,8.1 QQ Pelvis Rotine,666.06,21.63
8,1136484220210409,73,F,155.0,55.0,成人,呼吸器内科,外来,胸部CT,単純,5.1 QQ Chest Routine,298.35,4.99
...,...,...,...,...,...,...,...,...,...,...,...,...,...
503,1185671720210924,44,M,177.0,66.0,成人,外科,８西,胸部〜骨盤CT,造影,5.7 P+CE Chest-Pelvis Routine,211.73,9.60
221,1185840120210924,27,M,164.0,65.3,成人,感染症内科,外来,頸部CT,造影,3.3 Neck Helical Routine,420.25,22.35
560,1186113320210927,67,M,176.0,64.5,成人,心血管外科,外来,胸部CT,造影,5.1 QQ Chest Routine,412.25,9.56
33,1186730120210928,66,M,168.2,64.6,成人,消化器内科,外来,腹部〜骨盤CT,造影,"6.4 HCC 3Phase (40sec,70sec,180sec) Routine",190.31,13.04


In [158]:
# 重複のデータの抽出
df_duplicated = df[df['accession'].duplicated(keep=False)]

In [159]:
# 重複なしのデータの集合
df_not_duplicated = df[~df['accession'].duplicated(keep=False)]
len(df_not_duplicated)

375

In [160]:
len(df_duplicated['accession'].unique()) + len(df_not_duplicated)

457

In [161]:
len(df['accession'].unique())

457

* ここからデータ重複の削除の処理

In [180]:
duplicated_accession_set = set(df_duplicated['accession'])

In [144]:
df_duplicated[df_duplicated['accession'] == duplicated_accession_set[0]]

Unnamed: 0,accession,age,gender,height_cm,weight_kg,adult_child,department,hospital_ward,scan_area,scan_method,scan_protocol,mA,CTDI
523,1163782220210708,79,M,160.0,65.0,成人,消化器内科,外来,腹部〜骨盤CT,造影,"6.4 HCC 3Phase (40sec,70sec,180sec) Routine",468.2,13.0
524,1163782220210708,79,M,160.0,65.0,成人,消化器内科,外来,腹部〜骨盤CT,造影,"6.4 HCC 3Phase (40sec,70sec,180sec) Routine",468.2,13.05
525,1163782220210708,79,M,160.0,65.0,成人,消化器内科,外来,腹部〜骨盤CT,造影,"6.4 HCC 3Phase (40sec,70sec,180sec) Routine",468.2,13.95
526,1163782220210708,79,M,160.0,65.0,成人,消化器内科,外来,腹部〜骨盤CT,造影,"6.4 HCC 3Phase (40sec,70sec,180sec) Routine",468.2,13.03


In [181]:
# 無難に最大値を抽出するのが簡単でかつ安全則に乗っとていると考えられるが
result = []
for accession in duplicated_accession_set:
    result.append(df_duplicated[df_duplicated['accession'] == accession].iloc[(df_duplicated[df_duplicated['accession'] == accession]['CTDI'].argmax()), :])
df_result = pd.concat([df_not_duplicated, pd.DataFrame(result)], axis=0)

In [182]:
df_result

Unnamed: 0,accession,age,gender,height_cm,weight_kg,adult_child,department,hospital_ward,scan_area,scan_method,scan_protocol,mA,CTDI
2,1184792820210921,85,M,171.0,58.9,成人,循環器内科,外来,胸部〜骨盤CT,単純,5.7 P+CE Chest-Pelvis Routine,234.59,16.66
5,1183177120210914,81,F,160.0,49.1,成人,救急科,外来,脳CT,単純,1.7 Brain Head Routine TFI-H,170.25,26.10
6,1180474420210905,79,M,170.1,62.6,成人,救急科,外来,頸部〜骨盤CT,単純,5.3 QQ Chest - Pelvis Routine,192.71,8.72
7,1185946220210925,52,M,170.0,60.0,成人,救急科,外来,骨盤骨CT,単純,8.1 QQ Pelvis Rotine,666.06,21.63
8,1136484220210409,73,F,155.0,55.0,成人,呼吸器内科,外来,胸部CT,単純,5.1 QQ Chest Routine,298.35,4.99
...,...,...,...,...,...,...,...,...,...,...,...,...,...
503,1185671720210924,44,M,177.0,66.0,成人,外科,８西,胸部〜骨盤CT,造影,5.7 P+CE Chest-Pelvis Routine,211.73,9.60
221,1185840120210924,27,M,164.0,65.3,成人,感染症内科,外来,頸部CT,造影,3.3 Neck Helical Routine,420.25,22.35
560,1186113320210927,67,M,176.0,64.5,成人,心血管外科,外来,胸部CT,造影,5.1 QQ Chest Routine,412.25,9.56
33,1186730120210928,66,M,168.2,64.6,成人,消化器内科,外来,腹部〜骨盤CT,造影,"6.4 HCC 3Phase (40sec,70sec,180sec) Routine",190.31,13.04


In [164]:
# 無難に最大値を抽出するのが簡単でかつ安全則に乗っとていると考えられるが
for accession in duplicated_accession_set:
    pd.DataFrame(df_duplicated[df_duplicated['accession'] == accession].iloc[(df_duplicated[df_duplicated['accession'] == accession]['CTDI'].argmax()), :]).T

In [173]:
df_not_duplicated

Unnamed: 0,accession,age,gender,height_cm,weight_kg,adult_child,department,hospital_ward,scan_area,scan_method,scan_protocol,mA,CTDI
2,1184792820210921,85,M,171.0,58.9,成人,循環器内科,外来,胸部〜骨盤CT,単純,5.7 P+CE Chest-Pelvis Routine,234.59,16.66
5,1183177120210914,81,F,160.0,49.1,成人,救急科,外来,脳CT,単純,1.7 Brain Head Routine TFI-H,170.25,26.10
6,1180474420210905,79,M,170.1,62.6,成人,救急科,外来,頸部〜骨盤CT,単純,5.3 QQ Chest - Pelvis Routine,192.71,8.72
7,1185946220210925,52,M,170.0,60.0,成人,救急科,外来,骨盤骨CT,単純,8.1 QQ Pelvis Rotine,666.06,21.63
8,1136484220210409,73,F,155.0,55.0,成人,呼吸器内科,外来,胸部CT,単純,5.1 QQ Chest Routine,298.35,4.99
...,...,...,...,...,...,...,...,...,...,...,...,...,...
582,1187404320210930,26,M,164.0,75.0,成人,救急科,外来,脳CT,単純,1.4 QQ Brain non-Helical Routine,355.89,32.81
583,1176644720210823,56,F,157.0,42.8,成人,産婦人科,外来,胸部〜骨盤CT,Dual Energy,5.26 GSIX Chest-Pelvis CEonly,230.00,10.87
586,1157340320210616,79,M,157.1,58.0,成人,外科,外来,胸部〜骨盤CT,Dual Energy,5.26 GSIX Chest-Pelvis CEonly,400.00,15.51
587,1183653820210915,91,M,10.0,10.0,成人,救急科,外来,脳CT,単純,1.5 QQ Brain-Head Routine TFI-H,215.03,50.42


In [134]:
len(df)

388