* 複数スキャンがある場合に、どの値を予測させるのか？
 
 1スキャンは、そのまま
 
 2スキャンは、大きい方の値
 
 3スキャンは、中央値を用いる
 
** 安全則に考えると、最大値を予測するのが一番か？中央値は何か問題があるか？

In [1]:
import pandas as pd
import numpy as np
from remove_duplicate_left_max_ctdi import remove_ducplicate_left_ctdi

In [6]:
def preprocess_import_data(df):
    """この関数は、読み込んだデータを整形して、列名を変更する関数です.
    
    params:
        df: 読み込んだDataFrame
        
    Return:
        df: データ整形後のDataFrame
    """
    if 'Unnamed: 0' in df.columns:
        df.drop('Unnamed: 0', axis=1, inplace=True)
    # 予測に不要な特徴量を削除する
    # 実施検査日、study_date, accessionno, 患者ID, プリセット名称は削除
    drop_list = ['実施検査日(YYYYMMDD)', 'study_date', '患者ID', 'プリセット名称', 'DLP']
    df.drop(drop_list, axis=True, inplace=True)
    
    # column名を変更する
    df.rename(columns={'検査時年齢': 'age', '性別': 'gender', '身長（ｃｍ）': 'height_cm', '体重（ｋｇ）': 'weight_kg',
                       '依頼科名称': 'department', '入院病棟名称': 'hospital_ward', '実施検査室名称': 'room', '撮影機種': 'modality', 
                       '部位名称': 'scan_area', '検査方法': 'scan_method', 'ACCESSIONNO': 'accession'}, inplace=True)
    # 予測に使う装置
    df.query('modality == "Revolution"', inplace=True)
    
    # 現状ではroom, modalityは１つだけを想定しているので、dropする。
    df.drop(['room', 'modality'], axis=1, inplace=True)

    # hospital_wardのNaNは'外来'を意味する
    df.loc[df['hospital_ward'].isna(), 'hospital_ward'] = '外来'
    
    df.reset_index(inplace=True)
    df.drop(['index', 'kV', 'rotation_time', 'scan_series'], axis=1, inplace=True)
    

In [3]:
df = pd.read_excel('./scan_data/202109_all_scan_data.xlsx')
preprocess_import_data(df)

In [6]:
# accessionのユニークな値を取り出す
df = remove_ducplicate_left_ctdi(df)

正常に処理が行われました。


In [7]:
df

Unnamed: 0,accession,age,gender,height_cm,weight_kg,adult_child,department,hospital_ward,scan_area,scan_method,scan_protocol,mA,CTDI
2,1184792820210921,85,M,171.0,58.9,成人,循環器内科,外来,胸部〜骨盤CT,単純,5.7 P+CE Chest-Pelvis Routine,234.59,16.66
5,1183177120210914,81,F,160.0,49.1,成人,救急科,外来,脳CT,単純,1.7 Brain Head Routine TFI-H,170.25,26.10
6,1180474420210905,79,M,170.1,62.6,成人,救急科,外来,頸部〜骨盤CT,単純,5.3 QQ Chest - Pelvis Routine,192.71,8.72
7,1185946220210925,52,M,170.0,60.0,成人,救急科,外来,骨盤骨CT,単純,8.1 QQ Pelvis Rotine,666.06,21.63
8,1136484220210409,73,F,155.0,55.0,成人,呼吸器内科,外来,胸部CT,単純,5.1 QQ Chest Routine,298.35,4.99
...,...,...,...,...,...,...,...,...,...,...,...,...,...
503,1185671720210924,44,M,177.0,66.0,成人,外科,８西,胸部〜骨盤CT,造影,5.7 P+CE Chest-Pelvis Routine,211.73,9.60
221,1185840120210924,27,M,164.0,65.3,成人,感染症内科,外来,頸部CT,造影,3.3 Neck Helical Routine,420.25,22.35
560,1186113320210927,67,M,176.0,64.5,成人,心血管外科,外来,胸部CT,造影,5.1 QQ Chest Routine,412.25,9.56
33,1186730120210928,66,M,168.2,64.6,成人,消化器内科,外来,腹部〜骨盤CT,造影,"6.4 HCC 3Phase (40sec,70sec,180sec) Routine",190.31,13.04


In [158]:
# 重複のデータの抽出
df_duplicated = df[df['accession'].duplicated(keep=False)]

In [159]:
# 重複なしのデータの集合
df_not_duplicated = df[~df['accession'].duplicated(keep=False)]
len(df_not_duplicated)

375

In [160]:
len(df_duplicated['accession'].unique()) + len(df_not_duplicated)

457

In [161]:
len(df['accession'].unique())

457

* ここからデータ重複の削除の処理

In [180]:
duplicated_accession_set = set(df_duplicated['accession'])

In [144]:
df_duplicated[df_duplicated['accession'] == duplicated_accession_set[0]]

Unnamed: 0,accession,age,gender,height_cm,weight_kg,adult_child,department,hospital_ward,scan_area,scan_method,scan_protocol,mA,CTDI
523,1163782220210708,79,M,160.0,65.0,成人,消化器内科,外来,腹部〜骨盤CT,造影,"6.4 HCC 3Phase (40sec,70sec,180sec) Routine",468.2,13.0
524,1163782220210708,79,M,160.0,65.0,成人,消化器内科,外来,腹部〜骨盤CT,造影,"6.4 HCC 3Phase (40sec,70sec,180sec) Routine",468.2,13.05
525,1163782220210708,79,M,160.0,65.0,成人,消化器内科,外来,腹部〜骨盤CT,造影,"6.4 HCC 3Phase (40sec,70sec,180sec) Routine",468.2,13.95
526,1163782220210708,79,M,160.0,65.0,成人,消化器内科,外来,腹部〜骨盤CT,造影,"6.4 HCC 3Phase (40sec,70sec,180sec) Routine",468.2,13.03


In [181]:
# 無難に最大値を抽出するのが簡単でかつ安全則に乗っとていると考えられるが
result = []
for accession in duplicated_accession_set:
    result.append(df_duplicated[df_duplicated['accession'] == accession].iloc[(df_duplicated[df_duplicated['accession'] == accession]['CTDI'].argmax()), :])
df_result = pd.concat([df_not_duplicated, pd.DataFrame(result)], axis=0)

In [182]:
df_result

Unnamed: 0,accession,age,gender,height_cm,weight_kg,adult_child,department,hospital_ward,scan_area,scan_method,scan_protocol,mA,CTDI
2,1184792820210921,85,M,171.0,58.9,成人,循環器内科,外来,胸部〜骨盤CT,単純,5.7 P+CE Chest-Pelvis Routine,234.59,16.66
5,1183177120210914,81,F,160.0,49.1,成人,救急科,外来,脳CT,単純,1.7 Brain Head Routine TFI-H,170.25,26.10
6,1180474420210905,79,M,170.1,62.6,成人,救急科,外来,頸部〜骨盤CT,単純,5.3 QQ Chest - Pelvis Routine,192.71,8.72
7,1185946220210925,52,M,170.0,60.0,成人,救急科,外来,骨盤骨CT,単純,8.1 QQ Pelvis Rotine,666.06,21.63
8,1136484220210409,73,F,155.0,55.0,成人,呼吸器内科,外来,胸部CT,単純,5.1 QQ Chest Routine,298.35,4.99
...,...,...,...,...,...,...,...,...,...,...,...,...,...
503,1185671720210924,44,M,177.0,66.0,成人,外科,８西,胸部〜骨盤CT,造影,5.7 P+CE Chest-Pelvis Routine,211.73,9.60
221,1185840120210924,27,M,164.0,65.3,成人,感染症内科,外来,頸部CT,造影,3.3 Neck Helical Routine,420.25,22.35
560,1186113320210927,67,M,176.0,64.5,成人,心血管外科,外来,胸部CT,造影,5.1 QQ Chest Routine,412.25,9.56
33,1186730120210928,66,M,168.2,64.6,成人,消化器内科,外来,腹部〜骨盤CT,造影,"6.4 HCC 3Phase (40sec,70sec,180sec) Routine",190.31,13.04


In [164]:
# 無難に最大値を抽出するのが簡単でかつ安全則に乗っとていると考えられるが
for accession in duplicated_accession_set:
    pd.DataFrame(df_duplicated[df_duplicated['accession'] == accession].iloc[(df_duplicated[df_duplicated['accession'] == accession]['CTDI'].argmax()), :]).T

In [173]:
df_not_duplicated

Unnamed: 0,accession,age,gender,height_cm,weight_kg,adult_child,department,hospital_ward,scan_area,scan_method,scan_protocol,mA,CTDI
2,1184792820210921,85,M,171.0,58.9,成人,循環器内科,外来,胸部〜骨盤CT,単純,5.7 P+CE Chest-Pelvis Routine,234.59,16.66
5,1183177120210914,81,F,160.0,49.1,成人,救急科,外来,脳CT,単純,1.7 Brain Head Routine TFI-H,170.25,26.10
6,1180474420210905,79,M,170.1,62.6,成人,救急科,外来,頸部〜骨盤CT,単純,5.3 QQ Chest - Pelvis Routine,192.71,8.72
7,1185946220210925,52,M,170.0,60.0,成人,救急科,外来,骨盤骨CT,単純,8.1 QQ Pelvis Rotine,666.06,21.63
8,1136484220210409,73,F,155.0,55.0,成人,呼吸器内科,外来,胸部CT,単純,5.1 QQ Chest Routine,298.35,4.99
...,...,...,...,...,...,...,...,...,...,...,...,...,...
582,1187404320210930,26,M,164.0,75.0,成人,救急科,外来,脳CT,単純,1.4 QQ Brain non-Helical Routine,355.89,32.81
583,1176644720210823,56,F,157.0,42.8,成人,産婦人科,外来,胸部〜骨盤CT,Dual Energy,5.26 GSIX Chest-Pelvis CEonly,230.00,10.87
586,1157340320210616,79,M,157.1,58.0,成人,外科,外来,胸部〜骨盤CT,Dual Energy,5.26 GSIX Chest-Pelvis CEonly,400.00,15.51
587,1183653820210915,91,M,10.0,10.0,成人,救急科,外来,脳CT,単純,1.5 QQ Brain-Head Routine TFI-H,215.03,50.42


### データについて整理し直す

In [240]:
%load_ext autoreload
%autoreload 2
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from preprocessing.initial_preprocessing import initial_preprocessing


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [290]:
import pandas as pd
import numpy as np

df = pd.read_excel('train_data.xlsx', index_col=False)
df.head(2)

Unnamed: 0,ACCESSIONNO,scan protocol,scan series,target region,scanning length,kV,mean mA,max mA,pitch factor,nomial total collimation width,...,部位名称,プリセット名称,検査方法名称,検査責任者名称,検査/撮影情報01,検査/撮影情報02,撮影機種,検査方法,adult_child,study_date
0,1103165320210101,5.3 QQ Chest - Pelvis Routine,,Chest,862.18,100,594.11,700,1.38,80.0,...,胸部〜骨盤CT,256列 【枠外】胸〜骨盤CT（単純）,単純,加藤大貴,,2.0,Revolution,単純,成人,202101
1,1103165220210101,1.5 QQ Brain-Head Routine TFI-H,,Head,240.75,100,387.65,515,0.52,40.0,...,脳CT,256列 【枠外】脳・頭部CT（単純）,単純,加藤大貴,,2.0,Revolution,単純,成人,202101


In [291]:
initial_preprocessing(df)

In [248]:
df.columns

Index(['accession', 'scan protocol', 'scan series', 'target region',
       'scanning length', 'kV', 'mean mA', 'max mA', 'pitch factor',
       'nomial total collimation width', 'exposure time per rotation',
       'exposure time', 'CTDIw phantom type', 'Mean CTDIvol', 'DLP',
       'study_date', 'id', 'age', 'department', 'hospital_ward', 'gender',
       'height_cm', 'weight_kg', 'scan_area', 'preset_name', 'modality',
       'scan_method', 'adult_child'],
      dtype='object')

In [288]:
# 重複データを削除する
%load_ext autoreload
%autoreload 2
from preprocessing.remove_duplicate_ctdi import remove_duplicate_ctdi

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [104]:
len(df)

23707

In [88]:
max_indices = df_duplicated.groupby('accession')['Mean CTDIvol'].idxmax()
result_df = df_duplicated.loc[max_indices].reset_index(drop=True)
len(result_df)

3306

In [85]:
len(df_duplicated)

9981

In [292]:
df = remove_duplicate_ctdi(df)
len(df)
df.reset_index(drop=True, inplace=True)

正常に処理が行われました。


In [295]:
from preprocessing.drop_emergency_suspicious_height_weight import drop_emergency_suspicious_height_weight

df_temp = drop_emergency_suspicious_height_weight(df)

In [251]:
# 次にやるべきことは、scan seriesがNaNとnot NaNになるものを分ける
# この境界が装置のバージョンアップをしているので、この境界でデータを比較しておく必要がある。
df_scan_series_nan = df[df['scan series'].isna()]
df_scan_series_not_nan = df[~df['scan series'].isna()]

In [296]:
len(df_temp)

8958

### [ToDo]: 身長と体重が怪しいデータを引き抜く（ここまでは共通の処理）

In [253]:
# シンプルに救急科を全て削除する
df_temp = df[~(df['department'] == '救急科')]
# 成人のみに限定
df_temp = df_temp[df_temp['adult_child'] == '成人']

# 次にやるべきことは、scan seriesがNaNとnot NaNになるものを分ける
# この境界が装置のバージョンアップをしているので、この境界でデータを比較しておく必要がある。
df_scan_series_nan = df_temp[df_temp['scan series'].isna()]
df_scan_series_not_nan = df_temp[~df_temp['scan series'].isna()]

# 頭部の検査は設定を変更しているので、target regionがHeadのものは削除する
df_scan_series_nan_not_head = df_scan_series_nan[~(df_scan_series_nan['target region'] == 'Head')]

# データを結合して、reset_index
df_temp = pd.concat([df_scan_series_nan_not_head, df_scan_series_not_nan], axis=0).reset_index(drop=True)
# scan seriesがNaNはdropしてみる
df_temp

Unnamed: 0,accession,scan protocol,scan series,target region,scanning length,kV,mean mA,max mA,pitch factor,nomial total collimation width,...,department,hospital_ward,gender,height_cm,weight_kg,scan_area,preset_name,modality,scan_method,adult_child
0,1103161420210101,5.1 QQ Chest Routine,,Chest,382.14,100,350.24,640,1.38,80.0,...,感染症内科,外来,F,150.0,49.1,胸部CT,256列 【枠外】胸部CT（単純）,Revolution,単純,成人
1,1103161620210101,5.1 QQ Chest Routine,,Chest,442.20,100,442.93,700,1.38,80.0,...,感染症内科,外来,M,163.0,74.0,胸部CT,256列 【枠外】胸部CT（単純）,Revolution,単純,成人
2,1103339220210102,5.1 QQ Chest Routine,,Chest,467.06,100,306.59,700,1.38,80.0,...,感染症内科,外来,M,178.0,68.0,胸部CT,256列 【枠外】胸部CT（単純）,Revolution,単純,成人
3,1103327120210102,5.1 QQ Chest Routine,,Chest,472.12,100,433.42,700,1.38,80.0,...,感染症内科,外来,M,182.0,70.0,胸部CT,256列 【枠外】胸部CT（単純）,Revolution,単純,成人
4,1103326920210102,5.1 QQ Chest Routine,,Chest,427.24,100,367.87,684,1.38,80.0,...,感染症内科,外来,M,200.0,200.0,胸部CT,256列 【枠外】胸部CT（単純）,Revolution,単純,成人
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9025,1364925420230425,"5.12 Chest - Pelvis (40sec,80sec)Routine",CE,Chest,916.19,100,450.77,700,0.99,80.0,...,総合診療科,外来,M,166.0,71.0,胸部〜骨盤CT,256列 【枠外】胸〜骨盤CT（造影適宜）,Revolution,造影,成人
9026,1365245120230426,6.2 Abdomen-Pelvis Routine,Plain,Abdomen and Pelvis,626.27,100,414.06,720,0.99,80.0,...,消化器内科,外来,M,174.0,73.4,腹部〜骨盤CT,256列 【枠外】腹部骨盤CT（造影適宜）,Revolution,造影,成人
9027,1365256720230426,"5.36 GSIX Aorta CTA prep (CTA, +20s) Delay onl...",CTA,Chest,5.00,100,100.00,100,,5.0,...,整形外科,外来,M,171.0,65.0,左上肢ｼｬﾝﾄCT,撮影済のみ(ﾌﾟﾘｾｯﾄ無),Revolution,CTA,成人
9028,1365967020230428,"6.4 HCC 3Phase (40sec,70sec,180sec) Routine",CE,Abdomen,551.66,100,354.37,720,0.99,80.0,...,内科一般,外来,M,176.5,56.2,腹部〜骨盤CT,撮影済のみ(ﾌﾟﾘｾｯﾄ無),Revolution,造影,成人


In [254]:
df_temp = df_temp[(df_temp['height_cm'] > 100) & (df_temp['height_cm'] < 200)]
df_temp = df_temp[(df_temp['weight_kg'] > 20) & (df_temp['weight_kg'] < 250)]

In [255]:
len(df_temp)

8958

In [260]:
index = df_temp['height_cm'].sort_values(ascending=True)[:30].index.to_list()

In [266]:
df_temp[['scanning length', 'height_cm', 'weight_kg', 'id', 'study_date', 'target region', 'Mean CTDIvol']].loc[index]

Unnamed: 0,scanning length,height_cm,weight_kg,id,study_date,target region,Mean CTDIvol
1219,796.77,101.0,39.8,75206,2021-06-18,Chest,6.7
100,626.53,101.0,39.8,75206,2021-01-14,Abdomen,12.78
2268,852.06,101.0,39.8,75206,2021-04-21,Chest,6.45
2390,621.51,101.0,39.8,75206,2021-06-02,Abdomen,17.1
1029,796.77,101.0,39.8,75206,2021-05-18,Chest,7.38
277,567.16,101.0,39.8,75206,2021-02-04,Chest,6.03
638,827.2,101.0,39.8,75206,2021-03-29,Chest,6.13
6499,506.63,117.7,76.4,9326611,2023-02-10,Bronchus,10.38
3568,531.81,117.7,76.4,9326611,2022-01-21,Bronchus,9.9
110,456.72,117.7,76.4,9326611,2021-01-15,Chest,9.14


In [270]:
df_temp[df_temp['id'] == 75206]

Unnamed: 0,accession,scan protocol,scan series,target region,scanning length,kV,mean mA,max mA,pitch factor,nomial total collimation width,...,department,hospital_ward,gender,height_cm,weight_kg,scan_area,preset_name,modality,scan_method,adult_child
100,1107160820210114,6.2 Abdomen-Pelvis Routine,,Abdomen,626.53,100,459.39,720,0.99,80.0,...,内科一般,外来,M,101.0,39.8,腹部〜骨盤CT,256列 【枠外】腹部骨盤CT（単純）,Revolution,単純,成人
277,1112448720210128,5.1 QQ Chest Routine,,Chest,567.16,100,360.62,700,1.38,80.0,...,内科一般,外来,M,101.0,39.8,胸部CT,256列 胸部CT（単純）,Revolution,単純,成人
638,1127294720210315,5.3 QQ Chest - Pelvis Routine,,Chest,827.2,100,366.71,700,1.38,80.0,...,内科一般,外来,M,101.0,39.8,胸部〜骨盤CT,256列 胸〜骨盤CT（単純）,Revolution,単純,成人
1029,1149049920210518,5.6 P+CE Chest-Pelvis Routine,,Chest,796.77,100,318.55,700,0.99,80.0,...,内科一般,外来,M,101.0,39.8,胸部〜骨盤CT,256列 【枠外】胸〜骨盤CT（単純）,Revolution,単純,成人
1219,1157935420210618,5.6 P+CE Chest-Pelvis Routine,,Chest,796.77,100,289.25,671,0.99,80.0,...,内科一般,外来,M,101.0,39.8,胸部〜骨盤CT,256列 【枠外】胸〜骨盤CT（単純）,Revolution,単純,成人
2268,1141117020210421,5.6 P+CE Chest-Pelvis Routine,,Chest,852.06,100,385.56,700,1.38,80.0,...,内科一般,外来,M,101.0,39.8,胸部〜骨盤CT,256列 【枠外】胸〜骨盤CT（単純）,Revolution,単純,成人
2390,1153467220210602,6.19 GSIX Abdomen-Pelvi - CE only,,Abdomen,621.51,140,356.82,355,0.99,80.0,...,内科一般,外来,M,101.0,39.8,腹部〜骨盤CT,256列 【枠外】腹部骨盤CT（DE）,Revolution,Dual Energy,成人


In [276]:
# 最終的な怪しいデータに関しては直接データを確認して削除する
drop_ids = [75206, 9326611]
for ids in drop_ids:
    df_temp = df_temp[~(df_temp['id'] == ids)]
df_temp.reset_index(drop=True, inplace=True)

In [275]:
df_temp['weight_kg'].sort_values(ascending=True)

7866     24.2
5745     25.0
8641     25.8
5852     25.8
3613     26.7
        ...  
8963    151.8
7016    151.8
6914    151.8
8943    151.8
4387    172.2
Name: weight_kg, Length: 8948, dtype: float64

In [263]:
df_temp.loc[6499]

accession                                   1273950920220715
scan protocol                           5.1 QQ Chest Routine
scan series                                           Plain 
target region                                       Bronchus
scanning length                                       506.63
kV                                                       100
mean mA                                               442.85
max mA                                                   700
pitch factor                                            1.38
nomial total collimation width                          80.0
exposure time per rotation                               0.7
exposure time                                           3.22
CTDIw phantom type                IEC Body Dosimetry Phantom
Mean CTDIvol                                           10.38
DLP                                                   525.81
study_date                               2023-02-10 00:00:00
id                      

## SweetVizを使ってデータの比較をみる

In [36]:
import sweetviz as sv

In [37]:
report = sv.compare([df_scan_series_nan, 'Before Version up CT'], [df_scan_series_not_nan, 'After Version up CT'])
report.show_html('Compare_Report_CT_verison.html')

                                             |          | [  0%]   00:00 -> (? left)

  value_counts_without_nan = pd.Series()
  figure.savefig(as_raw_bytes, format='png', transparent=True)
  figure.savefig(as_raw_bytes, format='png', transparent=True)
  figure.savefig(as_raw_bytes, format='png', transparent=True)
  figure.savefig(as_raw_bytes, format='png', transparent=True)
  figure.savefig(as_raw_bytes, format='png', transparent=True)
  figure.savefig(as_raw_bytes, format='png', transparent=True)
  figure.savefig(as_raw_bytes, format='png', transparent=True)
  figure.savefig(as_raw_bytes, format='png', transparent=True)
  figure.savefig(as_raw_bytes, format='png', transparent=True)
  figure.savefig(as_raw_bytes, format='png', transparent=True)
  figure.savefig(as_raw_bytes, format='png', transparent=True)
  figure.savefig(as_raw_bytes, format='png', transparent=True)
  figure.savefig(as_raw_bytes, format='png', transparent=True)
  figure.savefig(as_raw_bytes, format='png', transparent=True)
  figure.savefig(as_raw_bytes, format='png', transparent=True)
  figure.savef

Report Compare_Report_CT_verison.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.


### Pandas=Profilling

In [40]:
from pandas_profiling import ProfileReport

In [43]:
profile_nan_data = ProfileReport(df_scan_series_nan, title='Before Version up CT')
profile_nan_data.to_file('Before version up.html')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)


Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(


Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

In [44]:
profile_not_nan = ProfileReport(df_scan_series_not_nan, title='After Version up CT')
profile_not_nan.to_file('After version up.html')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)


Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(


Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

In [45]:
df.columns

Index(['accession', 'scan protocol', 'scan series', 'target region',
       'scanning length', 'kV', 'mean mA', 'max mA', 'pitch factor',
       'nomial total collimation width', 'exposure time per rotation',
       'exposure time', 'CTDIw phantom type', 'Mean CTDIvol', 'DLP',
       '実施検査日(YYYYMMDD)', 'age', 'department', 'hospital_ward', 'gender',
       'height_cm', 'weight_kg', 'scan_area', 'preset_name', 'modality',
       'scan_method', 'adult_child', 'study_date'],
      dtype='object')