In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from data_import import preprocess_import_data

In [2]:
df = pd.concat([pd.read_excel('./scan_data/202109_all_scan_data.xlsx'), 
                pd.read_excel('./scan_data/202110_all_scan_data.xlsx'),
                pd.read_excel('./scan_data/202111_all_scan_data.xlsx'),
                pd.read_excel('./scan_data/202112_all_scan_data.xlsx')])

preprocess_import_data(df)
df.head(3)

Unnamed: 0,age,gender,height_cm,weight_kg,adult_child,department,hospital_ward,scan_area,scan_method,mA,CTDI
0,72,M,170.0,83.0,成人,救急科,外来,胸部〜骨盤CT,造影,366.41,16.64
1,72,M,170.0,83.0,成人,救急科,外来,胸部〜骨盤CT,造影,366.41,16.61
2,85,M,171.0,58.9,成人,循環器内科,外来,胸部〜骨盤CT,単純,234.59,16.66


In [3]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error
from sklearn.preprocessing import OrdinalEncoder
from xgboost import XGBRegressor, XGBRFRegressor

In [5]:
oe = OrdinalEncoder()
oe.set_output(transform='pandas')
cat_cols = df.select_dtypes(exclude=np.number).columns.to_list()
df[cat_cols] = oe.fit_transform(df[cat_cols])

df.dropna(inplace=True)

target = 'CTDI'
X = df.drop(target, axis=1)
X = df.drop('mA', axis=1)
y = df[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [8]:
xgb_reg = XGBRegressor(learning_rate=0.01,
                       n_estimators=100,
                       max_depth=3)

xgb_reg.fit(X_train, y_train)

y_pred = xgb_reg.predict(X_test)

In [9]:
print(f'mean_absolte_error: {mean_absolute_error(y_test, y_pred)}')
print(f'mean_absolute_percentage_error: {mean_absolute_percentage_error(y_test, y_pred)}')

mean_absolte_error: 6.899913421860903
mean_absolute_percentage_error: 0.3518131979424789


In [11]:
df_predict = pd.DataFrame({'y_test': y_test, 'y_pred': y_pred})

df_predict['diff'] = df_predict['y_test'] - df_predict['y_pred']

df_predict

Unnamed: 0,y_test,y_pred,diff
898,18.68,11.714196,6.965804
1645,27.50,17.582987,9.917013
1918,19.13,11.810486,7.319514
104,15.44,9.795569,5.644431
2367,21.86,14.272161,7.587839
...,...,...,...
1841,34.38,21.707563,12.672437
538,10.22,6.577580,3.642420
2239,11.46,7.298831,4.161169
1888,6.60,4.350098,2.249902


In [12]:
df_predict.describe()

Unnamed: 0,y_test,y_pred,diff
count,854.0,854.0,854.0
mean,19.181288,12.287225,6.894068
std,10.953031,6.795279,4.191184
min,2.14,3.129221,-0.989221
25%,9.9475,6.401962,3.545538
50%,15.91,10.182335,5.742096
75%,28.305,18.013206,10.291794
max,65.51,32.393623,33.116377


## mAを予測する

In [13]:
df = pd.concat([pd.read_excel('./scan_data/202109_all_scan_data.xlsx'), 
                pd.read_excel('./scan_data/202110_all_scan_data.xlsx'),
                pd.read_excel('./scan_data/202111_all_scan_data.xlsx'),
                pd.read_excel('./scan_data/202112_all_scan_data.xlsx')])

preprocess_import_data(df)

# ラベルのエンコーディング Ordinal_encoder
oe = OrdinalEncoder()
oe.set_output(transform='pandas')
# カテゴリカラムのみ抽出して、ordinal_encoder
cat_cols = df.select_dtypes(exclude=np.number).columns.to_list()
df[cat_cols] = oe.fit_transform(df[cat_cols])

# 今回はとりあえず、kVなどの線量情報が含まれてないものは単純にdropnaしてしまう
df.dropna(inplace=True)

# データをtargetとそれ以外に分割
target = 'mA'
X = df.drop(target, axis=1)
X = df.drop('CTDI', axis=1)
y = df[target]


# train testに分割
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

# 本来はここでhyper-parameterチューニングをする
xgb_reg = XGBRegressor(learning_rate=0.01,
                       n_estimators=100,
                       max_depth=3)

xgb_reg.fit(X_train, y_train)

y_pred = xgb_reg.predict(X_test)

print(f'mean_absolte_error: {mean_absolute_error(y_test, y_pred)}')
print(f'mean_absolute_percentage_error: {mean_absolute_percentage_error(y_test, y_pred)}')

df_predict = pd.DataFrame({'y_test': y_test, 'y_pred': y_pred})
df_predict['diff'] = df_predict['y_test'] - df_predict['y_pred']
df_predict

mean_absolte_error: 105.35262483136715
mean_absolute_percentage_error: 0.3614017752853843


Unnamed: 0,y_test,y_pred,diff
898,354.65,225.053452,129.596548
1645,298.31,186.329086,111.980914
1918,351.62,223.032333,128.587667
104,283.88,174.649918,109.230082
2367,190.09,121.406464,68.683536
...,...,...,...
1841,224.22,141.608871,82.611129
538,367.28,233.267639,134.012361
2239,365.60,231.663147,133.936853
1888,394.55,249.367325,145.182675


In [14]:
df_predict.describe()

Unnamed: 0,y_test,y_pred,diff
count,854.0,854.0,854.0
mean,287.619895,182.267807,105.3521
std,124.060919,77.371857,46.861824
min,15.0,15.22391,-0.22391
25%,198.9225,124.684738,74.237762
50%,263.84,167.500671,96.339329
75%,365.3375,231.663147,133.674353
max,661.93,384.049194,277.880806


## XGBRFRegressor

In [17]:
df = pd.concat([pd.read_excel('./scan_data/202109_all_scan_data.xlsx'), 
                pd.read_excel('./scan_data/202110_all_scan_data.xlsx'),
                pd.read_excel('./scan_data/202111_all_scan_data.xlsx'),
                pd.read_excel('./scan_data/202112_all_scan_data.xlsx')])

preprocess_import_data(df)

xgbrf = XGBRFRegressor(learning_rate=0.1,
                       n_estimators=100,
                       max_depth=3,
                       objective='reg:squarederror')

oe = OrdinalEncoder()
oe.set_output(transform='pandas')
cat_cols = df.select_dtypes(exclude=np.number).columns.to_list()
df[cat_cols] = oe.fit_transform(df[cat_cols])

df.dropna(inplace=True)

target = 'CTDI'
X = df.drop(target, axis=1)
X = df.drop('mA', axis=1)
y = df[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [18]:
xgbrf.fit(X_train, y_train)

In [19]:
y_pred = xgbrf.predict(X_test)

In [20]:
print(f'mean_absolte_error: {mean_absolute_error(y_test, y_pred)}')
print(f'mean_absolute_percentage_error: {mean_absolute_percentage_error(y_test, y_pred)}')

mean_absolte_error: 16.816236076863085
mean_absolute_percentage_error: 0.8633697460037525


In [21]:
df_predict = pd.DataFrame({'y_test': y_test, 'y_pred': y_pred})
df_predict['diff'] = df_predict['y_test'] - df_predict['y_pred']
df_predict

Unnamed: 0,y_test,y_pred,diff
898,18.68,2.289706,16.390294
1645,27.50,3.129825,24.370175
1918,19.13,2.305424,16.824576
104,15.44,1.894999,13.545001
2367,21.86,2.954233,18.905767
...,...,...,...
1841,34.38,3.578564,30.801436
538,10.22,1.444835,8.775165
2239,11.46,1.452803,10.007197
1888,6.60,1.036996,5.563004


In [23]:
df_predict.describe()

Unnamed: 0,y_test,y_pred,diff
count,854.0,854.0,854.0
mean,19.181288,2.365056,16.816236
std,10.953031,1.062835,9.903756
min,2.14,1.033134,1.093227
25%,9.9475,1.437482,8.535946
50%,15.91,2.050832,13.846316
75%,28.305,3.365353,24.945777
max,65.51,5.31137,60.304014


## mAを予測する

In [24]:
df = pd.concat([pd.read_excel('./scan_data/202109_all_scan_data.xlsx'), 
                pd.read_excel('./scan_data/202110_all_scan_data.xlsx'),
                pd.read_excel('./scan_data/202111_all_scan_data.xlsx'),
                pd.read_excel('./scan_data/202112_all_scan_data.xlsx')])

preprocess_import_data(df)

# ラベルのエンコーディング Ordinal_encoder
oe = OrdinalEncoder()
oe.set_output(transform='pandas')
# カテゴリカラムのみ抽出して、ordinal_encoder
cat_cols = df.select_dtypes(exclude=np.number).columns.to_list()
df[cat_cols] = oe.fit_transform(df[cat_cols])

# 今回はとりあえず、kVなどの線量情報が含まれてないものは単純にdropnaしてしまう
df.dropna(inplace=True)

# データをtargetとそれ以外に分割
target = 'mA'
X = df.drop(target, axis=1)
X = df.drop('CTDI', axis=1)
y = df[target]


# train testに分割
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

# 本来はここでhyper-parameterチューニングをする
xgbrf.fit(X_train, y_train)


y_pred = xgbrf.predict(X_test)

print(f'mean_absolte_error: {mean_absolute_error(y_test, y_pred)}')
print(f'mean_absolute_percentage_error: {mean_absolute_percentage_error(y_test, y_pred)}')

df_predict = pd.DataFrame({'y_test': y_test, 'y_pred': y_pred})
df_predict['diff'] = df_predict['y_test'] - df_predict['y_pred']
df_predict

mean_absolte_error: 258.3827632711866
mean_absolute_percentage_error: 0.8942276881410369


Unnamed: 0,y_test,y_pred,diff
898,354.65,36.229763,318.420237
1645,298.31,25.784649,272.525351
1918,351.62,36.229763,315.390237
104,283.88,26.964548,256.915452
2367,190.09,20.689497,169.400503
...,...,...,...
1841,224.22,21.111050,203.108950
538,367.28,35.521908,331.758092
2239,365.60,35.882832,329.717168
1888,394.55,40.091099,354.458901


In [25]:
df_predict.describe()

Unnamed: 0,y_test,y_pred,diff
count,854.0,854.0,854.0
mean,287.619895,29.237122,258.382763
std,124.060919,11.868396,112.361116
min,15.0,3.273782,10.49643
25%,198.9225,20.75016,178.170324
50%,263.84,25.877586,237.989727
75%,365.3375,36.229763,329.695751
max,661.93,58.275639,604.797775
