In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error
from sklearn.preprocessing import OrdinalEncoder
import lightgbm as lgb
from data_import import preprocess_import_data

In [10]:
df = pd.concat([pd.read_excel('./scan_data/202109_all_scan_data.xlsx'), 
                pd.read_excel('./scan_data/202110_all_scan_data.xlsx'),
                pd.read_excel('./scan_data/202111_all_scan_data.xlsx'),
                pd.read_excel('./scan_data/202112_all_scan_data.xlsx')])

preprocess_import_data(df)
df.head(3)

Unnamed: 0,age,gender,height_cm,weight_kg,adult_child,department,hospital_ward,scan_area,scan_method,mA,CTDI
0,72,M,170.0,83.0,成人,救急科,外来,胸部〜骨盤CT,造影,366.41,16.64
1,72,M,170.0,83.0,成人,救急科,外来,胸部〜骨盤CT,造影,366.41,16.61
2,85,M,171.0,58.9,成人,循環器内科,外来,胸部〜骨盤CT,単純,234.59,16.66


In [11]:
oe = OrdinalEncoder()
oe.set_output(transform='pandas')
cat_cols = df.select_dtypes(exclude=np.number).columns.to_list()
df[cat_cols] = oe.fit_transform(df[cat_cols])

df.dropna(inplace=True)

target = 'CTDI'

X = df.drop(target, axis=1)
X = df.drop('mA', axis=1)
y = df[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [12]:
lgb_reg = lgb.LGBMRegressor(learning_rate=0.01, random_state=0)
lgb_reg.fit(X_train, y_train)

In [13]:
y_pred = lgb_reg.predict(X_test)

In [14]:
print(f'mean_absolte_error: {mean_absolute_error(y_test, y_pred)}')
print(f'mean_absolute_percentage_error: {mean_absolute_percentage_error(y_test, y_pred)}')

mean_absolte_error: 3.4538200603436713
mean_absolute_percentage_error: 0.27049076820325163


In [15]:
df_predict = pd.DataFrame({'y_test': y_test, 'y_pred': y_pred})

df_predict['diff'] = df_predict['y_test'] - df_predict['y_pred']

df_predict

Unnamed: 0,y_test,y_pred,diff
898,18.68,18.659247,0.020753
1645,27.50,24.230009,3.269991
1918,19.13,18.859951,0.270049
104,15.44,16.503613,-1.063613
2367,21.86,20.605301,1.254699
...,...,...,...
1841,34.38,28.559330,5.820670
538,10.22,13.284352,-3.064352
2239,11.46,14.025652,-2.565652
1888,6.60,11.005110,-4.405110


In [16]:
df_predict.describe()

Unnamed: 0,y_test,y_pred,diff
count,854.0,854.0,854.0
mean,19.181288,18.951831,0.229457
std,10.953031,6.910224,4.063834
min,2.14,8.425702,-6.285702
25%,9.9475,13.08361,-3.125205
50%,15.91,16.788873,-0.947698
75%,28.305,24.724621,3.538777
max,65.51,40.319013,25.255833


### mAを予測する

In [19]:
df = pd.concat([pd.read_excel('./scan_data/202109_all_scan_data.xlsx'), 
                pd.read_excel('./scan_data/202110_all_scan_data.xlsx'),
                pd.read_excel('./scan_data/202111_all_scan_data.xlsx'),
                pd.read_excel('./scan_data/202112_all_scan_data.xlsx')])

preprocess_import_data(df)

# ラベルのエンコーディング Ordinal_encoder
oe = OrdinalEncoder()
oe.set_output(transform='pandas')
# カテゴリカラムのみ抽出して、ordinal_encoder
cat_cols = df.select_dtypes(exclude=np.number).columns.to_list()
df[cat_cols] = oe.fit_transform(df[cat_cols])

# 今回はとりあえず、kVなどの線量情報が含まれてないものは単純にdropnaしてしまう
df.dropna(inplace=True)

# データをtargetとそれ以外に分割
target = 'mA'
X = df.drop(target, axis=1)
X = df.drop('CTDI', axis=1)
y = df[target]


# train testに分割
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

# 本来はここでhyper-parameterチューニングをする
lgb_reg = lgb.LGBMRegressor(learning_rate=0.01, random_state=0)
lgb_reg.fit(X_train, y_train)

y_pred = lgb_reg.predict(X_test)

print(f'mean_absolte_error: {mean_absolute_error(y_test, y_pred)}')
print(f'mean_absolute_percentage_error: {mean_absolute_percentage_error(y_test, y_pred)}')

df_predict = pd.DataFrame({'y_test': y_test, 'y_pred': y_pred})
df_predict['diff'] = df_predict['y_test'] - df_predict['y_pred']
df_predict

mean_absolte_error: 36.565732339369006
mean_absolute_percentage_error: 0.28919066402372906


Unnamed: 0,y_test,y_pred,diff
898,354.65,329.501973,25.148027
1645,298.31,294.857843,3.452157
1918,351.62,328.431736,23.188264
104,283.88,285.256213,-1.376213
2367,190.09,225.699293,-35.609293
...,...,...,...
1841,224.22,247.459184,-23.239184
538,367.28,337.072768,30.207232
2239,365.60,337.072768,28.527232
1888,394.55,355.377193,39.172807


In [20]:
df_predict.describe()

Unnamed: 0,y_test,y_pred,diff
count,854.0,854.0,854.0
mean,287.619895,287.595384,0.02451
std,124.060919,78.689304,45.401672
min,15.0,117.932081,-102.932081
25%,198.9225,231.20428,-32.282965
50%,263.84,272.517881,-8.924896
75%,365.3375,337.072768,28.756284
max,661.93,516.553456,145.376544
