### ライブラリのimport

In [67]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

## 解析用の元データを読み込み

In [26]:
df = pd.read_excel('preprocessed_train_data.xlsx')
df.head()

Unnamed: 0,accession,scan protocol,scan series,target region,scanning length,kV,mean mA,max mA,pitch factor,nomial total collimation width,...,age,gender,height_cm,weight_kg,scan_area,scan_method,bmi,bmr,category_bmi,age_era
0,1103161420210101,5.1 QQ Chest Routine,,Chest,382.14,100,350.24,640,1.38,80,...,86,F,150.0,49.1,胸部CT,単純,21.82,1000.01,normal weight,80
1,1103161620210101,5.1 QQ Chest Routine,,Chest,442.2,100,442.93,700,1.38,80,...,61,M,163.0,74.0,胸部CT,単純,27.85,1487.4,obesity class1,60
2,1103339220210102,5.1 QQ Chest Routine,,Chest,467.06,100,306.59,700,1.38,80,...,46,M,178.0,68.0,胸部CT,単純,21.46,1581.27,normal weight,40
3,1103327120210102,5.1 QQ Chest Routine,,Chest,472.12,100,433.42,700,1.38,80,...,52,M,182.0,70.0,胸部CT,単純,21.13,1588.26,normal weight,50
4,1103481920210103,5.1 QQ Chest Routine,,Chest,462.22,100,691.26,700,1.38,80,...,52,M,175.0,112.0,胸部CT,単純,36.57,2130.73,obesity class3,50


## 　新たな特徴量を作成

#### 体表面積を作成　　

体表面積 (m2
 )=0.007184×身長(cm) 
^0.725
 ×体重(kg)^
0.425

In [27]:
# Define a function to calculate body surface area
def calculate_bsa(height_cm, weight_kg):
    bsa = 0.007184 * (height_cm ** 0.725) * (weight_kg ** 0.425)
    return bsa

# Calculate body surface area for each row in the data
df['body_surface_area'] = calculate_bsa(df['height_cm'], df['weight_kg'])

# Display the first few rows of the data
df.head()


Unnamed: 0,accession,scan protocol,scan series,target region,scanning length,kV,mean mA,max mA,pitch factor,nomial total collimation width,...,gender,height_cm,weight_kg,scan_area,scan_method,bmi,bmr,category_bmi,age_era,body_surface_area
0,1103161420210101,5.1 QQ Chest Routine,,Chest,382.14,100,350.24,640,1.38,80,...,F,150.0,49.1,胸部CT,単純,21.82,1000.01,normal weight,80,1.421484
1,1103161620210101,5.1 QQ Chest Routine,,Chest,442.2,100,442.93,700,1.38,80,...,M,163.0,74.0,胸部CT,単純,27.85,1487.4,obesity class1,60,1.797322
2,1103339220210102,5.1 QQ Chest Routine,,Chest,467.06,100,306.59,700,1.38,80,...,M,178.0,68.0,胸部CT,単純,21.46,1581.27,normal weight,40,1.84815
3,1103327120210102,5.1 QQ Chest Routine,,Chest,472.12,100,433.42,700,1.38,80,...,M,182.0,70.0,胸部CT,単純,21.13,1588.26,normal weight,50,1.901449
4,1103481920210103,5.1 QQ Chest Routine,,Chest,462.22,100,691.26,700,1.38,80,...,M,175.0,112.0,胸部CT,単純,36.57,2130.73,obesity class3,50,2.256766


### scan_areaとscan_method(例：胸部CTx単純)をクロス集計

In [28]:
# クロス集計表の出現割合
cat_1 = 'scan_area'
cat_2 = 'scan_method'
X_cross = pd.crosstab(df[cat_1], df[cat_2], normalize='index')
X_cross = X_cross.reset_index()

# クロス集計表のテーブルへの変換
X_tbl = pd.melt(X_cross, id_vars=cat_1, value_name=f'rate_{cat_1} x {cat_2}')

# 出現割合の特徴量追加
df = pd.merge(df, X_tbl, on=[cat_1, cat_2], how='left')

### category_bmiとgenderのクロス集計

In [29]:
# クロス集計表の出現割合
cat_1 = 'category_bmi'
cat_2 = 'gender'
X_cross = pd.crosstab(df[cat_1], df[cat_2], normalize='index')
X_cross = X_cross.reset_index()

# クロス集計表のテーブルへの変換
X_tbl = pd.melt(X_cross, id_vars=cat_1, value_name=f'rate_{cat_1} x {cat_2}')

# 出現割合の特徴量追加
df = pd.merge(df, X_tbl, on=[cat_1, cat_2], how='left')

### category_bmiとage_eraのクロス集計

In [35]:
# クロス集計表の出現割合
cat_1 = 'age_era'
cat_2 = 'category_bmi'
X_cross = pd.crosstab(df[cat_1], df[cat_2], normalize='index')
X_cross = X_cross.reset_index()

# クロス集計表のテーブルへの変換
X_tbl = pd.melt(X_cross, id_vars=cat_1, value_name=f'rate_{cat_1} x {cat_2}')

# 出現割合の特徴量追加
df = pd.merge(df, X_tbl, on=[cat_1, cat_2], how='left')

In [36]:
df.head()

Unnamed: 0,accession,scan protocol,scan series,target region,scanning length,kV,mean mA,max mA,pitch factor,nomial total collimation width,...,bmi,bmr,category_bmi,age_era,body_surface_area,rate_scan_area x scan_method,rate_category_bmi x gender,rate_category_bmi x age_era,rate_scan_area x CTDIw phantom type,rate_age_era x category_bmi
0,1103161420210101,5.1 QQ Chest Routine,,Chest,382.14,100,350.24,640,1.38,80,...,21.82,1000.01,normal weight,80,1.421484,0.965587,0.285877,0.135156,1.0,0.643761
1,1103161620210101,5.1 QQ Chest Routine,,Chest,442.2,100,442.93,700,1.38,80,...,27.85,1487.4,obesity class1,60,1.797322,0.965587,0.845044,0.12657,1.0,0.259048
2,1103339220210102,5.1 QQ Chest Routine,,Chest,467.06,100,306.59,700,1.38,80,...,21.46,1581.27,normal weight,40,1.84815,0.965587,0.714123,0.194761,1.0,0.614739
3,1103327120210102,5.1 QQ Chest Routine,,Chest,472.12,100,433.42,700,1.38,80,...,21.13,1588.26,normal weight,50,1.901449,0.965587,0.714123,0.200076,1.0,0.500475
4,1103481920210103,5.1 QQ Chest Routine,,Chest,462.22,100,691.26,700,1.38,80,...,36.57,2130.73,obesity class3,50,2.256766,0.965587,0.685714,0.442857,1.0,0.01472


### 各年代の性別ごとのBMIの中央値を作成し、それを新たな特徴量とする

In [42]:
median_bmi = df.groupby(['age_era', 'gender'])['bmi'].median().to_dict()

In [47]:
df['age_era_gender_bmi'] = df.apply(lambda row: median_bmi[row['age_era'], row['gender']], axis=1)

### 数値カラムに対する変換

* 身長は何かしら変換を加えて正規分布に近づけてみるのも良いかも(Box-Cox変換)
* age_era_gender_bmiとbody_surface_areもBox-Cox変換等の非線形処理をしても良い