In [38]:
# In this note, we oversample the low bone density data via ADASYN

In [60]:
import numpy as np
import pandas as pd

from collections import Counter
from sklearn import utils

from imblearn.over_sampling import ADASYN
from missingpy import MissForest

In [66]:
data = pd.read_csv('/home/youho/cp/cleaned_data/cp_data_final.csv')

In [67]:
# Designate categorical variables
data['sex'] = data.sex.astype('category')
data['dm'] = data.dm.astype('category')
data['htn'] = data.htn.astype('category')
data['smoke'] = data.smoke.astype('category')
data['gmfcs'] = data.gmfcs.astype('category')
data['cp_type'] = data.cp_type.astype('category')
data['dysphagia'] = data.dysphagia.astype('category')
data['smoke'] = data.smoke.astype('category')

In [68]:
print(data.dtypes)

age                       int64
sex                    category
dm                     category
htn                    category
height                  float64
weight                  float64
bmi                     float64
waist_circumference     float64
smoke                  category
gmfcs                  category
cp_type                category
dysphagia              category
bun                       int64
cr                      float64
gfr                     float64
chol                      int64
alp                       int64
got                       int64
gpt                       int64
tg                      float64
hdl                     float64
ldl                     float64
wbc                       int64
hb                      float64
plt                       int64
max_grip                float64
low_bone_density          int64
dtype: object


In [69]:
data.shape

(95, 27)

In [70]:
data.head()

Unnamed: 0,age,sex,dm,htn,height,weight,bmi,waist_circumference,smoke,gmfcs,...,got,gpt,tg,hdl,ldl,wbc,hb,plt,max_grip,low_bone_density
0,44,1,0,0,130.0,59.6,35.266272,91.0,1,4,...,64,46,250.0,31.0,52.0,5550,14.2,212,6.80388,1
1,38,1,0,0,172.0,47.6,16.089778,73.0,1,4,...,15,15,141.0,40.0,115.0,6770,14.3,230,18.370476,0
2,37,1,0,0,169.1,48.6,16.996091,63.5,0,2,...,28,16,92.0,56.0,87.0,5920,14.1,249,29.392762,0
3,33,1,0,0,173.0,55.0,18.376825,67.0,1,4,...,17,14,104.0,54.0,98.0,4700,16.1,273,20.41164,0
4,35,1,0,0,168.0,58.0,20.549887,74.5,1,4,...,26,23,446.0,35.0,61.0,6460,16.1,266,29.48348,0


In [71]:
#split dataset into features and labels

X = data.iloc[:,:-1] # Features
y = data.iloc[:, -1] # labels

In [74]:
SEED=42

print('Original dataset shape %s' % Counter(y))

ada = ADASYN(sampling_strategy=0.5, random_state=SEED)
X_res, y_res = ada.fit_resample(X, y)

print('Resampled dataset shape %s' % Counter(y_res))

Original dataset shape Counter({0: 81, 1: 14})
Resampled dataset shape Counter({0: 81, 1: 40})


In [75]:
X_res

Unnamed: 0,age,sex,dm,htn,height,weight,bmi,waist_circumference,smoke,gmfcs,...,alp,got,gpt,tg,hdl,ldl,wbc,hb,plt,max_grip
0,44,1,0,0,130.000000,59.600000,35.266272,91.000000,1,4,...,83,64,46,250.000000,31.000000,52.000000,5550,14.200000,212,6.803880
1,38,1,0,0,172.000000,47.600000,16.089778,73.000000,1,4,...,72,15,15,141.000000,40.000000,115.000000,6770,14.300000,230,18.370476
2,37,1,0,0,169.100000,48.600000,16.996091,63.500000,0,2,...,95,28,16,92.000000,56.000000,87.000000,5920,14.100000,249,29.392762
3,33,1,0,0,173.000000,55.000000,18.376825,67.000000,1,4,...,67,17,14,104.000000,54.000000,98.000000,4700,16.100000,273,20.411640
4,35,1,0,0,168.000000,58.000000,20.549887,74.500000,1,4,...,90,26,23,446.000000,35.000000,61.000000,6460,16.100000,266,29.483480
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
116,40,1,0,,164.567273,53.993026,19.946309,77.956592,1,4,...,76,18,17,64.321546,61.610931,91.932477,6846,14.747588,221,9.564811
117,52,1,0,0,159.845502,51.407424,20.120010,77.975586,0,,...,59,16,11,59.828057,67.209064,90.134821,5864,14.479367,180,9.223444
118,40,1,0,0,162.304258,53.868754,20.448011,82.012144,,,...,70,14,9,80.914369,51.632233,98.093204,5677,14.954660,194,5.771745
119,48,2,0,0,151.811460,46.293900,20.073700,69.234750,0,4,...,78,19,12,60.868759,43.587800,99.140480,4489,11.991682,195,7.476303


In [76]:
data_res = pd.concat([X_res,y_res], axis=1)

In [77]:
data_res.head()

Unnamed: 0,age,sex,dm,htn,height,weight,bmi,waist_circumference,smoke,gmfcs,...,got,gpt,tg,hdl,ldl,wbc,hb,plt,max_grip,low_bone_density
0,44,1,0,0,130.0,59.6,35.266272,91.0,1,4,...,64,46,250.0,31.0,52.0,5550,14.2,212,6.80388,1
1,38,1,0,0,172.0,47.6,16.089778,73.0,1,4,...,15,15,141.0,40.0,115.0,6770,14.3,230,18.370476,0
2,37,1,0,0,169.1,48.6,16.996091,63.5,0,2,...,28,16,92.0,56.0,87.0,5920,14.1,249,29.392762,0
3,33,1,0,0,173.0,55.0,18.376825,67.0,1,4,...,17,14,104.0,54.0,98.0,4700,16.1,273,20.41164,0
4,35,1,0,0,168.0,58.0,20.549887,74.5,1,4,...,26,23,446.0,35.0,61.0,6460,16.1,266,29.48348,0


In [78]:
RS = 100

# get indices of categorical features
cat_cols = [data_res.columns.get_loc(col) for col in data.select_dtypes(['category']).columns.tolist()]

# missForest imputation
imputer = MissForest(random_state=RS)
data_res_imputed = imputer.fit_transform(data_res, cat_vars=cat_cols)

Iteration: 0
Iteration: 1
Iteration: 2


In [79]:
data_res_imputed = pd.DataFrame(data_res_imputed, columns=data.columns.tolist())
for col in cat_cols:
    data_res_imputed.iloc[:,col] = data_res_imputed.iloc[:,col].astype('category')
data_res_imputed.describe(include='all')

Unnamed: 0,age,sex,dm,htn,height,weight,bmi,waist_circumference,smoke,gmfcs,...,got,gpt,tg,hdl,ldl,wbc,hb,plt,max_grip,low_bone_density
count,121.0,121.0,121.0,121.0,121.0,121.0,121.0,121.0,121.0,121.0,...,121.0,121.0,121.0,121.0,121.0,121.0,121.0,121.0,121.0,121.0
unique,,2.0,2.0,2.0,,,,,2.0,5.0,...,,,,,,,,,,
top,,1.0,0.0,0.0,,,,,0.0,4.0,...,,,,,,,,,,
freq,,77.0,117.0,114.0,,,,,92.0,75.0,...,,,,,,,,,,
mean,41.826446,,,,159.40779,56.665269,22.347031,80.169266,,,...,21.297521,21.429752,131.748632,48.92498,101.323373,6477.669421,13.97609,267.099174,17.289598,0.330579
std,8.289228,,,,7.593873,10.676771,3.976196,13.840845,,,...,6.904883,11.5577,80.424189,11.785948,27.531006,1589.486371,1.855983,74.748178,8.849323,0.472377
min,22.0,,,,130.0,35.0,14.381986,0.0,,,...,13.0,8.0,30.0,28.0,45.0,3570.0,5.6,147.0,1.814368,0.0
25%,36.0,,,,154.35144,50.4393,20.145035,73.713517,,,...,17.0,14.0,75.0,41.0,84.0,5260.0,12.8,223.0,10.432616,0.0
50%,41.0,,,,159.845502,55.0,21.590362,79.162135,,,...,19.0,17.0,110.0,46.0,99.0,6200.0,14.2,250.0,16.726508,0.0
75%,46.0,,,,165.0,60.251,23.923609,87.0,,,...,24.0,26.0,167.0,57.0,119.909731,7560.0,15.2,295.0,22.6796,1.0


In [83]:
data_res_imputed

Unnamed: 0,age,sex,dm,htn,height,weight,bmi,waist_circumference,smoke,gmfcs,...,got,gpt,tg,hdl,ldl,wbc,hb,plt,max_grip,low_bone_density
0,44.0,1.0,0.0,0.0,130.000000,59.600000,35.266272,91.000000,1.0,4.0,...,64.0,46.0,250.000000,31.000000,52.000000,5550.0,14.200000,212.0,6.803880,1.0
1,38.0,1.0,0.0,0.0,172.000000,47.600000,16.089778,73.000000,1.0,4.0,...,15.0,15.0,141.000000,40.000000,115.000000,6770.0,14.300000,230.0,18.370476,0.0
2,37.0,1.0,0.0,0.0,169.100000,48.600000,16.996091,63.500000,0.0,2.0,...,28.0,16.0,92.000000,56.000000,87.000000,5920.0,14.100000,249.0,29.392762,0.0
3,33.0,1.0,0.0,0.0,173.000000,55.000000,18.376825,67.000000,1.0,4.0,...,17.0,14.0,104.000000,54.000000,98.000000,4700.0,16.100000,273.0,20.411640,0.0
4,35.0,1.0,0.0,0.0,168.000000,58.000000,20.549887,74.500000,1.0,4.0,...,26.0,23.0,446.000000,35.000000,61.000000,6460.0,16.100000,266.0,29.483480,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
116,40.0,1.0,0.0,0.0,164.567273,53.993026,19.946309,77.956592,1.0,4.0,...,18.0,17.0,64.321546,61.610931,91.932477,6846.0,14.747588,221.0,9.564811,1.0
117,52.0,1.0,0.0,0.0,159.845502,51.407424,20.120010,77.975586,0.0,5.0,...,16.0,11.0,59.828057,67.209064,90.134821,5864.0,14.479367,180.0,9.223444,1.0
118,40.0,1.0,0.0,0.0,162.304258,53.868754,20.448011,82.012144,1.0,4.0,...,14.0,9.0,80.914369,51.632233,98.093204,5677.0,14.954660,194.0,5.771745,1.0
119,48.0,2.0,0.0,0.0,151.811460,46.293900,20.073700,69.234750,0.0,4.0,...,19.0,12.0,60.868759,43.587800,99.140480,4489.0,11.991682,195.0,7.476303,1.0


In [84]:
data_res_imputed.to_csv(r'/home/youho/cp/cleaned_data/cp_data_final_adasyn.csv')