In [2]:
# 他克莫司剂量模型集成

In [1]:
import pandas as pd
import numpy as np

import sys
import re
import os
project_path = os.getcwd()

## load data

In [2]:
df_model =pd.read_excel(project_path +'/data/df_修改剂量_model_data_forward_模型集成.xlsx')
if 'Unnamed: 0' in df_model.columns:
    df_model = df_model.drop(['Unnamed: 0'], axis=1)

In [3]:
df_model.shape

(150, 13)

In [4]:
df_model.columns

Index(['target_dosage', 'gender', 'age', 'weight', 'EH', 'DM', 'ccb',
       'glucocorticoid', 'last_dosage', 'last_tdm', 'direct_bilirubin', 'urea',
       'lymphocyte_ratio'],
      dtype='object')

In [5]:
# 抽取分类数据
# 把少数样本
df_model['target_dosage']=df_model['target_dosage'].apply(lambda x: 0 if round(float(x),1)==1 else
                                                                1 if round(float(x),1)==1.5 else
                                                                2 if round(float(x),1)==2 else
                                                                3 if round(float(x),1)==2.5 else
                                                                4 if round(float(x),1)==3 else
                                                                5 if round(float(x),1)==4 else 6)
# 提取分类数据
df_model['target_dosage']=df_model['target_dosage'].astype('str')
df_model=df_model[df_model['target_dosage'].str.contains('0|1|2|3|4|5')]
df_model['target_dosage']=df_model['target_dosage'].astype('float')
print(df_model.target_dosage.value_counts())

2.0    32
4.0    21
0.0    21
1.0    19
5.0    17
3.0    11
Name: target_dosage, dtype: int64


In [6]:
discrete_col=['ccb','DM','EH','glucocorticoid','gender']
continuous_col=[x for x in df_model.columns if x not in discrete_col]
continuous_col.remove('target_dosage')

## 归一化处理

In [7]:
# 防止不同维特征数据差距过大，影响建模效果
max_list=[]
for i in continuous_col:
    max_value = df_model[i].max()
    max_list.append(max_value)
    df_model[i]=df_model[i].apply(lambda x: round(x/max_value,3))

## 随机森林插补

In [8]:
# 使用随机森林对缺失值进行插补
import pandas as pd
pd.set_option('mode.chained_assignment', None)
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
def missing_value_interpolation(df):
    df = df.reset_index(drop=True)
    # 提取存在缺失值的列名
    missing_list = []
    for i in df.columns:
        if df[i].isnull().sum()>0:
            missing_list.append(i)
    missing_list_copy = missing_list.copy()
    # 用该列未缺失的值训练随机森林，然后用训练好的rf预测缺失值
    for i in range(len(missing_list)):
        name=missing_list[0]
        df_missing = df[missing_list_copy]
        # 将其他列的缺失值用0表示。
        missing_list.remove(name)
        for j in missing_list:
            df_missing[j]=df_missing[j].astype('str').apply(lambda x: 0 if x=='nan' else x)
        df_missing_is = df_missing[df_missing[name].isnull()]
        df_missing_not = df_missing[df_missing[name].notnull()]
        y = df_missing_not[name]
        x = df_missing_not.drop([name],axis=1)

        rfr = RandomForestRegressor(n_estimators=300,
                                    random_state=3)
        rfr.fit(x, y)
        #预测缺失值
        predict = rfr.predict(df_missing_is.drop([name],axis=1))
        #填补缺失值
        df.loc[df[name].isnull(),name] = predict
    return df

In [9]:
# 插补建模数据
df_model_cb=missing_value_interpolation(df_model)

## SMOTE

In [10]:
from imblearn.over_sampling import SMOTE,ADASYN 
tran_x = df_model_cb.drop(['target_dosage'],axis=1)
tran_y = df_model_cb['target_dosage']
sm = SMOTE(random_state=0)
tran_x_sm,tran_y_sm = sm.fit_resample(tran_x,tran_y)

## model

In [11]:
import xgboost
# XGBoost模型
xgb_model=xgboost.XGBClassifier(max_depth=5,
                        learning_rate=0.001,
                        n_estimators=500,
                        min_child_weight=0.5,
                        eta=0.1,
                        gamma=0.5,
                        reg_lambda=5,
                        subsample=0.8,
                        colsample_bytree=0.8,
                        nthread=4,
                        scale_pos_weight=1,
                        random_state=3)
xgb_model.fit(tran_x_sm,tran_y_sm)

XGBClassifier(colsample_bytree=0.8, eta=0.1, gamma=0.5, learning_rate=0.001,
              max_depth=5, min_child_weight=0.5, n_estimators=500, nthread=4,
              objective='multi:softprob', random_state=3, reg_lambda=5,
              subsample=0.8)

## save model

In [14]:
import pickle
pickle.dump(xgb_model,open(project_path+'/data/result/xgb_model.pkl','wb'))

## predict dosage

In [15]:
gender=1
age=32
weight=65
EH=0
DM=0
ccb=0
glucocorticoid=1
last_dosage=2
last_tdm=12.5
direct_bilirubin=1.8
urea=14.4
lymphocyte_ratio=29.7

In [16]:
# 连续变量归一化处理
age=round(age/73,3)
weight=round(weight/117.6,3)
last_dosage=round(last_dosage/6,3)
last_tdm=round(last_tdm/30,3)
direct_bilirubin=round(direct_bilirubin/8.8,3)
urea=round(urea/36.1,3)
lymphocyte_ratio=round(lymphocyte_ratio/71.8)

In [17]:
df_test=pd.DataFrame(data={'gender':[gender],
                          'age':[age],
                          'weight':[weight],
                          'EH':[EH],
                          'DM':[DM],
                          'ccb':[ccb],
                          'glucocorticoid':[glucocorticoid],
                          'last_dosage':[last_dosage],
                          'last_tdm':[last_tdm],
                          'direct_bilirubin':[direct_bilirubin],
                          'urea':[urea],
                          'lymphocyte_ratio':[lymphocyte_ratio]})

In [18]:
# 重新加载预测模型
import xgboost
pre_model=pickle.load(open(project_path+'/data/result/xgb_model.pkl','rb'))

In [19]:
pre_model

XGBClassifier(colsample_bytree=0.8, eta=0.1, gamma=0.5, learning_rate=0.001,
              max_depth=5, min_child_weight=0.5, missing=nan, n_estimators=500,
              nthread=4, objective='multi:softprob', random_state=3,
              reg_lambda=5, subsample=0.8)

In [20]:
pre=pre_model.predict(df_test)

In [21]:
pre

array([2.])

In [22]:
# 标签转换为日剂量
if pre[0]==0:
    dosage='1mg'
elif pre[0]==1:
    dosage='1.5mg'
elif pre[0]==2:
    dosage='2mg'
elif pre[0]==3:
    dosage.append('2.5mg')
elif pre[0]==4:
    dosage.append('3mg')
elif pre[0]==5:
    dosage.append('4mg')

In [23]:
dosage

'2mg'