In [6]:
# _*_ coding: utf-8 _*_
# @Time: 2021/10/27 17:51
# @Author: yuyongsheng
# @Software: PyCharm
# @Description: 南方医院利伐沙班(1)：数据处理。以case_no为基础，多次入院数据；其他联合用药

In [7]:
# 导入程序包
import pandas as pd
pd.set_option('mode.chained_assignment', None)
import numpy as np
import os
project_path=os.getcwd()

In [8]:
# 导入预定义函数
# 字符串转换为时间格式
import datetime
def str_to_datetime(x):
    try:
        a = datetime.datetime.strptime(x, "%d/%m/%Y %H:%M:%S")
        return a
    except:
        return np.NaN

# 原始数据集预处理：调整时间格式，删除异常值（过大100倍）、文字

## 用药原始数据doctor_order处理

In [9]:
df_doctor_order=pd.read_csv(project_path+'/data/raw_data/2-doctor_order.csv')
print(df_doctor_order.shape)
print(df_doctor_order['patient_id'].nunique())
print(df_doctor_order['case_no'].nunique())
# 提取用药状态为停止的用药
df_doctor_order=df_doctor_order[df_doctor_order['statusdesc']=='停止']
print(df_doctor_order.shape)
print(df_doctor_order['patient_id'].nunique())
print(df_doctor_order['case_no'].nunique())
# 并删除服药方式为“取药用”的样本
df_doctor_order=df_doctor_order[df_doctor_order['medication_way']!='取药用']
print(df_doctor_order.shape)
print(df_doctor_order['patient_id'].nunique())
print(df_doctor_order['case_no'].nunique())
# 删除用药剂量为空的数据
df_doctor_order=df_doctor_order[(df_doctor_order['dosage'].astype('str').notnull()) & (df_doctor_order['dosage'].astype('str')!='nan')]
df_doctor_order=df_doctor_order.reset_index(drop=True)
print(df_doctor_order.shape)
print(df_doctor_order['patient_id'].nunique())
print(df_doctor_order['case_no'].nunique())

(3806167, 30)
7544
9350
(629287, 30)
7145
8732
(629198, 30)
7145
8732
(269955, 30)
6891
8358


In [10]:
# 提取doctor_order里面的有效字段
df_doctor_order=df_doctor_order[['patient_id','case_no','long_d_order','drug_name','amount','drug_spec','dosage','frequency','medication_way','start_datetime','end_datetime']]
# 调整doctor_order开始服药时间和结束服药时间格式
df_doctor_order['start_datetime']=df_doctor_order['start_datetime'].apply(str_to_datetime)
df_doctor_order['end_datetime']=df_doctor_order['end_datetime'].apply(str_to_datetime)
print(df_doctor_order)

        patient_id   case_no  long_d_order                 drug_name  amount  \
0          8403580  10000919             1        辅酶Q10软胶囊[10mg*24粒]     1.0   
1          8403580  10000919             1      曲美他嗪片(万爽力)[20mg*30片]     1.0   
2          8403580  10000919             1      0.9%氯化钠注射液[100ml*1袋]     1.0   
3          8403580  10000919             1        磷酸肌酸钠粉针(天成)[1g*1瓶]     1.0   
4          8403580  10000919             1            静脉输液(其它药物集中配置)     1.0   
...            ...       ...           ...                       ...     ...   
269950     3111059   9998450             1             叶酸片[5mg*100片]     1.0   
269951     3111059   9998450             1       甲泼尼龙片(美卓乐)[4mg*30片]     1.0   
269952     3111059   9998450             1  美托洛尔缓释片(倍他乐克)[47.5mg*7片]     1.0   
269953     3111059   9998450             1       唑吡坦片(思诺思)[10mg*20片]     1.0   
269954     3111059   9998450             1              患者自备药(胶囊、片剂)     1.0   

        drug_spec  dosage frequency med

In [11]:
# 保存预处理后的原始用药数据doctor_order
writer=pd.ExcelWriter(project_path+'/data/pre_processed_raw_data/df_doctor_order.xlsx')
df_doctor_order.to_excel(writer)
writer.save()

## 诊断原始数据diagnostic处理

In [12]:
df_diagnostic=pd.read_csv(project_path+'/data/raw_data/3-diagnostic_record.csv',dtype={'case_no':str})  # dtype可以防止某一列因为pandas读取导致数据类型改变
print(df_diagnostic.shape)
print(df_diagnostic['patient_id'].nunique())
print(df_diagnostic['case_no'].nunique())
print(df_diagnostic)

(162149, 17)
10199
9346
       diagnostic_record_id  patient_id  inp_record_id  outp_record_id  \
0               10000247||2     8320687            NaN             NaN   
1               10000918||2     8403580            NaN             NaN   
2               10000918||3     8403580            NaN             NaN   
3               10000918||4     8403580            NaN             NaN   
4              10001365||10     4740788            NaN             NaN   
...                     ...         ...            ...             ...   
162144           9998849||2     6065947            NaN             NaN   
162145           9999973||1     4551879            NaN             NaN   
162146           9999973||2     4551879            NaN             NaN   
162147           9999973||3     4551879            NaN             NaN   
162148           9999973||4     4551879            NaN             NaN   

               record_date diagnostic_type diagnostic_content  treatment_days  \
0     

In [13]:
# 删除诊断为空的数据
df_diagnostic=df_diagnostic[(df_diagnostic['diagnostic_content'].notnull())& (df_diagnostic['diagnostic_content'].astype('str')!='nan')]
print(df_diagnostic.shape)
print(df_diagnostic['patient_id'].nunique())
print(df_diagnostic['case_no'].nunique())
# 删除住院记录case_no为空的记录
df_diagnostic=df_diagnostic[(df_diagnostic['case_no'].notnull()) & (df_diagnostic['case_no'].astype('str')!='nan')]
df_diagnostic=df_diagnostic.reset_index(drop=True)
print(df_diagnostic.shape)
print(df_diagnostic['patient_id'].nunique())
print(df_diagnostic['case_no'].nunique())
print(df_diagnostic)

(162145, 17)
10199
9346
(126565, 17)
7542
9346
       diagnostic_record_id  patient_id  inp_record_id  outp_record_id  \
0               10000918||2     8403580            NaN             NaN   
1               10000918||3     8403580            NaN             NaN   
2               10000918||4     8403580            NaN             NaN   
3              10001365||10     4740788            NaN             NaN   
4               10001365||2     4740788            NaN             NaN   
...                     ...         ...            ...             ...   
126560          9998449||37     3111059            NaN             NaN   
126561          9998449||38     3111059            NaN             NaN   
126562          9998449||39     3111059            NaN             NaN   
126563           9998449||5     3111059            NaN             NaN   
126564           9998449||6     3111059            NaN             NaN   

               record_date diagnostic_type diagnostic_content  t

In [14]:
# 调整diagnostic里面的时间格式
df_diagnostic['record_date']=df_diagnostic['record_date'].astype('str').apply(str_to_datetime)
# 提取diagnostic里面的有效字段
df_diagnostic=df_diagnostic[['patient_id','case_no','record_date','diagnostic_type','diagnostic_content']]
print(df_diagnostic)

        patient_id   case_no         record_date diagnostic_type  \
0          8403580  10000919 2020-08-16 11:45:25            初步诊断   
1          8403580  10000919 2020-08-18 18:00:11            最后诊断   
2          8403580  10000919 2020-08-18 18:00:25            出院诊断   
3          4740788  10001366 2020-08-20 11:06:13            出院诊断   
4          4740788  10001366 2020-08-16 10:10:42            初步诊断   
...            ...       ...                 ...             ...   
126560     3111059   9998450 2020-08-24 11:48:56            初步诊断   
126561     3111059   9998450 2020-08-24 11:49:34            初步诊断   
126562     3111059   9998450 2020-08-24 11:49:34            初步诊断   
126563     3111059   9998450 2020-08-15 13:34:35            初步诊断   
126564     3111059   9998450 2020-08-15 13:34:35            初步诊断   

       diagnostic_content  
0                  频发室性早搏  
1                  频发室性早搏  
2                  频发室性早搏  
3                  高胆固醇血症  
4                   股骨颈骨折  
...            

In [15]:
# 保存预处理后的原始诊断数据diagnostic
writer=pd.ExcelWriter(project_path+'/data/pre_processed_raw_data/df_diagnostic.xlsx')
df_diagnostic.to_excel(writer)
writer.save()

## 检验原始数据test_record+test_result处理

In [16]:
# 提取df_test，它是由rest_record和test_result合并而成，十分重要！！包含：tdm和安全性指标。
# 检测记录test_record
df_test_record=pd.read_csv(project_path+'/data/raw_data/4-test_record.csv',dtype={'case_no':str})
df_test_record=df_test_record[['test_record_id','patient_id','case_no','test_date','clinical_diagnosis']]
print(df_test_record.shape)
print(df_test_record['patient_id'].nunique())
print(df_test_record['case_no'].nunique())
# 删除test_date为空的记录
df_test_record=df_test_record[df_test_record['test_date'].notnull()]
print(df_test_record.shape)
print(df_test_record['patient_id'].nunique())
print(df_test_record['case_no'].nunique())
# 删除住院号case_no为空的记录
df_test_record=df_test_record[df_test_record['case_no'].notnull()]
df_test_record=df_test_record.reset_index(drop=True)
print(df_test_record.shape)
print(df_test_record['patient_id'].nunique())
print(df_test_record['case_no'].nunique())
# 调整检测时间格式
df_test_record['test_date']=df_test_record['test_date'].astype('str').apply(str_to_datetime)
print(df_test_record)

  interactivity=interactivity, compiler=compiler, result=result)


(431568, 5)
8015
9275
(431543, 5)
8015
9275
(418672, 5)
7501
9275
               test_record_id  patient_id   case_no           test_date  \
0       800001705269||B005||1     7156648   3698767 2018-06-05 11:04:00   
1       800001705269||B006||1     7156648   3698767 2018-06-05 11:04:00   
2       800001705269||B022||1     7156648   3698767 2018-06-05 11:04:00   
3       800001705269||B034||1     7156648   3698767 2018-06-05 11:04:00   
4       800001705270||B027||1     7156648   3698767 2018-06-05 09:25:00   
...                       ...         ...       ...                 ...   
418667  800010184743||A006||1     8450924  11424232 2021-01-07 20:37:00   
418668  800010184745||A083||1     8450924  11424232 2021-01-07 19:57:00   
418669  800010185440||B022||1     7870115  11221630 2021-01-07 18:57:00   
418670  800010185441||C222||1     7870115  11221630 2021-01-07 19:08:00   
418671  800010185442||A083||1     7870115  11221630 2021-01-07 18:26:00   

          clinical_diagnosis  
0 

In [17]:
# 保存预处理后的test_record
writer=pd.ExcelWriter(project_path+'/data/pre_processed_raw_data/df_test_record.xlsx')
df_test_record.to_excel(writer)
writer.save()

In [18]:
# 检测结果test_result
df_test_result=pd.read_csv(project_path+'/data/raw_data/4-test_result.csv')
df_test_result=df_test_result[['test_record_id','project_name','test_result','refer_scope','synonym']]
print(df_test_result.shape)
# 删除检测项目project_name为空的数据
df_test_result=df_test_result[df_test_result['project_name'].notnull()]
print(df_test_result.shape)
# 删除test_result为空的数据
df_test_result=df_test_result[df_test_result['test_result'].notnull()]
df_test_result=df_test_result.reset_index(drop=True)
print(df_test_result.shape)
# 删除<>号
df_test_result['test_result']=df_test_result['test_result'].astype('str').apply(lambda x:x.replace('<',''))
df_test_result['test_result']=df_test_result['test_result'].astype('str').apply(lambda x:x.replace('>',''))
print(df_test_result)

(2778381, 5)
(2778381, 5)
(2778297, 5)
                test_record_id project_name test_result refer_scope synonym
0        800001186909||A083||1        白细胞计数        6.61   3.50-9.50     WBC
1        800001186909||A083||1       淋巴细胞总数        2.99   1.10-3.20     LYM
2        800001186909||A083||1      中性粒细胞总数        3.27   1.80-6.30     NEU
3        800001186909||A083||1       单核细胞总数        0.28   0.10-0.60    MONO
4        800001186909||A083||1     嗜酸性粒细胞总数        0.07   0.02-0.52     EOS
...                        ...          ...         ...         ...     ...
2778292  800010185442||A083||1         -IG%        0.30         NaN    -IG%
2778293  800010185442||A083||1       -HFLC%        0.10         NaN  -HFLC%
2778294  800010185442||A083||1        -NEWX      315.00         NaN   -NEWX
2778295  800010185442||A083||1       -NESFL       50.70         NaN  -NESFL
2778296  800010185442||A083||1         -LYZ       54.60         NaN    -LYZ

[2778297 rows x 5 columns]


In [19]:
# 保存预处理后的test_result，数据太大无法保存
# writer=pd.ExcelWriter(project_path+'/data/pre_processed_raw_data/df_test_result.xlsx')
# df_test_result.to_excel(writer)
# writer.save()

In [20]:
# 合并test_record和test_result
df_test=pd.merge(df_test_record,df_test_result,on=['test_record_id'],how='inner')
print(df_test)

                test_record_id  patient_id   case_no           test_date  \
0        800001705269||B005||1     7156648   3698767 2018-06-05 11:04:00   
1        800001705269||B005||1     7156648   3698767 2018-06-05 11:04:00   
2        800001705269||B005||1     7156648   3698767 2018-06-05 11:04:00   
3        800001705269||B005||1     7156648   3698767 2018-06-05 11:04:00   
4        800001705269||B005||1     7156648   3698767 2018-06-05 11:04:00   
...                        ...         ...       ...                 ...   
2677957  800010185442||A083||1     7870115  11221630 2021-01-07 18:26:00   
2677958  800010185442||A083||1     7870115  11221630 2021-01-07 18:26:00   
2677959  800010185442||A083||1     7870115  11221630 2021-01-07 18:26:00   
2677960  800010185442||A083||1     7870115  11221630 2021-01-07 18:26:00   
2677961  800010185442||A083||1     7870115  11221630 2021-01-07 18:26:00   

           clinical_diagnosis project_name test_result refer_scope  synonym  
0        

# 纳排：提取服用利伐沙班的非瓣膜房颤患者

## 纳入：提取服用利伐沙班的患者

In [21]:
# 1. 提取服用利伐沙班非瓣膜房颤患者
print('-------------------------1.提取提取服用利伐沙班的非瓣膜房颤患者------------------------------')
# 1.1 服用利伐沙班且出院记录中有房颤的患者
print('-------------------------提取服用利伐沙班的患者------------------------------')
# 提取服药利伐沙班的患者id
df_lfsb=df_doctor_order[df_doctor_order['drug_name'].str.contains('利伐沙班')]
df_lfsb=df_lfsb.reset_index(drop=True)
# 排序
df_lfsb=df_lfsb.sort_values(['patient_id','case_no','start_datetime'],ascending=[True,True,True])
df_lfsb=df_lfsb.reset_index(drop=True)
print(df_lfsb.shape)
print(df_lfsb['patient_id'].nunique())
print(df_lfsb['case_no'].nunique())
# print(df_lfsb)

-------------------------1.提取提取服用利伐沙班的非瓣膜房颤患者------------------------------
-------------------------提取服用利伐沙班的患者------------------------------
(8700, 11)
5458
6523


In [22]:
# 保存利伐沙班用药记录
writer=pd.ExcelWriter(project_path+'/data/processed_data/df_1.1_利伐沙班用药记录.xlsx')
df_lfsb.to_excel(writer)
writer.save()

In [23]:
df_lfsb

Unnamed: 0,patient_id,case_no,long_d_order,drug_name,amount,drug_spec,dosage,frequency,medication_way,start_datetime,end_datetime
0,7777,5241451,1,利伐沙班薄膜衣片(15mg拜瑞妥)[15mg*7片],1.0,15mg*7片,15mg,1/日,口服,2018-12-28 09:09:11,2018-12-29 08:31:00
1,7777,5700529,1,利伐沙班薄膜衣片(15mg拜瑞妥)[15mg*7片],1.0,15mg*7片,15mg,1/日,口服,2019-02-24 16:59:12,2019-02-26 16:49:10
2,10014,8424199,1,利伐沙班薄膜衣片(拜瑞妥)[10mg*5片],2.0,10mg*5片,20mg,1/日,口服(餐时),2020-01-04 09:43:10,2020-01-10 08:05:00
3,18362,5000492,1,利伐沙班薄膜衣片(拜瑞妥)[10mg*5片],2.0,10mg*5片,20mg,1/日,口服,2018-11-20 18:06:21,2018-11-24 10:42:19
4,19290,4705828,1,利伐沙班薄膜衣片(拜瑞妥)[10mg*5片],1.0,10mg*5片,10mg,1/日,口服,2018-10-18 18:11:51,2018-10-24 10:36:00
...,...,...,...,...,...,...,...,...,...,...,...
8695,8668947,11407718,1,利伐沙班薄膜衣片(拜瑞妥)[10mg*5片],1.0,10mg*5片,20mg,1/日,口服(餐时),2021-01-07 09:45:10,2021-01-07 18:00:03
8696,8672712,11300809,1,利伐沙班薄膜衣片(拜瑞妥)[10mg*5片],1.0,10mg*5片,20mg,1/日,口服(餐时),2020-12-26 11:28:12,2020-12-26 12:59:47
8697,8681163,11350821,1,利伐沙班薄膜衣片(15mg拜瑞妥)[15mg*7片],1.0,15mg*7片,15mg,1/日,口服(餐时),2020-12-31 11:18:02,2020-12-31 16:49:06
8698,8684878,11343138,1,利伐沙班薄膜衣片(拜瑞妥)[10mg*5片],1.0,10mg*5片,20mg,1/日,口服(餐时),2020-12-30 19:44:46,2020-12-30 22:05:15


## 纳入: 提取出院诊断房颤患者

In [24]:
# 1.2 根据郑-诊断.xlsx，提取出院诊断房颤患者case_no，已进行合并纳入
print('-------------------------提取出院诊断房颤患者------------------------------')
df_oup_fib=df_diagnostic[(df_diagnostic['diagnostic_type']=='出院诊断') & (df_diagnostic['diagnostic_content'].str.contains(
'房颤射消融术后|心房扑动射频消融术后|心房颤动|阵发性心房颤动|持续性心房颤动|阵发性房颤|频发房性早搏|阵发性心房扑动|心房扑动|持续性房颤|房颤伴快速心室率\
|房颤射频消融术后|射频消融术后|快慢综合征|左心耳封堵术后|阵发性心房纤颤|心房颤动伴快速心室率|房颤|心房颤动射频消融术后|射频消融+左心耳封堵术后|左心耳封闭术后\
|心房颤动射频消融术后+左心耳封堵术|动态心电图异常：阵发性房颤、偶发房性早搏、偶发室性早搏、T波间歇性异常改变|左心房房颤射频消融+左心耳切除术后|永久性房颤\
|阵发性房颤射频消融术后|冷冻射频消融术后|心房颤动药物复律后'))]
df_oup_fib=df_oup_fib.sort_values(by=['patient_id','case_no','record_date'],ascending=[True,True,True])
df_oup_fib=df_oup_fib.reset_index(drop=True)
print(df_oup_fib.shape)
print(df_oup_fib['patient_id'].nunique())
print(df_oup_fib['case_no'].nunique())
print(df_oup_fib)

-------------------------提取出院诊断房颤患者------------------------------
(1726, 5)
1313
1533
      patient_id   case_no         record_date diagnostic_type  \
0          18362   5000492 2018-11-26 08:25:19            出院诊断   
1          19987   4198297 2018-08-16 09:27:49            出院诊断   
2          88144  10995704 2020-12-01 17:51:26            出院诊断   
3          99735  10757478 2020-11-23 09:22:36            出院诊断   
4          99735   9787952 2020-08-06 08:44:12            出院诊断   
...          ...       ...                 ...             ...   
1721     8677315  11304005 2020-12-31 09:21:54            出院诊断   
1722     8681163  11350821 2021-01-07 17:37:56            出院诊断   
1723     8682774  11331263 2020-12-31 11:38:04            出院诊断   
1724     8683799  11336735 2021-01-01 11:27:29            出院诊断   
1725     8684878  11343138 2020-12-31 15:08:59            出院诊断   

     diagnostic_content  
0                  心房颤动  
1               阵发性心房纤颤  
2                 阵发性房颤  
3                

In [25]:
# 保存出院诊断房颤患者
writer=pd.ExcelWriter(project_path+'/data/processed_data/df_1.2_出院诊断房颤患者记录.xlsx')
df_oup_fib.to_excel(writer)
writer.save()

In [26]:
print(type(df_lfsb.loc[0,'case_no']))
print(type(df_oup_fib.loc[0,'case_no']))

<class 'numpy.int64'>
<class 'str'>


## 合并利伐沙班用药和出院房颤诊断

In [27]:
#  调整利伐沙班用药的case_no格式
df_lfsb['case_no']=df_lfsb['case_no'].astype('str')
# 出院诊断
df_oup_fib=df_oup_fib.drop(['patient_id'],axis=1)

In [28]:
oup_fib_list=list(df_oup_fib['case_no'])
temp_list=[]
for i in np.unique(df_lfsb['case_no']):
    temp=df_lfsb[df_lfsb['case_no']==i]
    temp=temp.reset_index(drop=True)
    if i in oup_fib_list:
        temp_list.append(temp)
df_lfsb_oup=temp_list[0]
for j in range(1,len(temp_list)):
    df_lfsb_oup=pd.concat([df_lfsb_oup,temp_list[j]],axis=0)
df_lfsb_oup=df_lfsb_oup.reset_index(drop=True)
del temp_list

In [29]:
print(df_lfsb_oup.shape)
print(df_lfsb_oup['patient_id'].nunique())
print(df_lfsb_oup['case_no'].nunique())

(1906, 11)
1101
1271


In [30]:
print(df_lfsb_oup)

      patient_id   case_no  long_d_order                   drug_name  amount  \
0         743864  10024736             1      利伐沙班薄膜衣片(拜瑞妥)[10mg*5片]     3.0   
1        8184438  10029876             1  利伐沙班薄膜衣片(15mg拜瑞妥)[15mg*7片]     1.0   
2        8407634  10030043             1  利伐沙班薄膜衣片(15mg拜瑞妥)[15mg*7片]     1.0   
3        7488110  10034182             1  利伐沙班薄膜衣片(15mg拜瑞妥)[15mg*7片]     1.0   
4        8412117  10048915             1  利伐沙班薄膜衣片(15mg拜瑞妥)[15mg*7片]     1.0   
...          ...       ...           ...                         ...     ...   
1901     8390516   9960363             1  利伐沙班薄膜衣片(15mg拜瑞妥)[15mg*7片]     1.0   
1902     8388370   9961564             1  利伐沙班薄膜衣片(15mg拜瑞妥)[15mg*7片]     1.0   
1903     1416730   9979708             1  利伐沙班薄膜衣片(15mg拜瑞妥)[15mg*7片]     1.0   
1904     1416730   9979708             1  利伐沙班薄膜衣片(15mg拜瑞妥)[15mg*7片]     1.0   
1905     6668949   9989476             1      利伐沙班薄膜衣片(拜瑞妥)[10mg*5片]     2.0   

     drug_spec dosage frequency medicat

## 排除：膜瓣置换手术和瓣膜性房颤

In [31]:
# 1.3 提取瓣膜性房颤患者：手术中有膜瓣置换、诊断中为瓣膜性房颤。
print('-------------------------排除房颤相关的手术-----------------------------')
# 根据郑-手术.xlsx，排除膜瓣置换手术
df_surgical_record=pd.read_csv(project_path+'/data/raw_data/1-surgical_record.csv')
# df_surgical_valve=df_surgical_record[df_surgical_record['surgery_name'].str.contains('心脏病损腔内消融术|心脏病损腔内冷冻消融术|心电生理测定(EPS)|左心耳堵闭术|左心耳切除术|左心封堵术')]
df_surgical_valve=df_surgical_record[df_surgical_record['surgery_name'].str.contains('瓣膜置换')]
print(df_surgical_valve.shape)
print(df_surgical_valve['patient_id'].nunique())
print(df_surgical_valve['case_no'].nunique())
print(df_surgical_valve)

-------------------------排除房颤相关的手术-----------------------------
(29, 15)
20
20
     surgical_record_id  patient_id   case_no         surgery_date  \
286    10314882||4||3|3     8468331  10314882   24/9/2020 14:52:00   
426    10519687||2||1|1     8518643  10519687  20/10/2020 09:12:00   
427    10519687||2||2|2     8518643  10519687  20/10/2020 09:12:00   
456    10531917||3||1|1     4550400  10531917  22/10/2020 09:09:00   
1176    4227449||1||1|1     7481031   4227449                  NaN   
1178  4227449||1||21|21     7481031   4227449                  NaN   
1438    4697093||3||1|1     7543074   4697093                  NaN   
1439    4697093||3||2|2     7543074   4697093                  NaN   
1456    4726410||2||1|1     7554616   4726410                  NaN   
1457    4726410||2||2|2     7554616   4726410                  NaN   
1489    4753678||3||1|1     7559579   4753678                  NaN   
1490    4753678||3||5|5     7559579   4753678                  NaN   
1503    475

In [32]:
# 排除瓣膜置换手术的case_no
surgical_valve_list=list(df_surgical_record['case_no'])
temp_list=[]
for i in np.unique(df_lfsb_oup['case_no']):
    temp=df_lfsb_oup[df_lfsb_oup['case_no']==i]
    temp=temp.reset_index(drop=True)
    if i in surgical_valve_list:
        continue
    else:
        temp_list.append(temp)
df_lfsb_not_surgery=temp_list[0]
for j in range(1,len(temp_list)):
    df_lfsb_not_surgery=pd.concat([df_lfsb_not_surgery,temp_list[j]],axis=0)
df_lfsb_not_surgery=df_lfsb_not_surgery.reset_index(drop=True)
del temp_list

In [33]:
print(df_lfsb_not_surgery.shape)
print(df_lfsb_not_surgery['patient_id'].nunique())
print(df_lfsb_not_surgery['case_no'].nunique())

(1906, 11)
1101
1271


## 排除：诊断中瓣膜性房颤

In [34]:
# 排除临床诊断中瓣膜性房颤，包含：心脏瓣膜病和风湿性瓣膜病；不包括下肢静脉瓣膜病
print('-------------------------排除瓣膜性房颤患者-----------------------------')
# 删除临床诊断中的空值
df_clinical_diagnosis=df_test_record[df_test_record['clinical_diagnosis'].notnull()]  # 非空
df_heart_valve=df_clinical_diagnosis[df_clinical_diagnosis['clinical_diagnosis'].str.contains('瓣膜')]
df_heart_valve=df_heart_valve[df_heart_valve['clinical_diagnosis'].str.contains('心脏|风湿性')]
df_heart_valve['case_no']=df_heart_valve['case_no'].astype('str')

-------------------------排除瓣膜性房颤患者-----------------------------


In [35]:
print(df_heart_valve.shape)
print(df_heart_valve['patient_id'].nunique())
print(df_heart_valve['case_no'].nunique())

(779, 5)
16
17


In [36]:
# 排除瓣膜房颤的case_no
diagnosis_valve_list=list(df_heart_valve['case_no'])
temp_list=[]
for i in np.unique(df_lfsb_not_surgery['case_no']):
    temp=df_lfsb_not_surgery[df_lfsb_not_surgery['case_no']==i]
    temp=temp.reset_index(drop=True)
    if i in diagnosis_valve_list:
        continue
    else:
        temp_list.append(temp)
df_lfsb_not_valve=temp_list[0]
for j in range(1,len(temp_list)):
    df_lfsb_not_valve=pd.concat([df_lfsb_not_valve,temp_list[j]],axis=0)
df_lfsb_not_valve=df_lfsb_not_valve.reset_index(drop=True)
del temp_list

In [37]:
print(df_lfsb_not_valve.shape)
print(df_lfsb_not_valve['patient_id'].nunique())
print(df_lfsb_not_valve['case_no'].nunique())

(1891, 11)
1092
1260


In [38]:
# 保存利伐沙班非置换非瓣膜
writer=pd.ExcelWriter(project_path+'/data/processed_data/df_temp_利伐沙班非置换非瓣膜.xlsx')
df_lfsb_not_valve.to_excel(writer)
writer.save()

## 计算利伐沙班用药日剂量

In [39]:
# 1.5计算利伐沙班用药日剂量
print('-------------------------计算出院时利伐沙班用药日剂量------------------------------')
print(np.unique(df_lfsb['frequency']))
# 一片利伐沙班10mg
df_lfsb_not_valve['dosage']=df_lfsb_not_valve['dosage'].apply(lambda x: x.replace('mg', '') if 'mg' in x else 10 if '片' in x else x)
third=['1/72小时']
half=['1/2日','1/隔日']
one=['1/午','1/单日','1/日','1/日(餐前)','1/早','1/晚','Qd','Qd(8am)']
two=['1/12小时','12/日','2/日']
three=['Tid']
df_lfsb_not_valve['frequency']=df_lfsb_not_valve['frequency'].apply(lambda x: 0.33 if x in third else
                                                        0.5 if x in half else
                                                        1 if x in one else
                                                        2 if x in two else
                                                        3 if x in three else x)

-------------------------计算出院时利伐沙班用药日剂量------------------------------
['1/12小时' '1/2日' '1/72小时' '1/午' '1/单日' '1/日' '1/日(餐前)' '1/早' '1/晚' '1/隔日'
 '12/日' '2/日' 'Qd' 'Qd(8am)' 'Tid']


In [40]:
# # print(df_lfsb_not_valve.to_string())
# writer=pd.ExcelWriter(project_path+'/data/processed_data/df_temp_利伐沙班frequency处理.xlsx')
# df_lfsb_not_valve.to_excel(writer)
# writer.save()

In [41]:
df_lfsb_not_valve['日剂量']=df_lfsb_not_valve['dosage'].astype('float') * df_lfsb_not_valve['frequency'].astype('float')

In [42]:
print(df_lfsb_not_valve.shape)
print(df_lfsb_not_valve['patient_id'].nunique())
print(df_lfsb_not_valve['case_no'].nunique())

(1891, 12)
1092
1260


In [43]:
df_lfsb_not_valve

Unnamed: 0,patient_id,case_no,long_d_order,drug_name,amount,drug_spec,dosage,frequency,medication_way,start_datetime,end_datetime,日剂量
0,743864,10024736,1,利伐沙班薄膜衣片(拜瑞妥)[10mg*5片],3.0,10mg*5片,2.5,2.0,口服(餐时),2020-08-18 11:46:35,2020-08-25 09:11:00,5.0
1,8184438,10029876,1,利伐沙班薄膜衣片(15mg拜瑞妥)[15mg*7片],1.0,15mg*7片,15,1.0,口服(餐时),2020-08-21 14:45:36,2020-08-22 08:58:00,15.0
2,8407634,10030043,1,利伐沙班薄膜衣片(15mg拜瑞妥)[15mg*7片],1.0,15mg*7片,15,1.0,口服(餐时),2020-08-23 08:05:52,2020-08-25 08:17:00,15.0
3,7488110,10034182,1,利伐沙班薄膜衣片(15mg拜瑞妥)[15mg*7片],1.0,15mg*7片,15,1.0,口服(餐时),2020-08-19 11:17:06,2020-08-26 10:48:00,15.0
4,8412117,10048915,1,利伐沙班薄膜衣片(15mg拜瑞妥)[15mg*7片],1.0,15mg*7片,15,1.0,口服(餐时),2020-08-20 16:59:49,2020-08-21 18:14:33,15.0
...,...,...,...,...,...,...,...,...,...,...,...,...
1886,8390516,9960363,1,利伐沙班薄膜衣片(15mg拜瑞妥)[15mg*7片],1.0,15mg*7片,15,1.0,口服(餐时),2020-08-15 11:10:06,2020-08-22 08:53:00,15.0
1887,8388370,9961564,1,利伐沙班薄膜衣片(15mg拜瑞妥)[15mg*7片],1.0,15mg*7片,15,2.0,口服(餐时),2020-08-16 13:15:49,2020-08-21 10:27:00,30.0
1888,1416730,9979708,1,利伐沙班薄膜衣片(15mg拜瑞妥)[15mg*7片],1.0,15mg*7片,15,1.0,口服(餐时),2020-08-13 15:28:06,2020-08-16 17:53:29,15.0
1889,1416730,9979708,1,利伐沙班薄膜衣片(15mg拜瑞妥)[15mg*7片],1.0,15mg*7片,15,1.0,口服(餐时),2020-08-17 16:36:54,2020-08-20 10:11:00,15.0


In [44]:
# 合并同一case_no的多次用药数据，取最后一次日剂量作为最终日剂量
temp_list=[]
for i in np.unique(df_lfsb_not_valve['case_no']):
    temp=df_lfsb_not_valve[df_lfsb_not_valve['case_no']==i]
    temp=temp.reset_index(drop=True)
    if temp.shape[0]>1:
        temp.loc[0,'日剂量']=temp.loc[(temp.shape[0]-1),'日剂量']
        temp=temp.drop_duplicates(['case_no'],keep='first')
    temp_list.append(temp)
df_lfsb_drug=temp_list[0]
for j in range(1,len(temp_list)):
    df_lfsb_drug=pd.concat([df_lfsb_drug,temp_list[j]],axis=0)
del temp_list
df_lfsb_drug=df_lfsb_drug.reset_index(drop=True)
# 提取利伐沙班有效字段
df_lfsb_drug=df_lfsb_drug[['patient_id','case_no','start_datetime','end_datetime','日剂量']]

In [45]:
print(df_lfsb_drug.shape)
print(df_lfsb_drug['patient_id'].nunique())
print(df_lfsb_drug['case_no'].nunique())

(1260, 5)
1092
1260


In [46]:
writer=pd.ExcelWriter(project_path+'/data/processed_data/df_1.3_计算出院事利伐沙班日剂量.xlsx')
df_lfsb_drug.to_excel(writer)
writer.save()

## 合并人口信息学数据

In [47]:
# 1.5 合并人口信息学数据
print('-------------------------合并人口信息学数据-----------------------------')
df_popu=pd.read_excel(project_path+'/data/raw_data/1.基本信息(诊断非瓣膜房颤用利伐沙班).xlsx')
if 'Unnamed: 0' in df_popu.columns:
    df_popu = df_popu.drop(['Unnamed: 0'], axis=1)
df_popu=df_popu[['case_no','gender','age','height','weight','BMI']]
# 删除人口信息学重复数据，只保留第一条
df_popu=df_popu.drop_duplicates(subset=['case_no'],keep='first')

-------------------------合并人口信息学数据-----------------------------


In [99]:
print(type(df_popu.loc[0,'case_no']))
print(type(df_lfsb_drug.loc[0,'case_no']))

<class 'str'>
<class 'str'>


In [100]:
# 将df_popu的case_no格式调整为str
df_popu['case_no']=df_popu['case_no'].astype('str')
df_lfsb_popu=pd.merge(df_lfsb_drug,df_popu,on=['case_no'],how='left')

In [101]:
print(df_lfsb_popu.shape)
print(df_lfsb_popu['patient_id'].nunique())
print(df_lfsb_popu['case_no'].nunique())

(1260, 10)
1092
1260


In [102]:
print(df_lfsb_popu)

      patient_id   case_no      start_datetime        end_datetime   日剂量  \
0         743864  10024736 2020-08-18 11:46:35 2020-08-25 09:11:00   5.0   
1        8184438  10029876 2020-08-21 14:45:36 2020-08-22 08:58:00  15.0   
2        8407634  10030043 2020-08-23 08:05:52 2020-08-25 08:17:00  15.0   
3        7488110  10034182 2020-08-19 11:17:06 2020-08-26 10:48:00  15.0   
4        8412117  10048915 2020-08-20 16:59:49 2020-08-21 18:14:33  15.0   
...          ...       ...                 ...                 ...   ...   
1255     1768454   9959214 2020-08-11 16:56:19 2020-08-20 08:48:00  10.0   
1256     8390516   9960363 2020-08-15 11:10:06 2020-08-22 08:53:00  15.0   
1257     8388370   9961564 2020-08-16 13:15:49 2020-08-21 10:27:00  30.0   
1258     1416730   9979708 2020-08-13 15:28:06 2020-08-16 17:53:29  15.0   
1259     6668949   9989476 2020-08-14 17:29:05 2020-08-22 10:04:00  10.0   

      gender   age  height  weight        BMI  
0        1.0  70.0   174.0    65.0  21.

In [103]:
# 补充缺失的性别、年龄、身高信息
# 读取patient_info-包含性别和年龄；patient_sign_record-包含身高
df_patient_info=pd.read_csv(project_path+'/data/raw_data/1-patient_info.csv')
df_patient_info = df_patient_info.set_index('patient_id')
df_patient_sign_record=pd.read_csv(project_path+'/data/raw_data/1-patient_sign_record.csv')
df_height = df_patient_sign_record[df_patient_sign_record['sign_type'] == '身高(cm)']

In [104]:
aaa=df_lfsb_popu[df_lfsb_popu['gender'].isnull()]
bbb=df_lfsb_popu[df_lfsb_popu['gender'].notnull()]
aaa_list=[]
for i in np.unique(aaa['patient_id']):
    # print(i)
    temp=aaa[aaa['patient_id']==i]
    temp=temp.reset_index(drop=True)
    # 提取缺失的性别数据
    gender=df_patient_info.loc[i,'gender']
    if gender =='男':
        gender_value=1
    else:
        gender_value=0
    temp['gender']=gender_value
    # 提取缺失的年龄数据
    age=df_patient_info.loc[i,'birth_year']
    age_year=age.split('-')[0]
    start_datetime=temp.loc[0,'start_datetime']
    start_year=str(start_datetime).split('-')[0]
    # start_year=start_time[0:3]
    age_value=int(start_year)-int(age_year)
    temp['age']=age_value
    # 提取身高信息
    height= df_height[df_height['patient_id']==i]
    height=height.reset_index(drop=True)
    height=height.loc[0,'record_content']
    if height=='卧床' or height=='轮椅':
        temp['height']=np.nan
    else:
        temp['height']=height
    aaa_list.append(temp)
aaa=aaa_list[0]
for j in range(1,len(aaa_list)):
    aaa=pd.concat([aaa,aaa_list[j]],axis=0)
df_lfsb_popu=pd.concat([aaa,bbb],axis=0)
df_lfsb_popu=df_lfsb_popu.sort_values(['patient_id'])
df_lfsb_popu=df_lfsb_popu.reset_index(drop=True)

In [106]:
print(df_lfsb_popu.shape)
print(df_lfsb_popu['patient_id'].nunique())
print(df_lfsb_popu['case_no'].nunique())

(1260, 10)
1092
1260


In [107]:
df_lfsb_popu

Unnamed: 0,patient_id,case_no,start_datetime,end_datetime,日剂量,gender,age,height,weight,BMI
0,18362,5000492,2018-11-20 18:06:21,2018-11-24 10:42:19,20.0,1.0,76.0,172,,
1,19987,4198297,2018-08-13 10:40:32,2018-08-13 23:01:00,15.0,1.0,65.0,178,69.500000,21.935362
2,99735,9787952,2020-07-24 10:59:17,2020-08-07 08:18:00,10.0,1.0,60.0,171,60.333333,20.633129
3,133803,8022113,2019-11-15 18:44:33,2019-11-16 09:31:06,15.0,0.0,58.0,156,58.000000,23.833005
4,154159,5408239,2019-01-10 16:46:42,2019-01-16 08:58:53,15.0,1.0,76.0,165,67.000000,24.609734
...,...,...,...,...,...,...,...,...,...,...
1255,8628569,11061006,2020-12-02 17:23:52,2020-12-02 18:32:15,20.0,1.0,48.0,165,,
1256,8643418,11146976,2020-12-11 11:42:16,2020-12-11 12:53:58,20.0,1.0,67.0,168.344,66.314689,23.400016
1257,8672712,11300809,2020-12-26 11:28:12,2020-12-26 12:59:47,20.0,0.0,54.0,156.675,62.000000,25.257702
1258,8681163,11350821,2020-12-31 11:18:02,2020-12-31 16:49:06,15.0,1.0,52.0,168,,


In [108]:
# 统计年龄分布
df_age_stats=df_lfsb_popu.drop_duplicates(subset=['patient_id'],keep='first')
print(df_age_stats['age'].describe())

count    1092.000000
mean       66.400183
std        12.365875
min        21.000000
25%        59.000000
50%        67.000000
75%        75.000000
max        95.000000
Name: age, dtype: float64


In [109]:
# 保存人口学特征
writer=pd.ExcelWriter(project_path+'/data/processed_data/df_1.4_合并人口信息学特征的非瓣膜房颤患者.xlsx')
df_lfsb_popu.to_excel(writer)
writer.save()

## 排除：血肌酐清除率<15ml

In [110]:
# 过滤异常值
def filter_exce_value(df,feature):
    # 过滤文字!!!!!!!!!!!!!!!!!!!!!!!!!!!
    df=df[df[feature].str.contains('\d')]
    # 过滤异常大值!!!!!!!!!!!!!!!!!!!!!!!!!!
    median_value=df[feature].median()
    df[feature]=df[feature].apply(lambda x: x if abs(float(x)) < (100 * abs(median_value)) else np.nan)
    df=df[df[feature].notnull()]
    return df

In [111]:
# 使用随机森林对缺失值进行插补
import pandas as pd
pd.set_option('mode.chained_assignment', None)
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
def missing_value_interpolation(df):
    df = df.reset_index(drop=True)
    # 提取存在缺失值的列名
    missing_list = []
    for i in df.columns:
        if df[i].isnull().sum() > 0:
            missing_list.append(i)
    missing_list_copy = missing_list.copy()
    # 用该列未缺失的值训练随机森林，然后用训练好的rf预测缺失值
    for i in range(len(missing_list)):
        name=missing_list[0]
        df_missing = df[missing_list_copy]
        # 将其他列的缺失值用0表示。
        missing_list.remove(name)
        for j in missing_list:
            df_missing[j]=df_missing[j].astype('str').apply(lambda x: 0 if x=='nan' else x)
        df_missing_is = df_missing[df_missing[name].isnull()]
        df_missing_not = df_missing[df_missing[name].notnull()]
        y = df_missing_not[name]
        x = df_missing_not.drop([name],axis=1)
        # 列出参数列表
        tree_grid_parameter = {'n_estimators': list((10, 50, 100, 150, 200))}
        # 进行参数的搜索组合
        grid = GridSearchCV(RandomForestRegressor(),param_grid=tree_grid_parameter,cv=3)
        #rfr=RandomForestRegressor(random_state=0,n_estimators=100,n_jobs=-1)
        #根据已有数据去拟合随机森林模型
        grid.fit(x, y)
        rfr = RandomForestRegressor(n_estimators=grid.best_params_['n_estimators'])
        rfr.fit(x, y)
        #预测缺失值
        predict = rfr.predict(df_missing_is.drop([name],axis=1))
        #填补缺失值
        df.loc[df[name].isnull(),name] = predict
    return df

In [112]:
# 提取血清肌酐值
df_cr = df_test[(df_test['project_name'].str.contains('肌酐')) & (df_test['synonym']=='CR')]
df_cr = filter_exce_value(df_cr,'test_result')
df_cr=df_cr.reset_index(drop=True)
print(df_cr.shape)
print(df_cr['patient_id'].nunique())
print(df_cr['case_no'].nunique())

(28023, 9)
7378
9095


In [113]:
writer=pd.ExcelWriter(project_path+'/data/processed_data/df_temp_提取血肌酐记录.xlsx')
df_cr.to_excel(writer)
writer.save()

In [114]:
df_cr

Unnamed: 0,test_record_id,patient_id,case_no,test_date,clinical_diagnosis,project_name,test_result,refer_scope,synonym
0,800001705269||B034||1,7156648,3698767,2018-06-05 11:04:00,上消化道出血(食管胃底静脉曲张破裂出血),肌酐,65,53-123,CR
1,800001719657||B001||1,7398692,3724121,2018-06-07 20:44:00,肝炎后肝硬化失代偿期,肌酐,76,53-123,CR
2,800001720954||B034||1,7156648,3698767,2018-06-09 09:36:00,上消化道出血(食管胃底静脉曲张破裂出血),肌酐,76,53-123,CR
3,800001723611||B001||1,7398692,3724121,2018-06-09 09:36:00,肝炎后肝硬化失代偿期,肌酐,76,53-123,CR
4,800001728592||B001||1,7156648,3698767,2018-06-12 10:10:00,上消化道出血(食管胃底静脉曲张破裂出血),肌酐,87,53-123,CR
...,...,...,...,...,...,...,...,...,...
28018,800010171416||B034||1,8668947,11407718,2021-01-06 19:28:00,阵发性心房颤动,肌酐,113,53-123,CR
28019,800010172233||B001||1,8571297,11000596,2021-01-07 08:02:00,多个部位烧伤，述及的烧伤至少有一处三度烧伤(火焰烧伤70%TBSA（II°~III°）头面颈...,肌酐,42,53-123,CR
28020,800010172607||B001||1,7839643,11236808,2021-01-07 12:23:00,原发性胆汁性肝硬化伴食管胃底静脉曲张(失代偿期),肌酐,74,44-106,CR
28021,800010181517||B034||1,8621976,11382263,2021-01-07 16:26:00,股骨头坏死，无菌性(双侧股骨头坏死),肌酐,49,44-106,CR


In [115]:
# 提取df_cr有效字段
df_cr=df_cr[['case_no','test_result']]
df_cr=df_cr.rename(columns={'test_result':'血肌酐值'})

In [116]:
df_cr

Unnamed: 0,case_no,血肌酐值
0,3698767,65
1,3724121,76
2,3698767,76
3,3724121,76
4,3698767,87
...,...,...
28018,11407718,113
28019,11000596,42
28020,11236808,74
28021,11382263,49


In [117]:
# 删除重复数据
df_cr=df_cr.drop_duplicates(['case_no'],keep='first')
# 将血清肌酐值并入df_lfsb
df_lfsb_cr=pd.merge(df_lfsb_popu,df_cr,on=['case_no'],how='left')

In [118]:
print(df_lfsb_cr.to_string())

      patient_id   case_no      start_datetime        end_datetime   日剂量  gender   age   height      weight        BMI  血肌酐值
0          18362   5000492 2018-11-20 18:06:21 2018-11-24 10:42:19  20.0     1.0  76.0      172         NaN        NaN    91
1          19987   4198297 2018-08-13 10:40:32 2018-08-13 23:01:00  15.0     1.0  65.0      178   69.500000  21.935362   846
2          99735   9787952 2020-07-24 10:59:17 2020-08-07 08:18:00  10.0     1.0  60.0      171   60.333333  20.633129    77
3         133803   8022113 2019-11-15 18:44:33 2019-11-16 09:31:06  15.0     0.0  58.0      156   58.000000  23.833005    59
4         154159   5408239 2019-01-10 16:46:42 2019-01-16 08:58:53  15.0     1.0  76.0      165   67.000000  24.609734   116
5         154159   6304125 2019-05-06 10:50:51 2019-05-07 08:43:39   5.0     1.0  76.0      162   65.000000  24.767566   108
6         201048   5209138 2018-12-17 10:49:04 2018-12-19 15:18:00  10.0     1.0  77.0      164   60.000000  22.308150   249


In [119]:
# 对缺失值进行随机森林插补
df_lfsb_cr=missing_value_interpolation(df_lfsb_cr)

In [120]:
df_lfsb_cr

Unnamed: 0,patient_id,case_no,start_datetime,end_datetime,日剂量,gender,age,height,weight,BMI,血肌酐值
0,18362,5000492,2018-11-20 18:06:21,2018-11-24 10:42:19,20.0,1.0,76.0,172,46.790000,17.389291,91
1,19987,4198297,2018-08-13 10:40:32,2018-08-13 23:01:00,15.0,1.0,65.0,178,69.500000,21.935362,846
2,99735,9787952,2020-07-24 10:59:17,2020-08-07 08:18:00,10.0,1.0,60.0,171,60.333333,20.633129,77
3,133803,8022113,2019-11-15 18:44:33,2019-11-16 09:31:06,15.0,0.0,58.0,156,58.000000,23.833005,59
4,154159,5408239,2019-01-10 16:46:42,2019-01-16 08:58:53,15.0,1.0,76.0,165,67.000000,24.609734,116
...,...,...,...,...,...,...,...,...,...,...,...
1255,8628569,11061006,2020-12-02 17:23:52,2020-12-02 18:32:15,20.0,1.0,48.0,165,42.693750,16.879974,121
1256,8643418,11146976,2020-12-11 11:42:16,2020-12-11 12:53:58,20.0,1.0,67.0,168.344,66.314689,23.400016,246
1257,8672712,11300809,2020-12-26 11:28:12,2020-12-26 12:59:47,20.0,0.0,54.0,156.675,62.000000,25.257702,68
1258,8681163,11350821,2020-12-31 11:18:02,2020-12-31 16:49:06,15.0,1.0,52.0,168,44.334000,15.826622,52


In [121]:
# 计算血肌酐清除率
df_lfsb_cr=df_lfsb_cr.reset_index(drop=True)
for i in range(0,df_lfsb_cr.shape[0]):
    if df_lfsb_cr.loc[i,'gender']==1:
        g=1
    else:
        g=0.85
    age=float(140-df_lfsb_cr.loc[i,'age'])
#     print(age)
    weight=float(df_lfsb_cr.loc[i,'weight'])/(0.818* float(df_lfsb_cr.loc[i,'血肌酐值']))
#     print(weight)
    df_lfsb_cr.loc[i,'血肌酐清除率']=age * weight *float(g)
    

In [122]:
print(df_lfsb_cr.shape)
print(df_lfsb_cr['patient_id'].nunique())
print(df_lfsb_cr['case_no'].nunique())

(1260, 12)
1092
1260


In [123]:
# 保存血肌酐清除率
writer=pd.ExcelWriter(project_path+'/data/processed_data/df_temp_血肌酐清除率.xlsx')
df_lfsb_cr.to_excel(writer)
writer.save()

In [124]:
# 排除<15的血肌清除率
df_lfsb_cr=df_lfsb_cr[df_lfsb_cr['血肌酐清除率'] >15]
df_lfsb_cr=df_lfsb_cr.reset_index(drop=True)

In [125]:
print(df_lfsb_cr.shape)
print(df_lfsb_cr['patient_id'].nunique())
print(df_lfsb_cr['case_no'].nunique())

(1222, 12)
1059
1222


## 保存服用利伐沙班的非瓣膜患者

In [126]:
# 1.4服用利伐沙班的非瓣膜患者。利伐沙班用药患者&出院房颤诊断-瓣膜房颤
writer=pd.ExcelWriter(project_path+'/data/processed_data/df_1.5_排除血肌酐的非瓣膜房颤患者.xlsx')
df_lfsb_cr.to_excel(writer)
writer.save()

## 统计血小板计数和凝血时间

In [127]:
# 统计血小板分布
df_platelet=df_test[df_test['project_name']=='血小板计数']
df_platelet=df_platelet[df_platelet['test_result'].astype('int')<10]
df_platelet=df_platelet.drop_duplicates(subset=['patient_id'],keep='first')
print(df_platelet['test_result'].describe())
# 统计凝血时间异常·

count     117
unique      9
top         8
freq       26
Name: test_result, dtype: object


In [128]:
df_platelet

Unnamed: 0,test_record_id,patient_id,case_no,test_date,clinical_diagnosis,project_name,test_result,refer_scope,synonym
4860,800001849390||A083||1,7313864,3930555,2018-07-08 08:20:00,化疗后骨髓抑制,血小板计数,8,125-350,PLT
15784,800001911248||A083||1,7440449,3979265,2018-07-15 12:45:00,肠系膜静脉血栓形成伴肠坏死(空肠坏死),血小板计数,9,125-350,PLT
19534,800001928232||A083||1,7380931,3926302,2018-07-18 10:27:00,急性髓系白血病(del11q23),血小板计数,1,125-350,PLT
59968,800002087953||A083||1,7455818,4082340,2018-08-05 09:50:00,急性髓系白血病(C92.0)(伴FLT3、KMT2A突变),血小板计数,8,125-350,PLT
62908,800002098099||A083||1,7067321,4054379,2018-08-07 08:59:00,急性淋巴细胞白血病(BCP-NOS 高危组 难治性-CR1，CNS-L高危组),血小板计数,7,125-350,PLT
...,...,...,...,...,...,...,...,...,...
2511398,800009633050||A083||1,8335774,10695106,2020-11-21 08:32:00,急性髓系白血病(NPM1、FLT3-ITD阳性),血小板计数,8,125-350,PLT
2543234,800009729195||A083||1,7892051,10777640,2020-11-28 21:57:00,肺恶性肿瘤,血小板计数,9,125-350,PLT
2606102,800009911593||A083||1,8316242,10901751,2020-12-15 09:31:00,急性单核细胞白血病(FLT3-ITD阳性),血小板计数,4,125-350,PLT
2616202,800009944883||A083||1,8473271,10999888,2020-12-18 10:19:00,"急性髓系白血病(高危组,-7/7q-阳性)",血小板计数,7,125-350,PLT


## 增加联合用药

In [129]:
# 1.6 增加联合用药
print('---------------------添加联合用药和其他检验信息------------------------------')
# 添加高血压、糖尿病特征，根据diagnostic_record文件和梦璇文件总数据20210607.xlsx
df_mx=pd.read_excel(project_path+'/data/raw_data/总数据20210607.xlsx',dtype={'case_no':str})
if 'Unnamed: 0' in df_mx.columns:
    df_mx = df_mx.drop(['Unnamed: 0'], axis=1)
temp_list=[]
for i in np.unique(df_lfsb_cr['case_no']):
    temp=df_lfsb_cr[df_lfsb_cr['case_no']==i]
    for j in np.unique(df_mx.columns[12:]):
        temp[j]=df_mx[j]
    temp_list.append(temp)
df_lfsb_merge_other=temp_list[0]
for j in range(1,len(temp_list)):
    df_lfsb_merge_other=pd.concat([df_lfsb_merge_other,temp_list[j]],axis=0)
df_lfsb_merge_other=df_lfsb_merge_other.reset_index(drop=True)
del temp_list

---------------------添加联合用药和其他检验信息------------------------------


In [130]:
print(df_lfsb_merge_other.shape)
print(df_lfsb_merge_other['patient_id'].nunique())
print(df_lfsb_merge_other['case_no'].nunique())

(1222, 65)
1059
1222


In [131]:
writer=pd.ExcelWriter(project_path+'/data/processed_data/df_1.6_合并用药和检验信息.xlsx')
df_lfsb_merge_other.to_excel(writer)
writer.save()

## 增加糖尿病病史

In [132]:
# 其他糖尿病的诊断
df_diagnostic_dm=df_diagnostic[df_diagnostic['diagnostic_content'].str.contains('糖尿病')]
# 删除重复诊断的case_no
df_diagnostic_dm=df_diagnostic_dm.drop_duplicates(['case_no'],keep='first')
df_diagnostic_dm=df_diagnostic_dm.reset_index(drop=True)

In [133]:
print(df_diagnostic_dm.shape)
print(df_diagnostic_dm['patient_id'].nunique())
print(df_diagnostic_dm['case_no'].nunique())

(1515, 5)
1232
1515


In [134]:
df_diagnostic_dm

Unnamed: 0,patient_id,case_no,record_date,diagnostic_type,diagnostic_content
0,8404129,10003784,2020-08-23 09:39:09,最后诊断,2型糖尿病
1,8394639,10010350,2020-09-10 18:49:17,出院诊断,非胰岛素依赖型糖尿病伴有眼的并发症
2,7795384,10026705,2020-08-18 15:49:26,初步诊断,2型糖尿病
3,8409596,10027793,2020-08-18 15:17:30,初步诊断,2型糖尿病足病
4,8184438,10029876,2020-08-19 15:56:28,初步诊断,2型糖尿病不伴有并发症
...,...,...,...,...,...
1510,7700501,9983404,2020-08-13 16:47:57,初步诊断,2型糖尿病
1511,8382865,9990371,2020-08-14 11:55:06,初步诊断,2型糖尿病足病
1512,8348168,9991793,2020-09-20 20:51:32,初步诊断,糖尿病
1513,8401456,9994476,2020-09-04 11:05:41,出院诊断,2型糖尿病


In [135]:
# 提取糖尿病患者case_no列表
dm_list=list(df_diagnostic_dm['case_no'])
print(dm_list[0])
print(type(dm_list[0]))

10003784
<class 'str'>


In [136]:
# 并入纳排数据中
temp_list=[]
for i in np.unique(df_lfsb_merge_other['case_no']):
    temp=df_lfsb_merge_other[df_lfsb_merge_other['case_no']==i]
    temp=temp.reset_index(drop=True)
    if i in dm_list:
        temp['糖尿病']=1
    else:
        temp['糖尿病']=0
    temp_list.append(temp)

In [137]:
df_lfsb_merge_dm=temp_list[0]
for j in range(1,len(temp_list)):
    df_lfsb_merge_dm=pd.concat([df_lfsb_merge_dm,temp_list[j]],axis=0)
df_lfsb_merge_dm=df_lfsb_merge_dm.sort_values(by=['patient_id','case_no','start_datetime'])
df_lfsb_merge_dm=df_lfsb_merge_dm.reset_index(drop=True)
del temp_list

In [138]:
print(df_lfsb_merge_dm.shape)
print(df_lfsb_merge_dm['patient_id'].nunique())
print(df_lfsb_merge_dm['case_no'].nunique())

(1222, 65)
1059
1222


In [139]:
writer=pd.ExcelWriter(project_path+'/data/processed_data/df_1.7_增加糖尿病检验信息.xlsx')
df_lfsb_merge_dm.to_excel(writer)
writer.save()

## 删除缺失过多（>50%）的列

In [140]:
# 删除列超过50%的其他指标
for i in np.unique(df_lfsb_merge_dm.columns):
    other_up = df_lfsb_merge_dm[i].isnull().sum()
    other_down = df_lfsb_merge_dm[i].shape[0]
    if df_lfsb_merge_dm[i].isnull().sum()/df_lfsb_merge_dm[i].shape[0] >= 0.5:
        del df_lfsb_merge_dm[i]

In [141]:
print(df_lfsb_merge_dm.shape)
print(df_lfsb_merge_dm['patient_id'].nunique())
print(df_lfsb_merge_dm['case_no'].nunique())

(1222, 54)
1059
1222


In [142]:
# 保存删除缺失值过大的数据
writer=pd.ExcelWriter(project_path+'/data/processed_data/df_1.8_删除缺失过多的列.xlsx')
df_lfsb_merge_dm.to_excel(writer)
writer.save()

## 按梦璇(儿子)总数据20210607提取有效case_no

In [143]:
# 梦璇说-(儿子)总数据20210607是删除瓣膜置换手术、瓣膜性房颤后的
# 这个数据有问题，不能patient_id = case_no，这样就没有再次入院的了！！
mx_case_list=list(df_mx['case_no'])

In [144]:
# mx_case_list
print(type(mx_case_list[0]))

<class 'str'>


In [145]:
# temp_list=[]
# for i in np.unique(df_lfsb_merge_dm['case_no']):
#     temp=df_lfsb_merge_dm[df_lfsb_merge_dm['case_no']==i]
#     temp=temp.reset_index(drop=True)
#     if i in mx_case_list:
#         temp_list.append(temp)
# df_lfsb_inner_mx=temp_list[0]
# for j in range(1,len(temp_list)):
#     df_lfsb_inner_mx=pd.concat([df_lfsb_inner_mx,temp_list[j]],axis=0)
# df_lfsb_inner_mx=df_lfsb_inner_mx.reset_index(drop=True)
# del temp_list

In [146]:
# print(df_lfsb_inner_mx.shape)
# print(df_lfsb_inner_mx['patient_id'].nunique())
# print(df_lfsb_inner_mx['case_no'].nunique())

In [147]:
# # 按梦璇数据的case_no提取
# writer=pd.ExcelWriter(project_path+'/data/processed_data/df_1.9_梦璇0607数据的case_no提取.xlsx')
# df_lfsb_inner_mx.to_excel(writer)
# writer.save()

# 计算多次出入院

## 提取入院诊断

In [148]:
# 入院诊断: 补充诊断、初步诊断、门诊诊断、修正诊断、最后诊断
df_diagnostic_inp=df_diagnostic[df_diagnostic['diagnostic_type'].str.contains('补充诊断|初步诊断|门诊诊断|修正诊断|最后诊断|出院诊断')]
# 删除空值
df_diagnostic_inp=df_diagnostic_inp[df_diagnostic_inp['case_no'].notnull()]
# 入院诊断case_no格式调整：由float转为str
df_diagnostic_inp['case_no']=df_diagnostic_inp['case_no'].astype('int').astype('str')
df_diagnostic_inp=df_diagnostic[['patient_id','case_no','record_date','diagnostic_type','diagnostic_content']]

In [149]:
print(df_diagnostic_inp.shape)

(126565, 5)


In [150]:
# 合并同一case_no的入院诊断
temp_list=[]
for i in np.unique(df_diagnostic_inp['case_no']):
    temp=df_diagnostic_inp[df_diagnostic_inp['case_no']==i]
    temp=temp.reset_index(drop=True)
    temp_diagnostic_list=list(temp['diagnostic_content'])
    temp=temp.drop_duplicates(subset=['case_no'],keep='first')
    temp_diagnostic_str=';'.join(temp_diagnostic_list)
    temp['diagnostic_content']=temp_diagnostic_str
#     print(temp)
    temp_list.append(temp)
#     j=0
#     while j < temp.shape[0]-1:
#         # print(i)
#         temp.loc[j+1,'diagnostic_content']=temp.loc[j,'diagnostic_content'] +';'+temp.loc[j+1,'diagnostic_content']
#         temp=temp.drop(index=[j],axis=0)
#         temp=temp.reset_index(drop=True)
#         temp=temp.drop_duplicates(subset=['case_no'],keep='last')
#         j+=1
#     temp_list.append(temp)

In [151]:
df_diagnostic_inp_merge=temp_list[0]
for j in range(1,len(temp_list)):
    df_diagnostic_inp_merge=pd.concat([df_diagnostic_inp_merge,temp_list[j]],axis=0)
del temp_list
df_diagnostic_inp_merge=df_diagnostic_inp_merge.reset_index(drop=True)

In [152]:
df_diagnostic_inp_merge

Unnamed: 0,patient_id,case_no,record_date,diagnostic_type,diagnostic_content
0,8403580,10000919,2020-08-16 11:45:25,初步诊断,频发室性早搏;频发室性早搏;频发室性早搏
1,4740788,10001366,2020-08-20 11:06:13,出院诊断,高胆固醇血症;股骨颈骨折;手术后状态，其他特指的;股骨颈骨折;手术后状态，其他特指的;下肢动...
2,8404129,10003784,2020-08-23 09:39:09,最后诊断,2型糖尿病;主动脉硬化;脂肪肝;肾结石;宫颈鳞状细胞癌;肝功能异常;主动脉硬化;脂肪肝;肾结...
3,8301602,10004367,2020-08-16 10:22:08,初步诊断,恶性肿瘤维持性化学治疗;高级别粘浆型乳头状囊腺癌IIb术后;盆腔肿物;轻度贫血;恶性肿瘤维持...
4,7330307,10005446,2020-08-16 15:41:22,初步诊断,"左膝关节骨性关节炎;膝内翻畸形,后天性;颈动脉斑块形成;主动脉硬化;胸腰椎退行性变;高胆固醇..."
...,...,...,...,...,...
9341,7598728,9994805,2020-08-14 18:37:07,初步诊断,频发室性早搏;二尖瓣返流;肺动脉高压中度;高血压2级;2型糖尿病不伴有并发症;冠状动脉粥样硬...
9342,8400971,9995012,2020-08-14 18:45:11,初步诊断,剖宫产术后;髋关节痛;髋关节发育不良;髋关节发育不良
9343,8402546,9996417,2020-08-14 22:35:20,初步诊断,股动脉假性动脉瘤;股动脉假性动脉瘤
9344,5820076,9997345,2020-08-15 11:21:37,初步诊断,后天性胫骨畸形;后天性胫骨畸形;左膝关节创伤性关节炎;膝关节半脱位;后天性膝关节畸形;左腓总...


In [153]:
print(df_diagnostic_inp_merge.shape)
print(df_diagnostic_inp_merge['patient_id'].nunique())
print(df_diagnostic_inp_merge['case_no'].nunique())

(9346, 5)
7542
9346


In [154]:
writer=pd.ExcelWriter(project_path+'/data/processed_data/df_2.1_提取入院诊断.xlsx')
df_diagnostic_inp_merge.to_excel(writer)
writer.save()

## 提取出入院时间

In [204]:
# 2.计算多次出入院时间，case_no
df_inp_record=pd.read_csv(project_path+'/data/raw_data/1-inp_record.csv',dtype={'case_no':str})

In [205]:
# 删除空值数据
df_inp_record=df_inp_record[df_inp_record['adm_date'].notnull() & df_inp_record['dis_date'].notnull()]

In [206]:
print(df_inp_record.shape)
print(df_inp_record['patient_id'].nunique())
print(df_inp_record['case_no'].nunique())

(9149, 27)
7410
9149


In [207]:
# 调整出入院时间格式
df_inp_record['adm_date']=df_inp_record['adm_date'].astype('str').apply(str_to_datetime)
df_inp_record['dis_date']=df_inp_record['dis_date'].astype('str').apply(str_to_datetime)

In [208]:
# 提取出入院时间有效字段
df_inp_record=df_inp_record[['patient_id','case_no','adm_date','care_area','dis_date']]
df_inp_record=df_inp_record.sort_values(by=['patient_id','case_no','adm_date'])
df_inp_record=df_inp_record.reset_index(drop=True)

In [209]:
print(df_inp_record.shape)
print(df_inp_record['patient_id'].nunique())
print(df_inp_record['case_no'].nunique())

(9149, 5)
7410
9149


In [211]:
# 保存多次出入院时间
writer=pd.ExcelWriter(project_path+'/data/processed_data/df_temp_保存多次出入院时间.xlsx')
df_inp_record.to_excel(writer)
writer.save()

## 剂量分组，统计再次入院

In [162]:
# 3.按剂量10、15、20分组，计算再次入院率
df_lfsb_merge_dm['剂量分组']=df_lfsb_merge_dm['日剂量'].apply(lambda x: 0 if x==10 else 1 if x==15 else 2 if x==20 else np.nan)

In [163]:
print(df_lfsb_merge_dm.shape)
print(df_lfsb_merge_dm['patient_id'].nunique())
print(df_lfsb_merge_dm['case_no'].nunique())

(1222, 55)
1059
1222


In [164]:
# 提取分组数据
df_lfsb_group=df_lfsb_merge_dm[df_lfsb_merge_dm['剂量分组'].notnull()]

In [165]:
print(df_lfsb_group.shape)
print(df_lfsb_group['patient_id'].nunique())
print(df_lfsb_group['case_no'].nunique())

(1145, 55)
1003
1145


In [166]:
# 保存分组数据
writer=pd.ExcelWriter(project_path+'/data/processed_data/df_2.2_保存分组数据.xlsx')
df_lfsb_group.to_excel(writer)
writer.save()

In [167]:
# 提取单个剂量分组
df_lfsb_10=df_lfsb_merge_dm[df_lfsb_merge_dm['剂量分组']==0]
df_lfsb_15=df_lfsb_merge_dm[df_lfsb_merge_dm['剂量分组']==1]
df_lfsb_20=df_lfsb_merge_dm[df_lfsb_merge_dm['剂量分组']==2]

In [169]:
# 统计分组数
num_10_patient=df_lfsb_10['patient_id'].nunique()
num_10_case=df_lfsb_10['case_no'].nunique()
num_15_patient=df_lfsb_15['patient_id'].nunique()
num_15_case=df_lfsb_15['case_no'].nunique()
num_20_patient=df_lfsb_20['patient_id'].nunique()
num_20_case=df_lfsb_20['case_no'].nunique()

print('分组patient人数',num_10_patient,num_15_patient,num_20_patient)
print('分组case记录',num_10_case,num_15_case,num_20_case)

分组patient人数 225 511 308
分组case记录 246 567 332


### 统计10mg组再次入院

In [171]:
#统计10mg组再次入院人数
count_10=0
list_10_again=[]
for i in np.unique(df_lfsb_10['patient_id']):
    temp=df_lfsb_10[df_lfsb_10['patient_id']==i]
    if temp.shape[0]>1:
        count_10 +=1
        list_10_again.append(i)
print('10mg再次入院',count_10,count_10/num_10_patient)
print(list_10_again)

10mg再次入院 19 0.08444444444444445
[247563, 293857, 927121, 1022731, 1807633, 4556512, 4571745, 5679203, 6668949, 7098263, 7241855, 7306079, 7308961, 7325436, 7441706, 7513876, 7582093, 7659502, 8086635]


### 统计15mg组再次入院

In [173]:
# 统计15mg组再次入院人数
count_15=0
list_15_again=[]
for i in np.unique(df_lfsb_15['patient_id']):
    temp=df_lfsb_15[df_lfsb_15['patient_id']==i]
    temp=temp.reset_index(drop=True)
    if temp.shape[0]>1:
        count_15 +=1
        list_15_again.append(i)
print('15mg再次入院',count_15,count_15/num_15_patient)
print(list_15_again)

15mg再次入院 43 0.08414872798434442
[496817, 1195929, 2212555, 2886270, 2932241, 3478182, 4546386, 4695568, 5702732, 5823843, 5846872, 6390354, 6686954, 6777331, 7142119, 7206592, 7308961, 7562261, 7612108, 7633561, 7646775, 7658907, 7664380, 7672856, 7678373, 7687507, 7707115, 7718754, 7734291, 7740314, 7762988, 7781201, 7804785, 7822780, 7841442, 7869054, 7923610, 7978971, 8034888, 8074997, 8087765, 8141611, 8148588]


### 统计20mg组再次入院人数

In [175]:
# 统计20mg组再次入院人数
count_20=0
list_20_again=[]
for i in np.unique(df_lfsb_20['patient_id']):
    temp=df_lfsb_20[df_lfsb_20['patient_id']==i]
    temp=temp.reset_index(drop=True)
    if temp.shape[0]>1:
        count_20 +=1
        list_20_again.append(i)
print('20mg再次入院',count_20,count_20/num_20_patient)
print(list_20_again)

20mg再次入院 23 0.07467532467532467
[332270, 550739, 991451, 2210831, 2607927, 5642110, 5850383, 5896137, 5903669, 6530206, 7162315, 7399973, 7445852, 7456009, 7461014, 7481180, 7635420, 7756138, 7797303, 7806691, 7871805, 8089850, 8148762]


### 提取各组再次入院的记录

In [176]:
# 再次入院patient_id列表
list_again=list_10_again + list_15_again + list_20_again
print(type(list_again))
print(list_again)
df_lfsb_group_again=df_lfsb_group[df_lfsb_group['patient_id'].isin(list_again)]
df_lfsb_group_again=df_lfsb_group_again.reset_index(drop=True)

<class 'list'>
[247563, 293857, 927121, 1022731, 1807633, 4556512, 4571745, 5679203, 6668949, 7098263, 7241855, 7306079, 7308961, 7325436, 7441706, 7513876, 7582093, 7659502, 8086635, 496817, 1195929, 2212555, 2886270, 2932241, 3478182, 4546386, 4695568, 5702732, 5823843, 5846872, 6390354, 6686954, 6777331, 7142119, 7206592, 7308961, 7562261, 7612108, 7633561, 7646775, 7658907, 7664380, 7672856, 7678373, 7687507, 7707115, 7718754, 7734291, 7740314, 7762988, 7781201, 7804785, 7822780, 7841442, 7869054, 7923610, 7978971, 8034888, 8074997, 8087765, 8141611, 8148588, 332270, 550739, 991451, 2210831, 2607927, 5642110, 5850383, 5896137, 5903669, 6530206, 7162315, 7399973, 7445852, 7456009, 7461014, 7481180, 7635420, 7756138, 7797303, 7806691, 7871805, 8089850, 8148762]


In [177]:
print(df_lfsb_group_again.shape)
print(df_lfsb_group_again['patient_id'].nunique())
print(df_lfsb_group_again['case_no'].nunique())

(201, 55)
84
201


In [178]:
# 保存再次入院的分组数据
writer=pd.ExcelWriter(project_path+'/data/processed_data/df_2.3_保存再次入院的分组数据.xlsx')
df_lfsb_group_again.to_excel(writer)
writer.save()

## 提取部分基础特征，做PSM分析

In [179]:
# 提取部分基础特征，做PSM分析
df_lfsb_group_PSM=df_lfsb_group_again[['patient_id','case_no','日剂量','gender','age','height','weight','BMI','血肌酐清除率','大便常规-隐血','尿常规-隐血','糖尿病','高血压','剂量分组']]

In [180]:
# 再次入院分组数据做PSM分析
writer=pd.ExcelWriter(project_path+'/data/processed_data/df_2.4_再次入院分组数据做PSM分析.xlsx')
df_lfsb_group_PSM.to_excel(writer)
writer.save()

## 并入出入院时间和诊断

In [223]:
df_inp_record
print(type(df_inp_record.loc[0,'case_no']))

<class 'str'>


In [269]:
print('-------------------------计算多次出入院时间-----------------------------')
temp_list=[]
for i in np.unique(df_lfsb_group_PSM['case_no']):
    print(i)
#     print(type(i))
    temp=df_lfsb_group_PSM[df_lfsb_group_PSM['case_no']==i]
    temp_inp_time=df_inp_record[df_inp_record['case_no']==i]
    temp_inp_time=temp_inp_time.reset_index(drop=True)
#     print(temp_inp_time)
#     print(temp_inp_time.loc[0,'adm_date'])
    temp_inp_diagnostic=df_diagnostic_inp_merge[df_diagnostic_inp_merge['case_no']==i]
    temp_inp_diagnostic=temp_inp_diagnostic.reset_index(drop=True)
#     print(temp_inp_diagnostic)
    # 并入出入院时间
    temp['adm_date']=temp_inp_time.loc[0,'adm_date']
    temp['dis_date']=temp_inp_time.loc[0,'dis_date']
#     print(temp)
    # 并入入院诊断
    temp['diagnostic_content']=temp_inp_diagnostic.loc[0,'diagnostic_content']
    print(temp)
    temp_list.append(temp)

-------------------------计算多次出入院时间-----------------------------
11316003
     patient_id   case_no   日剂量  gender   age height  weight        BMI  \
193     8089850  11316003  20.0     1.0  67.0    169    63.0  22.058051   

        血肌酐清除率  大便常规-隐血  尿常规-隐血  糖尿病  高血压  剂量分组            adm_date  \
193  75.976343      NaN     NaN    0  NaN   2.0 2020-12-28 09:59:32   

               dis_date         diagnostic_content  
193 2020-12-31 15:10:00  阵发性房颤;阵发性房颤;主动脉硬化;胸腰椎退行性变  
4003605
   patient_id  case_no   日剂量  gender   age height  weight       BMI  \
4      332270  4003605  20.0     1.0  56.0    160  40.548  16.91944   

      血肌酐清除率  大便常规-隐血  尿常规-隐血  糖尿病  高血压  剂量分组            adm_date  \
4  33.579462      0.0     0.0    0  1.0   2.0 2018-07-15 09:41:22   

             dis_date                                 diagnostic_content  
4 2018-07-18 10:18:00  心力衰竭;高脂血症;痛风性关节炎;频发性房性期外收缩;心功能Ⅰ级;慢性肾脏病2期;心律失常;...  
4043176
     patient_id  case_no   日剂量  gender   age height  weight        BMI  \
107  

   patient_id  case_no   日剂量  gender   age height  weight        BMI  \
9      550739  4760051  20.0     0.0  65.0    154    57.0  24.034407   

      血肌酐清除率  大便常规-隐血  尿常规-隐血  糖尿病  高血压  剂量分组            adm_date  \
9  87.102689      0.0     0.0    1  0.0   2.0 2018-10-22 09:41:04   

             dis_date                                 diagnostic_content  
9 2018-10-29 09:07:00  心房颤动;原有的非胰岛素依赖型糖尿病;高脂血症;脑梗死;乳腺纤维瘤切除术后;子宫全切术后;左...  
4803597
     patient_id  case_no   日剂量  gender   age height  weight      BMI  \
117     7562261  4803597  15.0     1.0  61.0    160    60.0  23.4375   

        血肌酐清除率  大便常规-隐血  尿常规-隐血  糖尿病  高血压  剂量分组            adm_date  \
117  57.372485      0.0     0.0    0  1.0   1.0 2018-10-26 16:26:26   

               dis_date                                 diagnostic_content  
117 2018-11-03 10:28:00  冠状动脉粥样硬化性心脏病;冠状动脉粥样硬化性心脏病;冠状动脉支架植入后状态;乙型病毒性肝炎;...  
4826084
     patient_id  case_no   日剂量  gender   age height  weight        BMI  \
113     7481180  4826084  20.0    

84 2019-01-19 10:17:00  心房颤动;心房颤动;肥厚型梗阻性心肌病;高血压2级;2型糖尿病不伴有并发症;高脂血症;心功能...  
5374697
     patient_id  case_no   日剂量  gender   age height  weight        BMI  \
110     7456009  5374697  20.0     1.0  51.0    168    69.0  24.447279   

        血肌酐清除率  大便常规-隐血  尿常规-隐血  糖尿病  高血压  剂量分组            adm_date  \
110  80.724032      0.0     0.0    0  0.0   2.0 2019-01-07 10:10:08   

               dis_date                                 diagnostic_content  
110 2019-01-10 08:10:00  心房颤动;短阵房性心动过速;亚临床甲状腺功能亢进症;高脂血症;高尿酸血症;心律失常;射频消融...  
5407920
     patient_id  case_no   日剂量  gender   age height  weight        BMI  \
129     7646775  5407920  15.0     0.0  65.0    154    78.0  32.889189   

       血肌酐清除率  大便常规-隐血  尿常规-隐血  糖尿病  高血压  剂量分组            adm_date  \
129  83.27193      NaN     NaN    0  NaN   1.0 2019-01-10 11:02:38   

               dis_date                         diagnostic_content  
129 2019-01-16 08:53:00  心房颤动;高血压3级;心房颤动;左心耳血栓;慢性病毒性肝炎;主动脉硬化;高血压3级  
5447448
    patient_id  case_no   

     patient_id  case_no   日剂量  gender   age height  weight        BMI  \
147     7707115  5819516  15.0     0.0  65.0    160    65.0  25.390625   

        血肌酐清除率  大便常规-隐血  尿常规-隐血  糖尿病  高血压  剂量分组            adm_date  \
147  84.428484      NaN     NaN    0  NaN   1.0 2019-03-10 10:40:46   

               dis_date diagnostic_content  
147 2019-03-17 09:00:00    阵发性心房颤动;阵发性心房颤动  
5823870
    patient_id  case_no   日剂量  gender   age height    weight       BMI  \
39     3478182  5823870  15.0     1.0  78.0    160  40.50325  16.36613   

       血肌酐清除率  大便常规-隐血  尿常规-隐血  糖尿病  高血压  剂量分组            adm_date  \
39  35.696843      0.0     1.0    1  1.0   1.0 2019-03-11 08:23:53   

              dis_date                                 diagnostic_content  
39 2019-03-20 09:00:00  心房颤动;颈内动脉粥样硬化;颈总动脉狭窄;颈外动脉狭窄;心房颤动;心功能Ⅲ级;高血压3级;颈...  
5891393
    patient_id  case_no   日剂量  gender   age height  weight        BMI  \
32     2607927  5891393  20.0     1.0  58.0    175    80.0  26.122449   

       血肌酐清除率

166 2019-05-26 18:33:00  胸闷;持续性心房颤动;高血压2级;脂肪肝;持续性心房颤动;高血压1级;高脂血症;下肢静脉曲张...  
6431951
    patient_id  case_no   日剂量  gender   age height  weight       BMI  \
88     7162315  6431951  15.0     1.0  72.0    167    65.0  23.30668   

       血肌酐清除率  大便常规-隐血  尿常规-隐血  糖尿病  高血压  剂量分组            adm_date  \
88  59.459396      0.0     0.0    0  0.0   1.0 2019-05-18 17:56:45   

              dis_date                                 diagnostic_content  
88 2019-05-23 08:16:00  冠状动脉粥样硬化性心脏病;髂内动脉瘤;髂内动脉瘤;冠状动脉粥样硬化性心脏病;心房颤动;脑梗死...  
6448658
    patient_id  case_no   日剂量  gender   age height  weight        BMI  \
20     1195929  6448658  15.0     0.0  74.0      0  33.349  15.692541   

       血肌酐清除率  大便常规-隐血  尿常规-隐血  糖尿病  高血压  剂量分组            adm_date  \
20  30.495174      0.0     0.0    1  1.0   1.0 2019-05-20 17:47:06   

              dis_date                                 diagnostic_content  
20 2019-05-28 14:52:00  二尖瓣、三尖瓣关闭不全（中度）;2型糖尿病;心功能Ⅲ级;心房扑动;心房扑动;冠状动脉粥样硬化...  
6468655
     patient_id  case

     patient_id  case_no   日剂量  gender   age height  weight      BMI  \
171     7841442  7011798  15.0     1.0  65.0    168    65.5  23.2072   

        血肌酐清除率  大便常规-隐血  尿常规-隐血  糖尿病  高血压  剂量分组            adm_date  \
171  75.068765      NaN     NaN    0  NaN   1.0 2019-07-23 09:53:14   

               dis_date                                 diagnostic_content  
171 2019-08-02 11:21:00  胸腺恶性肿瘤;肺部感染;胸腺恶性肿瘤;多发淋巴结转移;肺转移;胸腔积液;高血压2级;冠状动脉...  
7024844
   patient_id  case_no   日剂量  gender   age height  weight        BMI  \
8      496817  7024844  15.0     1.0  61.0    169    63.0  22.058051   

       血肌酐清除率  大便常规-隐血  尿常规-隐血  糖尿病  高血压  剂量分组            adm_date  \
8  101.405868      0.0     0.0    0  0.0   1.0 2019-07-24 11:27:26   

             dis_date                                 diagnostic_content  
8 2019-08-01 09:17:00  胸痛;甲状腺功能减退;心耳血栓;持续性心房颤动;心脏起搏器植入术后状态;射频消融+左心耳封堵...  
7034569
    patient_id  case_no   日剂量  gender   age height  weight        BMI  \
58     5823843  7034569  15.0    

80 2019-09-13 08:57:00  心房颤动;慢性阻塞性肺病;支气管哮喘，非危重;右眼翼状胬肉术后;脑梗死;高血压病;慢性阻塞性...  
7426490
     patient_id  case_no   日剂量  gender   age height  weight        BMI  \
177     7871805  7426490  20.0     1.0  63.0    170    80.0  27.681661   

        血肌酐清除率  大便常规-隐血  尿常规-隐血  糖尿病  高血压  剂量分组            adm_date  \
177  78.443358      NaN     NaN    0  NaN   2.0 2019-09-08 08:20:56   

               dis_date diagnostic_content  
177 2019-09-09 12:25:00          心房颤动;心房颤动  
7435347
    patient_id  case_no   日剂量  gender   age height  weight        BMI  \
72     6390354  7435347  20.0     0.0  70.0    160    65.0  25.390625   

       血肌酐清除率  大便常规-隐血  尿常规-隐血  糖尿病  高血压  剂量分组            adm_date  \
72  61.402534      0.0     1.0    0  1.0   2.0 2019-09-09 10:14:20   

              dis_date                                 diagnostic_content  
72 2019-09-12 10:37:00  心房颤动;乙肝病毒携带者;脑梗死后遗症;风湿性心脏病;胆囊结石;心房颤动;风湿性心脏病;脑梗...  
7501258
     patient_id  case_no   日剂量  gender   age height  weight        BMI  \
180 

2 2019-11-20 08:48:00  心房纤颤和扑动;心房颤动;冠状动脉粥样硬化性心脏病;高血压3级;甲状腺功能减退症，其他特指的...  
8069145
     patient_id  case_no   日剂量  gender   age height  weight        BMI  \
155     7740314  8069145  15.0     1.0  63.0    160    71.0  27.734375   

        血肌酐清除率  大便常规-隐血  尿常规-隐血  糖尿病  高血压  剂量分组            adm_date  \
155  73.443671      NaN     NaN    0  NaN   1.0 2019-11-21 08:22:20   

               dis_date        diagnostic_content  
155 2019-11-27 08:55:00  心房颤动;高血压3级;持续性心房颤动;高血压3级  
8104175
     patient_id  case_no   日剂量  gender   age height  weight        BMI  \
138     7664380  8104175  15.0     0.0  73.0    150   33.72  15.732561   

        血肌酐清除率  大便常规-隐血  尿常规-隐血  糖尿病  高血压  剂量分组            adm_date  \
138  28.629525      NaN     NaN    0  NaN   1.0 2019-11-25 10:51:02   

               dis_date                                 diagnostic_content  
138 2019-12-06 08:54:00  肝血管瘤;肾囊肿;神经根型颈椎病;持续性心房颤动;冠状动脉粥样硬化性心脏病;慢性左心功能不全...  
8152032
     patient_id  case_no   日剂量  gender   age height  weight

     patient_id  case_no   日剂量  gender   age height  weight       BMI  \
195     8141611  8660633  15.0     1.0  43.0  177.5  48.496  17.45351   

        血肌酐清除率  大便常规-隐血  尿常规-隐血  糖尿病  高血压  剂量分组            adm_date  \
195  71.884352      NaN     NaN    0  NaN   1.0 2020-02-04 11:20:08   

               dis_date                                 diagnostic_content  
195 2020-02-08 09:38:00  阵发性室性心动过速;高尿酸血症;冠状动脉粥样硬化;肺动脉高压(轻度);二尖瓣返流;三尖瓣中度...  
8742662
     patient_id  case_no   日剂量  gender   age height    weight        BMI  \
199     8148762  8742662  20.0     1.0  65.0    165  42.73175  16.603383   

        血肌酐清除率  大便常规-隐血  尿常规-隐血  糖尿病  高血压  剂量分组            adm_date  \
199  46.093503      NaN     NaN    0  NaN   2.0 2020-03-05 10:28:35   

               dis_date                                 diagnostic_content  
199 2020-03-19 10:09:40  肺占位性病变;肺不张;阻塞性肺炎;胸腔积液;阻塞性肺炎;胸腔积液;肺不张;冠心病;慢性胆囊炎...  
8783896
     patient_id  case_no   日剂量  gender   age   height   weight        BMI  \
145     76875

146 2020-08-05 08:00:50  慢性心力衰竭;脐疝;腹壁疝;二尖瓣返流;心功能Ⅲ级;冠状动脉粥样硬化性心脏病;冠状动脉支架植...  
9703591
     patient_id  case_no   日剂量  gender   age height  weight        BMI  \
170     7822780  9703591  15.0     1.0  75.0    172   80.25  27.126149   

        血肌酐清除率  大便常规-隐血  尿常规-隐血  糖尿病  高血压  剂量分组            adm_date  \
170  35.037548      NaN     NaN    1  NaN   1.0 2020-07-15 09:23:51   

               dis_date                                 diagnostic_content  
170 2020-07-25 08:00:00  持续性心房颤动;2型糖尿病伴有肾的并发症;卵圆孔未闭;皮层下动脉硬化性脑病;胆囊结石;酒精性...  
9748832
     patient_id  case_no   日剂量  gender   age height  weight        BMI  \
190     8086635  9748832  15.0     1.0  75.0    174    55.0  18.166204   

        血肌酐清除率  大便常规-隐血  尿常规-隐血  糖尿病  高血压  剂量分组            adm_date  \
190  63.304188      NaN     NaN    0  NaN   1.0 2020-07-20 11:12:33   

               dis_date                                 diagnostic_content  
190 2020-07-28 08:13:00  红皮病性银屑病;红皮病性银屑病;心房颤动;前列腺钙化灶;肺炎;肺气肿;主动脉硬化;胸腰椎退行...  
9756896
    pat

In [271]:
df_lfsb_merge_inp_diagnostic=temp_list[0]
for j in range(1,len(temp_list)):
    df_lfsb_merge_inp_diagnostic=pd.concat([df_lfsb_merge_inp_diagnostic,temp_list[j]])
df_lfsb_merge_inp_diagnostic=df_lfsb_merge_inp_diagnostic.sort_values(['patient_id','case_no','adm_date'])
df_lfsb_merge_inp_diagnostic=df_lfsb_merge_inp_diagnostic.reset_index(drop=True)
del temp_list

In [272]:
print(df_lfsb_merge_inp_diagnostic.shape)
print(df_lfsb_merge_inp_diagnostic['patient_id'].nunique())
print(df_lfsb_merge_inp_diagnostic['case_no'].nunique())

(201, 17)
84
201


In [273]:
df_lfsb_merge_inp_diagnostic

Unnamed: 0,patient_id,case_no,日剂量,gender,age,height,weight,BMI,血肌酐清除率,大便常规-隐血,尿常规-隐血,糖尿病,高血压,剂量分组,adm_date,dis_date,diagnostic_content
0,247563,4274137,10.0,1.0,70.0,172,44.50000,15.041915,89.822645,0.0,1.0,0,1.0,0.0,2018-08-16 12:03:03,2018-08-25 08:09:00,心房颤动;高血压1级;动脉硬化性心脏病;陈旧性肺结核;前列腺增生;心房颤动;心房颤动;心功能...
1,247563,6307058,10.0,1.0,70.0,173,56.12500,18.752715,39.367810,0.0,0.0,0,1.0,0.0,2019-05-05 15:11:02,2019-05-22 08:10:00,心房颤动;慢性左心功能不全;心房颤动;慢性左心功能不全;高尿酸血症;陈旧性肺结核
2,293857,8030106,10.0,0.0,91.0,155,35.00000,14.568158,20.023488,1.0,1.0,0,1.0,0.0,2019-11-17 10:04:30,2019-11-20 08:48:00,心房纤颤和扑动;心房颤动;冠状动脉粥样硬化性心脏病;高血压3级;甲状腺功能减退症，其他特指的...
3,293857,8225142,10.0,0.0,91.0,155,40.00000,16.649324,20.366748,1.0,0.0,0,0.0,0.0,2019-12-08 17:37:32,2019-12-16 13:40:00,慢性左心功能不全;肺部感染;心房颤动;甲状腺功能减退症，其他特指的;低蛋白血症;冠状动脉粥样...
4,332270,4003605,20.0,1.0,56.0,160,40.54800,16.919440,33.579462,0.0,0.0,0,1.0,2.0,2018-07-15 09:41:22,2018-07-18 10:18:00,心力衰竭;高脂血症;痛风性关节炎;频发性房性期外收缩;心功能Ⅰ级;慢性肾脏病2期;心律失常;...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
196,8141611,9057048,15.0,1.0,43.0,177.5,48.43500,17.465529,59.828278,,,0,,1.0,2020-04-27 17:02:39,2020-05-02 08:30:00,高脂血症;三尖瓣中度返流;糖代谢紊乱;肺部感染;痛风;扩张型心肌病;心功能Ⅱ级;糖代谢紊乱;...
197,8148588,9018899,15.0,1.0,56.0,173,47.70600,17.463862,59.742740,,,0,,1.0,2020-04-22 09:10:20,2020-04-25 10:21:00,弥漫性大B细胞淋巴瘤;阵发性心房颤动;T波改变;高血压1级;慢性乙型病毒性肝炎;弥漫性大B细...
198,8148588,9180674,15.0,1.0,56.0,173,90.00000,30.071168,116.988023,,,0,,1.0,2020-05-14 17:27:15,2020-05-17 09:50:00,弥漫性大B细胞淋巴瘤;免疫功能低下;恶性肿瘤维持性化学治疗;阵发性房颤;前列腺钙化灶;肺大泡...
199,8148762,8742662,20.0,1.0,65.0,165,42.73175,16.603383,46.093503,,,0,,2.0,2020-03-05 10:28:35,2020-03-19 10:09:40,肺占位性病变;肺不张;阻塞性肺炎;胸腔积液;阻塞性肺炎;胸腔积液;肺不张;冠心病;慢性胆囊炎...


In [275]:
# 保存并入出入院时间和诊断
writer=pd.ExcelWriter(project_path+'/data/processed_data/df_2.5_并入出入院时间和诊断.xlsx')
df_lfsb_merge_inp_diagnostic.to_excel(writer)
writer.save()

## 统计再次入院的出血、卒中

In [245]:
# 按时间排序
df_lfsb_merge_inp_diagnostic=df_lfsb_merge_inp_diagnostic.sort_values(['patient_id','adm_date'])

In [246]:
# 判断两个列表中是否存在相同元素，存在返回True，否则False
def judge_list_element(list1,list2):
    judge_list=[x for x in list1 if x if list2]
    if judge_list:
        return True
    else:
        return False

In [247]:
# 根据郑-随访诊断，统计再次入院的出血、卒中
# 卒中事件
stroke_event=['脑梗死','脑梗死后遗症','腔隙性脑梗死','大脑动脉栓塞引起的脑梗死','脑梗塞','中风','脑梗死个人史','脑干梗死(康复期)','多发性脑梗死',
'左心耳封堵术后','左心耳封堵术','左心房栓子形成','多发腔隙性脑梗死','左侧基底节区陈旧性腔隙性脑梗塞','心耳血栓','小脑梗死','短暂性脑缺血',
'陈旧性脑梗死','左心耳附壁血栓','脑栓塞','基底动脉血栓形成脑梗死','左心耳血栓形成','脑梗死（基底节大动脉粥样硬化性）','多发性脑梗塞','陈旧性脑梗塞',
'脑梗死（大脑中动脉心源性）','大脑动脉狭窄脑梗死','短暂性脑缺血发作','脑梗塞后遗症','右侧小脑半球陈旧性脑梗死','脑血管取栓术后','陈旧性腔隙性脑梗死',
'大脑动脉血栓形成引起的脑梗死','肾缺血和肾梗死','左侧大脑中动脉支架取栓术后','多发腔隙性脑梗塞','胸主动脉附壁血栓','起搏器血栓形成','左心耳切除术后',
'左侧颈内动脉血管内抽吸术后','左心耳血栓']
# 出血事件
bleeding_event=['脑梗死后出血转化','消化道出血','出血性脑梗死','失血性休克','出血性内痔','脑出血后遗症','胃溃疡伴有穿孔','血尿，持续性',
'脑内出血','下消化道出血','肺泡出血可能','蛛网膜下腔出血','女性盆腔血肿','皮下出血']

In [336]:
# 排序
df_lfsb_merge_inp_diagnostic=df_lfsb_merge_inp_diagnostic.sort_values(['patient_id','case_no','adm_date'])
df_lfsb_merge_inp_diagnostic=df_lfsb_merge_inp_diagnostic.reset_index(drop=True)
# 第再次入院新出血卒中统计
group_0_num=0
group_1_num=0
group_2_num=0
# 患者id
for j in np.unique(df_lfsb_merge_inp_diagnostic['patient_id']):
#     print(type(j))
    # 患者的住院记录case_no
    temp=df_lfsb_merge_inp_diagnostic[df_lfsb_merge_inp_diagnostic['patient_id']==j]
    temp=temp.reset_index(drop=True)
    for k in range(temp.shape[0]):
        temp_diagnostic_list=str(temp.loc[k,'diagnostic_content']).split(';')
        # 如果第一次出院,存在出血卒中事件，则跳过；
        if k==0:
            if judge_list_element(temp_diagnostic_list,stroke_event) or judge_list_element(temp_diagnostic_list,bleeding_event):
#                 if j==7664380:
#                     print('看错了吧')
                break
        # 否则，统计再次入院的出血卒中事件
        if judge_list_element(temp_diagnostic_list,stroke_event) or judge_list_element(temp_diagnostic_list,bleeding_event):
            group_id=temp.loc[(k-1),'剂量分组']
            if group_id ==0:
                group_0_num +=1
            elif group_id ==1:
                group_1_num +=1
            elif group_id ==2:
                group_2_num +=1
                break

看错了吧


In [281]:
print('10mg组再次入院的新出血卒中率：',group_0_num, group_0_num/count_10)
print('15mg组再次入院的新出血卒中率：',group_1_num, group_1_num/count_15)
print('20mg组再次入院的新出血卒中率：',group_2_num, group_2_num/count_20)

10mg组再次入院的新出血卒中率： 0 0.0
15mg组再次入院的新出血卒中率： 0 0.0
20mg组再次入院的新出血卒中率： 0 0.0


In [None]:
# print('-------------------------计算多次出入院时间-----------------------------')
# temp_list=[]
# for i in np.unique(df_lfsb_group_PSM['patient_id']):
#     temp=df_lfsb_inner_mx[df_lfsb_merge_dm['patient_id']==i]
#     temp=temp.reset_index(drop=True)
#     temp_inp=df_inp_record[df_inp_record['patient_id']==i]
#     temp_inp=temp_inp.reset_index(drop=True)
#     # 判断是否存在多次入院信息，即多个case_no
#     if temp_inp.shape[0]>1:
#         for j in range(0,temp.shape[0]-1):
#             case_no=temp_inp.loc[j,'case_no']
# #             print(case_no)
# #             print(temp_inp)
#             # 出院时间
#             oup=temp_inp.loc[j,'dis_date']
#             oup_time='第%s次出院' % (j+1)
#             temp[oup_time]=oup
#             inp=temp_inp.loc[j+1,'adm_date']
#             # 入院时间和诊断
#             inp_time='第%s次入院' % (j+2)
#             diagnostic_inp='第%s次入院诊断' % (j+2)
#             diagnostic_inp_content=df_diagnostic_inp[case_no]
#             temp[inp_time]=inp
#             temp[diagnostic_inp]=diagnostic_inp
#             temp[diagnostic_inp_content]=diagnostic_inp_content
#             # 出院到入院时间间隔
#             interval_time=inp-oup
#             interval_days=interval_time.days
#             temp[oup_time+'-'+inp_time]=interval_days
#     temp_list.append(temp)
# df_lfsb_inp=temp_list[0]
# for k in range(1,len(temp_list)):
#     df_lfsb_inp=pd.concat([df_lfsb_inp,temp_list[k]],axis=0)

# df_lfsb_inp=df_lfsb_inp.sort_values(['case_no','start_datetime'])
# df_lfsb_inp=df_lfsb_inp.reset_index(drop=True)

In [None]:
writer=pd.ExcelWriter(project_path+'/data/processed_data/df_2.2_计算多次出入院时间和诊断.xlsx')
df_lfsb_inp.to_excel(writer)
writer.save()

In [None]:
# # 3.按剂量10、15、20分组，计算再次入院率
# df_lfsb_['剂量分组']=df_lfsb_not_valve['日剂量'].apply(lambda x: 0 if x==10 else 1 if x==15 else 2 if x==20 else np.nan)
# df_lfsb_0=df_lfsb_not_valve[df_lfsb_not_valve['剂量分组']==0]
# df_lfsb_1=df_lfsb_not_valve[df_lfsb_not_valve['剂量分组']==1]
# df_lfsb_2=df_lfsb_not_valve[df_lfsb_not_valve['剂量分组']==2]
# # 统计分组数
# num_10=df_lfsb_0['patient_id'].nunique()
# num_15=df_lfsb_1['patient_id'].nunique()
# num_20=df_lfsb_2['patient_id'].nunique()
# print('分组人数',num_10,num_15,num_20)
# #统计再次入院的每组人数
# count_10=0
# for i in np.unique(df_lfsb_0['patient_id']):
#     temp=df_lfsb_0[df_lfsb_0['patient_id']==i]
#     temp=temp.reset_index(drop=True)
#     if temp.shape[0]>1:
#         count_10 +=1
# count_15=0
# for i in np.unique(df_lfsb_1['patient_id']):
#     temp=df_lfsb_1[df_lfsb_1['patient_id']==i]
#     temp=temp.reset_index(drop=True)
#     if temp.shape[0]>1:
#         count_15 +=1
# count_20=0
# for i in np.unique(df_lfsb_2['patient_id']):
#     temp=df_lfsb_2[df_lfsb_2['patient_id']==i]
#     temp=temp.reset_index(drop=True)
#     if temp.shape[0]>1:
#         count_20 +=1
# print('10mg再次入院',count_10,count_10/num_10)
# print('15mg再次入院',count_15,count_15/num_10)
# print('20mg再次入院',count_20,count_20/num_10)