In [148]:
'''
在获得粗略数据后，构建可以被模型读取的数据
'''
# 截取的时段为患者开始接收RRT到结束的时段
# 计算每个时刻及撤机7d内的reward
# 动作要量化为IHD、CRRT、No的片段
# 状态进行高斯补全
# 考虑到动作可能重叠的情况，则IHD为优先级

'\n在获得粗略数据后，构建可以被模型读取的数据\n'

In [149]:
TIMEDURING = 6
CON_WINDOWS = 7 * 24 // TIMEDURING

# 参数
C_1 = 0.3
C_2 = 0.8
C_3 = -0.1
C4 = 0.8
C5 = 0.2
k = -0.1


import pandas as pd
import numpy as np
import os
from tqdm import tqdm
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, WhiteKernel


In [150]:

# file_demo = pd.read_csv('./data/temp/30166100.csv')

# df = file_demo
ID_col = ['stay_id', 'step']
state_col =    ['ph',
 'po2',
 'calcium',
 'bicarbonate',
 'creatinine',
 'sodium',
 'potassium',
 'uo_rt_24hr']
col_list = ['eGFR'] + state_col



In [158]:
# 首先利用高斯处理过程处理各个state、action值
# 选择用于高斯过程的特征（这里使用时间的浮点数表示）
df_merge = pd.DataFrame()
file_pool = os.listdir('./data/temp/')
for file in tqdm(file_pool):
    df = pd.read_csv('./data/temp/' + file)

    df['start_time'] = pd.to_datetime(df['start_time'])
    df['end_time'] = pd.to_datetime(df['end_time'])
    # 按照 start_time 升序排序
    df.sort_values('start_time', inplace=True)
    # 重置索引，并创建新的 step 列作为排序后的序号
    df.reset_index(drop=True, inplace=True)
    df['step'] = df.index  # index 是自动从0开始的整数序列
    
    
    #使用均匀分布的形式进行拟合
    # 设置step为索引
    df.set_index('step', inplace=True)
    # 对每列进行线性插值
    df['eGFR'] = df['avg_eGFR']
    df_process = df[col_list]
    df_process = df_process.interpolate(method='linear')

    # 填充插值后仍存在的缺失值
    df_process.fillna(method='ffill', inplace=True)  # 先前无存在值，使用最近的存在值
    df_process.fillna(method='bfill', inplace=True)  # 之后无存在值，也使用最近的存在值

    df[col_list] = df_process


    # 对数据的长度进行裁切
    # 寻找第一个和最后一个IHD、CRRT记录点
    # 计算每列的第一个和最后一个有效索引
    first_index_ihd = df['ihd'].first_valid_index()
    first_index_crrt = df['crrt'].first_valid_index()
    last_index_ihd = df['ihd'].last_valid_index()
    last_index_crrt = df['crrt'].last_valid_index()

    # 使用min和max来确定第一个和最后一个有效索引，处理None的情况
    first_index = min(first_index_ihd if first_index_ihd is not None else float('inf'),
                    first_index_crrt if first_index_crrt is not None else float('inf'))

    last_index = max(last_index_ihd if last_index_ihd is not None else float('-inf'),
                    last_index_crrt if last_index_crrt is not None else float('-inf'))
    # 需要检查是否有无效的索引值（如果两个列都是None）
    first_index = None if first_index == float('inf') else first_index
    last_index = None if last_index == float('-inf') else last_index

    if (first_index is None) or (last_index is None):
        print(f'{file} have no rrt record')
        continue
    # print(f"first non-null value at step: {first_index}")
    # print(f"last non-null value at step: {last_index}")
    # 计算eGFR的末期值
    max_index = min(last_index + CON_WINDOWS, df.index.max())
    # 使用loc来安全地获取范围内的数据并计算最小值
    min_value = df.loc[last_index:max_index, 'eGFR'].min()

    df = df.loc[first_index:last_index+1]
    df.reset_index(drop=True, inplace=True)
    df['step'] = df.index  # index 是自动从0开始的整数序列

    # 对动作进行处理
    df['action'] = np.where(pd.notna(df['crrt']), 2, 
                            np.where(pd.notna(df['ihd']), 1, 0)) #若crrt非空则2，若IHD非空则1，否则0

    # 计算sgn(eGFR >= 90)
    df['sgn_eGFR'] = (df['eGFR'] >= 90).astype(int)

    # 计算ΔeGFR
    df['delta_eGFR'] = df['eGFR'].diff().fillna(0)  # 填充第一个值为0或其他适当方法

    # 计算第二项中的表达式
    df['exp_part_2'] = 2 / (1 + np.exp(-df['delta_eGFR'])) - 1

    # 计算第三项中的表达式
    df['exp_part_3'] = 1 - np.exp(k * np.maximum(0, df['step'] - 14 * 24 / TIMEDURING))

    # 计算最终reward
    df['reward'] = C_1 * df['sgn_eGFR'] + C_2 * df['exp_part_2'] + C_3 * df['exp_part_3']



    # 对于终末期的reward进行计算
    df.loc[df.index[-1], 'reward'] = C4 *  (1 if min_value > 90 else -1) + C5 * df.loc[df.index[-1], 'delta_eGFR']# 参数
    # df_merge = pd.concat([df_merge, df[ID_col+state_col+['action', 'reward']]])
    df_merge = pd.concat([df_merge, df])
df_merge.to_csv('./data/res.csv')


# # 将 starttime 和 endtime 转换为浮点数（UNIX 时间戳形式），然后计算平均值
# # df['time_float'] = (df['start_time'].astype('int64') + df['end_time'].astype('int64')) / 2 / 10**9
# df['eGFR'] = df['avg_eGFR']
# col = 'eGFR'
# # 分离出已知和未知的数据点
# known_data = df.dropna(subset=[col])
# unknown_data = df[df[col].isna()]

# # 定义高斯过程回归模型
# kernel = 1.0 * RBF(length_scale=10.0) + WhiteKernel(noise_level=0.1)
# gp = GaussianProcessRegressor(kernel=kernel, optimizer='fmin_l_bfgs_b', n_restarts_optimizer=10)

# # 训练高斯过程回归器
# gp.fit(known_data[['step']], known_data[col])

# # 使用模型对未知数据进行预测
# x_pred, sigma = gp.predict(unknown_data[['step']], return_std=True)

# df.loc[df[col].isna(), 'eGFR'] = x_pred
# # eGFR要保留原值


  0%|          | 0/3453 [00:00<?, ?it/s]

  1%|          | 18/3453 [00:00<00:39, 85.96it/s]

30034929.csv have no rrt record


  3%|▎         | 88/3453 [00:00<00:37, 89.28it/s]

30215388.csv have no rrt record


  4%|▎         | 127/3453 [00:01<00:36, 90.35it/s]

30340367.csv have no rrt record


  9%|▊         | 296/3453 [00:03<00:36, 85.85it/s]

30799244.csv have no rrt record


  9%|▉         | 323/3453 [00:03<00:36, 85.62it/s]

30896903.csv have no rrt record


 12%|█▏        | 422/3453 [00:04<00:38, 77.87it/s]

31222340.csv have no rrt record


 13%|█▎        | 464/3453 [00:05<00:36, 81.60it/s]

31361483.csv have no rrt record


 15%|█▌        | 525/3453 [00:06<00:35, 82.38it/s]

31524467.csv have no rrt record
31528992.csv have no rrt record


 17%|█▋        | 599/3453 [00:07<00:37, 77.13it/s]

31720814.csv have no rrt record


 21%|██        | 727/3453 [00:08<00:37, 72.99it/s]

32136845.csv have no rrt record


 23%|██▎       | 807/3453 [00:10<00:36, 71.80it/s]

32348707.csv have no rrt record


 26%|██▌       | 904/3453 [00:11<00:35, 70.86it/s]

32625996.csv have no rrt record


 29%|██▉       | 1011/3453 [00:13<00:37, 64.72it/s]

32875170.csv have no rrt record


 32%|███▏      | 1110/3453 [00:14<00:35, 65.55it/s]

33129284.csv have no rrt record


 36%|███▌      | 1251/3453 [00:17<00:36, 60.09it/s]

33505376.csv have no rrt record


 38%|███▊      | 1303/3453 [00:17<00:36, 58.91it/s]

33600218.csv have no rrt record


 40%|███▉      | 1372/3453 [00:19<00:34, 60.84it/s]

33786587.csv have no rrt record


 45%|████▍     | 1547/3453 [00:22<00:32, 58.35it/s]

34318564.csv have no rrt record


 46%|████▋     | 1602/3453 [00:23<00:33, 55.75it/s]

34506990.csv have no rrt record


 49%|████▉     | 1699/3453 [00:25<00:32, 54.59it/s]

34767909.csv have no rrt record


 54%|█████▍    | 1871/3453 [00:28<00:30, 51.66it/s]

35274229.csv have no rrt record


 56%|█████▌    | 1938/3453 [00:29<00:28, 53.17it/s]

35463762.csv have no rrt record


 58%|█████▊    | 1992/3453 [00:30<00:27, 54.00it/s]

35635578.csv have no rrt record
35654664.csv have no rrt record


 58%|█████▊    | 2010/3453 [00:31<00:27, 53.00it/s]

35675484.csv have no rrt record


 64%|██████▍   | 2210/3453 [00:35<00:27, 45.92it/s]

36250287.csv have no rrt record


 74%|███████▍  | 2561/3453 [00:43<00:20, 43.17it/s]

37272868.csv have no rrt record


 82%|████████▏ | 2846/3453 [00:50<00:14, 41.64it/s]

38114702.csv have no rrt record


 84%|████████▎ | 2885/3453 [00:51<00:13, 41.47it/s]

38236658.csv have no rrt record


 88%|████████▊ | 3036/3453 [00:54<00:10, 40.82it/s]

38680413.csv have no rrt record


 89%|████████▉ | 3071/3453 [00:55<00:09, 40.13it/s]

38817538.csv have no rrt record


 91%|█████████ | 3133/3453 [00:57<00:08, 39.59it/s]

38971017.csv have no rrt record


 91%|█████████▏| 3158/3453 [00:58<00:07, 39.66it/s]

39045520.csv have no rrt record


 92%|█████████▏| 3183/3453 [00:58<00:06, 40.19it/s]

39104319.csv have no rrt record


 93%|█████████▎| 3202/3453 [00:59<00:06, 39.29it/s]

39149172.csv have no rrt record


 93%|█████████▎| 3215/3453 [00:59<00:06, 38.73it/s]

39190025.csv have no rrt record


 93%|█████████▎| 3223/3453 [00:59<00:06, 38.32it/s]

39210893.csv have no rrt record


 96%|█████████▌| 3320/3453 [01:02<00:03, 38.02it/s]

39554910.csv have no rrt record
39568034.csv have no rrt record


100%|██████████| 3453/3453 [01:06<00:00, 52.13it/s]


In [157]:
df_merge.to_csv('./data/res.csv')

In [152]:
# 对数据的长度进行裁切
# 寻找第一个和最后一个IHD、CRRT记录点

# first_index = min(df['ihd'].first_valid_index(), df['crrt'].first_valid_index())
# last_index = max(df['ihd'].last_valid_index(), df['crrt'].last_valid_index())
# print(f"first non-null value at step: {first_index}")
# print(f"last non-null value at step: {last_index}")
# # 计算eGFR的末期值
# max_index = min(last_index + CON_WINDOWS, df.index.max())
# # 使用loc来安全地获取范围内的数据并计算最小值
# min_value = df.loc[last_index:max_index, 'eGFR'].min()

# df = df.loc[first_index:last_index+1]
# df.reset_index(drop=True, inplace=True)
# df['step'] = df.index  # index 是自动从0开始的整数序列

In [153]:
# # 对动作进行处理
# df['action'] = np.where(pd.notna(df['crrt']), 2, 
#                         np.where(pd.notna(df['ihd']), 1, 0)) #若crrt非空则2，若IHD非空则1，否则0


In [154]:
# # 参数
# C_1 = 0.3
# C_2 = 0.8
# C_3 = -0.1
# C4 = 0.8
# C5 = 0.2
# k = -0.1

# # 计算sgn(eGFR >= 90)
# df['sgn_eGFR'] = (df['eGFR'] >= 90).astype(int)

# # 计算ΔeGFR
# df['delta_eGFR'] = df['eGFR'].diff().fillna(0)  # 填充第一个值为0或其他适当方法

# # 计算第二项中的表达式
# df['exp_part_2'] = 2 / (1 + np.exp(-df['delta_eGFR'])) - 1

# # 计算第三项中的表达式
# df['exp_part_3'] = 1 - np.exp(k * np.maximum(0, df['step'] - 14 * 24 / TIMEDURING))

# # 计算最终reward
# df['reward'] = C_1 * df['sgn_eGFR'] + C_2 * df['exp_part_2'] + C_3 * df['exp_part_3']




In [155]:


# 对于终末期的reward进行计算
df.loc[df.index[-1], 'reward'] = C4 *  (1 if min_value > 90 else -1) + C5 * df.loc[df.index[-1], 'delta_eGFR']# 参数
df[ID_col+state_col+['action', 'reward']].to_csv('./data/res.csv')
