In [2]:
import pandas as pd
from tqdm import tqdm

root = './data/'
TIMEDURING = 6
col = ['stay_id', 'subject_id', 'time',
       'ph', 'po2', 'calcium', 'bicarbonate', 'creatinine', 'sodium', 'potassium', 'uo_rt_24hr']
state_col = ['ph', 'po2', 'calcium', 'bicarbonate', 'creatinine', 'sodium', 'potassium', 'uo_rt_24hr']

In [23]:
#将状态信息插入到同一个表中
df_list = []
for file in ['kdigo_stages.csv','bg.csv','chemistry.csv']:
    df_list.append(pd.read_csv(root + file))
df = pd.concat(df_list, keys='time')
df.columns.tolist()

['stay_id',
 'subject_id',
 'creat',
 'uo_rt_24hr',
 'time',
 'ph',
 'temperature',
 'lactate',
 'so2',
 'po2',
 'hemoglobin',
 'chloride',
 'calcium',
 'bicarbonate',
 'bun',
 'creatinine',
 'sodium',
 'potassium']

In [24]:

df_state_oral = df[col]
df_state_oral.to_csv('./data/df_state_oral.csv')

In [14]:
#下面读取动作，按照设定的参数进行时间区域的划分
df = pd.read_csv('./data/stay.csv')

# 确保时间列是datetime类型
df['intime'] = pd.to_datetime(df['intime'], format='%Y/%m/%d %H:%M')
df['outtime'] = pd.to_datetime(df['outtime'], format='%Y/%m/%d %H:%M')

# 创建一个空列表来存储时间切片数据
slices = []

# 遍历每行来创建时间切片
for index, row in tqdm(df.iterrows()):
    start_time = row['intime']
    end_time = min(start_time + pd.Timedelta(hours=1), row['outtime'])
    
    while start_time < row['outtime']:
        slices.append({
            'stay_id': row['stay_id'],
            'start_time': start_time,
            'end_time': end_time
        })
        
        start_time += pd.Timedelta(hours=TIMEDURING)
        end_time = min(start_time + pd.Timedelta(hours=TIMEDURING), row['outtime'])

# 利用列表创建DataFrame
slices_df = pd.DataFrame(slices)
slices_df.to_csv('./data/icu_stay.csv', index=False)

42258it [00:34, 1241.95it/s]


In [17]:
df_egfr = pd.read_csv('./data/egfr.csv').drop_duplicates()
df_slice = pd.read_csv('./data/icu_stay.csv')
df_crrt = pd.read_csv('./data/crrt.csv')
df_ihd = pd.read_csv('./data/ihd.csv')

stay_id_crrt = df_crrt['stay_id'].unique()
stay_id_ihd = df_ihd['stay_id'].unique()

df_slice = df_slice[(df_slice['stay_id'].isin(stay_id_crrt) ) | (df_slice['stay_id'].isin(stay_id_ihd))]


In [18]:
print('concat eGFR')
# 确保时间格式正确
df_egfr['charttime'] = pd.to_datetime(df_egfr['charttime'], format='%Y-%m-%d %H:%M:%S')
df_slice['start_time'] = pd.to_datetime(df_slice['start_time'], format='%Y-%m-%d %H:%M:%S')
df_slice['end_time'] = pd.to_datetime(df_slice['end_time'], format='%Y-%m-%d %H:%M:%S')

# 定义函数计算时间段内的平均 eGFR
def calculate_avg_egfr(row):
    # 筛选出同一 stay_id 且 charttime 在 start_time 和 end_time 之间的记录
    mask = (df_egfr['stay_id'] == row['stay_id']) & (df_egfr['charttime'] >= row['start_time']) & (df_egfr['charttime'] <= row['end_time'])
    filtered_egfr = df_egfr.loc[mask, 'eGFR']
    
    # 计算平均值
    return filtered_egfr.mean()
tqdm.pandas(desc="计算平均 eGFR")

# 使用 progress_apply 代替 apply
df_slice['avg_eGFR'] = df_slice.progress_apply(calculate_avg_egfr, axis=1)

# 确保时间字段是正确的 datetime 类型
df_crrt['starttime'] = pd.to_datetime(df_crrt['starttime'], format='%Y-%m-%d %H:%M:%S')
df_crrt['endtime'] = pd.to_datetime(df_crrt['endtime'], format='%Y-%m-%d %H:%M:%S')
df_ihd['starttime'] = pd.to_datetime(df_ihd['starttime'], format='%Y-%m-%d %H:%M:%S')
df_ihd['endtime'] = pd.to_datetime(df_ihd['endtime'], format='%Y-%m-%d %H:%M:%S')
df_slice['start_time'] = pd.to_datetime(df_slice['start_time'], format='%Y-%m-%d %H:%M:%S')
df_slice['end_time'] = pd.to_datetime(df_slice['end_time'], format='%Y-%m-%d %H:%M:%S')

# 初始化新列
df_slice['crrt'] = ''
df_slice['ihd'] = ''
df_slice['patientweight'] = None
df_slice['originalamount'] = None

# 定义检查时间重叠的函数
def check_overlap(row, df, label):
    # 检查时间重叠
    overlaps = df[(df['starttime'] <= row['end_time']) & (df['endtime'] >= row['start_time']) & (df['stay_id'] == row['stay_id'])]
    if not overlaps.empty:
        row[label] = 'yes'
        row['patientweight'] = overlaps['patientweight'].values[0]  # 假设取第一个重叠记录的值
        row['originalamount'] = overlaps['originalamount'].values[0]  # 假设取第一个重叠记录的值
    return row

# 应用函数到 df_slice 的每一行
print('concat crrt')
tqdm.pandas(desc='cal crrt')
df_slice = df_slice.progress_apply(check_overlap, args=(df_crrt, 'crrt'), axis=1)
print('concat IHD')
tqdm.pandas(desc='cal ihd')
df_slice = df_slice.progress_apply(check_overlap, args=(df_ihd, 'ihd'), axis=1)


concat eGFR


计算平均 eGFR: 100%|██████████| 130120/130120 [22:17<00:00, 97.28it/s] 


concat crrt


cal crrt: 100%|██████████| 130120/130120 [01:25<00:00, 1522.95it/s]


concat IHD


cal ihd: 100%|██████████| 130120/130120 [01:21<00:00, 1587.57it/s]


In [20]:
df_slice.to_csv('./data/df_r_a.csv')

In [4]:
#将状态信息concat其中
df_slice = pd.read_csv('./data/df_r_a.csv')
df_state_oral = pd.read_csv('./data/df_state_oral.csv')
# 定义函数计算时间段内的平均 eGFR
def calculate_avg_state(row):
    # 筛选出同一 stay_id 且 charttime 在 start_time 和 end_time 之间的记录
    mask = (df_state_oral['stay_id'] == row['stay_id']) & (df_state_oral['time'] >= row['start_time']) & (df_state_oral['time'] <= row['end_time'])
    filtered_state = df_state_oral.loc[mask, state_col]
    # 计算平均值
    return filtered_state.mean()
# tqdm.pandas(desc="插入状态数据")

# 使用 progress_apply 代替 apply
stay_id = df_slice['stay_id'].unique().tolist()
for id in tqdm(stay_id[870:]):
    df = df_slice[df_slice['stay_id']==id]
    df[state_col] = df.apply(calculate_avg_state, axis=1)
    df.to_csv(f'./data/temp/{id}.csv')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in