In [16]:
# 进行的是Data sampling & Profile
# 有效样本数量、正负样本、轨迹长度分布、含有三种动作切换的样本数量，状态列的统计信息

In [40]:
import pandas as pd

df = pd.read_csv('./data/res.csv')
state_col = ['ph',
 'po2',
 'calcium',
 'bicarbonate',
 'creatinine',
 'sodium',
 'potassium',
 'uo_rt_24hr']
col_list = ['reward'] + state_col
df['stay_id'].nunique()

3413

In [21]:
# 对df按照stay_id分割样本
# 首先筛选包含至少包含8条记录的样本
# 根据样本step最大的reward列记录的正负，判断正负样本
# 统计筛选后样本的行数记录分布
# 进行col列的统计
# 进行col列缺失率的统计

# 筛选action列含有三个unique值的样本
# 进行col列的统计
# 进行col列缺失率的统计

0         0
1         1
2         0
3         1
4         0
         ..
75153     8
75154     9
75155    10
75156    11
75157    12
Name: step, Length: 75158, dtype: int64

In [36]:
# 分组
grouped = df.groupby('stay_id')
# 筛选
filtered_df = grouped.filter(lambda x: len(x) >= 8)

In [38]:
dfdes = filtered_df.describe()
dfdes.to_csv('./data/desc.csv')

In [25]:
# filtered_df['stay_id'].nunique()
filtered_df

Unnamed: 0,step,stay_id,step.1,ph,po2,calcium,bicarbonate,creatinine,sodium,potassium,uo_rt_24hr,action,reward
4,0,30009597,0,7.35,30.0,0.97,22.0,4.825,138.0,3.975,0.3161,1,0.000000
5,1,30009597,1,7.35,30.0,0.97,23.0,4.350,138.0,3.950,0.3161,1,0.630903
6,2,30009597,2,7.35,30.0,0.97,24.0,3.875,138.0,3.925,0.3161,0,0.630903
7,3,30009597,3,7.35,30.0,0.97,25.0,3.400,138.0,3.900,0.3161,0,0.630903
8,4,30009597,4,7.35,30.0,0.97,24.5,3.675,138.0,3.950,0.3161,0,-0.502822
...,...,...,...,...,...,...,...,...,...,...,...,...,...
75153,8,39998012,8,7.44,99.0,1.08,26.0,0.600,136.0,3.700,,2,0.000000
75154,9,39998012,9,7.34,123.0,1.22,24.0,0.600,137.0,3.900,,2,0.000000
75155,10,39998012,10,7.32,118.0,1.13,22.0,1.100,138.0,4.100,,2,0.000000
75156,11,39998012,11,7.33,104.0,1.09,21.0,1.100,138.0,4.100,,2,0.000000


In [35]:
# Category for Treatment Result

# 按照stay_id分组
grouped = filtered_df.groupby('stay_id')

# 找到每个组中step最大的行的索引
idx_max_step = grouped['step'].idxmax()

# 使用.loc访问这些索引对应的reward
max_step_rewards = filtered_df.loc[idx_max_step, 'eGFR']

# 判断每个reward的正负
trajectory_type = max_step_rewards.apply(lambda x: 'Positive' if x > 90 else 'Negative')

# 计算正负轨迹的数量
trajectory_counts = trajectory_type.value_counts()

# 打印结果
print(trajectory_counts)

Negative    1856
Positive      88
Name: eGFR, dtype: int64


In [27]:
# Category for Trajectory action Type

#按照stay_id分组，查询action列的值仅包含2，包含1和2及其他三种情况，统计组的数量
import pandas as pd

# 假设 df 是你的原始 DataFrame

# 按照 stay_id 分组
grouped = filtered_df.groupby('stay_id')

# 定义一个函数来分类 action 值的情况
def classify_actions(actions):
    unique_actions = set(actions)
    if unique_actions == {2}:
        return 'Only 2'
    elif unique_actions >= {1, 2}:
        return 'Contains 1 and 2'
    else:
        return 'Other'

# 应用分类函数并统计结果
action_categories = grouped['action'].apply(classify_actions)
category_counts = action_categories.value_counts()

# 打印结果
print(category_counts)

Other               1179
Contains 1 and 2     521
Only 2               244
Name: action, dtype: int64


In [28]:
# Category for Time Span
# 检查每个stay_id组别的行数，统计小于84和大于84的组数
group_sizes = filtered_df.groupby('stay_id').size()
# 用条件判断来分类组的大小
less_than_84 = group_sizes[group_sizes < 84].count()
greater_or_equal_84 = group_sizes[group_sizes >= 84].count()

# 打印结果
print(f"Number of groups with less than 84 rows: {less_than_84}")
print(f"Number of groups with 84 or more rows: {greater_or_equal_84}")

Number of groups with less than 84 rows: 1786
Number of groups with 84 or more rows: 158


In [20]:
# 使用transform找到每个stay_id分组中step最大时的reward
filtered_df['max_step_reward'] = grouped['reward'].transform(lambda x: x[x.index == x['step'].idxmax()])

# 标记正负样本
filtered_df['sample_type'] = filtered_df['max_step_reward'].apply(lambda x: 'Positive' if x > 0 else 'Negative')

KeyError: 'step'

In [None]:
# 统计每个stay_id的行数
sample_distribution = filtered_df.groupby('stay_id').size()
print(sample_distribution)

stay_id
30009597     22
30031755     11
30045407     14
30045625     53
30056217     31
           ... 
39972385     21
39977408     12
39985110     22
39986206    114
39998012     13
Length: 1944, dtype: int64


In [None]:
# 对col列进行描述性统计
col_stats = filtered_df['col'].describe()
print(col_stats)

KeyError: 'col'

In [None]:
# 计算缺失率
missing_rate = filtered_df['col'].isnull().mean()
print(f"Missing rate of 'col': {missing_rate * 100:.2f}%")

In [None]:
# 筛选含有三个unique action值的样本
samples_with_three_unique_actions = filtered_df.groupby('stay_id').filter(lambda x: x['action'].nunique() == 3)

In [None]:
# 对col列进行描述性统计
col_stats_post_action_filter = samples_with_three_unique_actions['col'].describe()
print(col_stats_post_action_filter)

In [None]:
# 计算缺失率
missing_rate_post_action_filter = samples_with_three_unique_actions['col'].isnull().mean()
print(f"Missing rate of 'col' after action filter: {missing_rate_post_action_filter * 100:.2f}%")