In [23]:
import json
import pandas as pd
with open('/home/weisi/TemporalAssessment/data/MIMIC-IV-Note/mimic_final.json', 'r', encoding='utf-8') as f:
    df=pd.read_json(f,lines=True)


In [24]:
print(df.groupby('time').size())

time
2008 - 2010    149591
2011 - 2013     77556
2014 - 2016     61476
2017 - 2019     43022
2020 - 2022         1
dtype: int64


In [25]:
empty_label_count = df['label'].apply(lambda x: len(x) == 0).sum()
print(empty_label_count)
filtered_df = df[df['label'].apply(lambda x: len(x) > 0)]
print(filtered_df.groupby('time').size())

28926
time
2008 - 2010    137631
2011 - 2013     70089
2014 - 2016     56411
2017 - 2019     38588
2020 - 2022         1
dtype: int64


In [26]:
from transformers import RobertaTokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

def compute_token_length(row):
    tokens = tokenizer.tokenize(row['text'])
    return len(tokens)

filtered_df['token_length'] = filtered_df.apply(compute_token_length, axis=1)
'''length_distribution = filtered_df['token_length'].value_counts().sort_index()
print(length_distribution)
'''

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['token_length'] = filtered_df.apply(compute_token_length, axis=1)


"length_distribution = filtered_df['token_length'].value_counts().sort_index()\nprint(length_distribution)\n"

In [27]:
length_statistics = filtered_df['token_length'].describe()
print("Token length statistics:")
print(length_statistics)

def count_below_threshold(group, threshold):
    return (group['token_length'] < threshold).sum()

counts_below_256 = filtered_df.groupby('time').apply(count_below_threshold, threshold=256)
counts_below_512 = filtered_df.groupby('time').apply(count_below_threshold, threshold=512)
print("Number of entries with token length below 256 by time:")
print(counts_below_256)
print("\nNumber of entries with token length below 512 by time:")
print(counts_below_512)

Token length statistics:
count    302720.000000
mean       1979.489783
std         835.777183
min          13.000000
25%        1391.000000
50%        1856.000000
75%        2418.000000
max       10986.000000
Name: token_length, dtype: float64
Number of entries with token length below 256 by time:
time
2008 - 2010    91
2011 - 2013    26
2014 - 2016    28
2017 - 2019    28
2020 - 2022     0
dtype: int64

Number of entries with token length below 512 by time:
time
2008 - 2010    816
2011 - 2013    277
2014 - 2016    206
2017 - 2019    186
2020 - 2022      0
dtype: int64


  counts_below_256 = filtered_df.groupby('time').apply(count_below_threshold, threshold=256)
  counts_below_512 = filtered_df.groupby('time').apply(count_below_threshold, threshold=512)


In [28]:
filtered_df=filtered_df[filtered_df['token_length'] > 256]

In [29]:
grouped = filtered_df.groupby('time')['token_length'].describe()
print(grouped)

                count         mean         std     min     25%     50%  \
time                                                                     
2008 - 2010  137540.0  1922.443711  800.192187   257.0  1359.0  1812.0   
2011 - 2013   70062.0  1979.130213  830.837909   261.0  1392.0  1858.0   
2014 - 2016   56383.0  2025.577373  856.778981   261.0  1418.0  1894.0   
2017 - 2019   38560.0  2124.324300  906.687133   260.0  1481.0  1975.0   
2020 - 2022       1.0  2701.000000         NaN  2701.0  2701.0  2701.0   

                 75%      max  
time                           
2008 - 2010  2346.25  10970.0  
2011 - 2013  2421.00  10986.0  
2014 - 2016  2473.50  10273.0  
2017 - 2019  2598.25   9186.0  
2020 - 2022  2701.00   2701.0  


In [30]:
import os
from sklearn.model_selection import train_test_split
seed=1

folder_path ='/home/weisi/TemporalAssessment/data/MIMIC-IV-Note/seed{}/'.format(seed)

if not os.path.exists(folder_path):
    os.makedirs(folder_path)

ndf=filtered_df[['uid', 'did', 'time', 'text', 'label']]
# devide dataset to 4 time periods
df_2008_2010 = ndf[ndf['time'] == '2008 - 2010']
df_2011_2013 = ndf[ndf['time'] == '2011 - 2013']
df_2014_2016 = ndf[ndf['time'] == '2014 - 2016']
df_2017_2019 = ndf[ndf['time'] == '2017 - 2019']

# reduce the datasets to the same size
min_size = min(len(df_2008_2010), len(df_2011_2013), len(df_2014_2016), len(df_2017_2019))


df_2008_2010_sampled = df_2008_2010.sample(n=min_size, random_state=seed)
df_2011_2013_sampled = df_2011_2013.sample(n=min_size, random_state=seed)
df_2014_2016_sampled = df_2014_2016.sample(n=min_size, random_state=seed)
df_2017_2019_sampled = df_2017_2019.sample(n=min_size, random_state=seed)


def save_datasets(df, period,seed):
    print('label number: ',len(pd.Series(df['label'].explode().unique())))
    # split train, validation and test datasets by ratio 0.6 0.2 0.2
    train, test = train_test_split(df, test_size=0.4, random_state=seed)  
    validation, test = train_test_split(test, test_size=0.5, random_state=seed)  

    # save files
    train_filename = f'{period}_train.json'
    validation_filename = f'{period}_validation.json'
    test_filename = f'{period}_test.json'
    train.to_json(os.path.join(folder_path, train_filename), orient='records', lines=True)
    validation.to_json(os.path.join(folder_path, validation_filename), orient='records', lines=True)
    test.to_json(os.path.join(folder_path, test_filename), orient='records', lines=True)


save_datasets(df_2008_2010_sampled, 'T1',seed)
save_datasets(df_2011_2013_sampled, 'T2',seed)
save_datasets(df_2014_2016_sampled, 'T3',seed)
save_datasets(df_2017_2019_sampled, 'T4',seed)


label list:  0      I2510
1      I5032
2       I509
3      I4891
4       J449
5       K219
6       I252
7      Z9861
8      G8929
9       I129
10      N189
11      M109
12      E785
13     M1990
14    Z86718
15     Z7982
16     Z7901
17      Z794
18      F419
19      N179
20       I10
21      F329
22       D62
23      D696
24    Z87891
25      D649
26     Z8673
27      E875
28    F17200
29      J189
30      E871
31      E039
32      N400
33      E119
34       Z66
35      N183
36      M810
37      Z951
38      E860
39     G4733
40    J45998
41     E7800
42      D509
43      Y929
44      E669
45      N390
46      N186
47     Z7902
48      E872
49     K5900
dtype: object
label list:  0      Z7901
1       E785
2       M810
3     Z87891
4      E7800
5       E119
6       I129
7       N189
8      G4733
9       I509
10      D649
11     I2510
12       I10
13      E875
14    J45998
15      K219
16      N390
17      N179
18      N183
19     K5900
20     Z8673
21      N186
22     I5032
23       D6

In [None]:
samples_per_period = min_size // 4  # floor

# create a all year data that draw equal data from 4 time periods and has same size as other time periods
allyear_sampled = pd.concat([
    df_2008_2010.sample(n=samples_per_period, random_state=1),
    df_2011_2013.sample(n=samples_per_period, random_state=1),
    df_2014_2016.sample(n=samples_per_period, random_state=1),
    df_2017_2019.sample(n=samples_per_period, random_state=1)
])
# randomlize the order od all year data
allyear_sampled = allyear_sampled.sample(frac=1, random_state=1).reset_index(drop=True)
save_datasets(allyear_sampled, 'Allyear_sampled')