In [None]:
import pandas as pd
import plotly
import plotly.express as px
import numpy as np
from pathlib import Path
import re
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split
from transformers import AutoTokenizer
from sklearn.preprocessing import LabelEncoder
import config as code_config
import json

In [None]:
train_path = Path.cwd().joinpath("2023_ImageCLEFmed_Mediqa","dataset","TaskB","TaskB-TrainingSet.csv")
validation_path = Path.cwd().joinpath("2023_ImageCLEFmed_Mediqa","dataset","TaskB","TaskB-ValidationSet.csv")
augmented_path = Path.cwd().joinpath("TaskA-augmented_data.csv")

train_df = pd.read_csv(train_path,index_col="ID")
valid_df = pd.read_csv(validation_path,index_col="ID")
valid_index = {idx:idx+train_df.shape[0] for idx in valid_df.index}
valid_df.rename(mapper=valid_index,inplace=True)
augmented_data = pd.read_csv(augmented_path,index_col="ID")
augmented_sections = augmented_data["section_header"].unique().tolist()
merge_df = pd.concat([train_df,valid_df,augmented_data],axis=0,ignore_index=False)
merge_df["dialogue_wo_whitespaces"] = merge_df["dialogue"].apply(lambda x: re.sub(r'[\r\n\s]+',' ',x))
merge_df.reset_index(inplace=True)
merge_df.rename(mapper={'index':'ID'},axis=1,inplace=True)
merge_df_w_augmented_data = merge_df.loc[merge_df["section_header"].isin(augmented_sections)]
merge_df_wo_augmented_data = merge_df.loc[~merge_df["section_header"].isin(augmented_sections)]

In [None]:
merge_df_wo_augmented_data

In [None]:
le = LabelEncoder()
le.fit(merge_df["section_header"])

label2idx = {sec:i for i,sec in enumerate(le.classes_)}
idx2label = {i:sec for i,sec in enumerate(le.classes_)}

with open("TaskA_and_B-label2idx.json","w") as f:
    json.dump(label2idx,f,indent=2)
    
with open("TaskA_and_B-idx2label.json","w") as f:
    json.dump(idx2label,f,indent=2)
# merge_df["label"] = merge_df["section_header"].apply(lambda x: label2idx[x])

In [None]:
section_header_dist = \
merge_df["section_header"].value_counts(normalize=True).reset_index()
section_header_dist.columns = ["section_header","proportion"]
section_header_cnt = \
merge_df["section_header"].value_counts().reset_index()
section_header_cnt.columns = ["section_header","Count"]

In [None]:
px.bar(data_frame=section_header_cnt, \
       x='section_header', \
       y='Count', \
       title="Section Header Count").update_layout(xaxis_title="Section Header", \
                                                   yaxis_title="Count", \
                                                   title={'x':0.5,'xanchor': 'center'})

In [None]:
px.bar(data_frame=section_header_dist, \
       x='section_header', \
       y='proportion', \
       title="Section Header Proportion")

In [None]:
model_checkpoint = "emilyalsentzer/Bio_ClinicalBERT"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint,do_lower_case=True,force_download=True)

In [None]:
token_len_list = []
for sentence in merge_df["dialogue_wo_whitespaces"]:
    token_list = tokenizer.encode(sentence,add_special_tokens=True)
    token_len_list.append(len(token_list))

In [None]:
px.histogram(token_len_list,cumulative=True,histnorm="percent").update_layout(xaxis_title="Number of Tokens", \
                                                   yaxis_title="Percentage of IDs", \
                                                   title={'text':'Cumulative Distribution of number of tokens in every Dialogue', \
                                                          'x':0.5, \
                                                          'xanchor': 'center'})

In [None]:
summary_len_list = []
for sentence in merge_df["section_text"]:
    token_list = tokenizer.encode(sentence,add_special_tokens=True)
    summary_len_list.append(len(token_list))

In [None]:
px.histogram(summary_len_list,cumulative=True,histnorm="percent").update_layout(xaxis_title="Number of Tokens", \
                                                   yaxis_title="Percentage of IDs", \
                                                   title={'text':'Cumulative Distribution of number of tokens in every Summary', \
                                                          'x':0.5, \
                                                          'xanchor': 'center'})

In [None]:
# Getting min, median, max lengths of the text
min(token_len_list), np.median(token_len_list), max(token_len_list)

In [None]:
np.percentile(token_len_list,q=[0.,25,50,75,80,85,90,95,99,100])

Sentences with length <= 300 account for about 90% of the data

In [None]:
max_len = 300

In [None]:
skf = StratifiedKFold(
    n_splits=code_config.MULTI_CLASS_N_SPLITS,
    shuffle=True,
    random_state=code_config.SEED
)
split_dict = dict()
for split, (train_idx, test_idx) in enumerate(
    skf.split(merge_df_wo_augmented_data, y=merge_df_wo_augmented_data["section_header"])
):
    split_dict[split] = dict()
    train_df = merge_df_wo_augmented_data.iloc[train_idx,:]
    test_df = merge_df_wo_augmented_data.iloc[test_idx,:]
    test_counts = test_df["section_header"].value_counts(normalize=True).reset_index()
    test_counts.rename({"section_header":"test"},axis=1,inplace=True)
#     print(train_df.head())
    train,valid = \
    train_test_split(train_df,test_size=0.2,random_state=code_config.SEED,stratify=train_df["section_header"])
    train_counts = train["section_header"].value_counts(normalize=True).reset_index()
    train_counts.rename({"section_header":"train"},axis=1,inplace=True)
    valid_counts = valid["section_header"].value_counts(normalize=True).reset_index()
    valid_counts.rename({"section_header":"valid"},axis=1,inplace=True)
    
    new_df = pd.merge(train_counts,valid_counts,left_on="index",right_on="index",how="outer").fillna(0)
    new_df = pd.merge(new_df,test_counts,left_on="index",right_on="index",how="outer").fillna(0)
    
    split_dict[split]["train"] = train["ID"].values.tolist()
    split_dict[split]["valid"] = valid["ID"].values.tolist()
    split_dict[split]["test"] = test_df["ID"].values.tolist()

In [None]:
augmented_train_idx = \
[idx for idx in merge_df_w_augmented_data["ID"] if isinstance(idx,str) and ("Augmented" in idx) ]

In [None]:
for split_idx, split in split_dict.items():
    split["train"].extend(augmented_train_idx)

In [None]:
merge_df_w_augmented_data_valid_test = \
merge_df_w_augmented_data.loc[~merge_df_w_augmented_data["ID"].isin(augmented_train_idx),:]

In [None]:
skf = StratifiedKFold(
    n_splits=code_config.MULTI_CLASS_N_SPLITS,
    shuffle=True,
    random_state=code_config.SEED
)
new_split_dict = dict()
for split, (train_idx, test_idx) in enumerate(
    skf.split(merge_df_w_augmented_data_valid_test, y=merge_df_w_augmented_data_valid_test["section_header"])
):
    new_split_dict[split] = dict()
    train_df = merge_df_w_augmented_data_valid_test.iloc[train_idx,:]
    test_df = merge_df_w_augmented_data_valid_test.iloc[test_idx,:]
    new_split_dict[split]["valid"] = train["ID"].values.tolist()
    new_split_dict[split]["test"] = valid["ID"].values.tolist()

In [None]:
for split_idx, split in split_dict.items():
    split["valid"].extend(new_split_dict[split_idx]["valid"])
    split["test"].extend(new_split_dict[split_idx]["test"])

In [None]:
with open("taskA_and_B_train_valid_test_split.json","w") as f:
    json.dump(split_dict,f,indent=2)