In [None]:
import pandas as pd
import plotly
import plotly.express as px
import numpy as np
from transformers import AutoConfig, AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForSequenceClassification
from pathlib import Path
from datasets import Dataset,DatasetDict,load_dataset,load_metric
import evaluate
import re
from sklearn.model_selection import KFold, StratifiedKFold
import torch
from torch.utils.data import Dataset, DataLoader, TensorDataset, RandomSampler, SequentialSampler

In [None]:
train_path = Path.cwd().joinpath("mediqa-chat-data","TaskA","TaskA-TrainingSet.csv")
validation_path = Path.cwd().joinpath("mediqa-chat-data","TaskA","TaskA-ValidationSet.csv")

train_df = pd.read_csv(train_path,index_col="ID")
valid_df = pd.read_csv(validation_path,index_col="ID")
merge_df = pd.concat([train_df,valid_df],axis=0,ignore_index=True)
merge_df["dialogue_wo_whitespaces"] = merge_df["dialogue"].apply(lambda x: re.sub(r'[\r\n\s]+',' ',x))
merge_df.reset_index(inplace=True)
merge_df.rename(mapper={'index':'ID'},axis=1,inplace=True)

In [None]:
merge_df.head()

In [None]:
section_header_dist = \
merge_df["section_header"].value_counts(normalize=True).reset_index()
section_header_dist.columns = ["section_header","proportion"]
section_header_cnt = \
merge_df["section_header"].value_counts().reset_index()
section_header_cnt.columns = ["section_header","Count"]

In [None]:
px.bar(data_frame=section_header_cnt, \
       x='section_header', \
       y='Count', \
       title="Section_Header Count",)

In [None]:
px.bar(data_frame=section_header_dist, \
       x='section_header', \
       y='proportion', \
       title="Section_Header Proportion",)

In [None]:
model_checkpoint = "emilyalsentzer/Bio_ClinicalBERT"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint,do_lower_case=True,force_download=True)

In [None]:
merge_df.head()

In [None]:
token_len_list = []
for sentence in merge_df["dialogue_wo_whitespaces"]:
    token_list = tokenizer.encode(sentence,add_special_tokens=True)
    token_len_list.append(len(token_list))

In [None]:
px.histogram(token_len_list,title="Token Length distribution for Dialogue").update_layout(xaxis_title="Number of Tokens in a Dialogue", \
                                                                                       yaxis_title="Number of IDs",showlegend=False)

In [None]:
# Getting min, median, max lengths of the text
min(token_len_list), np.median(token_len_list), max(token_len_list)

In [None]:
np.percentile(token_len_list,q=[0.,25,50,75,80,85,90,95,99,100])

Sentences with length <= 300 account for about 90% of the data

In [None]:
max_len = 300

In [None]:
token_len_list = []
for sentence in merge_df["section_text"]:
    token_list = tokenizer.encode(sentence,add_special_tokens=True)
    token_len_list.append(len(token_list))

In [None]:
px.histogram(token_len_list,title="Token Length distribution for Section Text").update_layout(xaxis_title="Number of Tokens in a Section Text", \
                                                                                              yaxis_title="Number of IDs",showlegend=False)