In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install transformers
!pip install tensorflow-addons
!pip install datasets
!pip install --upgrade accelerate

In [None]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np 
import pandas as pd 
import os
import ast
import spacy
import random
import itertools
import matplotlib.pyplot as plt
from typing import List, Tuple

import tensorflow as tf
import tensorflow_addons as tfa

from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoConfig, TFAutoModel

from transformers import DataCollatorForLanguageModeling
from transformers import DebertaTokenizer, DebertaForMaskedLM
from transformers import Trainer, TrainingArguments
from datasets import Dataset

Data Loading

In [None]:
train = pd.read_csv('/content/drive/MyDrive/TAR - projekt/TAR-dataset/train.csv')
train['annotation'] = train['annotation'].apply(ast.literal_eval) # Construct an object from a string
train['location'] = train['location'].apply(ast.literal_eval) # Construct an object from a string
print(f"train.shape: {train.shape}")
train

train.shape: (14300, 6)


Unnamed: 0,id,case_num,pn_num,feature_num,annotation,location
0,00016_000,0,16,0,[dad with recent heart attcak],[696 724]
1,00016_001,0,16,1,"[mom with ""thyroid disease]",[668 693]
2,00016_002,0,16,2,[chest pressure],[203 217]
3,00016_003,0,16,3,"[intermittent episodes, episode]","[70 91, 176 183]"
4,00016_004,0,16,4,[felt as if he were going to pass out],[222 258]
...,...,...,...,...,...,...
14295,95333_912,9,95333,912,[],[]
14296,95333_913,9,95333,913,[],[]
14297,95333_914,9,95333,914,[photobia],[274 282]
14298,95333_915,9,95333,915,[no sick contacts],[421 437]


In [None]:
features = pd.read_csv('/content/drive/MyDrive/TAR - projekt/TAR-dataset/features.csv')
print(f"features.shape: {features.shape}")
features.head()

features.shape: (143, 3)


Unnamed: 0,feature_num,case_num,feature_text
0,0,0,Family-history-of-MI-OR-Family-history-of-myoc...
1,1,0,Family-history-of-thyroid-disorder
2,2,0,Chest-pressure
3,3,0,Intermittent-symptoms
4,4,0,Lightheaded


In [None]:
patient_notes = pd.read_csv('/content/drive/MyDrive/TAR - projekt/TAR-dataset/patient_notes.csv')
print(f"patient_notes.shape: {patient_notes.shape}")
patient_notes.head()

patient_notes.shape: (42146, 3)


Unnamed: 0,pn_num,case_num,pn_history
0,0,0,"17-year-old male, has come to the student heal..."
1,1,0,17 yo male with recurrent palpitations for the...
2,2,0,Dillon Cleveland is a 17 y.o. male patient wit...
3,3,0,a 17 yo m c/o palpitation started 3 mos ago; \...
4,4,0,17yo male with no pmh here for evaluation of p...


In [None]:
train = train.merge(features, on=['feature_num', 'case_num'], how='left')
train = train.merge(patient_notes, on=['pn_num', 'case_num'], how='left')
train['annotation_length'] = train['annotation'].apply(len)
print(f"train.shape: {train.shape}")

train.shape: (14300, 9)


In [None]:
seed=42
train, test = train_test_split(train[['pn_history', 'pn_num', 'feature_text','annotation_length', 'location']], test_size=0.15, random_state=seed)

### Tu odvajamo test set koji nejde u preteniranje. Te patient notes treba maknut.

In [None]:
test

Unnamed: 0,pn_history,pn_num,feature_text,annotation_length,location
4454,HPI 35 YO M IN OFFICE C/O BURNING EPIGASTRIC ...,30772,Awakens-at-night,0,[]
11329,"67-year-old female, has come to the physician'...",81385,Hallucinations-after-taking-Ambien,0,[]
1010,17 year old male presenting with heart poundin...,1809,heart-pounding-OR-heart-racing,2,"[33 47, 33 38;52 58]"
9329,17 YO MALE C/O CHEST PAIN SINCE YESTERDAY. P...,60922,Exercise-induced-asthma,1,[535 559]
3623,"Dolores Montgomery, a 44-year-old female, has ...",21372,Sexually-active,1,[399 414]
...,...,...,...,...,...
3559,Dolores Montgomery is a 44 year old previously...,21310,No-premenstrual-symptoms,0,[]
3323,3 year history of irregular menses. Pt has a c...,21114,Stress,0,[]
11098,Pt is 67 yr female presenting with trouble sle...,80706,Sleeping-medication-ineffective,1,[404 416;431 440]
8198,HPI: Patient is a 26F here for ED followup for...,55009,No-caffeine-use,0,[]


In [None]:
patient_notes

Unnamed: 0,pn_num,case_num,pn_history
0,0,0,"17-year-old male, has come to the student heal..."
1,1,0,17 yo male with recurrent palpitations for the...
2,2,0,Dillon Cleveland is a 17 y.o. male patient wit...
3,3,0,a 17 yo m c/o palpitation started 3 mos ago; \...
4,4,0,17yo male with no pmh here for evaluation of p...
...,...,...,...
42141,95330,9,Ms. Madden is a 20 yo female presenting w/ the...
42142,95331,9,A 20 YO F CAME COMPLAIN A DULL 8/10 HEADACHE T...
42143,95332,9,Ms. Madden is a 20yo female who presents with ...
42144,95333,9,Stephanie madden is a 20 year old woman compla...


In [None]:
forbidden_notes = np.array(test['pn_num'])

In [None]:
patient_notes = patient_notes[~patient_notes['pn_num'].isin(forbidden_notes)]

## Filtered patient notes for pretraining - without test set

In [None]:
patient_notes


Unnamed: 0,pn_num,case_num,pn_history
0,0,0,"17-year-old male, has come to the student heal..."
1,1,0,17 yo male with recurrent palpitations for the...
2,2,0,Dillon Cleveland is a 17 y.o. male patient wit...
3,3,0,a 17 yo m c/o palpitation started 3 mos ago; \...
4,4,0,17yo male with no pmh here for evaluation of p...
...,...,...,...
42139,95328,9,20 YO F C/O DULL HEADACHE SINCE YESTERDAY. SHE...
42140,95329,9,20 y/o F c/o headache that started yesterday m...
42142,95331,9,A 20 YO F CAME COMPLAIN A DULL 8/10 HEADACHE T...
42143,95332,9,Ms. Madden is a 20yo female who presents with ...


In [None]:
from transformers import DebertaTokenizer, DebertaForMaskedLM, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments
from datasets import load_dataset

In [None]:
dataset = Dataset.from_pandas(patient_notes)
tokenizer = DebertaTokenizer.from_pretrained('microsoft/deberta-base')

def tokenize_function(examples):
   return tokenizer(examples["pn_history"], padding="max_length", truncation=True)


tokenized_dataset = dataset.map(tokenize_function, batched=True)

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/474 [00:00<?, ?B/s]

Map:   0%|          | 0/41253 [00:00<?, ? examples/s]

In [None]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15)
model = DebertaForMaskedLM.from_pretrained('microsoft/deberta-base')

training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/TAR - projekt/savings/deberta-preTrain",
    overwrite_output_dir=True,
    num_train_epochs=20,
    per_device_train_batch_size=16,
    save_steps=10000,
    save_total_limit=2,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset,
)

trainer.train()
# Save the model
trainer.save_model("/content/drive/MyDrive/TAR - projekt/savings/deberta-preTrain")


Downloading pytorch_model.bin:   0%|          | 0.00/559M [00:00<?, ?B/s]

Some weights of the model checkpoint at microsoft/deberta-base were not used when initializing DebertaForMaskedLM: ['lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.bias', 'deberta.embeddings.position_embeddings.weight', 'lm_predictions.lm_head.dense.bias']
- This IS expected if you are initializing DebertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DebertaForMaskedLM were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['cls.predictions.transform.dense.weig

Step,Training Loss
500,2.7396
1000,1.4657
1500,1.2427
2000,1.1351
2500,1.0807
3000,1.0189
3500,0.985
4000,0.9472
4500,0.9238
5000,0.9026


In [None]:
from transformers import DebertaTokenizer, TFDebertaForMaskedLM
import tensorflow as tf

model_dir = '/content/drive/MyDrive/TAR - projekt/savings/deberta-fineTuning2'
output_dir = '/content/drive/MyDrive/TAR - projekt/savings/deberta-fineTuning2Converted'

pytorch_model = DebertaForMaskedLM.from_pretrained(model_dir)

tf_model = TFDebertaForMaskedLM.from_pretrained(model_dir, from_pt=True)
tf_model.save_pretrained(output_dir)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDebertaForMaskedLM: ['cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'deberta.embeddings.position_ids']
- This IS expected if you are initializing TFDebertaForMaskedLM from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDebertaForMaskedLM from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFDebertaForMaskedLM were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDebertaForMaskedLM for predictions without further training.
