<a href="https://colab.research.google.com/github/tanoManzo/mimic_attitude/blob/main/MIMIC_attitude_models_training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## MIMIC_attitude_models_training_few_shot

In [1]:
# install datasets and transformers libraries
! pip install transformers -q
! pip install datasets -q


In [2]:
#from huggingface_hub import notebook_login
#notebook_login()

In [3]:
# library import 
import torch
import pandas as pd
import numpy as np
from torch.utils.data import Dataset
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from transformers import EvalPrediction
from transformers import pipeline
from transformers import Trainer, TrainingArguments
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from datasets import load_dataset

Steps 
1. Prepare dataset
2. Load pretrained Tokenizer,  call it with dataset --> encoding
3. Build Pytorch Dataset with encodings
4. Load pretrained Model
5. (a) Load Trainer and train it, (b) or use native Pytorch training pipeline

In [4]:
token = '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'

### 1) dataset

In [5]:
data = load_dataset('tanoManzo/mimic_attitude_train_val_test_few', use_auth_token=token)
data

Using custom data configuration tanoManzo--mimic_attitude_train_val_test_few-89d4ab29f7d4e8de
Found cached dataset parquet (/home/manzog2/.cache/huggingface/datasets/tanoManzo___parquet/tanoManzo--mimic_attitude_train_val_test_few-89d4ab29f7d4e8de/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'text', 'label'],
        num_rows: 22
    })
    validation: Dataset({
        features: ['id', 'text', 'label'],
        num_rows: 100
    })
    test: Dataset({
        features: ['id', 'text', 'label'],
        num_rows: 101
    })
})

In [6]:
sample = data['train'][0]
sample

{'id': 887,
 'text': '8 Jaundice\n\nNursing Progress Note:\n#2 - RESP: REmains on IMV settings of 19/5 rate of 30.\nWeaned rate from 35 at 1500pm for art gas of 7.51, 25, 105,\n21, 0.  FIO2 21%. RR (30). No spontaneous resp effort. Lungs\nclear and equal. No retractions. No spells.\n#3 - NEURo: MRI of head and spine done. Poor prognosis.\nBelieved to be a hypoxic ischemic event possibly metabolic\nrelated effecting the brain and spine.  REmains unresponsive\nwith no spontaneous movement noted. EEG yesterday showed\nabsence of cortical activity, wide spread severe\nencephalopathy, and no seizure activity.  Neuro and genetics\nare following. Chromosomes are pending.\n#4 - FEN: TF remain at 100cc/kilo/day. NPO. Continues with\nUAC and DLUVC. Remains on TPN and lipids as ordered. Dstick\n97. Abdomin soft and flat. Girth 18cm. No BS. Voided\n2.4cc/kilo for this 12 hours. No stool since birth.  Lytes\nand bili in am.\n#5 - DEV: TEmps stable on warmer. Did get cold when at MRI\ntest - was qui

In [7]:
set(data['train']['label'])

{'Overall Negative Note', 'Overall Neutral Note', 'Overall Positive Note'}

create label2id and id2label necessary for the training

In [8]:
labels = set(data['train']['label'])
id2label = {idx:label for idx, label in enumerate(labels)}
label2id = {label:idx for idx, label in enumerate(labels)}
id2label

{0: 'Overall Neutral Note',
 1: 'Overall Negative Note',
 2: 'Overall Positive Note'}

### 2) tokenizer

from text to token (ids)

In [9]:
######################################################################
#1
log_training_history_file = 'log_distilbert.json'

#2
#model_ckpt = "microsoft/MiniLM-L12-H384-uncased"
#model_ckpt = "bigscience/bloom-560m"
#model_ckpt = 'cardiffnlp/twitter-roberta-base-sentiment-latest'
model_ckpt = 'distilbert-base-uncased-finetuned-sst-2-english'

#3
output_dir = "distilbert-attitude-few10p"
######################################################################

tokenizer = AutoTokenizer.from_pretrained(model_ckpt)


In [10]:
def preprocess_data(examples):
  # take a batch of texts
  text = examples["text"]
  # encode them
  encoding = tokenizer(text, padding="max_length", truncation=True, max_length=512)
  encoding['labels'] = [label2id[x] for x in examples['label']]
  return encoding

encoded_dataset = data.map(preprocess_data, batched=True, remove_columns=data['train'].column_names)
encoded_dataset

Loading cached processed dataset at /home/manzog2/.cache/huggingface/datasets/tanoManzo___parquet/tanoManzo--mimic_attitude_train_val_test_few-89d4ab29f7d4e8de/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-ef0df8e2cff1e33d.arrow


  0%|          | 0/1 [00:00<?, ?ba/s]

Loading cached processed dataset at /home/manzog2/.cache/huggingface/datasets/tanoManzo___parquet/tanoManzo--mimic_attitude_train_val_test_few-89d4ab29f7d4e8de/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-6fe4de43acf80322.arrow


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 22
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 100
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 101
    })
})

sample

torch format

In [11]:
encoded_dataset.set_format("torch")

## Traning Functions and Class Embalance

In [12]:
df = data['train'].to_pandas()

for idx,prob in enumerate(df["label"].value_counts(normalize=True).sort_index()):
  print(f'label: {idx}, {id2label[idx]:<30} probability: {prob:.4f}')

label: 0, Overall Neutral Note           probability: 0.1364
label: 1, Overall Negative Note          probability: 0.8182
label: 2, Overall Positive Note          probability: 0.0455


weights per each label

In [13]:
class_weights = (1-df["label"].value_counts(normalize=True).sort_index()).values
class_weights = torch.from_numpy(class_weights).float().to("cuda")
class_weights

tensor([0.8636, 0.1818, 0.9545], device='cuda:0')

weightedloss

In [14]:
from torch import nn

class WeightedLossTrainer(Trainer):
  def compute_loss(self, model, inputs, return_outputs=False):
    # feed input to the model
    outputs = model(**inputs)
    logits = outputs.get("logits")
    # extract labels
    labels = inputs.get("labels")
    # define loss with class weights
    loss_func = nn.CrossEntropyLoss(weight=class_weights)
    # compute loss
    loss = loss_func(logits, labels)
    return (loss, outputs) if return_outputs else loss

### Training

In [15]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(
    model_ckpt,
  # problem_type="multi_label_classification",
    num_labels=len(label2id),
    id2label=id2label,
    label2id=label2id,
    ignore_mismatched_sizes=True,
    use_auth_token=token
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased-finetuned-sst-2-english and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([2, 768]) in the checkpoint and torch.Size([3, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([2]) in the checkpoint and torch.Size([3]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


metrics

In [16]:
from sklearn.metrics import f1_score

def compute_metrics(pred):
  labels = pred.label_ids
  preds = pred.predictions.argmax(-1)
  f1_micro   = f1_score(labels, preds, average="micro")
  f1_macro   = f1_score(labels, preds, average="macro")
  f1_weighted = f1_score(labels, preds, average='weighted')
  # roc_auc    = roc_auc_score(labels, preds, average = 'micro')
  accuracy   = accuracy_score(labels, preds)
  metrics = {'f1_weighted':f1_weighted,
             "f1_macro":f1_macro, 
             "f1_micro":f1_micro,
             #'roc_auc': roc_auc,
             'accuracy': accuracy}
  return metrics


In [17]:
from transformers import TrainingArguments

batch_size = 2
#log the training loss at each epoch
logging_steps = len(data["train"]) // batch_size
training_args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=100,
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    evaluation_strategy="steps",
    save_strategy = "steps",
    logging_strategy="steps",
    eval_steps=11,
    logging_steps=11,
    # load_best_model_at_end=True,
    fp16=True, # make it train fast!
    push_to_hub=True,
    hub_token = token
)

In [18]:
trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["validation"],
    tokenizer=tokenizer
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

/gpfs/gsfs8/users/manzog2/repos/mimic_attitude/distilbert-attitude-few10p is already a clone of https://huggingface.co/tanoManzo/distilbert-attitude-few10p. Make sure you pull the latest changes with `repo.git_pull()`.


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Using cuda_amp half precision backend


In [19]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [20]:
# training
train_result = trainer.train() 

# compute train results
metrics = train_result.metrics



***** Running training *****
  Num examples = 22
  Num Epochs = 100
  Instantaneous batch size per device = 2
  Total train batch size (w. parallel, distributed & accumulation) = 2
  Gradient Accumulation steps = 1
  Total optimization steps = 1100
  Number of trainable parameters = 66955779
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss


***** Running Evaluation *****
  Num examples = 100
  Batch size = 2
  Num examples = 100
  Batch size = 2
***** Running Evaluation *****
  Num examples = 100
  Batch size = 2
***** Running Evaluation *****
  Num examples = 100
  Batch size = 2
***** Running Evaluation *****
  Num examples = 100
  Batch size = 2
***** Running Evaluation *****
  Num examples = 100
  Batch size = 2
***** Running Evaluation *****
  Num examples = 100
  Batch size = 2
***** Running Evaluation *****
  Num examples = 100
  Batch size = 2
***** Running Evaluation *****
  Num examples = 100
  Batch size = 2
***** Running Evaluation *****
  Num examples = 100
  Batch size = 2
***** Running Evaluation *****
  Num examples = 100
  Batch size = 2
***** Running Evaluation *****
  Num examples = 100
  Batch size = 2
***** Running Evaluation *****
  Num examples = 100
  Batch size = 2
***** Running Evaluation *****
  Num examples = 100
  Batch size = 2
***** Running Evaluation *****
  Num examples = 100
  Batch size 

In [21]:
trainer.log_metrics("eval", metrics)

***** eval metrics *****
  epoch                    =      100.0
  total_flos               =   271418GF
  train_loss               =     0.0366
  train_runtime            = 0:17:34.89
  train_samples_per_second =      2.086
  train_steps_per_second   =      1.043


In [22]:
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 100
  Batch size = 2


{'eval_loss': 1.461593508720398,
 'eval_f1_weighted': 0.7555,
 'eval_f1_macro': 0.3416666666666666,
 'eval_f1_micro': 0.82,
 'eval_accuracy': 0.82,
 'eval_runtime': 5.8131,
 'eval_samples_per_second': 17.202,
 'eval_steps_per_second': 8.601,
 'epoch': 100.0}

In [23]:
import json
logs = trainer.state.log_history
with open(log_training_history_file,'w') as f:
    json.dump(logs,f)