<a href="https://colab.research.google.com/github/tal-yifat/deep-NLP-explainability/blob/main/Injury_Classifier_Fine_Tuning_1_2023.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
EXPERIMENT_NAME = 'days-lost-over-0'
MODEL = r'microsoft/deberta-v3-base'
VAL_RATIO = 0.05
TEST_RATIO = 0.2
CLASS_WEIGHT_POWER = 0
OVERSAMPLING_FACTOR = None 
PIPELINE_DRY_RUN = False 
EPOCHS = 4
LEARNING_RATE = 4.5e-6
WARMUP_STEPS = 500
LOG_STEPS = 500
EVAL_STEPS = 5000
RANDOM_SEED = 42
DAYS_LOST_THRESHOLD = 0

In [None]:
%load_ext tensorboard

gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')

Wed Jan 25 01:22:58 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 510.47.03    Driver Version: 510.47.03    CUDA Version: 11.6     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   43C    P0    26W /  70W |      0MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np

from google.colab import drive
drive.mount('/content/drive', force_remount=True)
%cd /content/drive/MyDrive/MSHA_Datasets

Mounted at /content/drive
/content/drive/.shortcut-targets-by-id/1eOUzsDr6TjqGV_8vuLRBrBZRS5ub9uQ8/MSHA_Datasets


In [None]:
import torch
from torch.utils.tensorboard import SummaryWriter
# !rm -rf ./MSHA_logs/
import time
timestr = time.strftime('%Y-%m-%d-%H%M')
current_run_logs_dir = MODEL.split('/')[-1] + '_' + EXPERIMENT_NAME + '_' + timestr
tb_writer = SummaryWriter(current_run_logs_dir)

In [None]:
experiment_info = pd.Series({
    'model':MODEL,
    'validatin ratio':VAL_RATIO,
    'test ratio':TEST_RATIO,
    'class weight power':CLASS_WEIGHT_POWER,
    'oversampling factor':OVERSAMPLING_FACTOR,
    'epochs':EPOCHS,
    'learning rate':LEARNING_RATE,
    'warmup steps':WARMUP_STEPS,
    'log_steps':LOG_STEPS,
    'eval_steps':EVAL_STEPS,
    'random_seed':RANDOM_SEED,
    'days lost threshold':DAYS_LOST_THRESHOLD
    })
experiment_info.to_csv(current_run_logs_dir + '/ExperimentInfo.csv')

In [None]:
# The direct link to the shared dataset file is: https://drive.google.com/file/d/1CYKt6S-2BWAUbFai1BPGWb4Z5rZBNofk/view?usp=sharing
df = pd.read_csv('accidents.csv')
print ('Total: {}'.format(df.shape[0]))
df = df.dropna(subset = ['DAYS_LOST', 'NARRATIVE'])
print ('Total W/O NA: {}'.format(df.shape[0]))

df['NARRATIVE'] = df['NARRATIVE'].str.replace('^EE', 'Employee', case=True, regex=True)
df = df[['NARRATIVE', 'DAYS_LOST']]
df = df.reset_index(drop=True)
df = df.rename(columns={'NARRATIVE': 'text', 'DAYS_LOST': 'label'})

# turn values to binary
df['label'] = np.where((df['label'] > DAYS_LOST_THRESHOLD), 1, 0) 
df.head()

Total: 251956
Total W/O NA: 207641


Unnamed: 0,text,label
0,Employee says he slipped off back of fuel truc...,0
1,2 employees were attempting to load troughing ...,0
2,Employee was pushing against pry bar. Bar slip...,0
3,Employee tripped on c-channel at #6 silo. Fel...,0
4,Ground was icy. Employee slipped on ice and f...,1


In [None]:
from sklearn.model_selection import train_test_split

if PIPELINE_DRY_RUN: 
  df = df.head(100)
  VAL_RATIO = 0.2
  LOG_STEPS = 5
  EVAL_STEPS = 5

df_train, df_test, df_train['label'], df_test['label'] = train_test_split(df[['text']], df['label'], 
                                                    test_size=TEST_RATIO, random_state=RANDOM_SEED)
df_train, df_val, df_train['label'], df_val['label'] = train_test_split(df_train[['text']], df_train['label'], 
                                                    test_size=VAL_RATIO / (1 - TEST_RATIO), random_state=RANDOM_SEED)
# Verifying there's no overlap between the sets
assert(set(df_train.index).intersection(df_val.index)==set())
assert(set(df_train.index).intersection(df_test.index)==set())
assert(set(df_val.index).intersection(df_test.index)==set())

print(df_train.shape, df_val.shape, df_test.shape)

(155730, 2) (10382, 2) (41529, 2)


In [None]:
if OVERSAMPLING_FACTOR:
  df_oversample = pd.DataFrame(np.repeat(df_train[df_train['label']==1].values, 
                                        OVERSAMPLING_FACTOR-1, 
                                        axis=0),
                              columns=df_train.columns)
  df_train = df_train.append(df_oversample).sample(frac=1, random_state=RANDOM_SEED)
  df_train.shape

In [None]:
# Transformers installation
! pip install transformers datasets sentencepiece evaluate torch-lr-finder 

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.26.0-py3-none-any.whl (6.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m96.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.8.0-py3-none-any.whl (452 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m452.9/452.9 KB[0m [31m47.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting sentencepiece
  Downloading sentencepiece-0.1.97-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m73.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.0-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 KB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting torch-lr-finder
  Downloadin

In [None]:
import datasets
from datasets import Dataset, DatasetDict, ClassLabel

train_ds = Dataset.from_pandas(df_train, split='train')
train_ds.features['label'] = ClassLabel(num_classes=2, names=['<90', '>=90'])
val_ds = Dataset.from_pandas(df_val, split='val')
val_ds.features['label'] = ClassLabel(num_classes=2, names=['<90', '>=90'])
test_ds = Dataset.from_pandas(df_test, split='test')
test_ds.features['label'] = ClassLabel(num_classes=2, names=['<90', '>=90'])
raw_datasets = DatasetDict()
raw_datasets['train'] = train_ds
raw_datasets['validation'] = val_ds
raw_datasets['test'] = test_ds
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['text', 'label', '__index_level_0__'],
        num_rows: 155730
    })
    validation: Dataset({
        features: ['text', 'label', '__index_level_0__'],
        num_rows: 10382
    })
    test: Dataset({
        features: ['text', 'label', '__index_level_0__'],
        num_rows: 41529
    })
})

In [None]:
from transformers import AdamW, AutoTokenizer, AutoModelForSequenceClassification
from transformers import DataCollatorWithPadding

tokenizer = AutoTokenizer.from_pretrained(MODEL)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Downloading:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/579 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
# print(df_train['text'].iloc[0])
# t0 = tokenizer(df_train['text'].iloc[0])
# tokenizer.convert_ids_to_tokens(t0['input_ids'])

In [None]:
tokenizer('Hello World')

{'input_ids': [1, 5365, 964, 2], 'token_type_ids': [0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1]}

In [None]:
def tokenize_function(sample):
    return tokenizer(sample['text'], truncation=True)

In [None]:
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
tokenized_datasets 

  0%|          | 0/156 [00:00<?, ?ba/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


  0%|          | 0/11 [00:00<?, ?ba/s]

  0%|          | 0/42 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 155730
    })
    validation: Dataset({
        features: ['text', 'label', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 10382
    })
    test: Dataset({
        features: ['text', 'label', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 41529
    })
})

In [None]:
import evaluate

def sigmoid(z):
    return 1/(1 + np.exp(-z))

def compute_metrics(eval_preds):
    clf_metrics = evaluate.combine(['accuracy', 'f1', 'precision', 'recall'])
    roc_auc = evaluate.load('roc_auc')
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    class_1_pred_probs = [sigmoid(p1) for p0, p1 in logits]
    metric_res = roc_auc.compute(prediction_scores=class_1_pred_probs, references=labels)
    metric_res.update(clf_metrics.compute(predictions=predictions, references=labels))

    return metric_res

In [None]:
import torch
from transformers import Trainer

val_counts = df_train['label'].value_counts()
class_freqs = (val_counts[0] / df_train.shape[0] , val_counts[1] / df_train.shape[0])
print('class freqs: ', class_freqs)
class_weights = [(1/frq) ** CLASS_WEIGHT_POWER for frq in class_freqs]
print('class weights: ', class_weights)
# Normalize the class weights so that the mean weight for the dataset is 1,
# in order to avoid impacting the learning rate. 
loss_normalizer = 1 / (class_freqs[0] * class_weights[0] + class_freqs[1] * class_weights[1])
NORM_CLASS_WEIGHTS = [float(wt * loss_normalizer) for wt in class_weights]

print('Class weights: ', NORM_CLASS_WEIGHTS)

class CustomTrainer(Trainer):
    def __repr__(self):
      return super().__repr__() 
    
    def compute_loss(self, model, inputs, return_outputs=False):
        device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
        # inputs.to(device)
        labels = inputs.get('labels')
        # forward pass
        outputs = model(**inputs)
        logits = outputs.get('logits')
        # compute custom loss (suppose one has 3 labels with different weights)
        loss_fct = torch.nn.CrossEntropyLoss(weight=torch.tensor(NORM_CLASS_WEIGHTS).to(device))
        # loss_fct = torch.nn.BCELoss(weight=torch.tensor(NORM_CLASS_WEIGHTS).to(device))
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

class freqs:  (0.5063378925062608, 0.4936621074937392)
class weights:  [1.0, 1.0]
Class weights:  [1.0, 1.0]


In [None]:
from transformers import TrainingArguments, AutoModelForSequenceClassification
from transformers import Trainer
from transformers.integrations import TensorBoardCallback

training_args = TrainingArguments(output_dir=current_run_logs_dir, 
                                  learning_rate=LEARNING_RATE,
                                  warmup_steps=WARMUP_STEPS,
                                  evaluation_strategy='steps',
                                  logging_steps=LOG_STEPS,
                                  eval_steps=EVAL_STEPS,
                                  save_strategy='epoch', 
                                  num_train_epochs=EPOCHS,
                                  report_to='tensorboard')
model = AutoModelForSequenceClassification.from_pretrained(MODEL, num_labels=2)

trainer = CustomTrainer(
    model,
    training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[TensorBoardCallback(tb_writer=tb_writer)]
) 

Downloading:   0%|          | 0.00/371M [00:00<?, ?B/s]

Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2ForSequenceClassification: ['mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.dense.bias', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.LayerNorm.weight', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.dense.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.bias', 'mask_predictions.classifier.bias']
- This IS expected if you are initializing DebertaV2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2ForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a

In [None]:
trainer.train()    

The following columns in the training set don't have a corresponding argument in `DebertaV2ForSequenceClassification.forward` and have been ignored: __index_level_0__, text. If __index_level_0__, text are not expected by `DebertaV2ForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 155730
  Num Epochs = 4
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 77868
  Number of trainable parameters = 184423682
You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Roc Auc,Accuracy,F1,Precision,Recall
5000,0.4707,0.491309,0.855498,0.762955,0.739659,0.810385,0.680288
10000,0.4723,0.465501,0.866199,0.773165,0.757791,0.803665,0.716871
15000,0.4494,0.459219,0.870661,0.77644,0.758707,0.814509,0.71006
20000,0.4278,0.455209,0.872174,0.780678,0.773681,0.790735,0.757346
25000,0.4265,0.493715,0.872249,0.783375,0.774083,0.800042,0.749757
30000,0.4375,0.447855,0.874633,0.785301,0.779895,0.7917,0.768437
35000,0.4461,0.44905,0.876364,0.785783,0.781275,0.789819,0.772913
40000,0.3944,0.482033,0.872852,0.785398,0.782804,0.784333,0.78128
45000,0.4049,0.458626,0.877261,0.787517,0.784444,0.787831,0.781086
50000,0.4081,0.458374,0.877305,0.786746,0.786541,0.779476,0.793734


The following columns in the evaluation set don't have a corresponding argument in `DebertaV2ForSequenceClassification.forward` and have been ignored: __index_level_0__, text. If __index_level_0__, text are not expected by `DebertaV2ForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 10382
  Batch size = 8


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.77k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.55k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.36k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/9.54k [00:00<?, ?B/s]

The following columns in the evaluation set don't have a corresponding argument in `DebertaV2ForSequenceClassification.forward` and have been ignored: __index_level_0__, text. If __index_level_0__, text are not expected by `DebertaV2ForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 10382
  Batch size = 8
The following columns in the evaluation set don't have a corresponding argument in `DebertaV2ForSequenceClassification.forward` and have been ignored: __index_level_0__, text. If __index_level_0__, text are not expected by `DebertaV2ForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 10382
  Batch size = 8
Saving model checkpoint to deberta-v3-base_days-lost-over-0_2023-01-25-0123/checkpoint-19467
Configuration saved in deberta-v3-base_days-lost-over-0_2023-01-25-0123/checkpoint-19467/config.json
Model weights saved in deberta-v3-base_days-lost-ove

TrainOutput(global_step=77868, training_loss=0.4265056041243901, metrics={'train_runtime': 18052.735, 'train_samples_per_second': 34.506, 'train_steps_per_second': 4.313, 'total_flos': 2.476230957853495e+16, 'train_loss': 0.4265056041243901, 'epoch': 4.0})

In [None]:
test_res = trainer.evaluate(eval_dataset=tokenized_datasets['test'], metric_key_prefix='test')
pd.Series(test_res).to_csv(current_run_logs_dir + '/TestResults.csv')
test_res

The following columns in the evaluation set don't have a corresponding argument in `DebertaV2ForSequenceClassification.forward` and have been ignored: __index_level_0__, text. If __index_level_0__, text are not expected by `DebertaV2ForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 41529
  Batch size = 8


{'test_loss': 0.4680936932563782,
 'test_roc_auc': 0.880553885573959,
 'test_accuracy': 0.7924582821642707,
 'test_f1': 0.785185554420158,
 'test_precision': 0.8009355773630955,
 'test_recall': 0.7700430191630817,
 'test_runtime': 283.1748,
 'test_samples_per_second': 146.655,
 'test_steps_per_second': 18.335,
 'epoch': 4.0}

In [None]:
# # Evaluation of the version after 1 epoch

# model_e1 = AutoModelForSequenceClassification.from_pretrained(current_run_logs_dir + '/checkpoint-28872', num_labels=2)

# trainer_e1 = CustomTrainer(
#     model_e1,
#     training_args,
#     train_dataset=tokenized_datasets['train'],
#     eval_dataset=tokenized_datasets['validation'],
#     data_collator=data_collator,
#     tokenizer=tokenizer,
#     compute_metrics=compute_metrics,
#     callbacks=[TensorBoardCallback(tb_writer=tb_writer)]
# ) 

# test_res_epoch1 = trainer_e1.evaluate(eval_dataset=tokenized_datasets['test'])
# pd.Series(test_res_epoch1).to_csv(current_run_logs_dir + '/TestResultsEpoch1.csv')
# test_res_epoch1

In [None]:
if not PIPELINE_DRY_RUN:
  model_save_name = 'bert_injuries_model.pt'
  path =F'/content/drive/MyDrive/MSHA_Datasets/' + current_run_logs_dir + '/' + model_save_name
  torch.save(model.state_dict(), path)


In [None]:
tbpath = '/content/drive/MyDrive/MSHA_Datasets/'+current_run_logs_dir
%tensorboard --logdir $tbpath
nb_path = '/content/drive/MyDrive/MSHA_Datasets/'+current_run_logs_dir+'/Saved_Bert_Injuries.ipynb'
%notebook $nb_path

<IPython.core.display.Javascript object>

In [None]:
training_args

TrainingArguments(
_n_gpu=1,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_pin_memory=True,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=True,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_steps=5000,
evaluation_strategy=steps,
fp16=False,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
fsdp=[],
fsdp_min_num_params=0,
fsdp_transformer_layer_cls_to_wrap=None,
full_determinism=False,
gradient_accumulation_steps=1,
gradient_checkpointing=False,
greater_is_better=None,
group_by_length=False,
half_precision_backend=auto,
hub_model_id=None,
hub_private_repo=False,
hub_strategy=every_save,
hub_token=<HUB_TOKEN>,
ignore_data_skip=False,
include_inputs_for_metrics=False,
jit_mode_eval=False,
label_n

In [None]:
model

DebertaV2ForSequenceClassification(
  (deberta): DebertaV2Model(
    (embeddings): DebertaV2Embeddings(
      (word_embeddings): Embedding(128100, 768, padding_idx=0)
      (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
      (dropout): StableDropout()
    )
    (encoder): DebertaV2Encoder(
      (layer): ModuleList(
        (0): DebertaV2Layer(
          (attention): DebertaV2Attention(
            (self): DisentangledSelfAttention(
              (query_proj): Linear(in_features=768, out_features=768, bias=True)
              (key_proj): Linear(in_features=768, out_features=768, bias=True)
              (value_proj): Linear(in_features=768, out_features=768, bias=True)
              (pos_dropout): StableDropout()
              (dropout): StableDropout()
            )
            (output): DebertaV2SelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
 