In [1]:
!pip install transformers
!pip install sentencepiece
!pip install datasets



## Read the Preprocessed Dataset

#### Create one-hot encoding for labels and save separately to a new csv file

In [2]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from datasets import DatasetDict, load_dataset, load_metric

import transformers
import torch

In [3]:
# read dataset
dataset_df = pd.read_csv('/content/training_dataset.csv')

In [4]:
# drop column 'term', not required for training
dataset_df = dataset_df.drop(['term'], axis=1)

In [5]:
### One-hot encoding labels

def label_encoding(row):
  if row['label'] == 'non_science': return 1
  else: return 0

dataset_df['non_science'] = dataset_df.apply(label_encoding, axis=1)

###

def label_encoding(row):
  if row['label'] == 'reagent': return 1
  else: return 0

dataset_df['reagent'] = dataset_df.apply(label_encoding, axis=1)

### 

def label_encoding(row):
  if row['label'] == 'drug': return 1
  else: return 0

dataset_df['drug'] = dataset_df.apply(label_encoding, axis=1)

### 

def label_encoding(row):
  if row['label'] == 'protein': return 1
  else: return 0

dataset_df['protein'] = dataset_df.apply(label_encoding, axis=1)

### 

def label_encoding(row):
  if row['label'] == 'cell': return 1
  else: return 0

dataset_df['cell'] = dataset_df.apply(label_encoding, axis=1)

### 

def label_encoding(row):
  if row['label'] == 'antibiotic': return 1
  else: return 0

dataset_df['antibiotic'] = dataset_df.apply(label_encoding, axis=1)

In [7]:
# drop column 'label', do not need anymore
dataset_df = dataset_df.drop(['label'], axis=1)

In [8]:
# save created DF to csv
dataset_df.to_csv('new_train_dataset.csv', index=False)

### Create a data format suitable for fine-tuning

In [9]:
# use HF DatasetDict as a data format for fine-tuning
dataset = DatasetDict.from_csv('/content/new_train_dataset.csv')

Using custom data configuration default-15cf5285fb439285


Downloading and preparing dataset csv/default (download: Unknown size, generated: Unknown size, post-processed: Unknown size, total: Unknown size) to /root/.cache/huggingface/datasets/csv/default-15cf5285fb439285/0.0.0...


0 tables [00:00, ? tables/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-15cf5285fb439285/0.0.0. Subsequent calls will reuse this data.


In [10]:
# create a validation subset of a dataset
dataset = dataset.train_test_split(test_size=500, shuffle=True)

In [11]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'non_science', 'reagent', 'drug', 'protein', 'cell', 'antibiotic'],
        num_rows: 4500
    })
    test: Dataset({
        features: ['text', 'non_science', 'reagent', 'drug', 'protein', 'cell', 'antibiotic'],
        num_rows: 500
    })
})

In [34]:
# # make sure labes distribution is appropriate
# tmp_df = dataset['train'].to_pandas()

# # check for class imbalance
# tmp_df['label'].value_counts(sort=False)

cell            410
reagent         336
drug            382
antibiotic      457
non_science    2403
protein         512
Name: label, dtype: int64

In [35]:
# # make sure labes distribution is appropriate
# tmp_df = dataset['test'].to_pandas()

# # check for class imbalance
# tmp_df['label'].value_counts(sort=False)

cell            39
reagent         49
antibiotic      58
protein         57
drug            35
non_science    262
Name: label, dtype: int64

Observation: All labels included in validation dataset with approximatly same frequency.

In [12]:
# peek one example
dataset["train"][0]

{'antibiotic': 0,
 'cell': 0,
 'drug': 0,
 'non_science': 1,
 'protein': 0,
 'reagent': 0,
 'text': 'rybitwy [rɨˈbitfɨ] is a village in the administrative district of gmina połaniec, within staszów county, świętokrzyskie voivodeship, in south-central poland. it lies approximately 3 kilometres (2 mi) south of połaniec, 19 km (12 mi) south-east of staszów, and 70 km (43 mi) south-east of the regional capital kielce.the village has a population of  327.\n\n\n== demography ==\naccording to the 2002 poland census, there were 310 people residing in rybitwy village, of whom 51.6% were male and 48.4% were female. in the village, the population was spread out, with 24.5% under the age of 18, 37.7% from 18 to 44, 18.7% from 45 to 64, and 18.7% who were 65 years of age or older.\n\n \n\n\n== references =='}

In [13]:
# create labels column - one-hot representation of lables
cols = dataset["train"].column_names
dataset = dataset.map(lambda x : {"labels": [x[c] for c in cols if c != "text"]})

dataset

  0%|          | 0/4500 [00:00<?, ?ex/s]

  0%|          | 0/500 [00:00<?, ?ex/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'non_science', 'reagent', 'drug', 'protein', 'cell', 'antibiotic', 'labels'],
        num_rows: 4500
    })
    test: Dataset({
        features: ['text', 'non_science', 'reagent', 'drug', 'protein', 'cell', 'antibiotic', 'labels'],
        num_rows: 500
    })
})

In [14]:
dataset['test'][0]

{'antibiotic': 1,
 'cell': 0,
 'drug': 0,
 'labels': [0, 0, 0, 0, 0, 1],
 'non_science': 0,
 'protein': 0,
 'reagent': 0,

## Fine-Tuning a model for multi-label classification


Using HF library and pre-trained models for contextual representation from HF library (taking into account that model has a version with a classification head.)

In [15]:
# define model name to use and batch size
model_checkpoint = "distilbert-base-uncased"
batch_size = 4

In [16]:
# use accuracy as a metric for evaluation on the validation set
metric = load_metric('accuracy')

## Tokenize data

Preprocess texts first before feeding to a model. 
- tokenize the inputs (~ converting the tokens to their corresponding IDs in the pretrained vocabulary) and put it in a format the model expects, as well as generate the other inputs that model requires.

In [17]:
from transformers import AutoTokenizer
    
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)

In [18]:
def preprocess_tokenize(examples):
    """
    For HF DatasetDict
    """
    
    return tokenizer(examples['text'], truncation=True, padding=True, max_length=512)

Apply this function on all the sentences (or pairs of sentences) in our dataset, by using map method.

In [19]:
# remove cols NOT necessary for training
cols = dataset["train"].column_names
cols.remove("labels")

cols

['text', 'non_science', 'reagent', 'drug', 'protein', 'cell', 'antibiotic']

In [20]:
encoded_dataset = dataset.map(preprocess_tokenize, batched=True, remove_columns=cols)

  0%|          | 0/5 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [21]:
# data format for training
encoded_dataset

DatasetDict({
    train: Dataset({
        features: ['attention_mask', 'input_ids', 'labels'],
        num_rows: 4500
    })
    test: Dataset({
        features: ['attention_mask', 'input_ids', 'labels'],
        num_rows: 500
    })
})

## Fine-Tuning Model

In [25]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

# num of target labels
num_labels = 6
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels)

Downloading:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.bias', 'classifier

In [38]:
# standard hyperparams as reported in HF

args = TrainingArguments(
    output_dir = "/content/labtwin_experiments/",
    evaluation_strategy = "epoch",
    save_strategy = "no",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    gradient_accumulation_steps=2,
    num_train_epochs=5,
    weight_decay=0.01
)


PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


### Customize Trainer using a custom loss function for multi-label classification:

Source: https://huggingface.co/transformers/main_classes/trainer.html

In [39]:
class MultilabelTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss_fct = torch.nn.BCEWithLogitsLoss()
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), 
                        labels.float().view(-1, self.model.config.num_labels))
        return (loss, outputs) if return_outputs else loss

In [40]:
# define a compute_metrics function to perform evaluation

def accuracy_thresh(y_pred, y_true, thresh=0.5, sigmoid=True): 
    y_pred = torch.from_numpy(y_pred)
    y_true = torch.from_numpy(y_true)
    if sigmoid: 
      y_pred = y_pred.sigmoid()
    return ((y_pred>thresh)==y_true.bool()).float().mean().item()

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    return {'accuracy_thresh': accuracy_thresh(predictions, labels)}

In [41]:
# init a Trainer

multi_trainer = MultilabelTrainer(
    model,
    args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["test"],
    compute_metrics=compute_metrics,
    tokenizer=tokenizer)

In [42]:
# "plain" (without training performance)

multi_trainer.evaluate()

***** Running Evaluation *****
  Num examples = 500
  Batch size = 4


{'eval_accuracy_thresh': 0.9649999737739563,
 'eval_loss': 0.12263728678226471,
 'eval_runtime': 16.6036,
 'eval_samples_per_second': 30.114,
 'eval_steps_per_second': 7.528}

In [43]:
# train the model

multi_trainer.train()

***** Running training *****
  Num examples = 4500
  Num Epochs = 5
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 2
  Total optimization steps = 2810


Epoch,Training Loss,Validation Loss,Accuracy Thresh
0,0.1073,0.104888,0.968
1,0.0801,0.107024,0.969
2,0.0611,0.105239,0.968667
3,0.0468,0.114186,0.967333
4,0.0353,0.113282,0.968333


***** Running Evaluation *****
  Num examples = 500
  Batch size = 4


***** Running Evaluation *****
  Num examples = 500
  Batch size = 4
***** Running Evaluation *****
  Num examples = 500
  Batch size = 4
***** Running Evaluation *****
  Num examples = 500
  Batch size = 4
***** Running Evaluation *****
  Num examples = 500
  Batch size = 4


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=2810, training_loss=0.06179466994207525, metrics={'train_runtime': 2306.2596, 'train_samples_per_second': 9.756, 'train_steps_per_second': 1.218, 'total_flos': 2980199175487488.0, 'train_loss': 0.06179466994207525, 'epoch': 5.0})

In [44]:
# save the model
multi_trainer.save_model('/content/experiment_1/model_trained')

Saving model checkpoint to /content/experiment_1/model_trained
Configuration saved in /content/experiment_1/model_trained/config.json
Model weights saved in /content/experiment_1/model_trained/pytorch_model.bin
tokenizer config file saved in /content/experiment_1/model_trained/tokenizer_config.json
Special tokens file saved in /content/experiment_1/model_trained/special_tokens_map.json


In [45]:
# save model to google disc
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [46]:
# save the model
multi_trainer.save_model('/content/drive/MyDrive/labtwin-test/multi_model_trained')

Saving model checkpoint to /content/drive/MyDrive/labtwin-test/multi_model_trained
Configuration saved in /content/drive/MyDrive/labtwin-test/multi_model_trained/config.json
Model weights saved in /content/drive/MyDrive/labtwin-test/multi_model_trained/pytorch_model.bin
tokenizer config file saved in /content/drive/MyDrive/labtwin-test/multi_model_trained/tokenizer_config.json
Special tokens file saved in /content/drive/MyDrive/labtwin-test/multi_model_trained/special_tokens_map.json
