<a href="https://colab.research.google.com/github/vifirsanova/empi/blob/main/demos/fine_tuning_demo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#**The notebook author is [Victoria Firsanova](https://linktr.ee/vifirsanova), PhD student**

#**The notebook is a part of the EMPI project by Victoria Firsanova licensed under the MIT License. See more at https://github.com/vifirsanova/empi/**

# Import libraries

In [3]:
!pip install transformers # HuggingFace Transformers

In [4]:
# dataset upload and 
# the dataset format is JSON
import json
from pathlib import Path
from sklearn.model_selection import train_test_split

# arrays processing
import numpy as np
import pandas as pd

# machine learning
import torch
import gc 
gc.collect() 
from torch.utils.data import DataLoader
from transformers import AdamW

# access to Drive
from google.colab import drive

# regex
import re

# Load dataset

In [None]:
# load from the project repo https://github.com/vifirsanova/empi/

!wget https://raw.githubusercontent.com/vifirsanova/empi/main/dataset/dataset.json

In [6]:
# load data

path = Path('/content/dataset.json')
data = json.loads(path.read_text(encoding='utf-8'))

data = data['data']

In [7]:
# split the data into train, validation and test sets

train, temp = train_test_split(data, test_size=0.3, shuffle=True)
val, test = train_test_split(temp, test_size=0.5, shuffle=True)

In [8]:
# extract contexts, questions and answers from the dataset structure
# in machine reading comprehension, we use contexts to extract answers to questions

def read_dataset(dataset):
    contexts = []
    questions = []
    answers = []
    for group in dataset:
        for paragraph in group['paragraphs']:
          context = paragraph['context']
          for qa in paragraph['qas']:
              question = qa['question']
              for answer in qa['answers']:
                  contexts.append(context)
                  questions.append(question)
                  answers.append(answer)

    return contexts, questions, answers

train_contexts, train_questions, train_answers = read_dataset(train)
val_contexts, val_questions, val_answers = read_dataset(val)
test_contexts, test_questions, test_answers = read_dataset(test)

# Load Transformers

In [None]:
# load a model and a tokenizer for it
# choose the model here https://huggingface.co/models

from transformers import BertForQuestionAnswering
from transformers import BertTokenizerFast

tokenizer = BertTokenizerFast.from_pretrained('bert-base-multilingual-cased')
model = BertForQuestionAnswering.from_pretrained("bert-base-multilingual-cased")

# Tokenization

In [10]:
# apply Tokenizer

train_encodings = tokenizer(train_contexts, train_questions, truncation=True, padding=True)
val_encodings = tokenizer(val_contexts, val_questions, truncation=True, padding=True)

In [11]:
# add token positions (see 'answer_start' and 'answer_end' in the dataset) to the encoded data

def add_token_positions(encodings, answers):
    start_positions = []
    end_positions = []
    for i in range(len(answers)):
        start_positions.append(encodings.char_to_token(i, answers[i]['answer_start']))
        end_positions.append(encodings.char_to_token(i, answers[i]['answer_end']))

        if start_positions[-1] is None:
            start_positions[-1] = tokenizer.model_max_length
        if end_positions[-1] is None:
            end_positions[-1] = tokenizer.model_max_length
    encodings.update({'start_positions': start_positions, 'end_positions': end_positions})

add_token_positions(train_encodings, train_answers)
add_token_positions(val_encodings, val_answers)

In [12]:
# use encodings to wrap-up the dataset for the machine learning stage

class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)

train_dataset = Dataset(train_encodings)
val_dataset = Dataset(val_encodings)

# Machine Learning. Fine-tuning of the Transformer model on a custom dataset (domain adaptation and / or cross-lingual learning)

In [13]:
# function that saves the best model

def save_checkpoint(state, is_best, filename='checkpoint.pth.tar'):
    torch.save(state, filename)
    if is_best:
        util.copyfile(filename, 'model_best.pth.tar')

# use GPUs when possible

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

# send the model to GPU

model.to(device)

# begin training 

model.train()

# load the processes dataset

train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True)

# apply optimizer

optim = AdamW(model.parameters(), lr=1e-5)

# training loop

for epoch in range(1):
    for batch in train_loader:
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device) 
        attention_mask = batch['attention_mask'].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)
        loss = outputs[0]
        loss.backward()
        optim.step()

# save the model, load and eval
filepath = '/content/model.pth'
torch.save(model.state_dict(), filepath)
model.load_state_dict(torch.load(filepath))
model.eval()



BertForQuestionAnswering(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise

In [14]:
# get access to Drive

drive.mount('/content/drive')

Mounted at /content/drive


In [45]:
# copy model to Drive

!cp "/content/model.pth" "/content/drive/MyDrive/model/"

# Use Neural Network

In [47]:
# get model from Drive

#!cp "/content/drive/MyDrive/model/" "/content/model.pth"

# load model

model.load_state_dict(torch.load('model.pth'))

<All keys matched successfully>

In [48]:
#send model to CPU

model.to('cpu')

BertForQuestionAnswering(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise

In [50]:
# generate answers to questions from the test set

answers = []

for i in range(len(test_questions)):
  question, text = test_questions[i], test_contexts[i]

  input_text = "[CLS] " + question + " [SEP] " + text + " [SEP]"

  input_ids = tokenizer.encode(input_text)

  start_scores, end_scores = model(torch.tensor([input_ids])).values()

  all_tokens = tokenizer.convert_ids_to_tokens(input_ids)
  ans = ' '.join(all_tokens[torch.argmax(start_scores) : torch.argmax(end_scores)+1])

  answers.append(ans)

In [69]:
# show an example
# the BERT outputs are tokenized, change them to initial form

def detokenize(sent):
  detokenized = ""
  for token in sent.split(' '):
      if token.startswith("##"): # conatenate tokens
          detokenized += token[2:]
      else:
          detokenized += " " + token
  detokenized = re.sub(r'^[^SEP\]]*SEP\]', r' ', detokenized)
  return detokenized[1:]
  

print(
    'SAMPLE QUESTION:', test_questions[3],
    '\nGENERATED ANSWER:', detokenize(answers[3])
    )

SAMPLE QUESTION: Кто может помочь с агрессивным или самоповреждающим поведенем ребёнка. 
GENERATED ANSWER:  Без профессиональной помощи с пониманием причин агрессивного поведения и самоповреждений вашего ребёнка в спектре аутизма может оказаться трудно реализовывать стратегии реагирования на его самоповреждения .
