<center><h2>ALTeGraD 2022<br>Lab Session 2: Transfer learning for NLP</h2> 27 / 10 / 2022<br> M. Kamal Eddine, H. Abdine<br><br>


<b>Student name:</b> Zakaria El Founoun

</center>

<br><br>
In this lab we will:
* Implement and pretrain a language model with transformer architecture.
* Use the pretrained model (transfer learning) to perform a sentiment analysis task which consists of classifying some books reviews into positive and negative ones.
* Compare the performance of the pretrained model to a model trained from scratch.
 <br>
 
<b>The deadline for this lab is November 14, 2022 11:59 PM.</b> More details about the submission and the architecture for this lab can be found in the handout PDF.

In [1]:
import math

import torch
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

### The Model

In [2]:
class TransformerModel(nn.Module):
    def __init__(self, ntoken, nhead, nhid, nlayers, dropout=0.5):
        super(TransformerModel, self).__init__()
        '''
        ntokens: the size of vocabulary
        nhid: the hidden dimension of the model.
        We assume that embedding_dim = nhid
        nlayers: the number of nn.TransformerEncoderLayer in nn.TransformerEncoder
        nhead: the number of heads in the multiheadattention models
        dropout: the dropout value
         '''
        self.model_type = "Transformer"
        self.encoder = nn.Embedding(num_embeddings=ntoken, embedding_dim=nhid) # fill me, nhid = the dim_embed
        self.pos_encoder = PositionalEncoding(nhid, dropout) #fill me, the PositionalEncoding class is implemented in the next cell
        encoder_layers = nn.TransformerEncoderLayer(nhid, nhead, nhid) #fill me we assume nhid = d_model = dim_feedforward
        self.transformer_encoder = nn.TransformerEncoder(encoder_layers,nlayers) #fill me
        self.nhid = nhid
        self.init_weights()

    def generate_square_subsequent_mask(self, sz):
        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
        mask = (
            mask.float()
            .masked_fill(mask == 0, float("-inf"))
            .masked_fill(mask == 1, float(0.0))
        )
        return mask

    def init_weights(self):
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)

    def forward(self, src, src_mask):
        src = self.encoder(src) * math.sqrt(self.nhid) 
        src = self.pos_encoder(src) #fill me
        output = self.transformer_encoder(src, src_mask) #fill me
        return output


class ClassificationHead(nn.Module):
    def __init__(self, nhid, nclasses):
        super(ClassificationHead, self).__init__()
        self.decoder = nn.Linear(nhid, nclasses)
        self.init_weights()

    def init_weights(self):
        initrange = 0.1
        self.decoder.bias.data.zero_()
        self.decoder.weight.data.uniform_(-initrange, initrange)

    def forward(self, src):
        output = self.decoder(src)
        return output
    
class Model(nn.Module):
    def __init__(self, ntoken, nhead, nhid, nlayers, nclasses, dropout=0.5):
        super(Model, self).__init__()
        self.base = TransformerModel(ntoken, nhead, nhid, nlayers) #fill me
        self.classifier = ClassificationHead(nhid, nclasses) #fill me 

    def forward(self, src, src_mask):
        # base model
        x = self.base(src, src_mask) #fill me
        # classifier model
        output = self.classifier(x) #fill me
        return output

In [3]:
class PositionalEncoding(nn.Module):
    def __init__(self, nhid, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, nhid)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(
            torch.arange(0, nhid, 2).float() * (-math.log(10000.0) / nhid)
        )
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer("pe", pe)

    def forward(self, x):
        x = x + self.pe[: x.size(0), :]
        return self.dropout(x)

Let's verify if our model works, by applying one inference step

In [4]:
ntokens = 100 # the size of vocabulary
nhid = 200  # hidden dimension
nlayers = 4  # the number of nn.TransformerEncoderLayer in nn.TransformerEncoder
nhead = 2  # the number of heads in the multiheadattention models
dropout = 0  # the dropout value

model = Model(ntokens, nhead, nhid, nlayers, ntokens, dropout).to(device)
dummy_input = torch.tensor([[2, 6, 2, 5, 43, 21]]).to(device)
src_mask = model.base.generate_square_subsequent_mask(1).to(device)
out = model.forward(dummy_input, src_mask)

print(out.shape) # is it the right shape? yes 

torch.Size([1, 6, 100])


## Vocabulary and Tokenization

In [5]:
# !wget https://raw.githubusercontent.com/moussaKam/transfer_learning_transformers/main/dict.txt
# !head -5 dict.txt
#!pip install wget
import wget
wget.download( "https://raw.githubusercontent.com/moussaKam/transfer_learning_transformers/main/dict.txt")

'dict (5).txt'

In [6]:
path_vocab = "dict.txt"
token2ind = {"<sos>": 0, "<pad>": 1, "<eos>": 2, "<oov>": 3} # the 4 first indices are reserved to special tokens
with open(path_vocab, "r", encoding="utf8") as f: # setting encoding method to "utf8"
    for idx, line in enumerate(f):
        word = line.split()[0].strip()
        token2ind[word] = idx+4 #fill me

ind2token = {v:k for k,v in token2ind.items()} #fill me

print(ind2token[1111])

▁trop


### Data Loader


In [7]:
import numpy
import torch
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset


class Dataset(Dataset):
    def __init__(
        self,
        path_documents,
        path_labels=None,
        token2ind={},
        max_len=512,
        task="language_modeling",
    ):
        self.task = task
        self.max_len = max_len
        self.token2ind = token2ind
        self.documents = []
        self.labels = []
        with open(path_documents, "r", encoding= "utf8") as f1:
            for line in f1:
                self.documents.append(line.strip())
        if task == "classification":
            with open(path_labels, "r" ,encoding="utf8") as f1:
                for line in f1:
                    self.labels.append(int(line.strip()))
            assert len(self.labels) == len(self.documents)

    def __len__(self):
        return len(self.documents)

    def __getitem__(self, index):
        sequence = self.documents[index].split()
        if len(sequence) > self.max_len - 1:
            sequence = sequence[: self.max_len - 1]
        source_sequence = [self.token2ind.get(word,self.token2ind["<oov>"]) for word in sequence[:self.max_len] ] #fill me (constract the input sequence using token2ind, sequence and special tokens)
        if self.task == "language_modeling":
            target = source_sequence[1:]
            target.append(self.token2ind["<eos>"])
        elif self.task == "classification":
            target = [self.labels[index]]
        sample = {
            "source_sequence": torch.tensor(source_sequence),
            "target": torch.tensor(target),
        }
        return sample


def MyCollator(batch):
    source_sequences = pad_sequence(
        #we use padding to match the length of the sequences in the same batch
        [sample["source_sequence"] for sample in batch], padding_value=token2ind["<pad>"]
    )
    target = pad_sequence(
        [sample["target"] for sample in batch], padding_value=token2ind["<pad>"]
    )
    return source_sequences, target.reshape(-1)


def get_loader(
    path_documents,
    path_labels=None,
    token2ind={},
    max_len=512,
    batch_size=32,
    task="language_modeling",
):
    dataset = Dataset(
        path_documents,
        path_labels=path_labels,
        token2ind=token2ind,
        max_len=512,
        task=task,
    )
    data_loader = DataLoader(
        dataset=dataset,
        batch_size=batch_size,
        shuffle=True,
        collate_fn=MyCollator,
        pin_memory=True,
        drop_last=True,
    )
    return data_loader

## The Training

In [8]:
def train(
    path_data_train,
    path_labels_train=None,
    path_data_valid=None,
    save_interval=-1,
    log_interval=5,
    task="language_modeling",
    batch_size=32,
):
    model.train()
    total_loss = 0.0
    ntokens = len(token2ind)
    data_loader = get_loader(
        path_data_train,
        path_labels_train,
        token2ind,
        task=task,
        batch_size=batch_size,
    )
    
    losses = []
    for idx, data in enumerate(data_loader): #step 1
        optimizer.zero_grad()
        src_mask = model.base.generate_square_subsequent_mask(data[0].size(0)).to(
            device
        )
        input = data[0].to(device)
        output = model(input, src_mask) #step 2
        if task == 'classification':
            #last vector only
            output = output[-1] #fill me 
        output = output.view(-1, output.shape[-1])
        target = data[1] #fill me
        target = target.to(device)
        loss = criterion(output, target) #fill me, Cross entropy check next cells
        #fill me step 3
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5) # prevent exploding gradient 
        #fill me step 4
        optimizer.step()
        total_loss += loss.item() 
        if idx % log_interval == 0 and idx > 0:
            cur_loss = total_loss / log_interval
            print(
                "| epoch {:3d} | {:5d}/{:5d} steps | "
                "loss {:5.5f} | ppl {:8.3f}".format(
                    epoch, idx, len(data_loader), cur_loss, math.exp(cur_loss),
                )
            )
            losses.append(cur_loss)
            total_loss = 0
    return losses

In [9]:
ntokens = len(ind2token)#fill me # the size of vocabulary
nhid = 200  # the dimension of the feedforward network model in nn.TransformerEncoder
nlayers = 4  # the number of nn.TransformerEncoderLayer in nn.TransformerEncoder
nhead = 2  # the number of heads in the multiheadattention models
dropout = 0  # the dropout value

nclasses = 2 # for classification task only

model = Model(ntokens, nhead, nhid, nlayers, ntokens, dropout).to(device)

In [10]:
# optimization paramerters

criterion = nn.CrossEntropyLoss(ignore_index=token2ind['<pad>'])
lr = 0.0003  # learning rate
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

In [11]:
wget.download("https://raw.githubusercontent.com/moussaKam/transfer_learning_transformers/main/pretraining_subset.txt")
path_data_train = "pretraining_subset.txt"

In [12]:
import torch
torch.cuda.empty_cache()

In [13]:
#pretraining on a tiny subset
log_interval = 500
epochs = 2
for epoch in range(1, epochs + 1): #5
    train(
        path_data_train,
        save_interval=-1,
        task= "language_modeling", # fill me
        batch_size=16,  
        log_interval=log_interval,
    )

| epoch   1 |   500/ 3125 steps | loss 7.50133 | ppl 1810.449
| epoch   1 |  1000/ 3125 steps | loss 6.80184 | ppl  899.498
| epoch   1 |  1500/ 3125 steps | loss 6.56338 | ppl  708.664
| epoch   1 |  2000/ 3125 steps | loss 6.36743 | ppl  582.556
| epoch   1 |  2500/ 3125 steps | loss 6.22581 | ppl  505.630
| epoch   1 |  3000/ 3125 steps | loss 6.16712 | ppl  476.811
| epoch   2 |   500/ 3125 steps | loss 5.98057 | ppl  395.666
| epoch   2 |  1000/ 3125 steps | loss 5.89746 | ppl  364.111
| epoch   2 |  1500/ 3125 steps | loss 5.86114 | ppl  351.122
| epoch   2 |  2000/ 3125 steps | loss 5.81006 | ppl  333.639
| epoch   2 |  2500/ 3125 steps | loss 5.78791 | ppl  326.331
| epoch   2 |  3000/ 3125 steps | loss 5.76424 | ppl  318.698


## Text Generation


In [22]:
!pip install sentencepiece



Collecting sentencepiece
  Downloading sentencepiece-0.1.97-cp37-cp37m-win_amd64.whl (1.1 MB)
Installing collected packages: sentencepiece
Successfully installed sentencepiece-0.1.97





In [17]:
if not os.path.isfile("pretrained_model_4layers.pt"):
    wget.download("https://raw.githubusercontent.com/moussaKam/transfer_learning_transformers/main/pretrained_model_4layers.pt")

model = Model(ntokens, nhead, nhid, nlayers, ntokens).to(device) 

#load the checkpoint
checkpoint = torch.load('pretrained_model_4layers.pt', map_location=torch.device('cpu')) 
#load state dict
model.load_state_dict(checkpoint['model_state_dict']) 

<All keys matched successfully>

In [23]:
# !pip install sentencepiece   # uncomment this if you are using google colab
wget.download("https://raw.githubusercontent.com/moussaKam/transfer_learning_transformers/main/sentencepiece.french.model")
import sentencepiece as spm

s = spm.SentencePieceProcessor(model_file='sentencepiece.french.model') #load sentencepiece model

#examples
encoded = s.encode_as_pieces("Bonjour les amis!")
decoded = s.decode_pieces(encoded)
print(encoded)
print(decoded)

['▁Bonjour', '▁les', '▁amis', '!']
Bonjour les amis!


In [25]:
def infer_next_token(sent):
    model.eval()
    sent_pieces = s.encode_as_pieces(sent)
    source = [token2ind['<sos>']] + [token2ind[el] for el in sent_pieces] # list of tokens
    source = torch.tensor(source).to(device)
    source = source.reshape(-1, 1)
    src_mask = model.base.generate_square_subsequent_mask(source.size(0)).to(device)
    out = model(source, src_mask)
    next_token_ind =  torch.argmax(out[-1,0]).item() #fill me
    return next_token_ind, out
    
def infer_next_tokens(sent, max_len=50):
    # to be implemented
    new_sent=s.encode_as_pieces(sent)
    while len(new_sent)<max_len and new_sent[-1]!="<eos>":
        new_sent+=[ind2token[infer_next_token(s.decode_pieces(new_sent))[0]]]
    return s.decode_pieces(new_sent)

In [26]:
sent = "Bonjour les"
infer_next_tokens(sent)

'Bonjour les gens qui ont été très accueillants et sympathiques.<eos>'

### Supervised task

In [27]:
wget.download("https://raw.githubusercontent.com/moussaKam/transfer_learning_transformers/main/cls-books/train.review.spm")
wget.download("https://raw.githubusercontent.com/moussaKam/transfer_learning_transformers/main/cls-books/train.label")
wget.download("https://raw.githubusercontent.com/moussaKam/transfer_learning_transformers/main/cls-books/test.review.spm")
wget.download("https://raw.githubusercontent.com/moussaKam/transfer_learning_transformers/main/cls-books/test.label")


path_data_train = "train.review.spm"
path_labels_train = "train.label"

path_data_valid = "test.review.spm"
path_labels_valid = "test.label"

In [28]:
# a function to evaluate the validation accuracy of the model.
def evaluate_accuracy(data_loader):
    #to be implemented
    model.eval() 
    all_corr=0
    size=0
    with torch.no_grad(): # Only testing :)
        for id,data in enumerate(data_loader):
            src_mask=model.base.generate_square_subsequent_mask(data[0].size(0)).to(device)
            output=model(data[0].to(device),src_mask)[-1]
            output=torch.argmax(output,dim=1)
            target=data[1].to(device)
            corr=torch.sum((target==output)).item()
            all_corr+=corr
            size+=len(target)
    model.train() 
    return all_corr/size

In [29]:
#save the base model to be loaded later in the fine-tuning phase
torch.save({"model_state_dict": model.base.state_dict(),}, "pretrained_model_4layers_no_class_head.pt")

In [30]:
from_scratch_settings = [True, False]

from_scratch_valid_acc = []
pretrained_valid_acc = []
lr = 0.0001

for from_scratch in from_scratch_settings:
    model = Model(ntokens, nhead, nhid, nlayers, 2, dropout).to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    if not from_scratch:
        print("=====PRETRAINED MODEL======")
        #load checkpoint
        checkpoint = torch.load("pretrained_model_4layers_no_class_head.pt")
        #load state dict
        model.base.load_state_dict(checkpoint['model_state_dict'])
    else:
        print("=====Trainig FROM SCRATCH======")
    epochs = 15
    for epoch in range(1, epochs + 1):
        train(
            path_data_train,
            path_labels_train,
            save_interval=-1,
            task='classification',
            batch_size=8,
            log_interval=50,
        )
        acc = evaluate_accuracy(
            get_loader(
                path_data_valid,
                path_labels_valid,
                token2ind=token2ind,
                batch_size=20,
                task='classification',
            )
        )
        if from_scratch:
            from_scratch_valid_acc.append(acc)
        else:
            pretrained_valid_acc.append(acc)
    print()

| epoch   1 |    50/  200 steps | loss 0.76297 | ppl    2.145
| epoch   1 |   100/  200 steps | loss 0.74000 | ppl    2.096
| epoch   1 |   150/  200 steps | loss 0.76083 | ppl    2.140
| epoch   2 |    50/  200 steps | loss 0.73135 | ppl    2.078
| epoch   2 |   100/  200 steps | loss 0.72550 | ppl    2.066
| epoch   2 |   150/  200 steps | loss 0.71673 | ppl    2.048
| epoch   3 |    50/  200 steps | loss 0.70282 | ppl    2.019
| epoch   3 |   100/  200 steps | loss 0.68453 | ppl    1.983
| epoch   3 |   150/  200 steps | loss 0.64560 | ppl    1.907
| epoch   4 |    50/  200 steps | loss 0.62261 | ppl    1.864
| epoch   4 |   100/  200 steps | loss 0.57179 | ppl    1.771
| epoch   4 |   150/  200 steps | loss 0.52461 | ppl    1.690
| epoch   5 |    50/  200 steps | loss 0.42974 | ppl    1.537
| epoch   5 |   100/  200 steps | loss 0.39437 | ppl    1.483
| epoch   5 |   150/  200 steps | loss 0.34617 | ppl    1.414
| epoch   6 |    50/  200 steps | loss 0.31978 | ppl    1.377
| epoch 

In [36]:
#Visualize the accuracy
import numpy as np
import pandas as pd
import plotly.express as px
history = pd.DataFrame({'epochs': np.arange(1,16), 'Model trained from scratch': from_scratch_valid_acc, 'Pretrained model': pretrained_valid_acc})
px.line(history, x = 'epochs', y = ['Model trained from scratch', 'Pretrained model'], title= "Accuracy")