# Training and validation Data 

In [0]:
import json
from pandas import json_normalize
import pandas as pd
import numpy as np

In [0]:
def formatting_squad(dataset_file, display=False):
    with open(dataset_file, encoding='utf-8') as f:
        # print(json.dumps(f))
        raw_data = json.load(f)
        raw_data = raw_data['data']
        raw_data = json_normalize(raw_data)['paragraphs']
        context = []
        question = []
        answer_start = []
        text = []
        for i in range(len(raw_data)): # paragraphs
            for j in range(len(raw_data[i])): #qas
                for k in range(len(raw_data[i][j]['qas'])):
                    if (len(raw_data[i][j]['qas'][k]['answers']) != 0):
                        question.append(raw_data[i][j]['qas'][k]['question'])
                        answer_start.append(raw_data[i][j]['qas'][k]['answers'][0]['answer_start'])
                        text.append(raw_data[i][j]['qas'][k]['answers'][0]['text'])
                        context.append(raw_data[i][j]['context'])
                    else:
                        continue
        data = pd.DataFrame({"context":context, "question": question, "answer_start": answer_start, "text": text})
#         print(data.head())
        if display is True:
            print(data.shape)
        return data

In [0]:
train_data = formatting_squad(r"train.json")
train_data = train_data[train_data['context'].apply(lambda x: len(x)<500)].reset_index(drop=True)
# print(train_data.shape)
# train_data.head(3)

In [0]:
valid_data = formatting_squad(r"valid.json")
valid_data = valid_data.reset_index(drop=True)
# print(valid_data.shape)
# valid_data.head(3)

# Defining the model

In [5]:
!pip install transformers



In [0]:
import transformers
from torch import nn
import torch
from torch.utils.data import Dataset, DataLoader
from transformers.optimization import AdamW
from tqdm import trange, tqdm_notebook

In [0]:
from transformers.modeling_camembert import CamembertModel
from transformers.tokenization_camembert import CamembertTokenizer

## BILSTM

In [0]:
# import torch
from torch.autograd import Variable
# import torch.nn as nn
import torch.nn.functional as F

class BILSTM(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, use_gpu , batch_size, dropout):
        super(BILSTM, self).__init__()
        self.hidden_dim = hidden_dim
        self.use_gpu  = torch.cuda.is_available()
        self.batch_size = batch_size
        self.dropout = dropout
        self.lstm = nn.LSTM(input_size=embedding_dim, hidden_size=self.hidden_dim, bidirectional=True)
        self.hidden = self.init_hidden()

    def init_hidden(self):
        if self.use_gpu:
            return (Variable(torch.zeros(2, self.batch_size, self.hidden_dim).cuda()),
                    Variable(torch.zeros(2, self.batch_size, self.hidden_dim).cuda()))
        else:
            return (Variable(torch.zeros(2, self.batch_size, self.hidden_dim)),
                    Variable(torch.zeros(2, self.batch_size, self.hidden_dim)))

    def forward(self, x):
        # x = self.embeddings(sentence).view(len(sentence), self.batch_size, -1)
        output, hidden = self.lstm(x)
        output, _ = self.lstm(x, hidden)
        # lstm_out, self.hidden = self.lstm(x.view(512, self.batch_size, -1), self.hidden)
        return output
        # y = self.hidden2label(lstm_out[-1])
        # log_probs = F.log_softmax(y)
        # return log_probs

## CamemBERT

In [0]:
class CamemBERTQA(nn.Module):
    def __init__(self,bert_type, hidden_size, num_labels, n_blocks = 2, n_heads = 12, dropout=0.1):
        super(CamemBERTQA, self).__init__()
        self.bert_type = bert_type
        self.hidden_size = hidden_size
        self.num_labels = num_labels
        self.n_blocks = n_blocks
        self.n_heads = n_heads
        self.dropout = dropout
        self.camembert = CamembertModel.from_pretrained(self.bert_type)
        use_gpu  = torch.cuda.is_available()

        self.bilstm = BILSTM(embedding_dim=768, hidden_dim=768, use_gpu=use_gpu, batch_size = 5, dropout=self.dropout)
        self.fc = nn.Linear(self.hidden_size * 2, hidden_size)

        self.qa_outputs = nn.Linear(self.hidden_size, self.num_labels)

    def forward(self, input_ids, mask=None):
        output = self.camembert(input_ids = input_ids) # input_ids is a tensor
        
        sequence_output = self.fc(self.bilstm(output[0])) # BILSTM
        
        logits = self.qa_outputs(sequence_output) #(None, seq_len, hidden_size)*(hidden_size, 2)=(None, seq_len, 2)
        start_logits, end_logits = logits.split(1, dim=-1)    #(None, seq_len, 1), (None, seq_len, 1)
        start_logits = start_logits.squeeze(-1)  #(None, seq_len)
        end_logits = end_logits.squeeze(-1)    #(None, seq_len)
        outputs = (start_logits, end_logits,) 
        return outputs

### Loss function

In [0]:
def loss_func(out, s_target, e_target):
    criterion = nn.CrossEntropyLoss()
    s_loss = criterion(out[0], s_target)
    e_loss = criterion(out[1], e_target)
    total_loss = s_loss+e_loss
    return total_loss

## Dataloader for train and eval set


In [0]:
class BertDatasetModule(Dataset):
    def __init__(self, tokenizer, context, question, max_length, text):
        self.context = context
        self.question = question
        self.text = text
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.context)
  
    def __getitem__(self, idx):
        context_ = self.context[idx]
        question_ = self.question[idx]
        text_ = self.text[idx]
    
        #encoding
        input_ids = self.tokenizer.encode(question_, context_)
        answer_ids = self.tokenizer.encode(text_)
        token_type_ids = [0 if i <= input_ids.index(6) else 1 for i in range(len(input_ids))]
    
        #calculating start and end position of the answer in input_ids
        s_pos, e_pos = 0, 0
        for i in range(len(input_ids)):
            if (input_ids[i: i+len(answer_ids[1:-1])] == answer_ids[1:-1]):
                s_pos = i
                e_pos = i + len(answer_ids[1:-1]) - 1
                break

        assert((s_pos<len(input_ids)) & (e_pos<len(input_ids)) & (s_pos<=e_pos))

        if (len(input_ids) < self.max_length):
            padding_len = self.max_length - len(input_ids)
            ids = input_ids + ([0]*padding_len)
        else:
            ids = input_ids[:self.max_length]

        if (len(token_type_ids)<self.max_length):
            padding_len = self.max_length - len(token_type_ids)
            token_ids = token_type_ids  + ([1]*padding_len)
        else:
            token_ids = token_type_ids[:self.max_length]

        return {'ids': torch.tensor(ids, dtype = torch.long),
                'token_type_ids': torch.tensor(token_ids, dtype = torch.long),
                'start_pos': torch.tensor(s_pos, dtype = torch.long),
                'end_pos': torch.tensor(e_pos, dtype = torch.long)}

## Training and evaluation function


In [0]:
def train_loop(dataloader, model, optimizer, device, max_grad_norm, scheduler=None):
    model.train()
    for bi, d in enumerate(tqdm_notebook(dataloader, desc="Iteration")):
        ids = d['ids']
        # mask_ids = d['mask']
        token_ids = d['token_type_ids']
        start_pos = d['start_pos']
        end_pos = d['end_pos']

        ids = ids.to(device, dtype = torch.long)
        # mask_ids = mask_ids.to(device, dtype = torch.long)
        token_ids = token_ids.to(device, dtype = torch.long)
        start_pos = start_pos.to(device, dtype = torch.long)
        end_pos = end_pos.to(device, dtype = torch.long)

        optimizer.zero_grad()
        start_and_end_scores = model(ids)
        # start_scores, end_scores = model(ids, token_ids)
        loss = loss_func(start_and_end_scores, start_pos, end_pos)
        # torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
        loss.backward()
        optimizer.step()
        if scheduler is not None:
            scheduler.step()
        if bi%100==0:
            print (f"bi: {bi}, loss: {loss}")

In [0]:
def eval_loop(dataloader, model, device):
    model.eval()
    pred_s = None
    pred_e = None
    eval_loss = 0.0
    eval_steps = 0

    for bi, d in enumerate(dataloader):
        ids = d['ids']
        # mask_ids = d['mask']
        token_ids = d['token_type_ids']
        start_pos = d['start_pos']
        end_pos = d['end_pos']

        ids = ids.to(device, dtype = torch.long)
        # mask_ids = mask_ids.to(device, dtype = torch.long)
        token_ids = token_ids.to(device, dtype = torch.long)
        start_pos = start_pos.to(device, dtype = torch.long)
        end_pos = end_pos.to(device, dtype = torch.long)

        with torch.no_grad():
            start_and_end_scores = model(ids)
#             start_and_end_scores = model(ids, token_ids)

            loss = loss_func(start_and_end_scores, start_pos, end_pos)
            eval_loss += loss.mean().item()

        eval_steps += 1
        if pred_s is None:
            pred_s = start_and_end_scores[0].detach().cpu().numpy()
            pred_e = start_and_end_scores[1].detach().cpu().numpy()
        else:
            pred_s = np.append(pred_s, start_and_end_scores[0].detach().cpu().numpy(), axis=0)
            pred_e = np.append(pred_e, start_and_end_scores[1].detach().cpu().numpy(), axis=0)

    eval_loss = eval_loss/eval_steps
    pred_start = np.argmax(pred_s, axis=1)
    pred_end = np.argmax(pred_e, axis=1)

    return eval_loss, pred_start, pred_end

## Configuration

In [0]:
MAX_SEQ_LENGTH = 512
TRAIN_BATCH_SIZE = 8  # 8, 16 or 32
EVAL_BATCH_SIZE = 8 # 8, 16 or 32
LEARNING_RATE = 2e-5 # 1e-5, 5e-5
NUM_TRAIN_EPOCHS = 4 # or 2, 3 or 4
BERT_TYPE = "camembert-base" #"fmikaelian/camembert-base-fquad"
max_grad_norm = 1.0

In [0]:
tokenizer = CamembertTokenizer.from_pretrained(BERT_TYPE, do_lower_case = True)

train_dataset = BertDatasetModule(
    tokenizer = tokenizer,
    context = train_data['context'],
    question = train_data['question'],
    max_length = MAX_SEQ_LENGTH,
    text = train_data['text']
)

train_dataloader = DataLoader(train_dataset, batch_size = TRAIN_BATCH_SIZE, shuffle=True)

In [0]:
eval_dataset = BertDatasetModule(
    tokenizer = tokenizer,
    context = valid_data['context'],
    question = valid_data['question'],
    max_length = MAX_SEQ_LENGTH,
    text = valid_data['text']
)

eval_dataloader = DataLoader(eval_dataset, batch_size = EVAL_BATCH_SIZE, shuffle=False)

In [22]:
# import transformers
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

# n_blocks is the number of encoderBlocks in the transformer
model = CamemBERTQA(bert_type = BERT_TYPE, hidden_size = 768, num_labels = 2, n_blocks = 1, n_heads = 12).to(device) 
optimizer = AdamW(model.parameters(), lr = LEARNING_RATE, correct_bias = False)

NUM_TRAIN_STEPS = int(len(train_dataset)/TRAIN_BATCH_SIZE * NUM_TRAIN_EPOCHS) 
scheduler = transformers.get_constant_schedule_with_warmup(
                optimizer,
                num_warmup_steps=500,
                last_epoch=-1)

cuda


In [18]:
NUM_TRAIN_STEPS

918

In [19]:
model.eval()

CamemBERTQA(
  (camembert): CamembertModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(32005, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-0

## Training Iterations


In [0]:
#training
for epoch in trange(NUM_TRAIN_EPOCHS):
    train_loop(train_dataloader, model, optimizer, device, max_grad_norm, scheduler)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  This is separate from the ipykernel package so we can avoid doing imports until


HBox(children=(FloatProgress(value=0.0, description='Iteration', max=307.0, style=ProgressStyle(description_wi…

bi: 0, loss: 12.450721740722656


## Evaluation

In [0]:
res = eval_loop(eval_dataloader, model, device)
print("Evaluation loss: "+str(res[0]))

Token indices sequence length is longer than the specified maximum sequence length for this model (10 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (13 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (13 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (10 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (15 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this mod

Evaluation loss: 8.871234133727569


## Testing the model


In [0]:
dev_data = formatting_squad(r"test.json")
print(dev_data.shape)
dev_data.head(3)

(3188, 4)


Unnamed: 0,context,question,answer_start,text
0,Les deux tableaux sont certes décrits par des ...,Que concerne principalement les documents ?,161,La Vierge aux rochers
1,Les deux tableaux sont certes décrits par des ...,Par quoi sont décrit les deux tableaux ?,46,documents contemporains
2,Les deux tableaux sont certes décrits par des ...,Quels types d'objets sont les deux tableaux au...,204,objets de spéculations


In [0]:
context_ = dev_data['context']
question_ = dev_data['question']
text_ = dev_data['text']
pred_start = res[1]
pred_end = res[2]
res_text_ = []
act_start = []
act_end = []

input_ids_list = list(map(lambda x,y: tokenizer.encode(x, y), question_, context_))
answer_ids_list = list(map(lambda x: tokenizer.encode(x), text_))

for i in range(len(input_ids_list)):
    res_text_.append(tokenizer.decode(input_ids_list[i][pred_start[i]:pred_end[i]+1]))
    s_pos, e_pos = 0, 0
    for j in range(len(input_ids_list[i])):
        if (input_ids_list[i][j: j+len(answer_ids_list[i][1:-1])] == answer_ids_list[i][1:-1]):
            s_pos = j
            e_pos = j + len(answer_ids_list[i][1:-1]) - 1
            break
    act_start.append(s_pos)
    act_end.append(e_pos)

Token indices sequence length is longer than the specified maximum sequence length for this model (10 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (13 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (13 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (10 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (15 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this mod

In [0]:
dev_data['start_pos'] = act_start
dev_data['end_pos'] = act_end
dev_data['predicted_text'] = res_text_
dev_data['predicted_start_pos'] = pred_start
dev_data['predicted_end_pos'] = pred_end

In [0]:
show_columns = ['text', 'predicted_text', 'start_pos', 'end_pos', 'predicted_start_pos', 'predicted_end_pos']
dev_data[show_columns].head(20)

Unnamed: 0,text,predicted_text,start_pos,end_pos,predicted_start_pos,predicted_end_pos
0,La Vierge aux rochers,la vierge aux rochers,39,42,39,42
1,documents contemporains,documents contemporains à leur création,20,21,20,24
2,objets de spéculations,la vierge aux rochers,56,59,47,50
3,droite,jambe droite,62,62,61,62
4,gauche,leur pied gauche,63,63,61,63
5,vert,vert,176,176,176,176
6,atelier de Léonard de Vinci,l'atelier de léonard de vinci,0,0,26,35
7,séchage,processus de séchage,126,126,124,126
8,doigts,doigts,91,91,91,91
9,La Vierge aux rochers,<s> quel est le nom du panneau central du reta...,0,0,0,67


In [0]:
cond1 = dev_data['predicted_start_pos']>dev_data['predicted_end_pos']
cond2 = dev_data['end_pos']<dev_data['predicted_start_pos']
cond3 = dev_data['start_pos']>dev_data['predicted_end_pos']

incorrect_pred = dev_data[(cond1) | (cond2) | (cond3)].shape[0]
incorrect_pred

910

In [0]:
t = dev_data.shape[0]
print(f"accuracy = {(t - incorrect_pred)*100/t}")

accuracy = 71.45545796737767


In [0]:
dev_data[(cond1) | (cond2) | (cond3)][show_columns].head()


Unnamed: 0,text,predicted_text,start_pos,end_pos,predicted_start_pos,predicted_end_pos
2,objets de spéculations,la vierge aux rochers,56,59,47,50
6,atelier de Léonard de Vinci,l'atelier de léonard de vinci,0,0,26,35
9,La Vierge aux rochers,(la vierge aux rochers),0,0,63,68
40,adhérence,problèmes d'adhérence,0,0,33,36
44,la tête,,74,75,74,45


In [0]:
csv_incorrect = dev_data[(cond1) | (cond2)| (cond3)][show_columns]
csv_correct = dev_data.drop(csv_incorrect.index)[show_columns]

In [0]:
csv_incorrect.to_csv('csv_incorrect.csv')
csv_correct.to_csv('csv_correct.csv')