In [1]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/2c/4e/4f1ede0fd7a36278844a277f8d53c21f88f37f3754abf76a5d6224f76d4a/transformers-3.4.0-py3-none-any.whl (1.3MB)
[K     |████████████████████████████████| 1.3MB 2.8MB/s 
Collecting tokenizers==0.9.2
[?25l  Downloading https://files.pythonhosted.org/packages/7c/a5/78be1a55b2ac8d6a956f0a211d372726e2b1dd2666bb537fea9b03abd62c/tokenizers-0.9.2-cp36-cp36m-manylinux1_x86_64.whl (2.9MB)
[K     |████████████████████████████████| 2.9MB 16.7MB/s 
[?25hCollecting sentencepiece!=0.1.92
[?25l  Downloading https://files.pythonhosted.org/packages/e5/2d/6d4ca4bef9a67070fa1cac508606328329152b1df10bdf31fb6e4e727894/sentencepiece-0.1.94-cp36-cp36m-manylinux2014_x86_64.whl (1.1MB)
[K     |████████████████████████████████| 1.1MB 35.4MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[

In [2]:
# Install dependencies
import time
import datetime
import random
from tqdm import tqdm

import numpy as np
import pandas as pd

import nltk
nltk.download('punkt')

import torch
import torch.nn.functional as F
import transformers as optimus

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [14]:
# Import torch utilities to loading data
from torch.utils.data import DataLoader, TensorDataset
from torch.utils.data.sampler import RandomSampler

In [9]:
tokenizer_class, pretrained_weights = (
    optimus.BertTokenizer,
    "bert-base-uncased",
)

# load the tokenizer from pre-trained model
tokenizer = tokenizer_class.from_pretrained(pretrained_weights, do_lower_case=True)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




In [17]:
# define model - BertForSequenceClassification
# using bert-base-uncased
model_class, pretrained_weights = (
    optimus.BertForSequenceClassification,
    "bert-base-uncased",
)

# loading pre-trained bert model
model = model_class.from_pretrained(pretrained_weights, num_labels=2)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [18]:
# configure device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [41]:
# val_data = pd.read_csv('./translated-hi-dev.csv')

# val_data = pd.read_csv('./translated-mr-dev.csv')

# val_data = pd.read_csv('./translated-gu-dev.csv')

val_data = pd.read_csv('./dev.csv')

print(len(val_data))

71


In [42]:
# initialize empty dataframe
val = pd.DataFrame()

# iterate each row in previously loaded train_data
# tokenizer(sent_a, sent_b) returns a dictionary of encoded values namely input_ids, attention_mask, token_type_ids
# input_ids - IDs returned by bert-tokenizer
# attention_mask - 1's for all the ids in the sentence and 0's for all padded tokens
# token_type_ids - 0's for all tokens in sent_a, 1's for all tokens in sent_b
for i, row in val_data.iterrows():
  d = {}
  encoded = tokenizer(row['sentence1'], row['sentence2'])
  d['input_ids'] = encoded['input_ids']
  d['attention_mask'] = encoded['attention_mask']
  d['token_type_ids'] = encoded['token_type_ids']
  d['label'] = row['label']
  val = val.append(d, ignore_index=True)

# padding all lists to MAX_SEQ_LENGTH
val['input_ids'] = val['input_ids'].apply(lambda x: x + (max_seq_len - len(x))*[0])
val['attention_mask'] = val['attention_mask'].apply(lambda x: x + (max_seq_len - len(x))*[0])
val['token_type_ids'] = val['token_type_ids'].apply(lambda x: x + (max_seq_len - len(x))*[0])

In [44]:
# convert all lists to pytorch tensors
input_ids = torch.tensor([r for r in val['input_ids']], dtype=torch.long)
attention_mask = torch.tensor([r for r in val['attention_mask']], dtype=torch.long)
token_type_ids = torch.tensor([r for r in val['token_type_ids']], dtype=torch.long)
label_ids = torch.tensor([r for r in val['label']], dtype=torch.long)

In [45]:
# wraping tensors into TensorDataset
# Initialing DataLoader for efficiently loading data
val_dataset = TensorDataset(input_ids, attention_mask, token_type_ids, label_ids)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)

In [46]:
# load model to device
model = model.to(device)

In [47]:
# convert model to eval mode
model.eval()

# initialize placeholders
total_eval_accuracy = 0
total_eval_loss = 0

# iterate for all batched in the dataloader
for step, batch in enumerate(tqdm(val_dataloader)):
    # unpack values in the batch
    input_ids, input_mask, segment_ids, label_ids = batch

    # load tensors to device
    input_ids = input_ids.to(device)
    input_mask = input_mask.to(device)
    segment_ids = segment_ids.to(device)
    label_ids = label_ids.to(device)
    
    # perform forward pass without tracking gradients
    with torch.no_grad():        
        loss, logits = model(input_ids=input_ids,
                    attention_mask=input_mask,
                    token_type_ids=segment_ids,
                    labels=label_ids)
    # update loss
    total_eval_loss += loss.item()

    # Output of the model is the scores for 3 classes namely None, Positive, and Negative
    # Computing softmax for finding max probablity
    logits = F.softmax(logits, dim=-1)
    # move ground truths and predictions to cpu
    logits = logits.detach().cpu().numpy()
    label_ids = label_ids.to('cpu').numpy()
    # get index of max probability
    outputs = np.argmax(logits, axis=1)

    # comapre ground truths and predictions
    total_eval_accuracy += np.sum(outputs == label_ids)

avg_val_accuracy = total_eval_accuracy / (len(val_dataloader) * val_dataloader.batch_size)
print("  Accuracy: {0:.2f}".format(avg_val_accuracy))

avg_val_loss = total_eval_loss / (len(val_dataloader) * val_dataloader.batch_size)

100%|██████████| 3/3 [00:00<00:00,  5.09it/s]

  Accuracy: 0.43



