# Inference code for bert classifier

In [1]:
import transformers
import torch

from transformers import BertForSequenceClassification

from tqdm import tqdm as tqdm
import pandas as pd

In [2]:
## GLOBAL SETTINGS
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
label_dict = {
    0: 'contradiction',
    1: 'entailment',
    2: 'neutral'
}  # output label and there index
max_len = 128  # Max length of input sentence

In [4]:
print('Loading BERT tokenizer from transformer library...')
tokenizer = transformers.BertTokenizer.from_pretrained('bert-base-uncased',
                                                       do_lower_case=True)

print('Loading trained model for inference...')
model = BertForSequenceClassification.from_pretrained('./final_model_bert/')
model = model.to(device)

Loading BERT tokenizer from transformer library...
Loading trained model for inference...


In [5]:
# Helper function to get tokenized vectors from bert for each sentence
def get_tokernized_values(each_sen_pair):
    encoded_dict = tokenizer.encode_plus(
        text=each_sen_pair[0],  # Sentence to encode.
        text_pair=each_sen_pair[1],  # for text pair ## TODO check this??
        add_special_tokens=True,  # Add '[CLS]' and '[SEP]'
        max_length=max_len,  # Pad & truncate all sentences.
        pad_to_max_length=True,
        return_attention_mask=True,  # Construct attn. masks.
        return_tensors='pt',  # Return pytorch tensors.
    )
    return encoded_dict['input_ids'], encoded_dict['attention_mask']


# Helper function to test similarity
def test_similarity(test_sentence_pair):
    sen_input_id, sen_attn_mask = get_tokernized_values(test_sentence_pair)
    sen_input_id = sen_input_id.to(device)
    sen_attn_mask = sen_attn_mask.to(device)
    outputs = model(sen_input_id,
                    token_type_ids=None,
                    attention_mask=sen_attn_mask)
    logits = outputs[0]
    logits = logits.argmax(dim=1)
    logits = logits.to('cpu').numpy()
    return label_dict[logits[0]]

In [6]:
final_test_df = pd.read_csv('../dataset/assignment_data_set/test.csv')
final_df_label = pd.DataFrame(columns=['gold_label'])

In [7]:
for index, row in tqdm(final_test_df.iterrows()):
    sen_1,sen_2 = row['sentence1'],row['sentence2']
    result = test_similarity([sen_1,sen_2])
    final_df_label.at[index, 'gold_label'] = str(result)

0it [00:00, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
493it [00:04, 100.53it/s]


In [8]:
final_df_label

Unnamed: 0,gold_label
0,entailment
1,neutral
2,entailment
3,contradiction
4,neutral
...,...
488,entailment
489,neutral
490,contradiction
491,contradiction


In [9]:
# save the final predicted classes
final_df_label.to_csv('predicted.csv',index=False)

In [None]:
# pd.read_csv('./predicted.csv')