*Copyright (c) Microsoft Corporation. All rights reserved.*

*Licensed under the MIT License.*

# Text Classification of MultiNLI Sentences using Multiple Transformer Models

In [1]:
import json
import os
import sys
import json
from tempfile import TemporaryDirectory

import numpy as np
import pandas as pd
import scrapbook as sb
import torch
import torch.nn as nn
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
from utils_nlp.common.timer import Timer
from utils_nlp.common.pytorch_utils import dataloader_from_dataset
from utils_nlp.dataset.multinli import load_pandas_df
from utils_nlp.models.transformers.sequence_classification import (
    Processor, SequenceClassifier)

  import pandas.util.testing as tm


## Introduction
In this notebook, we fine-tune and evaluate a number of pretrained models on a subset of the [MultiNLI](https://www.nyu.edu/projects/bowman/multinli/) dataset.

We use a [sequence classifier](../../utils_nlp/models/transformers/sequence_classification.py) that wraps [Hugging Face's PyTorch implementation](https://github.com/huggingface/transformers) of different transformers, like [BERT](https://github.com/google-research/bert), [XLNet](https://github.com/zihangdai/xlnet), and [RoBERTa](https://github.com/pytorch/fairseq).

In [14]:
# notebook parameters
DATA_FOLDER = TemporaryDirectory().name
CACHE_DIR = TemporaryDirectory().name
NUM_EPOCHS = 1
BATCH_SIZE = 16
NUM_GPUS = 1
MAX_LEN = 100
TRAIN_DATA_FRACTION = 1
TEST_DATA_FRACTION = 1
TRAIN_SIZE = 0.95
LABEL_COL = "label"
TEXT_COL = "response"
MODEL_NAMES = ["distilbert-base-uncased", "roberta-base", "xlnet-base-cased"]

## Read Dataset
We start by loading a subset of the data. The following function also downloads and extracts the files, if they don't exist in the data folder.

The MultiNLI dataset is mainly used for natural language inference (NLI) tasks, where the inputs are sentence pairs and the labels are entailment indicators. The sentence pairs are also classified into *genres* that allow for more coverage and better evaluation of NLI models.

For our classification task, we use the first sentence only as the text input, and the corresponding genre as the label. We select the examples corresponding to one of the entailment labels (*neutral* in this case) to avoid duplicate rows, as the sentences are not unique, whereas the sentence pairs are.

In [15]:
# df = load_pandas_df(DATA_FOLDER, "train")
# df = df[df["gold_label"]=="neutral"]  # get unique sentences

data = []
with open("train.jsonl") as f:
    for data_row in f:
        row = []
        parsed_json = json.loads(data_row)
        row.append(parsed_json['label'])
        row.append(parsed_json['response'])
        data.append(row)

for row in data:
    row[1] = row[1].replace('@USER ', '')
# print(data[0])

df = pd.DataFrame(data=data, columns=["label", "response"])

In [16]:
#Inspect if train dataframe loaded
df

Unnamed: 0,label,response
0,SARCASM,I don't get this .. obviously you do care or y...
1,SARCASM,trying to protest about . Talking about him an...
2,SARCASM,He makes an insane about of money from the MOV...
3,SARCASM,Meanwhile Trump won't even release his SAT sco...
4,SARCASM,Pretty Sure the Anti-Lincoln Crowd Claimed Tha...
...,...,...
4995,NOT_SARCASM,You don't . I have purchased a lot on Amazon (...
4996,NOT_SARCASM,#Emotions you say 🤔 never knew that I think I ...
4997,NOT_SARCASM,"You are so right ... "" Yes ! #Silence is not #..."
4998,NOT_SARCASM,Another lazy delusional voter who takes the wo...


In [17]:
df[[LABEL_COL, TEXT_COL]].head()

Unnamed: 0,label,response
0,SARCASM,I don't get this .. obviously you do care or y...
1,SARCASM,trying to protest about . Talking about him an...
2,SARCASM,He makes an insane about of money from the MOV...
3,SARCASM,Meanwhile Trump won't even release his SAT sco...
4,SARCASM,Pretty Sure the Anti-Lincoln Crowd Claimed Tha...


We split the data for training and testing, sample a fraction for faster execution, and encode the class labels:

In [18]:
# split
df_train, df_test = train_test_split(df, train_size = TRAIN_SIZE, random_state=0)



In [22]:
# sample
df_train = df_train.sample(frac=0.95).reset_index(drop=True)
# df_test = df_test.sample(frac=TEST_DATA_FRACTION).reset_index(drop=True)

The examples in the dataset are grouped into 5 genres:

In [23]:
df_train[LABEL_COL].value_counts()

1    2270
0    2242
Name: label, dtype: int64

In [None]:
# encode labels
label_encoder = LabelEncoder()
df_train[LABEL_COL] = label_encoder.fit_transform(df_train[LABEL_COL])
# df_test[LABEL_COL] = label_encoder.transform(df_test[LABEL_COL])
df_train
num_labels = len(np.unique(df_train[LABEL_COL]))

In [28]:
print("Number of unique labels: {}".format(num_labels))
print("Number of training examples: {}".format(df_train.shape[0]))
# print("Number of testing examples: {}".format(df_test.shape[0]))

Number of unique labels: 2
Number of training examples: 4512


## Select Pretrained Models

Several pretrained models have been made available by [Hugging Face](https://github.com/huggingface/transformers). For text classification, the following pretrained models are supported.

In [57]:
pd.DataFrame({"model_name": SequenceClassifier.list_supported_models()})

Unnamed: 0,model_name
0,albert-base-v1
1,albert-base-v2
2,albert-large-v1
3,albert-large-v2
4,albert-xlarge-v1
...,...
68,xlm-roberta-large-finetuned-conll02-spanish
69,xlm-roberta-large-finetuned-conll03-english
70,xlm-roberta-large-finetuned-conll03-german
71,xlnet-base-cased


## Fine-tune

Our wrappers make it easy to fine-tune different models in a unified way, hiding the preprocessing details that are needed before training. In this example, we're going to select the following models and use the same piece of code to fine-tune them on our genre classification task. Note that some models were pretrained on multilingual datasets and can be used with non-English datasets.

In [58]:
print(MODEL_NAMES)

['distilbert-base-uncased', 'roberta-base', 'xlnet-base-cased']


For each pretrained model, we preprocess the data, fine-tune the classifier, score the test set, and store the evaluation results.

In [59]:
# results = {}
# result_doc = {}
# for model_name in tqdm(MODEL_NAMES, disable=True):

#     # preprocess
#     processor = Processor(
#         model_name=model_name,
#         to_lower=model_name.endswith("uncased"),
#         cache_dir=CACHE_DIR,
#     )
#     train_dataset = processor.dataset_from_dataframe(
#         df_train, TEXT_COL, LABEL_COL, max_len=MAX_LEN
#     )
#     train_dataloader = dataloader_from_dataset(
#         train_dataset, batch_size=BATCH_SIZE, num_gpus=NUM_GPUS, shuffle=True
#     )
#     test_dataset = processor.dataset_from_dataframe(
#         df_test, TEXT_COL, LABEL_COL, max_len=MAX_LEN
#     )
#     test_dataloader = dataloader_from_dataset(
#         test_dataset, batch_size=BATCH_SIZE, num_gpus=NUM_GPUS, shuffle=False
#     )

#     # fine-tune
#     classifier = SequenceClassifier(
#         model_name=model_name, num_labels=num_labels, cache_dir=CACHE_DIR
#     )
#     with Timer() as t:
#         classifier.fit(
#             train_dataloader, num_epochs=NUM_EPOCHS, num_gpus=NUM_GPUS, verbose=False,
#         )
#     train_time = t.interval / 3600

#     # predict
#     preds = classifier.predict(test_dataloader, num_gpus=NUM_GPUS, verbose=False)

#     # eval
#     accuracy = accuracy_score(df_test[LABEL_COL], preds)
#     class_report = classification_report(
#         df_test[LABEL_COL], preds, target_names=label_encoder.classes_, output_dict=True
#     )

#     # save results
#     results[model_name] = {
#         "accuracy": accuracy,
#         "f1-score": class_report["macro avg"]["f1-score"],
#         "time(hrs)": train_time,
#     }

HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=442.0), HTML(value='')))





HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=231508.0), HTML(value='')))

HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=267967963.0), HTML(value='')))




	add_(Number alpha, Tensor other)
Consider using one of the following signatures instead:
	add_(Tensor other, *, Number alpha)


HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=481.0), HTML(value='')))





HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=898823.0), HTML(value='')))

HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=456318.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=501200538.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=760.0), HTML(value='')))





HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=798011.0), HTML(value='')))

HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=467042463.0), HTML(value='')))




In [27]:
data = []
with open("test.jsonl") as f:
    for data_row in f:
        row = []
        parsed_json = json.loads(data_row)
        row.append(parsed_json['id'])
        row.append(parsed_json['response'])
        data.append(row)

for row in data:
    row[1] = row[1].replace('@USER ', '')
# print(data[0])

df = pd.DataFrame(data=data, columns=["id", "response"])
df_prod = df.reset_index(drop=True)
df_prod


Unnamed: 0,id,response
0,twitter_1,"My 3 year old , that just finished reading Nie..."
1,twitter_2,"How many verifiable lies has he told now ? 15,..."
2,twitter_3,Maybe Docs just a scrub of a coach ... I mean ...
3,twitter_4,is just a cover up for the real hate inside . ...
4,twitter_5,The irony being that he even has to ask why .
...,...,...
1795,twitter_1796,is definitely the best out there . No question...
1796,twitter_1797,Ye let her out run wild and infect 10000 more ...
1797,twitter_1798,"Thanks for that , I would have never known ."
1798,twitter_1799,Yes also #found this on #new with loads of <UR...


In [48]:
model_name = 'bert-large-cased-whole-word-masking'
processor = Processor(
        model_name=model_name,
        to_lower=model_name.endswith("uncased"),
        cache_dir=CACHE_DIR,
    )
train_dataset = processor.dataset_from_dataframe(
        df_train, TEXT_COL, LABEL_COL, max_len=MAX_LEN
    )
train_dataloader = dataloader_from_dataset(
        train_dataset, batch_size=BATCH_SIZE, num_gpus=NUM_GPUS, shuffle=True
    )

# fine-tune
classifier = SequenceClassifier(
        model_name=model_name, num_labels=num_labels, cache_dir=CACHE_DIR
    )
with Timer() as t:
        classifier.fit(
            train_dataloader, num_epochs=NUM_EPOCHS, num_gpus=NUM_GPUS, verbose=False,
        )
train_time = t.interval / 3600


HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=625.0), HTML(value='')))





HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=213450.0), HTML(value='')))

HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=1338743948.0), HTML(value='')))




In [49]:
prod_dataset = processor.dataset_from_dataframe(
        df_prod, TEXT_COL, max_len=MAX_LEN
    )
prod_dataloader = dataloader_from_dataset(
        prod_dataset, batch_size=BATCH_SIZE, num_gpus=NUM_GPUS, shuffle=False
    )
    # predict
preds = classifier.predict(prod_dataloader, num_gpus=NUM_GPUS, verbose=False)

In [50]:
preds
with open('answer.txt', 'w') as out:
    for index,label in enumerate(preds):
        res = ""
        if label == 1:
            res = 'SARCASM'
        if label == 0:
            res = 'NOT_CARCASM'
        line = "twitter_%s,%s\n"%(str(index + 1),res)
        out.write(line)



## Evaluate

Finally, we report the accuracy and F1-score metrics for each model, as well as the fine-tuning time in hours.

In [61]:
df_results = pd.DataFrame(results)
df_results

Unnamed: 0,distilbert-base-uncased,roberta-base,xlnet-base-cased
accuracy,0.7848,0.7536,0.7704
f1-score,0.784064,0.75123,0.769815
time(hrs),0.029202,0.038811,0.053048


In [None]:
# for testing
sb.glue("accuracy", df_results.iloc[0, :].mean())
sb.glue("f1", df_results.iloc[1, :].mean())