# Sarcasm Classification Using BERT Model

In [1]:
# Need PyTorch 1.5+. 1.4 will report segment error when streaming to GPU memory.
import json
import os
import sys
import json
from tempfile import TemporaryDirectory

import numpy as np
import pandas as pd
import scrapbook as sb
import torch
import torch.nn as nn
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
from utils_nlp.common.timer import Timer
from utils_nlp.common.pytorch_utils import dataloader_from_dataset
from utils_nlp.dataset.multinli import load_pandas_df
from utils_nlp.models.transformers.sequence_classification import (
    Processor, SequenceClassifier)

  import pandas.util.testing as tm


## Introduction
In this notebook, we use BERT to classify sarcasm in Twitter responses.
We use pre-trained BERT model followed by fine-tuning with labeled data in the training set.

In [14]:
# notebook parameters
DATA_FOLDER = TemporaryDirectory().name
CACHE_DIR = TemporaryDirectory().name
NUM_EPOCHS = 1
BATCH_SIZE = 16
# Student tier, only 1 GPU available
NUM_GPUS = 1
MAX_LEN = 100
TRAIN_DATA_FRACTION = 1
TEST_DATA_FRACTION = 1
TRAIN_SIZE = 0.95
LABEL_COL = "label"
TEXT_COL = "response"
TRAIN_DATA_PATH = "train.jsonl"

## Read Dataset
We read the training data and keep only label and responses. Then remove the '@USER' tag since it doesn't contribute to sarcasm.
Context can be used in improving accuracy with [SEP] seperator in BERT but not explored in this notebook.

In [15]:
data = []
with open(TRAIN_DATA_PATH) as f:
    for data_row in f:
        row = []
        parsed_json = json.loads(data_row)
        row.append(parsed_json['label'])
        row.append(parsed_json['response'])
        data.append(row)

for row in data:
    row[1] = row[1].replace('@USER ', '')

# Build dataframe
df = pd.DataFrame(data=data, columns=["label", "response"])

In [16]:
# Inspect the training dta
df

Unnamed: 0,label,response
0,SARCASM,I don't get this .. obviously you do care or y...
1,SARCASM,trying to protest about . Talking about him an...
2,SARCASM,He makes an insane about of money from the MOV...
3,SARCASM,Meanwhile Trump won't even release his SAT sco...
4,SARCASM,Pretty Sure the Anti-Lincoln Crowd Claimed Tha...
...,...,...
4995,NOT_SARCASM,You don't . I have purchased a lot on Amazon (...
4996,NOT_SARCASM,#Emotions you say 🤔 never knew that I think I ...
4997,NOT_SARCASM,"You are so right ... "" Yes ! #Silence is not #..."
4998,NOT_SARCASM,Another lazy delusional voter who takes the wo...


In [17]:
df[[LABEL_COL, TEXT_COL]].head()

Unnamed: 0,label,response
0,SARCASM,I don't get this .. obviously you do care or y...
1,SARCASM,trying to protest about . Talking about him an...
2,SARCASM,He makes an insane about of money from the MOV...
3,SARCASM,Meanwhile Trump won't even release his SAT sco...
4,SARCASM,Pretty Sure the Anti-Lincoln Crowd Claimed Tha...


We evaluated the model by splitting the data for training and testing. In this case, since the model was proved to be good,
we use all data for training.
Next we encode the class labels. SARCASM = 1, NOT_SARCASM = 0.

In [18]:
# split
df_train, df_test = train_test_split(df, train_size = TRAIN_SIZE, random_state=0)



In [22]:
# sample
df_train = df_train.sample(frac=0.95).reset_index(drop=True)


In [23]:
# A simple statistics of the training data regarding label and count.
df_train[LABEL_COL].value_counts()

1    2270
0    2242
Name: label, dtype: int64

In [None]:
# encode labels
label_encoder = LabelEncoder()
df_train[LABEL_COL] = label_encoder.fit_transform(df_train[LABEL_COL])
num_labels = len(np.unique(df_train[LABEL_COL]))

In [28]:
print("Number of unique labels: {}".format(num_labels))
print("Number of training examples: {}".format(df_train.shape[0]))

Number of unique labels: 2
Number of training examples: 4512


In [27]:
# Load test data. Same processing method.
data = []
with open("test.jsonl") as f:
    for data_row in f:
        row = []
        parsed_json = json.loads(data_row)
        row.append(parsed_json['id'])
        row.append(parsed_json['response'])
        data.append(row)

for row in data:
    row[1] = row[1].replace('@USER ', '')
# print(data[0])

df = pd.DataFrame(data=data, columns=["id", "response"])
df_prod = df.reset_index(drop=True)
df_prod


Unnamed: 0,id,response
0,twitter_1,"My 3 year old , that just finished reading Nie..."
1,twitter_2,"How many verifiable lies has he told now ? 15,..."
2,twitter_3,Maybe Docs just a scrub of a coach ... I mean ...
3,twitter_4,is just a cover up for the real hate inside . ...
4,twitter_5,The irony being that he even has to ask why .
...,...,...
1795,twitter_1796,is definitely the best out there . No question...
1796,twitter_1797,Ye let her out run wild and infect 10000 more ...
1797,twitter_1798,"Thanks for that , I would have never known ."
1798,twitter_1799,Yes also #found this on #new with loads of <UR...


## Select Pretrained Models

We use pre-trained model provided by [Hugging Face](https://github.com/huggingface/transformers).
After evaluating "distilbert-base-uncased", "roberta-base", "xlnet-base-cased", we decided to go with a more complex model:
"bert-large-cased-whole-word-masking"

In [48]:
# Now we fine-tune the model to perform sarcasm detection.
model_name = 'bert-large-cased-whole-word-masking'
processor = Processor(
        model_name=model_name,
        to_lower=model_name.endswith("uncased"),
        cache_dir=CACHE_DIR,
    )
train_dataset = processor.dataset_from_dataframe(
        df_train, TEXT_COL, LABEL_COL, max_len=MAX_LEN
    )
train_dataloader = dataloader_from_dataset(
        train_dataset, batch_size=BATCH_SIZE, num_gpus=NUM_GPUS, shuffle=True
    )

# fine-tune
classifier = SequenceClassifier(
        model_name=model_name, num_labels=num_labels, cache_dir=CACHE_DIR
    )
with Timer() as t:
        classifier.fit(
            train_dataloader, num_epochs=NUM_EPOCHS, num_gpus=NUM_GPUS, verbose=False,
        )
train_time = t.interval / 3600


HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=625.0), HTML(value='')))





HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=213450.0), HTML(value='')))

HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=1338743948.0), HTML(value='')))




In [None]:
# Produce classification result.
prod_dataset = processor.dataset_from_dataframe(
        df_prod, TEXT_COL, max_len=MAX_LEN
    )
prod_dataloader = dataloader_from_dataset(
        prod_dataset, batch_size=BATCH_SIZE, num_gpus=NUM_GPUS, shuffle=False
    )
# predict
preds = classifier.predict(prod_dataloader, num_gpus=NUM_GPUS, verbose=False)

In [None]:
# Materialize results to file.
with open('answer.txt', 'w') as out:
    for index,label in enumerate(preds):
        res = ""
        if label == 1:
            res = 'SARCASM'
        if label == 0:
            res = 'NOT_CARCASM'
        line = "twitter_%s,%s\n"%(str(index + 1),res)
        out.write(line)



In [50]:
preds
with open('answer.txt', 'w') as out:
    for index,label in enumerate(preds):
        res = ""
        if label == 1:
            res = 'SARCASM'
        if label == 0:
            res = 'NOT_CARCASM'
        line = "twitter_%s,%s\n"%(str(index + 1),res)
        out.write(line)



## Evaluate

Finally, we report the accuracy and F1-score metrics for each model, as well as the fine-tuning time in hours.

In [61]:
df_results = pd.DataFrame(results)
df_results

Unnamed: 0,distilbert-base-uncased,roberta-base,xlnet-base-cased
accuracy,0.7848,0.7536,0.7704
f1-score,0.784064,0.75123,0.769815
time(hrs),0.029202,0.038811,0.053048


In [None]:
# for testing
sb.glue("accuracy", df_results.iloc[0, :].mean())
sb.glue("f1", df_results.iloc[1, :].mean())