```
// Copyright 2020 Twitter, Inc.
// SPDX-License-Identifier: Apache-2.0
```

# Finetune Text Classification Models

Take an existing BERT model (with or without TPP pre-training) and fine-tune it on an Text Classification dataset.

## Setup libraries

In [None]:
%pip install transformers==3.5.1 datasets==1.1.2 torch==1.4.0 seqeval==1.2.2 gensim==3.8.1

## Define parameters

In [None]:
HOMEDIR = "../"
DATADIR = f"{HOMEDIR}/data"
MODELDIR = f"{HOMEDIR}/models"
pre_trained_model_path = "bert-base-multilingual-uncased"
langs = "en"  # "en" # "**"

task_type = "sentiment"
LABEL_KEY = "label"
TEXT_KEY = "text"


## Setup Helpers

In [None]:
import json
import random
import re
from dataclasses import dataclass
from pathlib import Path
from typing import Any, Callable, Dict, List, NewType, Optional, Tuple, Union

import numpy as np
import pandas as pd
import torch
from IPython.core.debugger import set_trace
from sklearn.metrics import classification_report
from torch.nn.utils.rnn import pad_sequence
from tqdm import tqdm, trange
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    BertForSequenceClassification,
    BertTokenizerFast,
    EvalPrediction,
    Pipeline,
    RobertaTokenizerFast,
    TokenClassificationPipeline,
    Trainer,
    TrainingArguments,
)
from transformers.tokenization_utils_base import (
    BatchEncoding,
    PaddingStrategy,
    PreTrainedTokenizerBase,
)

In [None]:
class NumpyArrayEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        elif isinstance(obj, np.floating):
            return float(obj)
        elif isinstance(obj, np.ndarray):
            return obj.tolist()
        elif isinstance(obj, set):
            return list(obj)
        else:
            return super(NumpyArrayEncoder, self).default(obj)

In [None]:
URL_REGEX = re.compile(r"^http[s]?://[^ ]+")


def clean_text(text):
    if ord(text[0]) == 65039:
        text = text[1:]
    if text == chr(65039):
        return ""
    text = text.replace("\n", "[LF]")
    return URL_REGEX.sub("[URL]", text)


def read_classification_data(file_list, label_key=LABEL_KEY, return_dict=False):
    all_text = []
    all_label = []
    all_data = []

    for file_path in tqdm(file_list):
        with open(file_path) as fp:
            for line in fp:
                if not line:
                    continue
                line = json.loads(line)
                if return_dict:
                    all_data.append(line)
                    continue
                text = clean_text(line[TEXT_KEY])
                label = line[label_key]
                if label == "neutral":
                    continue
                all_text.append(text)
                all_label.append(label)
    if return_dict:
        return all_data
    return all_text, all_label


label2id = {"NOT": 0, "OFF": 1}


if task_type == "sentiment":
    label2id = {"negative": 0, "positive": 1}

id2label = {v: k for k, v in label2id.items()}


max_length = 128

In [None]:
class ClassificationDataset(torch.utils.data.Dataset):
    def __init__(self, file_paths, tokenizer):
        self.file_paths = file_paths
        self.tokenizer = tokenizer
        self._setup()

    def _setup(self):
        all_text, all_label = read_classification_data(self.file_paths)
        self.data = []
        num_errors = 0
        all_encodings = self.tokenizer(all_text, max_length=max_length, truncation=True)
        for i, label in tqdm(enumerate(all_label)):
            label = label2id[label]
            encodings = {k: v[i] for k, v in all_encodings.items()}
            self.data.append((encodings, label))

    def __getitem__(self, idx):
        encodings, label = self.data[idx]
        item = {key: torch.tensor(val) for key, val in encodings.items()}
        item["label"] = torch.tensor(label)
        return item

    def __len__(self):
        return len(self.data)

In [None]:
def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
    preds = np.argmax(preds, axis=1)
    result = classification_report(p.label_ids, preds)
    return result


def get_preds(df_data, classification_pipeline, text_key=TEXT_KEY):
    # For now we assume binary classification tasks
    all_preds = []
    batch_size = 64
    for i in trange(0, len(df_data[text_key].values) + batch_size, batch_size):
        batch = df_data[text_key].values[i : i + batch_size].tolist()
        if batch:
            preds = classification_pipeline(batch)
            preds = [
                (
                    max(pred, key=lambda x: x["score"])["label"],
                    pred[0]["score"],
                    pred[1]["score"],
                )
                for pred in preds
            ]
            all_preds.extend(preds)
    return all_preds


def get_report(df_preds, label_key=LABEL_KEY):
    if df_preds.shape[0] == 0:
        print(f"No data in df_preds")
        return None
    y_true = df_preds[label_key]
    y_pred = df_preds["pred"]
    print(classification_report(y_true=y_true, y_pred=y_pred))
    report = {
        "classification_report": classification_report(
            y_true=y_true, y_pred=y_pred, output_dict=True
        )
    }
    probas_pred = df_preds["1_score"]
    precision, recall, thresholds = precision_recall_curve(
        y_true=y_true, probas_pred=probas_pred, pos_label=id2label[1]
    )
    pr_auc = auc(recall, precision)
    pr_curve = {
        "precision": precision,
        "recall": recall,
        "thresholds": thresholds,
        "pr_auc": pr_auc,
    }
    report["pr_curve"] = pr_curve
    print(f"PRAUC: {pr_auc}")
    return report


def run_eval(file_paths, classification_pipeline):
    df_data = read_classification_data(file_paths, return_dict=True)
    df_data = pd.DataFrame(df_data)

    all_preds = get_preds(df_data, classification_pipeline)
    df_preds = pd.DataFrame(all_preds, columns=["pred", "0_score", "1_score"])
    df_preds = pd.concat([df_data.drop(TEXT_KEY, 1), df_preds], axis=1)

    report = get_report(df_preds, label_key=LABEL_KEY)
    return report

## Run Training

In [None]:
# model_dir = Path(f"{HOMEDIR}/en_sentiment_model/").expanduser()
model_dir = pre_trained_model_path
tokenizer = AutoTokenizer.from_pretrained(
    str(model_dir), max_len=512, truncation=True, padding=True, use_fast=True
)

In [None]:
data_dir = Path(f"{DATADIR}/SentimentData/").expanduser()
train_data_path = list(data_dir.glob(f"./{langs}/training.json"))
test_data_path = list(data_dir.glob(f"./{langs}/testing.json"))


train_data_path

In [None]:
train_dataset = ClassificationDataset(train_data_path, tokenizer)
val_dataset = []

In [None]:
len(train_dataset), len(val_dataset)

In [None]:
model_prefix = "multi" if langs == "**" else langs
finetuned_model_dir = str(
    Path(f"{MODELDIR}/{model_prefix}_{task_type}_model").expanduser()
)
logging_dir = str(Path(f"{MODELDIR}/{model_prefix}_{task_type}_logs").expanduser())


eval_every_steps = -1  # 1 # -1 for eval at end

training_args = TrainingArguments(
    output_dir=str(finetuned_model_dir),  # output directory
    num_train_epochs=3,  # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,  # batch size for evaluation
    warmup_steps=500,  # number of warmup steps for learning rate scheduler
    weight_decay=0.01,  # strength of weight decay
    logging_dir=str(logging_dir),  # directory for storing logs
    logging_steps=10,
    # evaluation_strategy="steps",
    # eval_steps=100,
    save_steps=500 if eval_every_steps < 1 else eval_every_steps,
    save_total_limit=2 if eval_every_steps < 1 else None,
    max_steps=-1 if eval_every_steps < 1 else 5,
    label_names=[id2label[i] for i in range(len(id2label))],
)


def train_model():
    model = AutoModelForSequenceClassification.from_pretrained(
        str(model_dir), num_labels=len(label2id), id2label=id2label, label2id=label2id
    )
    trainer = Trainer(
        model=model,  # the instantiated 🤗 Transformers model to be trained
        args=training_args,  # training arguments, defined above
        train_dataset=train_dataset,  # training dataset
        # eval_dataset=val_dataset,             # evaluation dataset
        # data_collator=data_collator,
        tokenizer=tokenizer,
    )
    trainer.train()
    trainer.save_model(finetuned_model_dir)
    tokenizer.save_pretrained(finetuned_model_dir)
    return model, trainer

In [None]:
%%time
model, trainer = train_model()

## Run evaluation

In [None]:
from typing import Any, Callable, Dict, List, NewType, Optional, Tuple, Union

import numpy as np
import pandas as pd
import torch
from sklearn.metrics import auc, classification_report, precision_recall_curve
from tqdm import trange
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    Pipeline,
    TokenClassificationPipeline,
    pipeline,
)

In [None]:
del model
# del classification_pipeline
torch.cuda.empty_cache()

In [None]:
task_type = "sentiment"

finetuned_model_dir = str(
    Path(f"{MODELDIR}/{model_prefix}_{task_type}_model").expanduser()
)


model = AutoModelForSequenceClassification.from_pretrained(finetuned_model_dir)
tokenizer = AutoTokenizer.from_pretrained(
    str(model_dir), max_len=512, truncation=True, padding=True, use_fast=True
)


def _parse_and_tokenize(self, inputs, padding=True, add_special_tokens=True, **kwargs):
    """
    Parse arguments and tokenize
    """
    # Parse arguments
    inputs = self.tokenizer(
        inputs,
        add_special_tokens=add_special_tokens,
        return_tensors=self.framework,
        padding=padding,
        truncation=True,
        max_length=max_length,
    )

    return inputs

In [None]:
classification_pipeline = pipeline(
    "sentiment-analysis",
    model=model,
    tokenizer=tokenizer,
    return_all_scores=True,
    device=0,
)
classification_pipeline.__class__.__bases__[0]._parse_and_tokenize = _parse_and_tokenize

In [None]:
langs = "**"  # "en" # "**"

data_dir = Path(f"{DATADIR}/SentimentData/").expanduser()
train_data_path = list(data_dir.glob(f"./{langs}/training.json"))
test_data_path = list(data_dir.glob(f"./{langs}/testing.json"))

test_data_path

In [None]:
%%time
reports = {}
for test_path in test_data_path:
    lang = test_path.parts[-2]
    subset = test_path.parts[-3]
    print(lang, subset, test_path.name)
    report = run_eval([test_path], classification_pipeline)
    df_report = pd.DataFrame(report["classification_report"]).T
    reports[(lang, subset)] = df_report

In [None]:
df_report = pd.concat(reports)
df_report

In [None]:
df_report.to_csv(Path(finetuned_model_dir) / "test_eval_report.txt", sep="\t")

In [None]:
df_report[df_report.index.isin(["macro avg"], level=2)]

In [None]:
df_report = pd.read_csv(
    Path(finetuned_model_dir) / "test_eval_report.txt", sep="\t", index_col=[0, 1, 2]
)
df_report[df_report.index.isin(["macro avg"], level=2)]

In [None]:
if Path(f"{str(finetuned_model_dir)}/test_eval_report.txt").expanduser().exists():
    df_report = pd.read_csv(
        f"{str(finetuned_model_dir)}/test_eval_report.txt", sep="\t", index_col=[0, 1, 2]
    )
df_report[df_report.index.isin(["macro avg"], level=2)]