In [1]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [3]:
import random
from typing import Callable, Dict, List, Optional, Tuple, Union

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

import datasets
from transformers import (DataCollator, EvalPrediction, PreTrainedModel,
                          PreTrainedTokenizerBase, Trainer, TrainerCallback,
                          TrainingArguments, AutoTokenizer,
                          AutoModelForSequenceClassification)

import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from torchmetrics.functional import f1, precision_recall, accuracy

2021-11-22 14:15:42.232000: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0


In [4]:
random.seed(2021)
np.random.seed(2021)
torch.manual_seed(2021)

<torch._C.Generator at 0x7f087c114850>

In [5]:
df_train = pd.read_csv("df_train_balanced_gender.csv")
df_val = pd.read_csv("df_val_balanced_gender.csv")

In [6]:
# model_name = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
model_name = "microsoft/xtremedistil-l6-h256-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at microsoft/xtremedistil-l6-h256-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
train_encodings = tokenizer(df_train['excerpt'].tolist(),
                            truncation=True,
                            padding=True,
                            max_length=512)
val_encodings = tokenizer(df_val['excerpt'].tolist(),
                          truncation=True,
                          padding=True,
                          max_length=512)



In [8]:
class GenderClassificationDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
        self.dataset_size = len(labels)
        self.lbl_to_id = {"Female": 0, "Male": 1}
        self.id_to_lbl = ["Female", "Male"]

    def __getitem__(self, idx):
        item = {
            key: torch.tensor(val[idx])
            for key, val in self.encodings.items()
        }
        item['labels'] = torch.tensor(
            [1. if x in self.labels[idx] else 0. for x in self.lbl_to_id])

        return item

    def __len__(self):
        return self.dataset_size


train_dataset = GenderClassificationDataset(train_encodings,
                                            df_train['gender'].tolist())
val_dataset = GenderClassificationDataset(
    val_encodings,
    df_val['gender'].tolist(),
)

In [11]:
def compute_metrics(eval_pred):
    preds, target = eval_pred
    res = dict()
    for metric in [f1, precision_recall, accuracy]:
        res[metric.__name__] = metric(preds,
                                      target,
                                      average="macro",
                                      num_classes=2)
    return res

In [12]:
male_neg_sampling_prob = (2 * df_train["male"].sum()) / len(df_train)
female_neg_sampling_prob = (2 * df_train["female"].sum()) / len(df_train)
neg_sampling_ratios = {
    "Female": female_neg_sampling_prob,
    "Male": male_neg_sampling_prob
}

In [14]:
class MultilabelTrainer(Trainer):
    def __init__(
        self,
        neg_sampling_ratios: Dict[str, float],
        model: Union[PreTrainedModel, nn.Module] = None,
        args: TrainingArguments = None,
        data_collator: Optional[DataCollator] = None,
        train_dataset: Optional[Dataset] = None,
        eval_dataset: Optional[Dataset] = None,
        tokenizer: Optional[PreTrainedTokenizerBase] = None,
        model_init: Callable[[], PreTrainedModel] = None,
        compute_metrics: Optional[Callable[[EvalPrediction], Dict]] = None,
        callbacks: Optional[List[TrainerCallback]] = None,
        optimizers: Tuple[torch.optim.Optimizer,
                          torch.optim.lr_scheduler.LambdaLR] = (None, None),
    ):
        super().__init__(
            model=model,
            args=args,
            data_collator=data_collator,
            train_dataset=train_dataset,
            eval_dataset=eval_dataset,
            tokenizer=tokenizer,
            model_init=model_init,
            compute_metrics=compute_metrics,
            callbacks=callbacks,
            optimizers=optimizers,
        )
        self.neg_sampling_ratios = neg_sampling_ratios

    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.get('logits')
        loss_weights = labels.clone()
        # handle negative examples sampling for the "Female" class
        neg_examples_indices_female = torch.nonzero(1-labels[:, 0])
        n = len(neg_examples_indices_female)
        if n > 0:
            a = torch.tensor([0., 1.], device=loss_weights.device)
            p = torch.tensor([1-self.neg_sampling_ratios["Female"], self.neg_sampling_ratios["Female"]])
            idx = p.multinomial(num_samples=n, replacement=True)
            loss_weights[neg_examples_indices_female, [0]*n] = a[idx]
        # handle negative examples sampling for the "Male" class
        neg_examples_indices_male = torch.nonzero(1-labels[:, 1])
        n = len(neg_examples_indices_male)
        if n > 0:
            a = torch.tensor([0., 1.], device=loss_weights.device)
            p = torch.tensor([1-self.neg_sampling_ratios["Male"], self.neg_sampling_ratios["Male"]])
            idx = p.multinomial(num_samples=n, replacement=True)
            loss_weights[neg_examples_indices_male, [1]*n] = a[idx]
        ##
        
        loss = F.binary_cross_entropy_with_logits(
            logits.view(-1, self.model.config.num_labels),
            labels.float().view(-1, self.model.config.num_labels) ,weight=loss_weights)
        return (loss, outputs) if return_outputs else loss

In [15]:
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=1,              # total number of training epochs
    per_device_train_batch_size=4,  # batch size per device during training
    per_device_eval_batch_size=8,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=100,
)

Using the `WAND_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [16]:
trainer = MultilabelTrainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    neg_sampling_ratios=neg_sampling_ratios,
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset,             # evaluation dataset
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

In [None]:
trainer.save_model("./model.ckpt")

### Evaluation

In [None]:
model_finetuned = AutoModelForSequenceClassification.from_pretrained("./model.ckpt")
model_finetuned = model_finetuned.to("cpu")

In [None]:
df_test = pd.read_csv(root_dir / "data/test_v0.7.1.csv",
                      usecols=['entry_id', 'excerpt', 'gender', 'lang',])
##
df_test["gender"] = df_test["gender"].apply(literal_eval)
test_encodings = tokenizer(df_test['excerpt'].tolist(),
                            truncation=True,
                            padding=True,
                            max_length=512)
test_dataset = GenderClassificationDataset(test_encodings, df_test["gender"].tolist())
##
model_finetuned.eval()
dl_test = DataLoader(test_dataset, batch_size=16)
logits = []
labels = []
for batch in tqdm(dl_test):
    labels.append(batch.pop("labels"))
    batch = {k:v.to("cuda") for k, v in batch.items()}
    with torch.inference_mode():
        outputs = model_finetuned(**batch)
    logits.append(outputs["logits"].cpu())
    torch.cuda.empty_cache()
##
print(compute_metrics((torch.vstack(logits), torch.vstack(labels).type(torch.int))))
##
np.save(root_dir / "test_logits.npy", torch.vstack(logits).numpy())
##
label_to_id = {"Female": 0, "Male":1}
##
test_logits = np.load("test_logits.npy")
test_logits = torch.tensor(test_logits)
preds = []
for pred_proba in torch.sigmoid(test_logits):
    x = []
    if pred_proba[0] > .5:
        x.append("Female")
    if pred_proba[1] > .5:
        x.append("Male")
    preds.append(x)
##
y_true = df_test["gender"].tolist()
y_true_bmat = multilabel_list_to_binary_mat(y_true, label_to_id)
y_pred_bmat = multilabel_list_to_binary_mat(preds, label_to_id)
##
acc = metrics.accuracy_score(y_true_bmat, y_pred_bmat)
f1 = metrics.f1_score(y_true_bmat, y_pred_bmat, average='macro')
recall = metrics.f1_score(y_true_bmat, y_pred_bmat, average='macro')
precision = metrics.f1_score(y_true_bmat, y_pred_bmat, average='macro')
cm = metrics.multilabel_confusion_matrix(y_true_bmat, y_pred_bmat)
print(
    metrics.classification_report(y_true_bmat,
                                  y_pred_bmat,
                                  target_names=label_to_id.keys()))
print(f"acc={acc}, f1={f1}, precision={precision}, recall={recall}")
print(cm)

### Generate Predictions on the Train Set

In [None]:
df_train_orig = pd.read_csv(root_dir / "data/train_v0.7.1.csv",
                      usecols=['entry_id', 'excerpt', 'gender', 'lang',])
##
df_train_orig["gender"] = df_train_orig["gender"].apply(literal_eval)
train_orig_encodings = tokenizer(df_train_orig['excerpt'].tolist(),
                            truncation=True,
                            padding=True,
                            max_length=512)
train_orig_dataset = GenderClassificationDataset(train_orig_encodings, df_train_orig["gender"].tolist())
##
model_finetuned.eval()
dl_train_orig = DataLoader(train_orig_dataset, batch_size=16)
logits = []
labels = []
for batch in tqdm(dl_train_orig):
    labels.append(batch.pop("labels"))
    batch = {k:v.to("cuda") for k, v in batch.items()}
    with torch.inference_mode():
        outputs = model_finetuned(**batch)
    logits.append(outputs["logits"].cpu())
    torch.cuda.empty_cache()
##
preds = []
for pred_proba in torch.sigmoid(logits):
    x = []
    if pred_proba[0] > .5:
        x.append("Female")
    if pred_proba[1] > .5:
        x.append("Male")
    preds.append(x)
##
df_train["gender_model_pred"] = preds
df_train[["entry_id", "gender_model_pred"]].to_csv("kw_model_gender_preds_on_train.csv", index=None)

In [None]:
df_val_orig = pd.read_csv(root_dir / "data/val_v0.7.1.csv",
                      usecols=['entry_id', 'excerpt', 'gender', 'lang',])
##
df_val_orig["gender"] = df_val_orig["gender"].apply(literal_eval)
val_orig_encodings = tokenizer(df_val_orig['excerpt'].tolist(),
                            truncation=True,
                            padding=True,
                            max_length=512)
val_orig_dataset = GenderClassificationDataset(val_orig_encodings, df_val_orig["gender"].tolist())
##
model_finetuned.eval()
dl_val_orig = DataLoader(val_orig_dataset, batch_size=16)
logits = []
labels = []
for batch in tqdm(dl_val_orig):
    labels.append(batch.pop("labels"))
    batch = {k:v.to("cuda") for k, v in batch.items()}
    with torch.inference_mode():
        outputs = model_finetuned(**batch)
    logits.append(outputs["logits"].cpu())
    torch.cuda.empty_cache()
##
preds = []
for pred_proba in torch.sigmoid(logits):
    x = []
    if pred_proba[0] > .5:
        x.append("Female")
    if pred_proba[1] > .5:
        x.append("Male")
    preds.append(x)
##
df_val["gender_model_pred"] = preds
df_val[["entry_id", "gender_model_pred"]].to_csv("kw_model_gender_preds_on_val.csv", index=None)

***

### Data Augmentation Techniques

- Random Cropping
    This is a standard augmentation in CV, but I haven't seen it applied to NLP, although it fits perfectly here: we need to chunk a very long text into a fixed token length fragments. Rather than doing it once and showing the same examples in each epoch, why not make it a dynamic and random crop? Initially, I implemented a function that took a random crop with padding around the answer text for positive examples, and a random crop for negative examples. Once I moved to data recipes described above, I realized that the same can be simpler achieved by using a different stride and negative sampling every time I add a dataset to my recipe.

- Progressive Resizing
    This is another trick that Jeremy teaches in the fastai lectures, applied commonly for images. In case of progressive resizing, I use different sequence lengths during training. E.g. start with 256, then move to 384, then move to 448.

- Cutout
    In computer vision, we sometimes black out patches of image as augmentation. In NLP, we can do the same by replacing some tokens with \[MASK\]. I saw that @cdeotte used this technique in Tweet Sentiment competition. I used 0-10% cutout in my models.

