In [1]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [5]:
import random
from ast import literal_eval
from typing import Callable, Dict, List, Optional, Tuple, Union

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

import datasets
from transformers import (DataCollator, EvalPrediction, PreTrainedModel,
                          PreTrainedTokenizerBase, Trainer, TrainerCallback,
                          TrainingArguments, AutoTokenizer,
                          AutoModelForSequenceClassification)

import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from torchmetrics.functional import f1, precision_recall, accuracy

In [7]:
random.seed(2021)
np.random.seed(2021)
torch.manual_seed(2021)

<torch._C.Generator at 0x7fd68de59d50>

In [3]:
df_train = pd.read_csv("../data/train_v0.7.1.csv",
                       usecols=[
                           'entry_id', 'excerpt', 'gender', 'lang',
                           "translation_en", "translation_fr", "translation_es"
                       ])
df_val = pd.read_csv("../data/val_v0.7.1.csv",
                     usecols=[
                         'entry_id', 'excerpt', 'gender', 'lang',
                         "translation_en", "translation_fr", "translation_es"
                     ])
df_test = pd.read_csv("../data/test_v0.7.1.csv",
                      usecols=[
                          'entry_id', 'excerpt', 'gender', 'lang',
                          "translation_en", "translation_fr", "translation_es"
                      ])

In [6]:
col = "gender"
for df in [df_train, df_val, df_test]:
    df[col] = df[col].apply(lambda x: list(sorted(list(set(literal_eval(x))))))

In [8]:
# model_name = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
model_name = "microsoft/xtremedistil-l6-h256-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at microsoft/xtremedistil-l6-h256-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
train_encodings = tokenizer(df_train['excerpt'].tolist(),
                            truncation=True,
                            padding=True,
                            max_length=512)
val_encodings = tokenizer(df_val['excerpt'].tolist(),
                          truncation=True,
                          padding=True,
                          max_length=512)



In [10]:
class GenderClassificationDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
        self.dataset_size = len(labels)
        self.lbl_to_id = {"Female": 0, "Male": 1}
        self.id_to_lbl = ["Female", "Male"]

    def __getitem__(self, idx):
        item = {
            key: torch.tensor(val[idx])
            for key, val in self.encodings.items()
        }
        item['labels'] = torch.tensor(
            [1. if x in self.labels[idx] else 0. for x in self.lbl_to_id])

        return item

    def __len__(self):
        return self.dataset_size


train_dataset = GenderClassificationDataset(train_encodings,
                                            df_train['gender'].tolist())
val_dataset = GenderClassificationDataset(
    val_encodings,
    df_val['gender'].tolist(),
)

In [11]:
def compute_metrics(eval_pred):
    preds, target = eval_pred
    res = dict()
    for metric in [f1, precision_recall, accuracy]:
        res[metric.__name__] = metric(preds,
                                      target,
                                      average="macro",
                                      num_classes=2)
    return res

In [12]:
df_train["male"] = df_train["gender"].apply(lambda x: 1 if "Male" in x else 0)
df_train["female"] = df_train["gender"].apply(lambda x: 1 if "Female" in x else 0)
##
male_neg_sampling_prob = (2 * df_train["male"].sum()) / len(df_train)
female_neg_sampling_prob = (2 * df_train["female"].sum()) / len(df_train)
neg_sampling_ratios = {
    "Female": female_neg_sampling_prob,
    "Male": male_neg_sampling_prob
}

In [14]:
class MultilabelTrainer(Trainer):
    def __init__(
        self,
        neg_sampling_ratios: Dict[str, float],
        model: Union[PreTrainedModel, nn.Module] = None,
        args: TrainingArguments = None,
        data_collator: Optional[DataCollator] = None,
        train_dataset: Optional[Dataset] = None,
        eval_dataset: Optional[Dataset] = None,
        tokenizer: Optional[PreTrainedTokenizerBase] = None,
        model_init: Callable[[], PreTrainedModel] = None,
        compute_metrics: Optional[Callable[[EvalPrediction], Dict]] = None,
        callbacks: Optional[List[TrainerCallback]] = None,
        optimizers: Tuple[torch.optim.Optimizer,
                          torch.optim.lr_scheduler.LambdaLR] = (None, None),
    ):
        super().__init__(
            model=model,
            args=args,
            data_collator=data_collator,
            train_dataset=train_dataset,
            eval_dataset=eval_dataset,
            tokenizer=tokenizer,
            model_init=model_init,
            compute_metrics=compute_metrics,
            callbacks=callbacks,
            optimizers=optimizers,
        )
        self.neg_sampling_ratios = neg_sampling_ratios

    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.get('logits')
        loss_weights = labels.clone()
        # handle negative examples sampling for the "Female" class
        neg_examples_indices_female = torch.nonzero(1-labels[:, 0])
        n = len(neg_examples_indices_female)
        if n > 0:
            a = torch.tensor([0., 1.], device=loss_weights.device)
            p = torch.tensor([1-self.neg_sampling_ratios["Female"], self.neg_sampling_ratios["Female"]])
            idx = p.multinomial(num_samples=n, replacement=True)
            loss_weights[neg_examples_indices_female, [0]*n] = a[idx]
        # handle negative examples sampling for the "Male" class
        neg_examples_indices_male = torch.nonzero(1-labels[:, 1])
        n = len(neg_examples_indices_male)
        if n > 0:
            a = torch.tensor([0., 1.], device=loss_weights.device)
            p = torch.tensor([1-self.neg_sampling_ratios["Male"], self.neg_sampling_ratios["Male"]])
            idx = p.multinomial(num_samples=n, replacement=True)
            loss_weights[neg_examples_indices_male, [1]*n] = a[idx]
        ##
        
        loss = F.binary_cross_entropy_with_logits(
            logits.view(-1, self.model.config.num_labels),
            labels.float().view(-1, self.model.config.num_labels) ,weight=loss_weights)
        return (loss, outputs) if return_outputs else loss

In [None]:
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=1,              # total number of training epochs
    per_device_train_batch_size=4,  # batch size per device during training
    per_device_eval_batch_size=8,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=100,
)

In [None]:
trainer = MultilabelTrainer(
    model=model,                         # the instantiated ðŸ¤— Transformers model to be trained
    neg_sampling_ratios=neg_sampling_ratios,
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset,             # evaluation dataset
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

In [None]:
trainer.save_model("./model_on_orig_train")

### Generate Predictions on Train Set

In [None]:
model_finetuned.eval()
dl_train = DataLoader(train_orig_dataset, batch_size=16)
logits = []
labels = []
for batch in tqdm(dl_train):
    labels.append(batch.pop("labels"))
    batch = {k:v.to("cuda") for k, v in batch.items()}
    with torch.inference_mode():
        outputs = model_finetuned(**batch)
    logits.append(outputs["logits"].cpu())
    torch.cuda.empty_cache()
##
logits = torch.cat(logits)
preds = []
for pred_proba in torch.sigmoid(logits):
    x = []
    if pred_proba[0] > .5:
        x.append("Female")
    if pred_proba[1] > .5:
        x.append("Male")
    preds.append(x)
##
df_train["gender_model_orig_pred"] = preds
df_train[["entry_id", "gender_model_orig_pred"]].to_csv("orig_model_gender_preds_on_train.csv", index=None)