In [2]:
%matplotlib inline

from pathlib import Path
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizerFast as BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup
import pytorch_lightning as pl
import torchmetrics
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from pytorch_lightning.loggers import TensorBoardLogger
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, multilabel_confusion_matrix
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc

In [3]:
RANDOM_SEED = 42
MAX_TOKEN_COUNT = 512
BERT_MODEL_NAME = 'bert-base-cased'
TEXT_COLUMN_NAME = 'text'
LABEL_COLUMN_NAME = 'hs'

In [4]:
df = pd.read_csv('../data/all-processed/B_english_basile_processed.csv')

In [5]:
empty_real_index = [index for index, text in enumerate(df.text.tolist()) if str(text).strip()==""] 
df.iloc[empty_real_index]

Unnamed: 0,text,hs


In [6]:
df

Unnamed: 0,text,hs
0,"hurray , saving us $ $ $ many ways @ potus @ r...",1
1,would young fighting age men vast majority one...,1
2,@ kamalaharris illegals dump kids border like ...,1
3,ny times : 'nearly white ' states pose 'an arr...,0
4,orban brussels : european leaders ignoring peo...,0
...,...,...
12995,@ samenvers unfollowed ? fuck pussy,0
12996,@ danreynolds stfu bitch ! go make satanic mus...,1
12997,"@ 2beornotbeing honey , fellow white chick , l...",0
12998,"hate bitches talk niggaz kids , everybody cant...",1


In [7]:
df[LABEL_COLUMN_NAME].value_counts()

0    7530
1    5470
Name: hs, dtype: int64

In [8]:
df['text'].map(
    lambda text: len(str(text).split(" "))
).describe()

count    13000.000000
mean        18.853385
std         10.067602
min          1.000000
25%         11.000000
50%         17.000000
75%         24.000000
max        121.000000
Name: text, dtype: float64

In [9]:
train_df, val_df = train_test_split(df, test_size=0.05)
train_df.shape, val_df.shape

((12350, 2), (650, 2))

In [10]:
BERT_MODEL_NAME = 'bert-base-cased'
tokenizer = BertTokenizer.from_pretrained(BERT_MODEL_NAME)

In [11]:
sample_row = df.iloc[0]
sample_comment = sample_row[TEXT_COLUMN_NAME]
sample_labels = sample_row[LABEL_COLUMN_NAME]

print(sample_comment)

hurray , saving us $ $ $ many ways @ potus @ realdonaldtrump # lockthemup # buildthewall # enddaca # boycottnfl # boycottnike


In [12]:
class HateDataset(torch.utils.data.Dataset):
    def __init__(
        self,
        data: pd.DataFrame,
        tokenizer: BertTokenizer,
        max_token_len: int = MAX_TOKEN_COUNT
    ):
    
        self.tokenizer = tokenizer
        self.data = data
        self.max_token_len = max_token_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index: int):
        data_row = self.data.iloc[index]
        text = data_row[TEXT_COLUMN_NAME]
        labels = data_row[LABEL_COLUMN_NAME]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_token_len,
            return_token_type_ids=False,
            padding="max_length",
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        return dict(
            input_ids=encoding["input_ids"].flatten(),
            attention_mask=encoding["attention_mask"].flatten(),
            labels=labels
        )   

In [13]:
train_dataset = HateDataset(
  train_df,
  tokenizer,
  max_token_len=MAX_TOKEN_COUNT
)

sample_item = train_dataset[0]
sample_item.keys()

dict_keys(['input_ids', 'attention_mask', 'labels'])

In [14]:
sample_item["labels"]

0

In [15]:
sample_item["input_ids"]

tensor([  101,   137, 12485, 10061,  9367,  1394, 20239,  2816,  5913,  1567,
         1294,  6884, 13280, 21602,  1221,  9304,  5412,  1663,   102,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0, 

In [16]:
sample_item["attention_mask"]

tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [17]:
class HateDataModule(pl.LightningDataModule):
    def __init__(self, train_df, test_df, tokenizer, batch_size=8, max_token_len=128):
        super().__init__()
        self.batch_size = batch_size
        self.train_df = train_df
        self.test_df = test_df
        self.tokenizer = tokenizer
        self.max_token_len = max_token_len
        
    def setup(self, stage=None):
        self.train_dataset = HateDataset(
            self.train_df,
            self.tokenizer,
            self.max_token_len
        )

        self.test_dataset = HateDataset(
            self.test_df,
            self.tokenizer,
            self.max_token_len
        )
        
    def train_dataloader(self):
        return DataLoader(
            self.train_dataset,
            batch_size=self.batch_size,
            shuffle=True
        )

    def val_dataloader(self):
        return DataLoader(
            self.test_dataset,
            batch_size=self.batch_size
        )

    def test_dataloader(self):
        return DataLoader(
            self.test_dataset,
            batch_size=self.batch_size
        )

In [18]:
N_EPOCHS = 10
BATCH_SIZE = 16

data_module = HateDataModule(
  train_df,
  val_df,
  tokenizer,
  batch_size=BATCH_SIZE,
  max_token_len=MAX_TOKEN_COUNT
)

In [19]:
import pytorch_lightning as pl
import torchmetrics

class HateTagger(pl.LightningModule):
    def __init__(self, n_training_steps=None, n_warmup_steps=None):
        super().__init__()
        self.bert = BertModel.from_pretrained(BERT_MODEL_NAME, return_dict=True)
        self.i2h = nn.Linear(self.bert.config.hidden_size, 128)
        self.h2o = nn.Linear(128, 1)
        self.n_training_steps = n_training_steps
        self.n_warmup_steps = n_warmup_steps
        self.criterion = nn.BCELoss()
        self.auroc = torchmetrics.AUROC()
        self.accuracy = torchmetrics.Accuracy()

    def forward(self, input_ids, attention_mask, labels=None):
        output = self.bert(input_ids, attention_mask=attention_mask)
        output = self.i2h(output.pooler_output)
        output = self.h2o(output)
        output = torch.sigmoid(output).flatten()
        loss = 0
        if labels is not None:
            loss = self.criterion(output.to(dtype=float), labels.to(dtype=float))
        return loss, output

    def training_step(self, batch, batch_idx):
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"]
        loss, outputs = self(input_ids, attention_mask, labels)
        self.log("train_loss", loss, prog_bar=True, logger=True, batch_size=BATCH_SIZE)
        # self.log("train_AUROC", self.auroc(outputs, labels), prog_bar=True, logger=True, batch_size=BATCH_SIZE)
        self.log("train_acc", self.calculate_accuracy(outputs, labels), prog_bar=True, logger=True, batch_size=BATCH_SIZE)
        return {"loss": loss, "predictions": outputs, "labels": labels}
        
    def validation_step(self, batch, batch_idx):
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"]
        loss, outputs = self(input_ids, attention_mask, labels)
        self.log("val_loss", loss, prog_bar=True, logger=True, batch_size=BATCH_SIZE)
        # self.log("val_AUROC", self.auroc(outputs, labels), prog_bar=True, logger=True, batch_size=BATCH_SIZE)
        self.log("val_acc", self.calculate_accuracy(outputs, labels), prog_bar=True, logger=True, batch_size=BATCH_SIZE)
        return loss

    def test_step(self, batch, batch_idx):
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"]
        loss, outputs = self(input_ids, attention_mask, labels)
        self.log("test_loss", loss, prog_bar=True, logger=True, batch_size=BATCH_SIZE)
        # self.log("test_AUROC", self.auroc(outputs, labels), prog_bar=True, logger=True, batch_size=BATCH_SIZE)
        self.log("test_acc", self.calculate_accuracy(outputs, labels), prog_bar=True, logger=True, batch_size=BATCH_SIZE)
        return loss

    def training_epoch_end(self, outputs):
        labels = []
        predictions = []
        for output in outputs:
            for out_labels in output["labels"].detach().cpu():
                labels.append(out_labels)
            for out_predictions in output["predictions"].detach().cpu():
                predictions.append(out_predictions)
        labels = torch.stack(labels).int()
        predictions = torch.stack(predictions)
        roc_auc = self.auroc(predictions, labels)
        self.logger.experiment.add_scalar("roc_auc/Train", roc_auc, self.current_epoch)

    def configure_optimizers(self):
        optimizer = AdamW(self.parameters(), lr=2e-5)
        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=self.n_warmup_steps,
            num_training_steps=self.n_training_steps
        )
        return dict(
            optimizer=optimizer,
            lr_scheduler=dict(
                scheduler=scheduler,
                interval='step'
            )
        )
    
    def calculate_auroc(self, outputs, labels):
        return self.auroc(torch.round(outputs).to(int), labels.to(int))

    def calculate_accuracy(self, outputs, labels):
        return self.accuracy(torch.round(outputs).to(int), labels.to(int))

In [20]:
model = HateTagger(
  n_warmup_steps=20,
  n_training_steps=100
)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [21]:
checkpoint_callback = ModelCheckpoint(
  dirpath="checkpoints",
  filename="best-checkpoint",
  save_top_k=1,
  verbose=True,
  monitor="val_loss",
  mode="min"
)

In [22]:
logger = TensorBoardLogger("lightning_logs", name="fake-news-ff")

In [23]:
early_stopping_callback = EarlyStopping(monitor='val_loss', patience=3)

In [24]:
trainer = pl.Trainer(
  logger=logger,
  checkpoint_callback=checkpoint_callback,
  callbacks=[early_stopping_callback],
  max_epochs=10,
  gpus=1,
  progress_bar_refresh_rate=30
)

  f"Setting `Trainer(checkpoint_callback={checkpoint_callback})` is deprecated in v1.5 and will "
  f"Setting `Trainer(progress_bar_refresh_rate={progress_bar_refresh_rate})` is deprecated in v1.5 and"
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs


In [25]:
trainer.fit(model, data_module)
# trainer.fit(model, data_loader1)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type      | Params
----------------------------------------
0 | bert      | BertModel | 108 M 
1 | i2h       | Linear    | 98.4 K
2 | h2o       | Linear    | 129   
3 | criterion | BCELoss   | 0     
4 | auroc     | AUROC     | 0     
5 | accuracy  | Accuracy  | 0     
----------------------------------------
108 M     Trainable params
0         Non-trainable params
108 M     Total params
433.635   Total estimated model params size (MB)


Validation sanity check:   0%|          | 0/2 [00:00<?, ?it/s]

  f"The dataloader, {name}, does not have many workers which may be a bottleneck."


                                                              

  f"The dataloader, {name}, does not have many workers which may be a bottleneck."


Epoch 0:   0%|          | 0/813 [00:00<?, ?it/s] 

  f"One of the returned values {set(extra.keys())} has a `grad_fn`. We will detach it automatically"


Epoch 3: 100%|██████████| 813/813 [05:24<00:00,  2.50it/s, loss=0.641, v_num=2, train_loss=0.474, train_acc=0.786, val_loss=0.633, val_acc=0.633]


In [26]:
trainer.test(model, data_module)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
  f"The dataloader, {name}, does not have many workers which may be a bottleneck."


Testing: 100%|██████████| 41/41 [00:06<00:00,  5.96it/s]--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'test_acc': 0.6329268217086792, 'test_loss': 0.6334168314933777}
--------------------------------------------------------------------------------
Testing: 100%|██████████| 41/41 [00:06<00:00,  5.95it/s]


[{'test_loss': 0.6334168314933777, 'test_acc': 0.6329268217086792}]