In [1]:
from sklearn.model_selection import train_test_split
import pandas as pd
import torch
from sklearn.model_selection import GroupShuffleSplit 
import matplotlib.pyplot as plt
import nltk
from torch import optim
from nltk.corpus import stopwords
from models import utils, MLP
import json

import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence
import torch.nn as nn
import torch



## data prep

In [2]:
annotations_df = pd.read_csv("data/Toxicity_content/toxic_content_annotation", delimiter=',')
text_df = pd.read_csv("data/Toxicity_content/toxic_content_sentences", delimiter=',')
annotators_df = pd.read_csv("data/Toxicity_content/toxic_content_workers", delimiter=',')

In [3]:
text_df

Unnamed: 0,sentence_id,comment,comment_id,perspective_score,source
0,0,Just a matter of time before pick up on this s...,1135_1,0.280691,twitter
1,1,this is QUINN you DUMBASS 😭😭😭,1135_5,0.909117,twitter
2,2,"I like Maxi, long term for sure. Just wouldn’t...",1135_6,0.051221,twitter
3,3,"anna really out there embarrassing amber, i’d ...",1135_7,0.785292,twitter
4,4,mfw we need to purge the system,1135_8,0.366173,4chan
...,...,...,...,...,...
100975,107615,Precisely. Drug testing does fuck-all to ensur...,7157_19,0.758788,4chan
100976,107616,Adult women I know are generally smarter than ...,7157_20,0.363075,4chan
100977,107617,This is as stupid as saying having a 3rd degre...,7157_21,0.766444,4chan
100978,107618,"""Germany is the enemy of Judaism and must be p...",7157_22,0.607978,4chan


In [4]:
annotations_df["comment"] = annotations_df["sentence_id"].map(text_df.set_index("sentence_id")["comment"])
annotations_df["gender"] = annotations_df["worker_id"].map(annotators_df.set_index("worker_id")["gender"])

x = annotations_df.groupby('sentence_id').agg({'toxic_score': lambda x: list(x)})
#keep only sentences that have more than 1 unique annotation in annotations_df
# x = x[x['toxic_score'].apply(lambda x: len(set(x))) > 1]
# annotations_df = annotations_df[annotations_df['sentence_id'].isin(x.index)]

annotators_df = annotators_df[annotators_df['worker_id'].isin(annotations_df['worker_id'])]
print(len(annotators_df))

annotator_ids = annotators_df['worker_id'].unique().tolist()
id2annotator = {index: row for (index, row) in enumerate(annotator_ids)}
annotator2id = {row: index for (index, row) in enumerate(annotator_ids)}
annotations_df["worker_id"] = annotations_df["worker_id"].map(annotator2id)
annotators_df["worker_id"] = annotators_df["worker_id"].map(annotator2id)

splitter = GroupShuffleSplit(test_size=0.2, n_splits=2, random_state = 2)
split = splitter.split(annotations_df, groups=annotations_df['sentence_id'])
train_inds, test_inds = next(split)
train_df = annotations_df.iloc[train_inds]
test_df = annotations_df.iloc[test_inds]
train_df = train_df.sample(frac=1)
test_df = test_df.sample(frac=1)

11604


In [5]:
labels = train_df['toxic_score'].unique()

In [6]:
#sort labels
labels.sort()

In [7]:
embedding_dim = 100

## bert

In [8]:
class MyBERTClassifier(nn.Module):
    def __init__(self, bert_model, num_labels, num_annotators, embedding_dim):
        super(MyBERTClassifier, self).__init__()
        self.bert = bert_model
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(bert_model.config.hidden_size + embedding_dim, num_labels)
        self.annotator_embeddings = nn.Embedding(num_annotators, embedding_dim)

    def forward(self, input_ids, attention_mask, annotator_ids):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        last_hidden_state = outputs.hidden_states[-1]
        pooled_output = torch.mean(last_hidden_state, dim=1)
        pooled_output = self.dropout(pooled_output)

        # annotator_embeddings = self.annotator_embeddings(annotator_ids)
        # Concatenate the pooled_output with the annotator_embeddings
        annotator_embeddings = self.annotator_embeddings(annotator_ids)
        pooled_output = torch.cat((pooled_output, annotator_embeddings), dim=1)

        logits = self.classifier(pooled_output)

        return logits


In [9]:
# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Load the BERT model for sequence classification
bert_model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(labels), output_attentions=True, output_hidden_states=True)
model = MyBERTClassifier(bert_model, num_labels=len(labels), num_annotators= len(annotator_ids), embedding_dim = embedding_dim)


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [10]:
# Define batch size and number of workers for data loaders
batch_size = 8
num_workers = 2

# Create training and testing datasets
train_dataset = utils.CustomDataset(train_df, tokenizer, labels)
test_dataset = utils.CustomDataset(test_df, tokenizer, labels)

# Create training and testing data loaders
train_data_loader = DataLoader(train_dataset, batch_size=batch_size, num_workers=num_workers)
test_data_loader = DataLoader(test_dataset, batch_size=batch_size, num_workers=num_workers)


In [11]:
device = torch.device("cpu")

In [15]:
# Training loop
num_epochs = 10
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
loss_fn = torch.nn.CrossEntropyLoss().to(device)

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch in train_data_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        annotator_ids = batch['annotator_id'].to(device)
        # annotator_embeddings = batch['annotator_embedding'].to(device)
        labels = batch['label'].to(device)

        # Forward pass
        logits = model(input_ids, attention_mask=attention_mask, annotator_ids=annotator_ids)
        # print(outputs)
        # logits = outputs.logits

        # Calculate the loss
        loss = loss_fn(logits, labels)

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    average_loss = total_loss / len(train_data_loader)

    print(f'Epoch {epoch + 1}/{num_epochs} - Loss: {average_loss:.4f}')


KeyboardInterrupt: 