In [1]:
import pandas as pd
import torch
from sklearn.model_selection import GroupShuffleSplit 
import matplotlib.pyplot as plt
from models import utils

import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification, BertPreTrainedModel, BertModel, BertConfig
from torch.utils.data import DataLoader
import torch.nn as nn
import torch



## data prep

In [2]:
annotations_df = pd.read_csv("data/Toxicity_content/toxic_content_annotation", delimiter=',')
text_df = pd.read_csv("data/Toxicity_content/toxic_content_sentences", delimiter=',')
annotators_df = pd.read_csv("data/Toxicity_content/toxic_content_workers", delimiter=',')

In [3]:
annotations_df["comment"] = annotations_df["sentence_id"].map(text_df.set_index("sentence_id")["comment"])
annotations_df["gender"] = annotations_df["worker_id"].map(annotators_df.set_index("worker_id")["gender"])

x = annotations_df.groupby('sentence_id').agg({'toxic_score': lambda x: list(x)})
#keep only sentences that have more than 1 unique annotation in annotations_df
# x = x[x['toxic_score'].apply(lambda x: len(set(x))) > 1]
# annotations_df = annotations_df[annotations_df['sentence_id'].isin(x.index)]

annotators_df = annotators_df[annotators_df['worker_id'].isin(annotations_df['worker_id'])]
print(len(annotators_df))

total_annotator_ids = annotators_df['worker_id'].unique().tolist()
id2annotator = {index: row for (index, row) in enumerate(total_annotator_ids)}
annotator2id = {row: index for (index, row) in enumerate(total_annotator_ids)}
annotations_df["worker_id"] = annotations_df["worker_id"].map(annotator2id)
annotators_df["worker_id"] = annotators_df["worker_id"].map(annotator2id)

splitter = GroupShuffleSplit(test_size=0.2, n_splits=2, random_state = 2)
split = splitter.split(annotations_df, groups=annotations_df['sentence_id'])
train_inds, test_inds = next(split)
train_df = annotations_df.iloc[train_inds]
test_df = annotations_df.iloc[test_inds]
train_df = train_df.sample(frac=1)
test_df = test_df.sample(frac=1)

11604


In [4]:
labels = train_df['toxic_score'].unique()

In [5]:
#sort labels
labels.sort()

In [6]:
embedding_dim = 100

## bert text+groups

In [7]:
class BertForSequenceClassification(BertPreTrainedModel):
    def __init__(self, config): 
        super().__init__(config)
        self.num_labels = config.num_labels

        self.bert = BertModel(config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size + config.group_embedding_dim, config.num_labels)
        variance = 0.5  
        std_deviation = torch.sqrt(torch.tensor(variance))

        self.group_embeddings = nn.Parameter(std_deviation*torch.randn(config.num_groups, config.group_embedding_dim))
        self.group_assignment = nn.Parameter(std_deviation*torch.randn(config.num_annotators, config.num_groups))

        self.init_weights()
        
    def forward(
        self,
        annotator_ids=None,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
        output_attentions=None,
        output_hidden_states=None,
        **kwargs,
    ):
        
        rater_group_assignment = self.group_assignment[annotator_ids].log_softmax(dim=1)

        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
        )

        pooled_output = outputs[1]
        pooled_output = pooled_output.unsqueeze(1).repeat(1,self.group_embeddings.size(0),1)
        a = self.group_embeddings.unsqueeze(0).repeat(pooled_output.shape[0], 1, 1)

        pooled_output = torch.cat((pooled_output, a), dim=2)

        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)

        return logits, rater_group_assignment 


In [8]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [9]:
configuration = BertConfig()
configuration.num_labels = len(labels)
configuration.num_annotators = len(total_annotator_ids)
configuration.group_embedding_dim = 100
configuration.hidden_size = 768 
configuration.num_groups = 5
model = BertForSequenceClassification(configuration).to(device)

In [10]:
# Define batch size and number of workers for data loaders
batch_size = 8
num_workers = 2

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Create training and testing datasets
train_dataset = utils.CustomDataset(train_df, tokenizer, labels)
test_dataset = utils.CustomDataset(test_df, tokenizer, labels)

# Create training and testing data loaders
train_data_loader = DataLoader(train_dataset, batch_size=batch_size, num_workers=num_workers)
test_data_loader = DataLoader(test_dataset, batch_size=batch_size, num_workers=num_workers)


In [21]:
# Training loop
num_epochs = 10
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
loss_fn = torch.nn.CrossEntropyLoss().to(device)

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch in train_data_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        annotator_ids = batch['annotator_id'].to(device)
        labels = batch['label'].to(device)

        # Forward pass

        w, log_p = model(annotator_ids = annotator_ids, input_ids =input_ids, attention_mask = attention_mask, labels = labels)
        loss = torch.zeros(input_ids.size(0))
        for i in range(input_ids.size(0)):
            loss[i] = - (w[i].log_softmax(dim=1) + log_p[i].reshape(-1, 1)).logsumexp(dim=0)[labels[i]]
        # Backward pass and optimization
        loss = torch.mean(loss)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    average_loss = total_loss / len(train_data_loader)

    print(f'Epoch {epoch + 1}/{num_epochs} - Loss: {average_loss:.4f}')


new_batch
new_batch
new_batch
new_batch
new_batch
new_batch
new_batch
new_batch
new_batch
new_batch
new_batch
new_batch
new_batch
new_batch
new_batch
new_batch
new_batch
new_batch
new_batch
new_batch
new_batch
new_batch
new_batch
new_batch
new_batch
new_batch
new_batch
new_batch
new_batch
new_batch
new_batch
new_batch
new_batch
new_batch
new_batch
new_batch
new_batch
new_batch
new_batch
new_batch
new_batch
new_batch
new_batch
new_batch
new_batch
new_batch
new_batch
new_batch
new_batch
new_batch
new_batch
new_batch
new_batch
new_batch
new_batch
new_batch
new_batch
new_batch
new_batch
new_batch
new_batch
new_batch
new_batch
new_batch
new_batch
new_batch
new_batch
new_batch
new_batch
new_batch
new_batch
new_batch
new_batch
new_batch
new_batch
new_batch
new_batch
new_batch
new_batch
new_batch
new_batch
new_batch
new_batch
new_batch
new_batch
new_batch
new_batch
new_batch
new_batch
new_batch
new_batch
new_batch
new_batch
new_batch
new_batch
new_batch
new_batch
new_batch
new_batch


KeyboardInterrupt: 

In [None]:
torch.save(model.state_dict(), 'bert_model_groups.pth')

In [12]:
model.eval()
with torch.no_grad():
    total_correct = 0
    total_samples = 0
    for batch in test_data_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        annotator_ids = batch['annotator_id'].to(device)
        labels = batch['label'].to(device)
        w, log_p = model(annotator_ids, input_ids, attention_mask=attention_mask)

        best_group = log_p.argmax(dim=1)
        w = w[range(len(w)), best_group]
        _, predicted = torch.max(w, 1)
        
        total_correct += (predicted == labels).sum().item()
        total_samples += labels.size(0)
    
    accuracy = total_correct / total_samples
    print(f"Test Accuracy: {accuracy * 100:.2f}%")



KeyboardInterrupt: 

In [1]:
#!/usr/bin/env python
# coding: utf-8

# In[2]:

import sys
sys.path.append("..")


import pandas as pd
import torch
from sklearn.model_selection import GroupShuffleSplit 
import matplotlib.pyplot as plt
from models import bert

import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification, BertPreTrainedModel, BertModel, BertConfig
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn
import torch



# ## data prep

# In[3]:


annotations_df = pd.read_csv("../data/Toxicity_content/toxic_content_annotation", delimiter=',')
text_df = pd.read_csv("../data/Toxicity_content/toxic_content_sentences", delimiter=',')
annotators_df = pd.read_csv("../data/Toxicity_content/toxic_content_workers", delimiter=',')


# In[5]:


annotations_df["comment"] = annotations_df["sentence_id"].map(text_df.set_index("sentence_id")["comment"])
annotations_df["gender"] = annotations_df["worker_id"].map(annotators_df.set_index("worker_id")["gender"])

x = annotations_df.groupby('sentence_id').agg({'toxic_score': lambda x: list(x)})
#keep only sentences that have more than 1 unique annotation in annotations_df
# x = x[x['toxic_score'].apply(lambda x: len(set(x))) > 1]
# annotations_df = annotations_df[annotations_df['sentence_id'].isin(x.index)]

annotators_df = annotators_df[annotators_df['worker_id'].isin(annotations_df['worker_id'])]
print(len(annotators_df))

total_annotator_ids = annotators_df['worker_id'].unique().tolist()
id2annotator = {index: row for (index, row) in enumerate(total_annotator_ids)}
annotator2id = {row: index for (index, row) in enumerate(total_annotator_ids)}
annotations_df["worker_id"] = annotations_df["worker_id"].map(annotator2id)
annotators_df["worker_id"] = annotators_df["worker_id"].map(annotator2id)

annotations_df.rename(columns = {'worker_id':'annotator_id', 'comment':'text', 'toxic_score':'annotation'}, inplace = True)


grouped = annotations_df.groupby('sentence_id')['annotation'].nunique().reset_index()
grouped.columns = ['sentence_id', 'unique_annotations']
annotations_df = annotations_df.merge(grouped, on='sentence_id')
annotations_df['disagreement'] = annotations_df['unique_annotations'] > 1


splitter = GroupShuffleSplit(test_size=0.2, n_splits=2, random_state = 2)
split = splitter.split(annotations_df, groups=annotations_df['sentence_id'])
train_inds, test_inds = next(split)
train_df = annotations_df.iloc[train_inds]
test_df = annotations_df.iloc[test_inds]
train_df = train_df.sample(frac=1)
test_df = test_df.sample(frac=1)


11604


In [7]:
len(train_df["sentence_id"].unique())

80784