In [None]:
#!pip install datasets

In [None]:
#!pip install accelerate -U

In [37]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# Prep Data

https://www.kaggle.com/datasets/waalbannyantudre/hate-speech-detection-curated-dataset/data (Published 2022)

In [38]:
df = pd.read_csv('HateSpeechDataset.csv')
# Create a random sample of 20,000 lines
df.head(3)

Unnamed: 0,Content,Label,Content_int
0,denial of normal the con be asked to comment o...,1,"[146715, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,..."
1,just by being able to tweet this insufferable ...,1,"[146715, 14, 15, 16, 17, 7, 18, 19, 20, 21, 22..."
2,that is retarded you too cute to be single tha...,1,"[146715, 28, 29, 30, 26, 31, 32, 7, 5, 33, 28,..."


In [39]:
df.shape

(440906, 3)

In [40]:
non_numeric_labels = df[~df['Label'].apply(lambda x: x.isnumeric())]
print(non_numeric_labels)

        Content  Label             Content_int
190108  content  Label  [146715, 7139, 146714]
418486  content  Label  [146715, 7139, 146714]
422333  content  Label  [146715, 7139, 146714]
424241  content  Label  [146715, 7139, 146714]
426162  content  Label  [146715, 7139, 146714]
435474  content  Label  [146715, 7139, 146714]
437104  content  Label  [146715, 7139, 146714]


In [41]:
df = df[df['Label'].apply(lambda x: x.isnumeric())]

In [42]:
df['Label'] = df['Label'].astype(int)

In [43]:
df.head(3)

Unnamed: 0,Content,Label,Content_int
0,denial of normal the con be asked to comment o...,1,"[146715, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,..."
1,just by being able to tweet this insufferable ...,1,"[146715, 14, 15, 16, 17, 7, 18, 19, 20, 21, 22..."
2,that is retarded you too cute to be single tha...,1,"[146715, 28, 29, 30, 26, 31, 32, 7, 5, 33, 28,..."


In [44]:
df.shape

(440899, 3)

In [45]:
df = df.drop('Content_int', axis=1)

In [46]:
df = df.rename(columns={'Label':'labels'})

In [47]:
df.head(3)

Unnamed: 0,Content,labels
0,denial of normal the con be asked to comment o...,1
1,just by being able to tweet this insufferable ...,1
2,that is retarded you too cute to be single tha...,1


In [48]:
df.reset_index(inplace=True, drop=False)
# Optionally, rename the 'index' column to 'id'
df.rename(columns={'index': 'id'}, inplace=True)

In [49]:
df.head(3)

Unnamed: 0,id,Content,labels
0,0,denial of normal the con be asked to comment o...,1
1,1,just by being able to tweet this insufferable ...,1
2,2,that is retarded you too cute to be single tha...,1


In [50]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df, test_size=0.2)

In [51]:
train.head(3)

Unnamed: 0,id,Content,labels
398464,398465,notability of will blazer a tag has been place...,0
420223,420225,here the user said i am in portuguese para pol...,0
271310,271311,it was not a how too it what a slut an environ...,0


In [52]:
test.head(3)

Unnamed: 0,id,Content,labels
238985,238986,donna keegan pictures i got a lot of them from...,0
67111,67111,well unfortunately he cannot eat pork as he is...,1
150380,150380,studio if all of you support him so much after...,0


# BERT model with hate speech training

In [61]:
from transformers import AutoTokenizer
from datasets import Dataset, DatasetDict

# Assuming df is your pandas DataFrame

# Load the tokenizer for your model
checkpoint = 'bert-base-uncased'  # Change this to match your model
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

# Tokenize the text function
def tokenize_function(examples):
    return tokenizer(examples['Content'], padding="max_length", truncation=True)

# Convert your pandas DataFrame to a Hugging Face Dataset
dataset = Dataset.from_pandas(df)

# Apply the tokenization function
tokenized_dataset = dataset.map(tokenize_function, batched=True)

# Randomly shuffle the dataset
shuffled_dataset = tokenized_dataset.shuffle(seed=42)

# Sample 20,000 lines from the dataset
sampled_dataset = shuffled_dataset.select(range(20000))

# Check the sampled dataset
print(sampled_dataset.column_names)

Map:   0%|          | 0/440899 [00:00<?, ? examples/s]

['id', 'Content', 'labels', 'input_ids', 'token_type_ids', 'attention_mask']


In [62]:
from datasets import DatasetDict

# Assuming you already have a tokenized_dataset with the correct columns
# Split the dataset into training and validation sets if not already split
train_test_split = tokenized_dataset.train_test_split(test_size=0.1)
dataset_dict = DatasetDict({
    'train': train_test_split['train'],
    'validation': train_test_split['test']
    })

# Now, dataset_dict can be used with the Trainer


In [63]:
print(dataset_dict)

DatasetDict({
    train: Dataset({
        features: ['id', 'Content', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 396809
    })
    validation: Dataset({
        features: ['id', 'Content', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 44090
    })
})


In [64]:
from sklearn.metrics import accuracy_score
import numpy as np

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {'accuracy': accuracy_score(labels, predictions)}


In [None]:
#!pip install transformers[torch]

In [58]:
#!pip list > Pip_List.txt

In [None]:
#!pip install accelerate==0.21.0

In [67]:
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments

# Initialize the model
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2) # Adjust `num_labels` as necessary

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,    
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    evaluation_strategy='epoch',
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_dict['train'],
    eval_dataset=dataset_dict['validation'], # Use a separate dataset for evaluation
    compute_metrics=compute_metrics # Define your metric computation function
)

# Start training
trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss


#BERT CNN Model

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel
import pandas as pd
from sklearn.model_selection import train_test_split
from torch import nn, optim

# Assuming 'df' is your DataFrame
# Ensure correct data types
df['Content'] = df['Content'].astype(str)
df['labels'] = df['labels'].astype(int)

# Split the dataset
train_df, val_df = train_test_split(df, test_size=0.1, random_state=42, stratify=df['labels'])


In [None]:
class TextDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.text = dataframe['Content']
        self.labels = dataframe['labels']
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, idx):
        text = str(self.text.iloc[idx])
        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'token_type_ids': inputs['token_type_ids'].flatten(),
            'labels': torch.tensor(self.labels.iloc[idx], dtype=torch.long)
        }


In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
MAX_LEN = 256
BATCH_SIZE = 16

train_dataset = TextDataset(train_df, tokenizer, MAX_LEN)
val_dataset = TextDataset(val_df, tokenizer, MAX_LEN)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)


In [None]:
class BertCnnModel(nn.Module):
    def __init__(self, n_classes):
        super(BertCnnModel, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        # Assuming the pooled output from BERT is of size [batch_size, hidden_size]
        # Adding an adaptive pooling layer to ensure the input size matches the kernel requirement
        self.adaptive_pool = nn.AdaptiveMaxPool1d(128)  # Adjust the target size as needed
        self.conv1 = nn.Conv1d(in_channels=768, out_channels=128, kernel_size=2)
        # The rest of your model definition...

    def forward(self, input_ids, attention_mask, token_type_ids):
        _, pooled_output = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            return_dict=False
        )
        # Transform BERT output for Conv1d and apply adaptive pooling
        cnn_input = pooled_output.unsqueeze(1).permute(0, 2, 1)
        pooled = self.adaptive_pool(cnn_input)
        # Continue with your model's forward pass...



In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BertCnnModel(n_classes=2)
model.to(device)

criterion = nn.CrossEntropyLoss().to(device)
optimizer = optim.Adam(model.parameters(), lr=2e-5)

# Training loop
model.train()
for epoch in range(3):  # example: 3 epochs
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        token_type_ids = batch['token_type_ids'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask, token_type_ids)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()


In [None]:
model.eval()
total, correct = 0, 0
with torch.no_grad():
    for batch in val_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        token_type_ids = batch['token_type_ids'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask, token_type_ids)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

print(f'Validation accuracy: {100 * correct / total}%')