In [1]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
import pandas as pd
from tqdm import tqdm
from torch.utils.data import Dataset

In [2]:
# Load pre-trained BERT model and tokenizer
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=3)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
# Check if a GPU is available and set PyTorch to use the GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [4]:
# Move the model to the GPU
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [5]:
# Tokenize the dataset
class HateSpeechDataset(Dataset):
    def __init__(self, texts, labels):
        self.encodings = tokenizer(texts, truncation=True, padding=True, max_length=128)
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [6]:
# # Function to predict a batch
# def predict_batch(batch):
#     inputs = tokenizer(batch, padding=True, truncation=True, return_tensors="pt")
#     outputs = model(**inputs)
#     return torch.argmax(outputs.logits, dim=1).numpy()

In [7]:
# # Modify the predict_batch function to move data to the GPU
# def predict_batch(batch):
#     inputs = tokenizer(batch, padding=True, truncation=True, return_tensors="pt")
    
#     # Move tensors to the specified device
#     inputs = {k: v.to(device) for k, v in inputs.items()}
    
#     with torch.no_grad():
#         outputs = model(**inputs)
#     return torch.argmax(outputs.logits, dim=1).cpu().numpy()  # Move results back to CPU for further processing

In [8]:
def predict_batch(batch):
    try:
        inputs = tokenizer(batch, padding=True, truncation=True, return_tensors="pt")
        
        # Move tensors to the specified device
        inputs = {k: v.to(device) for k, v in inputs.items()}
        
        with torch.no_grad():
            outputs = model(**inputs)
        return torch.argmax(outputs.logits, dim=1).cpu().numpy()  # Move results back to CPU for further processing
    except ValueError as e:
        print(f"Error during prediction: {e}")
        # Assign default prediction of 1 for the entire batch in case of an error
        return [1] * len(batch)

In [9]:
data = pd.read_csv("ucberkeley_measuring_hate_speech_dataset_training.csv")

In [10]:
data['hatespeech'] = data['hatespeech'].astype(int)

In [11]:
df = data.sample(n=13500, random_state=42).reset_index(drop=True)

In [12]:
df

Unnamed: 0,comment_id,annotator_id,hatespeech,text,annotator_gender,annotator_educ,annotator_income,annotator_ideology,annotator_age,annotator_trans,annotator_race,annotator_religion,annotator_sexuality
0,20005,692,2,careful wish always wanted feel huge cock hole...,male,college_grad_ba,50k-100k,slightly_liberal,30.0,cisgender,white,nothing,straight
1,20050,8399,0,dont care threatened feel cant go throwing res...,female,college_grad_ba,10k-50k,neutral,36.0,cisgender,latinx,christian,straight
2,22882,6516,1,interracial threeway young pawg druffbandit ma...,female,some_college,10k-50k,liberal,23.0,cisgender,asian,nothing,other
3,20055,9571,0,plenty evidence woman nonwhite people disadvan...,female,college_grad_aa,10k-50k,liberal,53.0,cisgender,white,other,straight
4,20065,1061,0,notwithstanding marriyum aurangzeb sahibas pol...,female,professional_degree,100k-200k,liberal,44.0,cisgender,white,nothing,straight
...,...,...,...,...,...,...,...,...,...,...,...,...,...
13495,20013,4207,2,hate people woman especially look mad got dog ...,female,professional_degree,10k-50k,slightly_liberal,25.0,cisgender,asian,hindu,straight
13496,38182,4124,2,haha love fag empty stare dumb willing obeying...,female,college_grad_aa,10k-50k,slightly_liberal,34.0,cisgender,white,nothing,straight
13497,737,9885,2,shes disgusting id let suck cock,male,some_college,10k-50k,slightly_conservative,34.0,cisgender,white,nothing,gay
13498,46466,6117,0,he black sorry disappoint dewayne craddock,male,high_school_grad,10k-50k,slightly_liberal,31.0,cisgender,white,christian,straight


In [13]:
# Prepare the dataset
texts = df['text'].tolist()
labels = df['hatespeech'].tolist()
dataset = HateSpeechDataset(texts, labels)

# Split the dataset for training and validation
train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])

In [14]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
)

In [15]:
# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

In [16]:
# Train the model
trainer.train()

# Save the fine-tuned model
model.save_pretrained("./fine_tuned_bert")

  0%|          | 0/4557 [00:00<?, ?it/s]

{'loss': 0.7475, 'learning_rate': 5e-05, 'epoch': 0.33}
{'loss': 0.6411, 'learning_rate': 4.383781119053488e-05, 'epoch': 0.66}
{'loss': 0.6276, 'learning_rate': 3.767562238106976e-05, 'epoch': 0.99}
{'loss': 0.5887, 'learning_rate': 3.151343357160463e-05, 'epoch': 1.32}
{'loss': 0.548, 'learning_rate': 2.5351244762139513e-05, 'epoch': 1.65}
{'loss': 0.5692, 'learning_rate': 1.918905595267439e-05, 'epoch': 1.97}
{'loss': 0.5052, 'learning_rate': 1.3026867143209267e-05, 'epoch': 2.3}
{'loss': 0.4593, 'learning_rate': 6.864678333744145e-06, 'epoch': 2.63}
{'loss': 0.4675, 'learning_rate': 7.024895242790239e-07, 'epoch': 2.96}
{'train_runtime': 646.6193, 'train_samples_per_second': 56.37, 'train_steps_per_second': 7.047, 'train_loss': 0.5708289034460259, 'epoch': 3.0}


In [17]:
# Predict using the fine-tuned model (for your 100000 texts)
# Load your dataset here
large_dataset = pd.read_csv("ucberkeley_measuring_hate_speech_dataset_testing.csv")

testing_dataset = large_dataset["text"].tolist()

# Tokenize and predict
#predictions = model(testing_dataset)

In [18]:
# Test prediction with a small set of texts (20 texts)
test_texts = text_list = large_dataset['text'].head(20).tolist()

test_predictions = predict_batch(test_texts)
print("Test Predictions:", test_predictions)

Test Predictions: [0 0 0 0 0 0 2 2 0 2 0 0 0 0 2 2 0 2 2 0]


In [19]:
# Checkpointing and resuming
checkpoint_file = "predictions_checkpoint.csv"
try:
    # Try to load existing checkpoint
    checkpoint_data = pd.read_csv(checkpoint_file)
    start_index = checkpoint_data.shape[0]
    predictions = checkpoint_data["predictions"].tolist()
except FileNotFoundError:
    # If no checkpoint exists, start from scratch
    start_index = 0
    predictions = []

In [20]:
# Process in batches and save checkpoints
batch_size = 1000
for i in tqdm(range(start_index, len(testing_dataset), batch_size)):
    batch = testing_dataset[i:i + batch_size]
    batch_predictions = predict_batch(batch)
    predictions.extend(batch_predictions)
    
    # Save checkpoint
    pd.DataFrame({"predictions": predictions}).to_csv(checkpoint_file, index=False)

  1%|          | 1/124 [00:00<00:28,  4.32it/s]

Error during prediction: Input is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers.


100%|██████████| 124/124 [46:31<00:00, 22.51s/it] 


In [25]:
# Convert predictions to DataFrame
final_predictions = pd.DataFrame({"text": testing_dataset, "predictions": predictions})

In [26]:
final_predictions

Unnamed: 0,text,predictions
0,yes indeed sort reminds elder lady played part...,0
1,trans woman reading tweet right beautiful,0
2,question broad criticize america country flee ...,0
3,time illegals go back country origin keep free...,0
4,starter bend one pink kick as pussy get taste ...,0
...,...,...
135448,breaking news sayyed abdulmalikthe saudi regim...,0
135449,million yemeni participated mass rally square ...,0
135450,abeshinzo realdonaldtrump shinzoabe dictator g...,2
135451,million yemeni participated mass rally square ...,0


In [27]:
# Optionally, save the final predictions
final_predictions.to_csv("bert_final_predictions.csv", index=False)