## Preprocess Data

In [2]:
import pandas as pd

In [6]:
# load dataset
DATA_FILE = 'data/labeled_data.csv'
df = pd.read_csv(DATA_FILE)
df = df.drop(columns=['Unnamed: 0'])
df.info()
df

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24783 entries, 0 to 24782
Data columns (total 6 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   count               24783 non-null  int64 
 1   hate_speech         24783 non-null  int64 
 2   offensive_language  24783 non-null  int64 
 3   neither             24783 non-null  int64 
 4   class               24783 non-null  int64 
 5   tweet               24783 non-null  object
dtypes: int64(5), object(1)
memory usage: 1.1+ MB


Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet
0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...
...,...,...,...,...,...,...
24778,3,0,2,1,1,you's a muthaf***in lie &#8220;@LifeAsKing: @2...
24779,3,0,1,2,2,"you've gone and broke the wrong heart baby, an..."
24780,3,0,3,0,1,young buck wanna eat!!.. dat nigguh like I ain...
24781,6,0,6,0,1,youu got wild bitches tellin you lies


In [10]:
df['tweet'][0]

"!!! RT @mayasolovely: As a woman you shouldn't complain about cleaning up your house. &amp; as a man you should always take the trash out..."

### Count the number of each classification
0 - hate speech \
1 - offensive language \
2 - neither

In [None]:
df['class'].value_counts()

class
1    19190
2     4163
0     1430
Name: count, dtype: int64

In [27]:
df['label'] = df['class'].apply(lambda x: 0 if x == 2 else 1)
df['label'].value_counts()

label
1    20620
0     4163
Name: count, dtype: int64

## Train Model

In [7]:
!pip install transformers
!pip install accelerate -U

Collecting transformers
  Downloading transformers-4.49.0-py3-none-any.whl (10.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.0/10.0 MB[0m [31m35.0 MB/s[0m eta [36m0:00:00[0m0:01[0m00:01[0m
[?25hCollecting huggingface-hub<1.0,>=0.26.0
  Downloading huggingface_hub-0.29.3-py3-none-any.whl (468 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m469.0/469.0 kB[0m [31m35.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tokenizers<0.22,>=0.21
  Downloading tokenizers-0.21.0-cp39-abi3-macosx_11_0_arm64.whl (2.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.6/2.6 MB[0m [31m53.0 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting filelock
  Downloading filelock-3.17.0-py3-none-any.whl (16 kB)
Collecting tqdm>=4.27
  Downloading tqdm-4.67.1-py3-none-any.whl (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.5/78.5 kB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting regex

In [None]:
import numpy as np
import torch
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
from sklearn.model_selection import train_test_split
from transformers import (
    DistilBertForSequenceClassification, 
    AutoTokenizer,
    Trainer, 
    TrainingArguments,
    EarlyStoppingCallback
)

# Set random seeds for reproducibility
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(RANDOM_SEED)

# Check for MPS (Apple Silicon) device
use_mps = torch.backends.mps.is_available()
device = torch.device('mps' if use_mps else 'cuda' if torch.cuda.is_available() else 'cpu')

print(f"Using device: {device}")

# Model configuration with smaller memory footprint
MODEL_NAME = "distilbert-base-uncased"
NUM_LABELS = 2  # Binary classification
OUTPUT_DIR = "./results"
MODEL_DIR = "./saved_model"
MAX_LENGTH = 128  # Limit sequence length to save memory

# Load pre-trained model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = DistilBertForSequenceClassification.from_pretrained(
    MODEL_NAME, 
    num_labels=NUM_LABELS,
    id2label={0: "negative", 1: "positive"}
)

# Split data with stratification to maintain class distribution
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df["tweet"], 
    df["label"], 
    test_size=0.2, 
    stratify=df["label"],
    random_state=RANDOM_SEED
)

# Convert to numpy arrays efficiently
train_texts = train_texts.reset_index(drop=True)
train_labels = train_labels.reset_index(drop=True).to_numpy()
val_texts = val_texts.reset_index(drop=True)
val_labels = val_labels.reset_index(drop=True).to_numpy()

# Memory-efficient dataset implementation
class TweetDataset(torch.utils.data.Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __getitem__(self, idx):
        # Tokenize on-the-fly instead of storing all tokenized data in memory
        encoding = self.tokenizer(
            self.texts[idx],
            truncation=True,
            max_length=self.max_length,
            padding="max_length",
            return_tensors="pt"
        )
        
        # Remove batch dimension added by tokenizer when return_tensors="pt"
        encoding = {k: v.squeeze(0) for k, v in encoding.items()}
        
        encoding["labels"] = torch.tensor(self.labels[idx])
        return encoding
    
    def __len__(self):
        return len(self.labels)

# Create datasets
train_dataset = TweetDataset(train_texts, train_labels, tokenizer, MAX_LENGTH)
val_dataset = TweetDataset(val_texts, val_labels, tokenizer, MAX_LENGTH)

# Define evaluation metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, 
        predictions, 
        average="binary", 
        zero_division=0
    )
    
    accuracy = accuracy_score(labels, predictions)
    
    tn, fp, fn, tp = confusion_matrix(labels, predictions).ravel()
    
    return {
        "accuracy": accuracy,
        "f1": f1,
        "precision": precision,
        "recall": recall,
        "true_positives": tp,
        "false_negatives": fn
    }

# Training hyperparameters
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=3,  # test again if not overtrained
    per_device_train_batch_size=8,  # keep small for cpu 
    per_device_eval_batch_size=16,  # keep small for cpu
    gradient_accumulation_steps=4,  # accumulate gradients to simulate larger batch
    warmup_ratio=0.1,
    weight_decay=0.01,
    logging_dir="./logs",
    evaluation_strategy="epochs",
    save_total_limit=2,  # keep fewer checkpoints
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    report_to="none",
    dataloader_num_workers=0,  # disable multiprocessing to reduce memory usage
    seed=RANDOM_SEED,
    optim="adamw_torch"  # Use PyTorch's native optimizer which can be more memory efficient
)

# Initialize trainer with early stopping
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

trainer.train()

eval_results = trainer.evaluate()
print(f"Final evaluation results: {eval_results}")

# Save the model and tokenizer
trainer.save_model(MODEL_DIR)
tokenizer.save_pretrained(MODEL_DIR)

# Interface to predict sentiment using the model
def predict_sentiment(text):
    # Process one example at a time
    inputs = tokenizer(
        text, 
        return_tensors="pt", 
        truncation=True, 
        max_length=MAX_LENGTH,
        padding="max_length"
    )
    
    # Move inputs to the right device
    inputs = {k: v.to(device) for k, v in inputs.items()}
    
    # Use inference mode to save memory
    with torch.inference_mode():
        outputs = model(**inputs)
    
    logits = outputs.logits.cpu()  # Move back to CPU for post-processing
    prediction = torch.argmax(logits, dim=-1).item()
    
    return {
        "text": text,
        "predicted_class": prediction,
        "class_name": model.config.id2label[prediction],
        "confidence": torch.softmax(logits, dim=-1)[0][prediction].item()
    }

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Using device: mps




Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,True Positives,False Negatives
200,0.1565,0.126252,0.94533,0.96644,0.987598,0.946169,3902,222
400,0.1236,0.118114,0.961872,0.976909,0.984487,0.969447,3998,126
600,0.1088,0.105777,0.955417,0.972853,0.98581,0.960233,3960,164
800,0.0793,0.11317,0.963688,0.978271,0.974038,0.982541,4052,72
1000,0.0765,0.104679,0.964495,0.978479,0.986926,0.970175,4001,123
1200,0.0761,0.102673,0.965907,0.979468,0.981495,0.977449,4031,93
1400,0.0389,0.123743,0.966109,0.979631,0.979631,0.979631,4040,84
1600,0.0409,0.129087,0.967319,0.980359,0.980359,0.980359,4043,81
1800,0.0395,0.126747,0.967319,0.980349,0.980825,0.979874,4041,83


Final evaluation results: {'eval_loss': 0.12908653914928436, 'eval_accuracy': 0.9673189429090175, 'eval_f1': 0.9803588748787585, 'eval_precision': 0.9803588748787585, 'eval_recall': 0.9803588748787585, 'eval_true_positives': 4043, 'eval_false_negatives': 81, 'eval_runtime': 49.3998, 'eval_samples_per_second': 100.344, 'eval_steps_per_second': 6.275, 'epoch': 2.9987898346107302}


## Test for Bias

In [2]:
import numpy as np
import torch
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
from sklearn.model_selection import train_test_split
from transformers import (
    DistilBertForSequenceClassification, 
    AutoTokenizer,
    Trainer, 
    TrainingArguments,
    EarlyStoppingCallback,
    AutoModelForSequenceClassification
)
import os

# Set random seeds for reproducibility
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(RANDOM_SEED)

# Check for MPS (Apple Silicon) device
use_mps = torch.backends.mps.is_available()
try:
    if torch.backends.mps.is_available():
        device = torch.device("mps")
        print("Using MPS device")
    elif torch.cuda.is_available():
        device = torch.device("cuda")
        print("Using CUDA device")
    else:
        device = torch.device("cpu")
        print("Using CPU device")
except:
    device = torch.device("cpu")
    print("Fallback to CPU device due to error")

print(f"Using device: {device}")

# Model configuration with smaller memory footprint
MODEL_NAME = "distilbert-base-uncased"
NUM_LABELS = 2  # Binary classification
OUTPUT_DIR = "./results"
MODEL_DIR = "./saved_model"
MAX_LENGTH = 128  # Limit sequence length to save memory


MODEL_DIR = "./saved_model"
model = AutoModelForSequenceClassification.from_pretrained(MODEL_DIR)
tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR)

  from .autonotebook import tqdm as notebook_tqdm


Using MPS device
Using device: mps


In [3]:
def predict_sentiment(text):
    # Process one example at a time
    inputs = tokenizer(
        text, 
        return_tensors="pt", 
        truncation=True, 
        max_length=MAX_LENGTH,
        padding="max_length"
    )
    
    # Move inputs to the right device
    inputs = {k: v.to(device) for k, v in inputs.items()}
    
    # Use inference mode to save memory
    with torch.inference_mode():
        outputs = model(**inputs)
    
    logits = outputs.logits.cpu()  # Move back to CPU for post-processing
    prediction = torch.argmax(logits, dim=-1).item()
    
    return {
        "text": text,
        "predicted_class": prediction,
        "class_name": model.config.id2label[prediction],
        "confidence": torch.softmax(logits, dim=-1)[0][prediction].item()
    }
model = model.to(device)

In [10]:
test_input = "I like candy and toys"
predict_sentiment(test_input)

{'text': 'I like candy and toys',
 'predicted_class': 0,
 'class_name': 'negative',
 'confidence': 0.9962435960769653}