In [3]:
# Basics
import pandas as pd
import numpy as np
import pickle

# Preprocessing
import torch
from sklearn.model_selection import train_test_split

# Modelling
from transformers import RobertaTokenizerFast, RobertaForSequenceClassification, Trainer, TrainingArguments

# Evaluation
from sklearn.metrics import classification_report, f1_score


In [4]:
# Load the pickle file
training_data = pd.read_pickle('/content/training_data_multiclass.pkl')

# Extract the 'davidson2017' dataset
df_raw = {}
df_raw['davidson2017'] = training_data['davidson2017'].copy()

# Ensure the label column is of type string
df_raw['davidson2017']['label'] = df_raw['davidson2017']['label'].astype(str)

# Replace string labels with integers
df_raw['davidson2017']['label'].replace({"hateful": 0, "offensive": 1, "neither": 2}, inplace=True)

# Print label counts to verify
print('davidson2017')
print(df_raw['davidson2017'].groupby('label')['text'].count())
print()


davidson2017
label
0     1430
1    19190
2     4163
Name: text, dtype: int64



The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_raw['davidson2017']['label'].replace({"hateful": 0, "offensive": 1, "neither": 2}, inplace=True)
  df_raw['davidson2017']['label'].replace({"hateful": 0, "offensive": 1, "neither": 2}, inplace=True)


In [5]:
# Split the dataset into training and validation/test sets
df_train, df_valtest = train_test_split(
    df_raw['davidson2017'],
    test_size=0.2,
    stratify=df_raw['davidson2017'].label,
    random_state=123
)

# Further split validation/test into validation and test sets
df_val, df_test = train_test_split(
    df_valtest,
    test_size=0.5,
    stratify=df_valtest.label,
    random_state=123
)

# Split up text and label columns into lists
train_texts = df_train['text'].astype("string").tolist()
val_texts = df_val['text'].astype("string").tolist()
test_texts = df_test['text'].astype("string").tolist()

train_labels = df_train['label'].tolist()
val_labels = df_val['label'].tolist()
test_labels = df_test['label'].tolist()


In [6]:
# Initialize RoBERTa tokenizer
roberta_tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base')

# Add special tokens for URLs, emojis, and mentions
special_tokens_dict = {'additional_special_tokens': ['[USER]', '[EMOJI]', '[URL]']}
num_added_toks = roberta_tokenizer.add_special_tokens(special_tokens_dict)
print(f"Added {num_added_toks} special tokens.")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Added 3 special tokens.


In [7]:
# Tokenize the texts
train_encodings_roberta = roberta_tokenizer(
    train_texts,
    truncation=True,
    padding=True,
    max_length=512  # Optional: Specify max_length if needed
)
val_encodings_roberta = roberta_tokenizer(
    val_texts,
    truncation=True,
    padding=True,
    max_length=512
)
test_encodings_roberta = roberta_tokenizer(
    test_texts,
    truncation=True,
    padding=True,
    max_length=512
)


In [8]:
class HateDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        # Convert each item to a dictionary of tensors
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

    def __len__(self):
        return len(self.labels)


In [9]:
# Create dataset objects
train_dataset_roberta = HateDataset(train_encodings_roberta, train_labels)
val_dataset_roberta = HateDataset(val_encodings_roberta, val_labels)
test_dataset_roberta = HateDataset(test_encodings_roberta, test_labels)


In [10]:
from transformers import TrainingArguments

training_args_roberta = TrainingArguments(
    output_dir='./Roberta',  # Output directory
    num_train_epochs=3,                # Total number of training epochs
    per_device_train_batch_size=16,    # Batch size per device during training
    per_device_eval_batch_size=64,     # Batch size for evaluation
    evaluation_strategy='epoch',       # Evaluation strategy
    save_strategy='epoch',             # Save strategy aligned with evaluation
    save_total_limit=2,                 # Limit the total number of checkpoints
    warmup_steps=500,                  # Number of warmup steps for learning rate scheduler
    weight_decay=0.01,                 # Strength of weight decay
    learning_rate=5e-5,                # Learning rate
    seed=123,                           # Seed for reproducibility
    load_best_model_at_end=True,        # Load the best model at the end of training
    metric_for_best_model='f1',         # Use F1 score to evaluate the best model
    greater_is_better=True,
    logging_dir='./logs',               # Directory for storing logs
    logging_steps=100,                  # Log every 100 steps
    report_to="none"                    # Disable reporting to wandb and others
)




In [11]:
from transformers import RobertaForSequenceClassification

def model_init_roberta_D17():
    model = RobertaForSequenceClassification.from_pretrained(
        "roberta-base",
        num_labels=3  # Adjust based on the number of classes
    )
    model.resize_token_embeddings(len(roberta_tokenizer))  # Resize embeddings to accommodate new tokens
    return model


In [12]:
from sklearn.metrics import f1_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    f1 = f1_score(labels, preds, average='weighted')
    return {'f1': f1}

from transformers import Trainer

trainer_roberta = Trainer(
    args=training_args_roberta,
    train_dataset=train_dataset_roberta,
    eval_dataset=val_dataset_roberta,
    tokenizer=roberta_tokenizer,
    model_init=model_init_roberta_D17,
    compute_metrics=compute_metrics
)


  trainer_roberta = Trainer(


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


In [13]:
# Train the model
print('Training RoBERTa multiclass davidson2017 model')
try:
    trainer_roberta.train()
except Exception as e:
    print(f"Error encountered while training davidson2017 with RoBERTa: {e}")


Training RoBERTa multiclass davidson2017 model


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1
1,0.3269,0.297383,0.887466
2,0.264,0.263758,0.907718
3,0.2267,0.249568,0.918699


In [16]:
# Save the best model explicitly
trainer_roberta.save_model('./Models/RoBERTa_davidson2017_multiclass/Final')
roberta_tokenizer.save_pretrained('./Models/RoBERTa_davidson2017_multiclass/Final')

print("RoBERTa model for davidson2017 saved successfully!")


RoBERTa model for davidson2017 saved successfully!


In [17]:
import os

saved_dir = './Models/RoBERTa_davidson2017_multiclass/Final'
if os.path.exists(saved_dir):
    print(f"Model saved in directory: {saved_dir}")
    print("Contents:")
    print(os.listdir(saved_dir))
else:
    print(f"Model directory {saved_dir} does not exist.")


Model saved in directory: ./Models/RoBERTa_davidson2017_multiclass/Final
Contents:
['tokenizer_config.json', 'vocab.json', 'model.safetensors', 'training_args.bin', 'special_tokens_map.json', 'config.json', 'tokenizer.json', 'merges.txt', 'added_tokens.json']


In [18]:
from transformers import RobertaTokenizerFast, RobertaForSequenceClassification
import torch

# Define the directory where the model is saved
model_dir = './Models/RoBERTa_davidson2017_multiclass/Final'

# Load the tokenizer
loaded_tokenizer = RobertaTokenizerFast.from_pretrained(model_dir)

# Load the model
loaded_model = RobertaForSequenceClassification.from_pretrained(model_dir)

# Move the model to the GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
loaded_model.to(device)

print("Model and tokenizer loaded successfully!")


Model and tokenizer loaded successfully!


In [19]:
from torch.utils.data import DataLoader

# Create a DataLoader for the test set
test_loader = DataLoader(test_dataset_roberta, batch_size=64, shuffle=False)


In [20]:
from sklearn.metrics import classification_report, f1_score

def evaluate_model(model, dataloader, device):
    model.eval()
    preds = []
    true_labels = []

    with torch.no_grad():
        for batch in dataloader:
            # Move inputs to the device
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            # Forward pass
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits

            # Get predictions
            predictions = torch.argmax(logits, dim=-1)

            # Collect results
            preds.extend(predictions.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())

    return true_labels, preds

# Perform evaluation
true_labels, pred_labels = evaluate_model(loaded_model, test_loader, device)

# Print classification report
print('DAVIDSON2017 MULTICLASS')
print(classification_report(true_labels, pred_labels))


DAVIDSON2017 MULTICLASS
              precision    recall  f1-score   support

           0       0.43      0.46      0.44       143
           1       0.95      0.94      0.95      1919
           2       0.90      0.89      0.89       417

    accuracy                           0.91      2479
   macro avg       0.76      0.76      0.76      2479
weighted avg       0.91      0.91      0.91      2479



In [21]:
# Calculate F1 scores
print('F1 Scores for davidson2017:')
for average in ['micro', 'macro', 'weighted']:
    score = f1_score(true_labels, pred_labels, average=average)
    print(f'{average.capitalize()} F1 score: {score:.2%}')


F1 Scores for davidson2017:
Micro F1 score: 90.68%
Macro F1 score: 76.09%
Weighted F1 score: 90.79%


In [22]:
import shutil
from google.colab import files

# Compress the model directory
shutil.make_archive('RoBERTa_davidson2017_multiclass_Final', 'zip', './Models/RoBERTa_davidson2017_multiclass/Final')

# Download the zipped model
files.download('RoBERTa_davidson2017_multiclass_Final.zip')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [1]:
# ============================================
# 1. Import Necessary Libraries
# ============================================
import os
import re  # Regular expressions for text preprocessing
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import load_model
from tensorflow.keras.layers import Layer
import tensorflow.keras.backend as K
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from transformers import BertTokenizer, BertForSequenceClassification
import torch
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import mode
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
import json
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# ============================================
# 2. Define the Custom Attention Layer
# ============================================
class AttentionLayer(Layer):
    def __init__(self, **kwargs):
        super(AttentionLayer, self).__init__(**kwargs)

    def build(self, input_shape):
        # Create a trainable weight variable for attention
        self.W = self.add_weight(name='att_weight',
                                 shape=(input_shape[-1], 1),
                                 initializer='random_normal',
                                 trainable=True)
        super(AttentionLayer, self).build(input_shape)

    def call(self, inputs):
        # Compute attention scores
        e = K.tanh(K.dot(inputs, self.W))
        a = K.softmax(e, axis=1)
        output = inputs * a
        return K.sum(output, axis=1)

# ============================================
# 3. Load and Preprocess the Dataset
# ============================================
# Ensure NLTK data is downloaded
nltk.download('stopwords')
nltk.download('wordnet')

# Initialize stop words and lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Define preprocessing function
def preprocess_text(text):
    # Lowercase
    text = text.lower()
    # Remove URLs
    text = re.sub(r'http\S+', '', text)
    # Remove mentions and hashtags
    text = re.sub(r'[@#]\w+', '', text)
    # Remove punctuation and numbers
    text = re.sub(r'[^a-z\s]', '', text)
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    # Remove stop words and lemmatize
    text = ' '.join([
        lemmatizer.lemmatize(word) for word in text.split()
        if word not in stop_words
    ])
    return text

# Path to your dataset
data_path = '/content/ABC.csv'

# Load the dataset
df = pd.read_csv(data_path)

# Display the first few rows
print("First 5 rows of the dataset:")
print(df.head())

# Check for missing values
print("\nChecking for missing values:")
print(df.isnull().sum())

# Handle missing values (if any)
# For simplicity, we'll drop rows with missing 'tweet' values
df = df.dropna(subset=['tweet']).reset_index(drop=True)

# Analyze Class Distribution
print("\nClass Distribution:")
print(df['class'].value_counts())

# Plot class distribution
plt.figure(figsize=(6,4))
sns.countplot(x='class', data=df)
plt.title('Class Distribution')
plt.xlabel('Class')
plt.ylabel('Count')
plt.show()

# Apply preprocessing to the 'tweet' column
df['tweet'] = df['tweet'].astype(str).apply(preprocess_text)

# ============================================
# 4. Encode Labels
# ============================================
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(df['class'])

# Display encoded labels mapping
print("\nEncoded Labels Mapping:")
for i, class_name in enumerate(label_encoder.classes_):
    print(f"{i}: {class_name}")

# ============================================
# 5. Split the Data into Training and Testing Sets
# ============================================
from sklearn.model_selection import train_test_split

X = df['tweet'].values
y = encoded_labels

X_train_texts, X_test_texts, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"\nTraining samples: {len(X_train_texts)}")
print(f"Testing samples: {len(X_test_texts)}")

# ============================================
# 6. Tokenization and Padding for Keras Model
# ============================================
# Path to the tokenizer
tokenizer_path = '/content/Enhanced_CNN_LSTM_Model_tokenizer.json'

# Load tokenizer if exists, else fit a new one
if os.path.exists(tokenizer_path):
    with open(tokenizer_path) as f:
        data = json.load(f)
        tokenizer = Tokenizer.from_json(data)
    print(f"Tokenizer loaded from {tokenizer_path}")
else:
    # Fit a new tokenizer on training texts
    tokenizer = Tokenizer(num_words=20000, oov_token="<OOV>")
    tokenizer.fit_on_texts(X_train_texts)
    # Save the tokenizer for future use
    tokenizer_json = tokenizer.to_json()
    with open(tokenizer_path, 'w') as f:
        json.dump(tokenizer_json, f)
    print(f"Tokenizer fitted on training data and saved to {tokenizer_path}")

# Convert texts to sequences
X_train_sequences = tokenizer.texts_to_sequences(X_train_texts)
X_test_sequences = tokenizer.texts_to_sequences(X_test_texts)

# Define maximum sequence length (should match training)
max_sequence_length = 100  # Adjust based on your training

# Pad the sequences
X_train_padded = pad_sequences(
    X_train_sequences, maxlen=max_sequence_length, padding='post', truncating='post'
)
X_test_padded = pad_sequences(
    X_test_sequences, maxlen=max_sequence_length, padding='post', truncating='post'
)

print(f"\nExample of tokenized and padded sequence:")
print(X_test_padded[0])

# ============================================
# 7. Handle Class Imbalance
# ============================================
from sklearn.utils.class_weight import compute_class_weight

class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y_train),
    y=y_train
)
class_weights_dict = {i: class_weights[i] for i in range(len(class_weights))}
print("\nClass Weights:")
print(class_weights_dict)

# ============================================
# 8. Load the Enhanced Hybrid CNN + LSTM Model
# ============================================
# Path to the Enhanced Hybrid CNN + LSTM Model
enhanced_model_path = '/content/Enhanced_CNN_LSTM_Model.h5'

# Load the model with the custom AttentionLayer
enhanced_model = load_model(enhanced_model_path, custom_objects={'AttentionLayer': AttentionLayer})
print("Enhanced Hybrid CNN + LSTM Model loaded successfully.")

# ============================================
# 9. Load RoBERTa and BERT Models Using Hugging Face Transformers
# ============================================
# Create directories for RoBERTa and BERT models
os.makedirs('/content/RoBERTa_model', exist_ok=True)
os.makedirs('/content/BERT_model', exist_ok=True)

# List all files in /content
model_files = os.listdir('/content')

# Function to move RoBERTa files
def move_roberta_files():
    roberta_files = [f for f in model_files if f.endswith('.json') or f.endswith('.txt') or f.endswith('.safetensors') or f.endswith('.bin')]
    # Specifically, RoBERTa uses 'vocab.json' and 'merges.txt'
    roberta_specific = ['vocab.json', 'merges.txt']
    other_roberta = [f for f in roberta_files if f not in ['vocab.txt']]  # Exclude BERT's 'vocab.txt'
    for file in roberta_specific + other_roberta:
        if file in model_files:
            os.rename(f'/content/{file}', f'/content/RoBERTa_model/{file}')
            print(f"Moved {file} to RoBERTa_model/")

# Function to move BERT files
def move_bert_files():
    bert_files = [f for f in model_files if f.endswith('.json') or f.endswith('.txt') or f.endswith('.safetensors') or f.endswith('.bin')]
    # Specifically, BERT uses 'vocab.txt'
    bert_specific = ['vocab.txt']
    other_bert = [f for f in bert_files if f not in ['vocab.json', 'merges.txt']]  # Exclude RoBERTa's
    for file in bert_specific + other_bert:
        if file in model_files:
            os.rename(f'/content/{file}', f'/content/BERT_model/{file}')
            print(f"Moved {file} to BERT_model/")

# Move RoBERTa and BERT files
move_roberta_files()
move_bert_files()

# Check contents of RoBERTa_model and BERT_model directories
print("\nContents of RoBERTa_model directory:")
print(os.listdir('/content/RoBERTa_model'))

print("\nContents of BERT_model directory:")
print(os.listdir('/content/BERT_model'))


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


FileNotFoundError: [Errno 2] No such file or directory: '/content/ABC.csv'

In [None]:
from google.colab import drive
drive.mount('/content/drive')