In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

# Load BERTweet model and tokenizer
model_name = "vinai/bertweet-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
# model = AutoModelForSequenceClassification.from_pretrained(model_name)

hate_categories = 9  # Number of classes/categories
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=hate_categories)


  from .autonotebook import tqdm as notebook_tqdm
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/bertweet-base and are newly initialized: ['classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.out_proj.bias', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
import preprocessor
import demoji
import pandas as pd

# Load and preprocess your dataset
def preprocess_text(text):
    # Use tweet-preprocessor to clean tweets
    cleaned_text = preprocessor.clean(text)
    # Remove emojis
    cleaned_text = remove_emojis(cleaned_text)
    return cleaned_text

def remove_emojis(text):
    return demoji.replace(text, '')

# Load your dataset with columns 'tweet' and 'categories'
df = pd.read_csv("/Users/Hsuweic/Desktop/AI4healthcare/dataset/dataset_categories.csv")
hate_speech = df[df['label'] == 1].copy() 
# Reset the index to ensure it is consecutive
hate_speech.reset_index(drop=True, inplace=True)

# Preprocess text
hate_speech['cleaned_tweet'] = hate_speech['tweet'].apply(preprocess_text)

print(hate_speech.shape)
print(hate_speech.columns)
print(len(hate_speech))


(1430, 5)
Index(['Unnamed: 0', 'tweet', 'label', 'categories', 'cleaned_tweet'], dtype='object')
1430


In [5]:
# Tokenize and prepare input data
max_length = 128  # You can adjust this based on your requirements

def tokenize_data(text):
    return tokenizer(
        text,
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )

# Tokenize input data
tokenized_data = hate_speech['cleaned_tweet'].apply(tokenize_data)

# Convert categories to numerical labels
label_dict = {category: idx for idx, category in enumerate(hate_speech['categories'].unique())}
hate_speech['label'] = hate_speech['categories'].map(label_dict)

# Print label_dict to see the mapping
print("Label Dictionary:")
print(label_dict)
# print(hate_speech['label'].unique())
print(len(tokenized_data))
print(tokenized_data)

# Prepare input tensors
input_ids = torch.cat([tokenized_data[i]['input_ids'] for i in range(len(tokenized_data))], dim=0)
attention_masks = torch.cat([tokenized_data[i]['attention_mask'] for i in range(len(tokenized_data))], dim=0)
labels = torch.tensor(hate_speech['label'].values)

# Create DataLoader
from torch.utils.data import DataLoader, TensorDataset
dataset = TensorDataset(input_ids, attention_masks, labels)
dataloader = DataLoader(dataset, batch_size=8, shuffle=True)

<class 'pandas.core.series.Series'>
[0 1 2 3 4 5 6 7 8]
1430
0       [input_ids, token_type_ids, attention_mask]
1       [input_ids, token_type_ids, attention_mask]
2       [input_ids, token_type_ids, attention_mask]
3       [input_ids, token_type_ids, attention_mask]
4       [input_ids, token_type_ids, attention_mask]
                           ...                     
1425    [input_ids, token_type_ids, attention_mask]
1426    [input_ids, token_type_ids, attention_mask]
1427    [input_ids, token_type_ids, attention_mask]
1428    [input_ids, token_type_ids, attention_mask]
1429    [input_ids, token_type_ids, attention_mask]
Name: cleaned_tweet, Length: 1430, dtype: object


In [11]:
from transformers import AdamW
from torch.nn import CrossEntropyLoss

# Set up optimizer and loss function
# optimizer = AdamW(model.parameters(), lr=5e-5)
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
loss_fn = CrossEntropyLoss()

# Set up training parameters
num_epochs = 3  # You can adjust this based on your requirements
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Move model and data to the device
model.to(device)
input_ids, attention_masks, labels = input_ids.to(device), attention_masks.to(device), labels.to(device)

# Training loop
for epoch in range(num_epochs):
    model.train()
    print("Start Training")
    total_loss = 0
    n = 0
    for batch in dataloader:
        print("=== Start Batch " + str(n) + " ===")
        batch_input_ids, batch_attention_masks, batch_labels = batch
        batch_input_ids, batch_attention_masks, batch_labels = (
            batch_input_ids.to(device),
            batch_attention_masks.to(device),
            batch_labels.to(device),
        )

        # Zero the gradients
        print("Start Doing Gradients")
        optimizer.zero_grad()

        # Forward pass
        print("Start Forward")
        outputs = model(
            input_ids=batch_input_ids,
            attention_mask=batch_attention_masks,
            labels=batch_labels,
        )

        # Calculate loss
        print("Start Calculating Loss")
        loss = outputs.loss

        # Backward pass
        print("Start Backward")
        loss.backward()

        # Update parameters
        print("Start Update Parameters")
        optimizer.step()

        total_loss += loss.item()
        print(total_loss)

        n = n+1

    # Print average loss for the epoch
    average_loss = total_loss / len(dataloader)
    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {average_loss}")

# Save the trained model (This method is specific to the transformers library and is designed for saving transformer-based models.
# It saves the model in a format that includes the architecture, parameters, and additional information specific to the transformers library.
# It provides a higher-level abstraction that is specific to transformer models and allows for easily loading the model using AutoModel.from_pretrained later.)
model.save_pretrained("/Users/Hsuweic/Desktop/AI4health/model/BERTweet classification model")
tokenizer.save_pretrained("/Users/Hsuweic/Desktop/AI4health/model/BERTweet classification model")



Start Training
=== Start Batch 0 ===
Start Doing Gradients
Start Forward
Start Calculating Loss
Start Backward
Start Update Parameters
1.2722333669662476
=== Start Batch 1 ===
Start Doing Gradients
Start Forward
Start Calculating Loss
Start Backward
Start Update Parameters
1.6686564087867737
=== Start Batch 2 ===
Start Doing Gradients
Start Forward
Start Calculating Loss
Start Backward
Start Update Parameters
2.858840048313141
=== Start Batch 3 ===
Start Doing Gradients
Start Forward
Start Calculating Loss
Start Backward
Start Update Parameters
4.184438526630402
=== Start Batch 4 ===
Start Doing Gradients
Start Forward
Start Calculating Loss
Start Backward
Start Update Parameters
5.213654577732086
=== Start Batch 5 ===
Start Doing Gradients
Start Forward
Start Calculating Loss
Start Backward
Start Update Parameters
5.620341658592224
=== Start Batch 6 ===
Start Doing Gradients
Start Forward
Start Calculating Loss
Start Backward
Start Update Parameters
6.368605434894562
=== Start Batch 7

In [None]:
# trained model
# batch size = 8
# total training time = 160 mins
# optimizer = AdamW
# Epoch 1/3, Loss: 0.7618927443660172
# Epoch 2/3, Loss: 0.5140882317788441
# Epoch 3/3, Loss: 0.39429688695089776

### Model Inference
Label Dictionary:
{'Race': 0, 'Sexual Orientation': 1, 'Gender': 2, 'Religion': 3, 'Disability': 4, 'Physical Appearance': 5, 'Class': 6, 'Ethnicity': 7, 'Behavior': 8}

In [15]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

# Example input tweet
input_tweet = "you are a nasty bitch and I hate you"

# Load the BERTweet tokenizer and model
model_path = "/Users/Hsuweic/Desktop/AI4health/model/BERTweet classification model"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path, num_labels=9)  # Assuming 9 classes

# Tokenize the input tweet
inputs = tokenizer(input_tweet, return_tensors="pt")

# Forward pass through the model to obtain logits
with torch.no_grad():
    outputs = model(**inputs)

# Get logits from the output
logits = outputs.logits

# Apply softmax to get probabilities
probabilities = torch.nn.functional.softmax(logits, dim=1)

# Choose the predicted class
predicted_class = torch.argmax(probabilities, dim=1).item()

print("Predicted class:", predicted_class)



Predicted class: 2
