In [36]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

# Load BERTweet model and tokenizer
model_name = "vinai/bertweet-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
# model = AutoModelForSequenceClassification.from_pretrained(model_name)

num_categories = 9  # Number of classes/categories
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_categories)


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/bertweet-base and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.out_proj.bias', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [29]:
import preprocessor
import demoji
import pandas as pd

# Load and preprocess your dataset
def preprocess_text(text):
    # Use tweet-preprocessor to clean tweets
    cleaned_text = preprocessor.clean(text)
    # Remove emojis
    cleaned_text = remove_emojis(cleaned_text)
    return cleaned_text

def remove_emojis(text):
    return demoji.replace(text, '')

# Load your dataset with columns 'tweet' and 'categories'
df = pd.read_csv("/Users/Hsuweic/Desktop/AI4healthcare/dataset_categories.csv")
hate_speech = df[df['label'] == 1].copy() 
# Reset the index to ensure it is consecutive
hate_speech.reset_index(drop=True, inplace=True)

# Preprocess text
hate_speech['cleaned_tweet'] = hate_speech['tweet'].apply(preprocess_text)

print(hate_speech.shape)
print(hate_speech.columns)
print(len(hate_speech))


(1430, 5)
Index(['Unnamed: 0', 'tweet', 'label', 'categories', 'cleaned_tweet'], dtype='object')
1430


In [41]:
# Tokenize and prepare input data
max_length = 128  # You can adjust this based on your requirements

def tokenize_data(text):
    return tokenizer(
        text,
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )

# Tokenize input data
tokenized_data = hate_speech['cleaned_tweet'].apply(tokenize_data)

# Convert categories to numerical labels
label_dict = {category: idx for idx, category in enumerate(hate_speech['categories'].unique())}
hate_speech['label'] = hate_speech['categories'].map(label_dict)

print(type(tokenized_data))
print(hate_speech['label'].unique())
print(len(tokenized_data))
print(tokenized_data)

# Prepare input tensors
input_ids = torch.cat([tokenized_data[i]['input_ids'] for i in range(len(tokenized_data))], dim=0)
attention_masks = torch.cat([tokenized_data[i]['attention_mask'] for i in range(len(tokenized_data))], dim=0)
labels = torch.tensor(hate_speech['label'].values)

# Create DataLoader
from torch.utils.data import DataLoader, TensorDataset
dataset = TensorDataset(input_ids, attention_masks, labels)
dataloader = DataLoader(dataset, batch_size=8, shuffle=True)

<class 'pandas.core.series.Series'>
[1 2 3 4 5 6 7 8 9]
1430
0       [input_ids, token_type_ids, attention_mask]
1       [input_ids, token_type_ids, attention_mask]
2       [input_ids, token_type_ids, attention_mask]
3       [input_ids, token_type_ids, attention_mask]
4       [input_ids, token_type_ids, attention_mask]
                           ...                     
1425    [input_ids, token_type_ids, attention_mask]
1426    [input_ids, token_type_ids, attention_mask]
1427    [input_ids, token_type_ids, attention_mask]
1428    [input_ids, token_type_ids, attention_mask]
1429    [input_ids, token_type_ids, attention_mask]
Name: cleaned_tweet, Length: 1430, dtype: object


In [42]:
# from transformers import AdamW
# from torch.nn import CrossEntropyLoss
# from tqdm import tqdm

# # Set device
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model.to(device)

# # Set optimizer and loss function
# optimizer = AdamW(model.parameters(), lr=1e-5)
# loss_fn = CrossEntropyLoss()

# # Training loop
# num_epochs = 3  # You can adjust this based on your requirements

# for epoch in range(num_epochs):
#     model.train()
#     total_loss = 0
#     for batch in tqdm(dataloader, desc=f"Epoch {epoch + 1}"):
#         input_ids, attention_mask, label = batch
#         input_ids, attention_mask, label = input_ids.to(device), attention_mask.to(device), label.to(device)

#         optimizer.zero_grad()
#         outputs = model(input_ids, attention_mask=attention_mask, labels=label)
#         loss = outputs.loss
#         total_loss += loss.item()

#         loss.backward()
#         optimizer.step()

#     average_loss = total_loss / len(dataloader)
#     print(f"Epoch {epoch + 1}, Average Loss: {average_loss}")

# # Save the trained model
# model.save_pretrained("bertweet_multi_classification_model")
# tokenizer.save_pretrained("bertweet_multi_classification_model")


from transformers import AdamW
from torch.nn import CrossEntropyLoss

# Set up optimizer and loss function
optimizer = AdamW(model.parameters(), lr=5e-5)
loss_fn = CrossEntropyLoss()

# Set up training parameters
num_epochs = 3  # You can adjust this based on your requirements
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Move model and data to the device
model.to(device)
input_ids, attention_masks, labels = input_ids.to(device), attention_masks.to(device), labels.to(device)

# Training loop
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch in dataloader:
        batch_input_ids, batch_attention_masks, batch_labels = batch
        batch_input_ids, batch_attention_masks, batch_labels = (
            batch_input_ids.to(device),
            batch_attention_masks.to(device),
            batch_labels.to(device),
        )

        # Zero the gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = model(
            input_ids=batch_input_ids,
            attention_mask=batch_attention_masks,
            labels=batch_labels,
        )

        # Calculate loss
        loss = outputs.loss

        # Backward pass
        loss.backward()

        # Update parameters
        optimizer.step()

        total_loss += loss.item()

    # Print average loss for the epoch
    average_loss = total_loss / len(dataloader)
    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {average_loss}")

# Save the trained model
model.save_pretrained("/Users/Hsuweic/Desktop/AI4healthcare/BERTweet classification model")





IndexError: Target 9 is out of bounds.