In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, f1_score
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from transformers import DataCollatorWithPadding
from datasets import Dataset
import re

  from pandas.core import (


In [2]:
# Load the dataset
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

In [3]:
def clean_tweet(text):
    text = text.lower()                                      
    text = re.sub(r'http\S+|www.\S+', '', text)            
    text = re.sub(r'@\w+', '', text)                        
    text = re.sub(r'#', '', text)                            
    text = re.sub(r'&[a-z]+;', '', text)                     
    text = re.sub(r'\s+', ' ', text).strip()                
    text = re.sub(r'[^\w\s]', '', text)                    
    return text

In [4]:
# Prepare text and labels
train_texts = train_df["text"].astype(str).apply(clean_tweet).tolist()
test_texts = test_df["text"].astype(str).apply(clean_tweet).tolist()
train_labels = train_df["target"].tolist()

# Train-validation split
train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_texts, train_labels, test_size=0.2, random_state=42
)

In [5]:
# Load tokenizer and model
model_name = "vinai/bertweet-base"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

# Tokenize datasets
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_df["text"].astype(str).tolist(), truncation=True, padding=True)


emoji is not installed, thus not converting emoticons or emojis into text. Install emoji: pip3 install emoji==0.6.0
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/bertweet-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [6]:
# Create Hugging Face Datasets
train_dataset = Dataset.from_dict({
    'input_ids': train_encodings['input_ids'],
    'attention_mask': train_encodings['attention_mask'],
    'labels': train_labels
})

val_dataset = Dataset.from_dict({
    'input_ids': val_encodings['input_ids'],
    'attention_mask': val_encodings['attention_mask'],
    'labels': val_labels
})


In [7]:
# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate= 5e-6,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
)

In [8]:
# Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer),
)

# Train the model
trainer.train()



Epoch,Training Loss,Validation Loss
1,0.3437,0.40457
2,0.3599,0.401312
3,0.3584,0.408227




TrainOutput(global_step=1143, training_loss=0.40889347882408483, metrics={'train_runtime': 513.1214, 'train_samples_per_second': 35.606, 'train_steps_per_second': 2.228, 'total_flos': 422493660477000.0, 'train_loss': 0.40889347882408483, 'epoch': 3.0})

In [9]:
# Evaluate the model
preds = trainer.predict(val_dataset)
pred_labels = np.argmax(preds.predictions, axis=1)

val_accuracy = accuracy_score(val_labels, pred_labels)
val_f1 = f1_score(val_labels, pred_labels)

print("Validation Accuracy:", val_accuracy)
print("Validation F1 Score:", val_f1)
print("Classification Report:\n", classification_report(val_labels, pred_labels))

# Predict on test set
test_dataset = Dataset.from_dict({
    'input_ids': test_encodings['input_ids'],
    'attention_mask': test_encodings['attention_mask']
})
test_preds = trainer.predict(test_dataset)
test_pred_labels = np.argmax(test_preds.predictions, axis=1)



Validation Accuracy: 0.8443860801050558
Validation F1 Score: 0.8087167070217918
Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.90      0.87       874
           1       0.85      0.77      0.81       649

    accuracy                           0.84      1523
   macro avg       0.85      0.84      0.84      1523
weighted avg       0.84      0.84      0.84      1523



