<a href="https://colab.research.google.com/github/waqarmm/AI-Labs/blob/master/Roberta_sentimentanalysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
import torch
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments

# Load your dataset (assuming it's in a CSV file)
your_dataset = pd.read_csv('new_data_with_emotions.csv')

# Drop rows with missing labels (NaN values)
your_dataset = your_dataset.dropna(subset=['emotion'])

# Extract text data, sentiment labels, and cities
text_data = your_dataset['combined_translated'].tolist()
sentiment_labels = your_dataset['emotion'].tolist()
cities = your_dataset['city'].tolist()

# Split the dataset into training and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(
    text_data,
    sentiment_labels,
    test_size=0.2,
    random_state=42
)

# Convert sentiment labels to numerical values
label_dict = {'Love': 0, 'Happy': 1, 'Neutral': 2, 'Sad': 3, 'Angry': 4}  # Map sentiment labels to numerical values
train_labels = [label_dict[label] for label in train_labels]
val_labels = [label_dict[label] for label in val_labels]

# Initialize RoBERTa tokenizer and encode text data
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
tokenizer.pad_token = tokenizer.eos_token  # Set the EOS token as the padding token

train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)

# Define datasets using PyTorch Dataset class
class SentimentDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = SentimentDataset(train_encodings, train_labels)
val_dataset = SentimentDataset(val_encodings, val_labels)

# Initialize RoBERTa model for sequence classification
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=len(label_dict))

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results_roberta',
    num_train_epochs=3,
    per_device_train_batch_size=1,  # Set batch size to 1
    per_device_eval_batch_size=1,   # Set batch size to 1 for evaluation
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs_roberta',
    logging_steps=10,
    evaluation_strategy="epoch"
)

# Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

# Start model training
trainer.train()


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,1.108,1.444778


KeyboardInterrupt: 

In [None]:
# Assuming you've already trained the model and initialized the Trainer (trainer) and model

# Prepare a sample query
query = "I am feeling happy and excited today. I would like to visit"

# Tokenize the query
tokenized_input = tokenizer(query, return_tensors="pt")

# Move the input tensor to the same device as the model (assuming it's on GPU)
tokenized_input = {key: value.to('cuda') for key, value in tokenized_input.items()}

# Make predictions
with torch.no_grad():
    outputs = model(**tokenized_input)
    logits = outputs.logits

# Get the predicted class (sentiment label)
predicted_class = torch.argmax(logits, dim=1).item()

# Reverse the label dictionary to get the sentiment label from the predicted class
reverse_label_dict = {v: k for k, v in label_dict.items()}
predicted_sentiment = reverse_label_dict[predicted_class]

# Print the predicted sentiment
print(f"The predicted sentiment based on the query is: {predicted_sentiment}")


In [2]:
pip install transformers[torch]

Collecting accelerate>=0.20.3 (from transformers[torch])
  Downloading accelerate-0.25.0-py3-none-any.whl (265 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m265.7/265.7 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.25.0
