In [None]:
import pandas as pd
import numpy as np
import torch
import os
from transformers import RobertaTokenizer, RobertaForSequenceClassification, TrainingArguments, Trainer
from datasets import Dataset
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from tqdm import tqdm


In [None]:
train_df = pd.read_csv('../data/train_data.csv')
test_df = pd.read_csv('../data/test_data.csv')


label_map = {'positive': 2, 'neutral': 1, 'negative': 0}
train_df['Sentiment'] = train_df['Sentiment'].map(label_map)
test_df['Sentiment'] = test_df['Sentiment'].map(label_map)


train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)


tokenizer = RobertaTokenizer.from_pretrained('roberta-base')


def tokenize_function(examples):
    return tokenizer(examples['Review'], padding="max_length", truncation=True)


train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)


train_dataset = train_dataset.rename_column("Sentiment", "labels")
test_dataset = test_dataset.rename_column("Sentiment", "labels")


train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

In [None]:
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=3)

training_args = TrainingArguments(
    num_train_epochs=12,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    learning_rate=2e-5
)

def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(p.label_ids, preds, average='weighted')
    acc = accuracy_score(p.label_ids, preds)
    return {
        'accuracy': acc,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()

In [None]:
results = trainer.evaluate(test_dataset)
print(f"Testing Accuracy: {results['eval_accuracy']:.4f}")
print(f"Testing F1 Score: {results['eval_f1']:.4f}")
print(f"Testing Recall: {results['eval_recall']:.4f}")
print(f"Testing Precision: {results['eval_precision']:.4f}")

Evaluating: 100%|██████████| 178/178 [5:35<00:00,  1.88s/it]
Testing Accuracy: 0.9660
Testing F1 Score: 0.9664
Testing Recall: 0.9660
Testing Precision: 0.9669


In [None]:
directory = '../model/roberta'
os.makedirs(directory, exist_ok=True)
model.save_pretrained(directory)
tokenizer.save_pretrained(directory)

In [None]:
directory = '../model/roberta'
tokenizer = RobertaTokenizer.from_pretrained(directory)
model = RobertaForSequenceClassification.from_pretrained(directory)

sample_review = "0% quality , bo touch work total pass waste phone"
inputs = tokenizer(sample_review, return_tensors="pt", padding=True, truncation=True)

outputs = model(**inputs)
predictions = np.argmax(outputs.logits.detach().numpy(), axis=1)

label_map = {2: 'positive', 1: 'neutral', 0: 'negative'}
predicted_sentiment = label_map[predictions[0]]
print(f"Predicted sentiment: {predicted_sentiment}")


In [None]:
test_strings = [
    "This product is amazing!",
    "disappoint with this purchase",
    "Value for money",
    "bad",
    "Great value for the price",
    "Product worse",
    "Sucks, I wanna die",
    "I want to get another one its so good",
    "Worse",
    "sometim game answer question correctli alexa say got wrong answer like turn dont light away home",
    "abl",
    "Not bad",
    "Good",
    "Sure, the movie wasn't *awful*, but it was far from a masterpiece.",
    "I can't believe they won the game! They totally choked in the last quarter, though.",
    "Don't get me wrong, the food was good, but the service was painfully slow.",
    "They say they improved the product, but I haven't noticed a difference yet.",
    "Lucky me, I found a parking spot right in front of the store.",
    "It's whatever. I guess the movie was okay.",
    "That was a close one! Glad we pulled through in the end.",
    "Eye roll. This new update is just a bunch of bugs.",
    "Not bad for a first try! I can see potential here.",
    "While the graphics were impressive, the story felt a bit lacking."
]

tokenizer = RobertaTokenizer.from_pretrained('../model/roberta')
model = RobertaForSequenceClassification.from_pretrained('../model/roberta')

def predict_sentiment(model, tokenizer, text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    with torch.no_grad():
        outputs = model(**inputs)
    predictions = np.argmax(outputs.logits.detach().numpy(), axis=1)
    return predictions[0]

label_map = {0: 'negative', 1: 'neutral', 2: 'positive'}

for text in test_strings:
    sentiment_label = predict_sentiment(model, tokenizer, text)
    predicted_sentiment = label_map[sentiment_label]
    print(f"Review: {text}")
    print(f"Predicted sentiment: {predicted_sentiment}\n")


In [None]:
model_path = '../model/roberta'
tokenizer = RobertaTokenizer.from_pretrained(model_path)
model = RobertaForSequenceClassification.from_pretrained(model_path)
model.eval()

label_map = {2: 'positive', 1: 'neutral', 0: 'negative'}

df = pd.read_csv('data/products.csv')

def get_sentiment(text):
    inputs = tokenizer(str(text), return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    pred = np.argmax(outputs.logits.detach().numpy(), axis=1)
    return pred[0]

df['Sentiment'] = df['Review'].apply(get_sentiment)

df.to_csv('products_sentiment.csv', index=False)
