# Roberta Model

In [None]:
%pip install transformers datasets pandas scikit-learn
%pip install transformers[torch]

In [None]:
import pandas as pd
import numpy as np
import torch
import csv
import os
from sklearn.model_selection import train_test_split
from transformers import RobertaTokenizer, RobertaForSequenceClassification, TrainingArguments, Trainer
from datasets import Dataset, DatasetDict
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

### Ready the Dataset for training

In [None]:
# Load the dataset
df = pd.read_csv('../review_data/dataset_7(senti).csv')

# Convert sentiment labels to numerical values
label_map = {'positive': 2, 'neutral': 1, 'negative': 0}
df['Sentiment'] = df['Sentiment'].map(label_map)

# Split the data into train and test sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['Sentiment'])

# Convert to Hugging Face datasets
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

# Load the tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

# Tokenize the data
def tokenize_function(examples):
    return tokenizer(examples['Review'], padding="max_length", truncation=True)

# Tokenize in batches
batch_size = 1000
train_dataset = train_dataset.map(tokenize_function, batched=True, batch_size=batch_size)
test_dataset = test_dataset.map(tokenize_function, batched=True, batch_size=batch_size)

# Rename 'Sentiment' column to 'labels' to match the expected input key
train_dataset = train_dataset.rename_column("Sentiment", "labels")
test_dataset = test_dataset.rename_column("Sentiment", "labels")

# Set the format for PyTorch
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

### Train the model

In [None]:
# Load the model
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=3)

# Define the training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    eval_strategy="epoch",
)

# training_args = TrainingArguments(
#     output_dir='./results',
#     num_train_epochs=20,
#     per_device_train_batch_size=64,
#     per_device_eval_batch_size=64,
#     warmup_steps=2000,
#     weight_decay=0.1,
#     logging_dir='./logs',
#     logging_steps=100,
#     evaluation_strategy="epoch",
#     save_strategy="epoch",
#     learning_rate=1e-4,
#     fp16=True,  
#     gradient_accumulation_steps=4,
#     lr_scheduler_type="constant_with_warmup",  
#     save_total_limit=3, 
#     gradient_checkpointing=True,  
#     report_to="none",
# )

# Initialize the Trainer
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    acc = accuracy_score(p.label_ids, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(p.label_ids, preds, average='weighted')
    return {
        'accuracy': acc,
        'precision': precision,
        'recall': recall,
        'f1': f1
}

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# Start training
trainer.train()

### Trained Results

In [None]:
# Evaluate the model on the test set
eval_result = trainer.evaluate()
print(f"Test Accuracy: {eval_result['eval_accuracy']}")
print(eval_result)

### Save the trainded model

In [None]:
directory = '../ml_trained_model/roberta_senti'
file_path = os.path.join(directory)

if not os.path.exists(directory):
    os.makedirs(directory)
    print(f"Directory '{directory}' created.")

# Save the model and tokenizer
model.save_pretrained(directory)
tokenizer.save_pretrained(directory)

### Test accuracy

In [None]:
# Load the dataset
try:
    df = pd.read_csv('../review_data/dataset_7(senti).csv', on_bad_lines='skip', quoting=csv.QUOTE_ALL)
except pd.errors.ParserError as e:
    print(f"Error parsing CSV file: {e}")
    exit(1)

# Drop rows with missing values in 'Sentiment' and 'Review'
df = df.dropna(subset=['Sentiment', 'Review'])

# Convert sentiment labels to numerical values
def map_sentiment_to_labels(sentiment):
    if sentiment == 'negative':
        return 0
    elif sentiment == 'neutral':
        return 1
    elif sentiment == 'positive':
        return 2
    else:
        raise ValueError(f"Unknown sentiment label: {sentiment}")

df['labels'] = df['Sentiment'].apply(map_sentiment_to_labels)

# Randomly select 500 reviews for testing
test_df = df.sample(n=500, random_state=42)

# Convert to Hugging Face dataset
test_dataset = Dataset.from_pandas(test_df)

# Load the tokenizer
tokenizer = RobertaTokenizer.from_pretrained('../ml_trained_model/roberta_senti')

# Tokenize the data
def tokenize_function(examples):
    return tokenizer(examples['Review'], padding="max_length", truncation=True)

# Tokenize in batches
batch_size = 1000
test_dataset = test_dataset.map(tokenize_function, batched=True, batch_size=batch_size)

# Set the format for PyTorch
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

# Load the model
model = RobertaForSequenceClassification.from_pretrained('../ml_trained_model/roberta_senti')

# Evaluate the model
def evaluate(model, dataset):
    model.eval()
    all_predictions = []
    all_labels = []
    
    for i in range(len(dataset)):
        inputs = {key: dataset[key][i].unsqueeze(0) for key in ['input_ids', 'attention_mask']}
        labels = dataset['labels'][i].unsqueeze(0)
        with torch.no_grad():
            outputs = model(**inputs)
        predictions = np.argmax(outputs.logits.detach().numpy(), axis=1)
        all_predictions.append(predictions[0])
        all_labels.append(labels.numpy()[0])
    
    accuracy = accuracy_score(all_labels, all_predictions)
    return accuracy

# Calculate accuracy
accuracy = evaluate(model, test_dataset)
print(f"Testing Accuracy: {accuracy:.4f}")


#### Testing Accuracy: 0.9660

### Single line sentence Testing

In [None]:
# Load the model and tokenizer
tokenizer = RobertaTokenizer.from_pretrained('../ml_trained_model/roberta_senti')
model = RobertaForSequenceClassification.from_pretrained('../ml_trained_model/roberta_senti')

sample_review = "0% quality , bo touch work total pass waste phone"

# Tokenize the sample review
inputs = tokenizer(sample_review, return_tensors="pt", padding=True, truncation=True)

# Perform prediction
outputs = model(**inputs)
predictions = np.argmax(outputs.logits.detach().numpy(), axis=1)

# Map numerical predictions back to sentiment labels
label_map = {2: 'positive', 1: 'neutral', 0: 'negative'}
predicted_sentiment = label_map[predictions[0]]
print(f"Predicted sentiment: {predicted_sentiment}")


### Array of sentence Testing

In [None]:
# Define the test strings
test_strings = [
    "This product is amazing!",
    "disappoint with this purchase",
    "Value for money",
    "bad",
    "Great value for the price",
    "Product worse",
    "Sucks, I wanna die",
    "I want to get another one its so good",
    "Worse",
    "sometim game answer question correctli alexa say got wrong answer like turn dont light away home",
    "abl",
    "Not bad",
    "Good",
    "Sure, the movie wasn't *awful*, but it was far from a masterpiece.",
    "I can't believe they won the game! They totally choked in the last quarter, though.",
    "Don't get me wrong, the food was good, but the service was painfully slow.",
    "They say they improved the product, but I haven't noticed a difference yet.",
    "Lucky me, I found a parking spot right in front of the store.",
    "It's whatever. I guess the movie was okay.",
    "That was a close one! Glad we pulled through in the end.",
    "Eye roll. This new update is just a bunch of bugs.",
    "Not bad for a first try! I can see potential here.",
    "While the graphics were impressive, the story felt a bit lacking."
]

# Load the tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained('../ml_trained_model/roberta_senti')
model = RobertaForSequenceClassification.from_pretrained('../ml_trained_model/roberta_senti')

# Function to predict sentiment
def predict_sentiment(model, tokenizer, text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    with torch.no_grad():
        outputs = model(**inputs)
    predictions = np.argmax(outputs.logits.detach().numpy(), axis=1)
    return predictions[0]

# Map numerical predictions back to sentiment labels
label_map = {0: 'negative', 1: 'neutral', 2: 'positive'}

# Predict sentiment for each test string
for text in test_strings:
    sentiment_label = predict_sentiment(model, tokenizer, text)
    predicted_sentiment = label_map[sentiment_label]
    print(f"Review: {text}")
    print(f"Predicted sentiment: {predicted_sentiment}\n")


### CSV file analyze

In [None]:
import torch
from transformers import RobertaTokenizer, RobertaForSequenceClassification
import numpy as np
import pandas as pd

# Load the tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained('../ml_trained_model/roberta_senti')
model = RobertaForSequenceClassification.from_pretrained('../ml_trained_model/roberta_senti')

# Function to predict sentiment
def predict_sentiment(model, tokenizer, text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    with torch.no_grad():
        outputs = model(**inputs)
    predictions = np.argmax(outputs.logits.detach().numpy(), axis=1)
    return predictions[0]  # Return single prediction, not numpy array

# Map numerical predictions back to sentiment labels
label_map = {0: 'negative', 1: 'neutral', 2: 'positive'}

# Read the CSV file
df = pd.read_csv('../review_data/dataset_7.csv')

# Initialize counters
total_reviews = len(df)
processed_reviews = 0
predicted_sentiments = []

# Predict sentiment for each review and store the results
for index, row in df.iterrows():
    sentiment_label = predict_sentiment(model, tokenizer, row['Review'])
    predicted_sentiment = label_map[sentiment_label]
    predicted_sentiments.append(predicted_sentiment)
    
    processed_reviews += 1
    if processed_reviews % 1000 == 0:
        print(f"Progress: {processed_reviews}/{total_reviews} reviews processed")

# Add predicted sentiments to dataframe
df['Sentiment'] = predicted_sentiments

# Write the results to a new CSV file
df.to_csv('../review_data/dataset_7(senti)_roberta.csv', index=False)
df.to_csv('../analyzed_data/roberta_analyzed_data.csv', index=False)

print("Sentiment prediction completed and saved to 'reviews_with_sentiment.csv'")
