In [1]:
#import libraries
import pandas as pd
import numpy as np
import nltk
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('ggplot')

import subprocess
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
from scipy.special import softmax
from tqdm.notebook import tqdm
import os
import torch

# Define paths for the saved model
MODEL_NAME = "cardiffnlp/twitter-roberta-base-sentiment"
MODEL_PATH = "saved_roberta_model.pth"

# Check if the model has already been saved locally
if os.path.exists(MODEL_PATH):
    print("Loading saved RoBERTa model...")
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    model = torch.load(MODEL_PATH)  # Load the saved model
else:
    print("Downloading and saving RoBERTa model...")
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)
    torch.save(model, MODEL_PATH)  # Save the model locally for future use

def polarity_scores_roberta(ex):
    encoded_text = tokenizer(ex, return_tensors='pt')
    output = model(**encoded_text)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)
        
    scores_dict = {'roberta_neg': scores[0],
                   'roberta_neu': scores[1],
                   'roberta_pos': scores[2],
                   'roberta_polarity': 1 if scores[2] > scores[0] else 0
                  }
    return scores_dict

# Load and process data
df = pd.read_csv('Breakfast Cooking - Kids Game - Copy - Copy.csv', encoding='utf-8')

# Run the polarity scores on the dataFrame
res = {}
robertaArr = {}
for i, row in tqdm(df.iterrows(), total=len(df)):
    try:
        text = row['Content']
        myId = row['Id']
        roberta_result = polarity_scores_roberta(text)
        robertaArr[myId] = roberta_result
        both = {**roberta_result}
        res[myId] = both
    except RuntimeError:
        print(f'Broke for {myId}')
        
roberta = pd.DataFrame(robertaArr).T
roberta = roberta.reset_index().rename(columns={'index': 'Id'})
roberta = roberta.merge(df, how='left')

# Emotion analysis pipeline using EmoRoBERTa
emotion_pipeline = pipeline('sentiment-analysis', model='arpanghoshal/EmoRoBERTa')

def get_emotional_label(text):
    return (emotion_pipeline(text)[0]['label'])

roberta['emotion'] = roberta['Content'].apply(get_emotional_label)

# Save the DataFrame with emotion labels to CSV
csv_file = "BreakFastCooking_emotionNew.csv"
roberta.to_csv(csv_file, index=False)

print("Process completed and results saved to CSV.")




Downloading and saving RoBERTa model...




  0%|          | 0/356 [00:00<?, ?it/s]

config.json:   0%|          | 0.00/1.72k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development





tf_model.h5:   0%|          | 0.00/501M [00:00<?, ?B/s]




All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

All the layers of TFRobertaForSequenceClassification were initialized from the model checkpoint at arpanghoshal/EmoRoBERTa.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Process completed and results saved to CSV.
