<a href="https://colab.research.google.com/github/tpeterz/happy-flix/blob/TAP/Emotion_Detection_HG.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# This program was ran in a Google Colab Notebook
# Model from HuggingFace:
* Jochen Hartmann: [Emotion English DistilRoBERTa-base](https://huggingface.co/j-hartmann/emotion-english-distilroberta-base/), 2022.
---
### While in Google Colab
- You may need to add a 'Secret Key' to store your personal HuggingFace token to use this (and most) models.
- On the left sidebar, above the 'Files' tab (Folder icon), there is a key icon. This is where you can create a new secret key, 'HF_TOKEN', where you assign the value portion to your personal and private token from [HuggingFace Tokens](https://huggingface.co/settings/tokens).



In [1]:
# Ran in colab
!pip install transformers datasets



In [3]:
import pandas as pd
import re
from emoji import demojize
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer
from tqdm import tqdm

In [4]:
# import required packages
import torch
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer

# Create class for data preparation
class SimpleDataset:
    def __init__(self, tokenized_texts):
        self.tokenized_texts = tokenized_texts

    def __len__(self):
        return len(self.tokenized_texts["input_ids"])

    def __getitem__(self, idx):
        return {k: v[idx] for k, v in self.tokenized_texts.items()}

In [5]:
!pip install accelerate -U



In [6]:
import accelerate
print(accelerate.__version__)

0.29.3


In [7]:
!pip install torch



In [8]:
!pip install transformers -U



In [9]:
# load tokenizer and model, create trainer

model_name = "j-hartmann/emotion-english-distilroberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
trainer = Trainer(model=model)

In [10]:
# run cell and select file for upload
from google.colab import files
# files.upload()

In [14]:
file_name = "combined_movies.csv"  # note: you can right-click on your file and copy-paste the path to it here
text_column = "reviews"
# read in csv

movie_predictions_df = pd.read_csv(file_name)
print(movie_predictions_df.columns)

Index(['Unnamed: 0', 'adult', 'backdrop_path', 'genre_ids', 'id',
       'original_language', 'original_title', 'overview', 'popularity',
       'poster_path', 'release_date', 'title', 'video', 'vote_average',
       'vote_count', 'reviews'],
      dtype='object')


## Converting the `movie_predictions_df` dataframe into a list with string items inside list
### This is will to run on the uncleaned reviews column. This specific model works best with lists of string items.

In [15]:
pred_texts = movie_predictions_df['reviews'].dropna().astype('str').tolist()

In [16]:
# Tokenize texts and create prediction data set
tokenized_texts = tokenizer(pred_texts,truncation=True,padding=True)
pred_dataset = SimpleDataset(tokenized_texts)

# WARNING !
# The cell below took 4 hours to run. This model had been saved and uploaded to HuggingFace, so that this does not need to be done again, unless changes need to be made.

In [17]:
# Run predictions
predictions = trainer.predict(pred_dataset)

In [18]:
# Transform predictions to labels
preds = predictions.predictions.argmax(-1)
labels = pd.Series(preds).map(model.config.id2label)
scores = (np.exp(predictions[0])/np.exp(predictions[0]).sum(-1,keepdims=True)).max(1)

In [19]:
# scores raw
temp = (np.exp(predictions[0])/np.exp(predictions[0]).sum(-1,keepdims=True))

In [20]:
# work in progress
# container
anger = []
disgust = []
fear = []
joy = []
neutral = []
sadness = []
surprise = []

# extract scores (as many entries as exist in pred_texts)
for i in range(len(pred_texts)):
  anger.append(temp[i][0])
  disgust.append(temp[i][1])
  fear.append(temp[i][2])
  joy.append(temp[i][3])
  neutral.append(temp[i][4])
  sadness.append(temp[i][5])
  surprise.append(temp[i][6])

In [21]:
# Create DataFrame with texts, predictions, labels, and scores
df = pd.DataFrame(list(zip(pred_texts,preds,labels,scores,  anger, disgust, fear, joy, neutral, sadness, surprise)), columns=['text','pred','label','score', 'anger', 'disgust', 'fear', 'joy', 'neutral', 'sadness', 'surprise'])
df.head()

Unnamed: 0,text,pred,label,score,anger,disgust,fear,joy,neutral,sadness,surprise
0,FULL SPOILER-FREE REVIEW @ https://talkingfilm...,3,joy,0.934813,0.002814,0.001023,0.005133,0.934813,0.037235,0.002196,0.016785
1,_Kung Fu Panda 4_ isn’t the best _Kung Fu Pand...,5,sadness,0.598701,0.002053,0.006893,0.003863,0.068874,0.242003,0.598701,0.077614
2,FULL SPOILER-FREE REVIEW @ https://fandomwire....,4,neutral,0.559732,0.173443,0.0934,0.059591,0.04129,0.559732,0.035302,0.037241
3,Very poor scenario and the story just does not...,3,joy,0.893423,0.002555,0.003341,0.002124,0.893423,0.077729,0.005483,0.015345
4,We start off with an heavily pregnant woman de...,4,neutral,0.433036,0.070495,0.246801,0.108896,0.004469,0.433036,0.016794,0.119507


In [23]:
df.to_csv('emotions_movie_reviews.csv', index=False)

In [27]:
# Save the model
model.save_pretrained("Model_Emotions")

In [28]:
# Save the predictions (if not in a Colab notebook, specify path)
df.to_csv("predictions.csv", index=False)