# Classifier based on Sentiment Analysis

In [1]:
# using the gpu
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
device

'cuda'

In [2]:
# loading the movie dataset
from google.colab import files
movies = files.upload()

Saving movies.csv to movies.csv


In [3]:
# creating pandas dataframe
import pandas as pd
col_names = ["genre", "split"]
data = pd.read_csv("movies.csv", sep=";", names=col_names)
data = data.sample(frac=1, random_state=42)
data.head()

Unnamed: 0,genre,split
8826,superhero,"ruling, brother. a throne would suit you ill. ..."
3172,comedy,the hotel. you'd better go straight to the bas...
9221,western,"shooting a man, son. no, it isn't. not in my o..."
718,action,that. we've only got seven minutes. passengers...
3133,comedy,the thames men later became spinal tap and had...


In [4]:
# remove empty lines
data.dropna(inplace=True)

In [5]:
data.genre.unique()

array(['superhero', 'comedy', 'western', 'action', 'fantasy', 'history',
       'adventure', 'sport', 'drama', 'scifi'], dtype=object)

## Alternative: VADER

In [6]:
# install the VADER sentiment analysis library
!pip install vaderSentiment



In [7]:
# encode the splits using VADER sentiment analysis
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

analyzer = SentimentIntensityAnalyzer()

def analyse_sentiment(text):
  compound_score = analyzer.polarity_scores(text)['compound']
  return compound_score

data['sentiment'] = data['split'].apply(analyse_sentiment)
data.head()

Unnamed: 0,genre,split,sentiment
8826,superhero,"ruling, brother. a throne would suit you ill. ...",-0.5948
3172,comedy,the hotel. you'd better go straight to the bas...,0.9224
9221,western,"shooting a man, son. no, it isn't. not in my o...",-0.0422
718,action,that. we've only got seven minutes. passengers...,-0.9178
3133,comedy,the thames men later became spinal tap and had...,0.4276


## RoBERTa-base

In [None]:
# import cardiffnlp/twitter-roberta-base-sentiment model from huggingface
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer
import numpy as np
from scipy.special import softmax
import csv
import urllib.request

task='sentiment'
MODEL = f"cardiffnlp/twitter-roberta-base-{task}"

tokenizer = AutoTokenizer.from_pretrained(MODEL)

# download label mapping
labels=[]
mapping_link = f"https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/{task}/mapping.txt"
with urllib.request.urlopen(mapping_link) as f:
    html = f.read().decode('utf-8').split("\n")
    csvreader = csv.reader(html, delimiter='\t')
labels = [row[1] for row in csvreader if len(row) > 1]

# PT
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
model.save_pretrained(MODEL)

config.json:   0%|          | 0.00/747 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

In [None]:
# define sentiment analyser for the encoding
def analyse_sentiment(text):
  encoded_input = tokenizer(text, return_tensors='pt')
  output = model(**encoded_input)
  scores = output[0][0].detach().numpy()
  scores = softmax(scores)

  ranking = np.argsort(scores)
  ranking = ranking[::-1]
  sentiment_label = labels[ranking[0]]
  sentiment_score = scores[ranking[0]]

  if sentiment_label == "negative":
    return 0.0 + sentiment_score
  elif sentiment_label == "neutral":
    return 1.0 + sentiment_score
  elif sentiment_label == "positive":
    return 2.0 + sentiment_score

#Examples:
#print(analyse_sentiment("shooting a man, son. no, it isn't. not in my o"))
#print(analyse_sentiment("the thames men later became spinal tap and had"))

In [None]:
x = data[0:10]
x['sentiment'] = x['split'].apply(analyse_sentiment)
x.head(10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x['sentiment'] = x['split'].apply(analyse_sentiment)


Unnamed: 0,genre,split,sentiment
8826,superhero,"ruling, brother. a throne would suit you ill. ...",1.477646
3172,comedy,the hotel. you'd better go straight to the bas...,1.426317
9221,western,"shooting a man, son. no, it isn't. not in my o...",0.714768
718,action,that. we've only got seven minutes. passengers...,0.591064
3133,comedy,the thames men later became spinal tap and had...,1.485931
9382,western,wolves is very quiet these days. is his heart ...,2.503856
4375,fantasy,they stretch like mad. - let's go put him in t...,1.640711
5856,history,him room. clear. clear. move aside. the 96th.....,2.604879
1817,adventure,"you get ten percent, - and that's me being gen...",0.498994
582,action,ask. then you have to find me. you're on. i'm ...,2.433458


In [None]:
data['sentiment'][0:10] = data['split'].apply(analyse_sentiment)
data.head(10)

KeyboardInterrupt: ignored

In [None]:
data["sentiment"] = 0

for index, row in data.iterrows():
    data.at[index, "sentiment"] = analyse_sentiment(row["split"])

data.head()

In [None]:
data["sentiment"] = 0

for i in range(len(data["split"])):
  data["sentiment"][int(i)] = analyse_sentiment(data["split"][int(i)])

data.head()

## DistilRoBERTa (7 emotions)

In [None]:
from transformers import pipeline
classifier = pipeline("text-classification", model="j-hartmann/emotion-english-distilroberta-base", return_all_scores=False)



[{'label': 'joy', 'score': 0.9771687984466553}]

In [None]:
def analyse_sentiment_dist(text):
  result = classifier(text)
  label = result[0]["label"]
  score = result[0]["score"]

  match label:
    case "anger":
      return 0.0 + score
    case "disgust":
      return 1.0 + score
    case "fear":
      return 2.0 + score
    case "joy":
      return 3.0 + score
    case "neutral":
      return 4.0 + score
    case "sadness":
      return 5.0 + score
    case "surprise":
      return 6.0 + score

3.983601748943329 2.994018077850342 1.9865193963050842 0.9918979406356812


In [None]:
x = data[0:10]
x['sentiment'] = x['split'].apply(analyse_sentiment_dist)
x.head(10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x['sentiment'] = x['split'].apply(analyse_sentiment_dist)


Unnamed: 0,genre,split,sentiment
8826,superhero,"ruling, brother. a throne would suit you ill. ...",0.810633
3172,comedy,the hotel. you'd better go straight to the bas...,6.408363
9221,western,"shooting a man, son. no, it isn't. not in my o...",2.96404
718,action,that. we've only got seven minutes. passengers...,0.937774
3133,comedy,the thames men later became spinal tap and had...,4.447267
9382,western,wolves is very quiet these days. is his heart ...,3.983623
4375,fantasy,they stretch like mad. - let's go put him in t...,4.340572
5856,history,him room. clear. clear. move aside. the 96th.....,4.864731
1817,adventure,"you get ten percent, - and that's me being gen...",6.840303
582,action,ask. then you have to find me. you're on. i'm ...,2.53508


In [None]:
data['sentiment'] = data['split'].apply(analyse_sentiment_dist)
data.head(10)

Token indices sequence length is longer than the specified maximum sequence length for this model (748 > 512). Running this sequence through the model will result in indexing errors


RuntimeError: ignored