# Classifier based on Sentiment Analysis

In [1]:
# using the gpu
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
device

'cuda'

In [2]:
# loading the movie dataset
from google.colab import files
movies = files.upload()

Saving movies.csv to movies.csv


In [3]:
# creating pandas dataframe
import pandas as pd
col_names = ["genre", "split"]
data = pd.read_csv("movies.csv", sep=";", names=col_names)
data = data.sample(frac=1, random_state=42)
data.head()

Unnamed: 0,genre,split
8826,superhero,"ruling, brother. a throne would suit you ill. ..."
3172,comedy,the hotel. you'd better go straight to the bas...
9221,western,"shooting a man, son. no, it isn't. not in my o..."
718,action,that. we've only got seven minutes. passengers...
3133,comedy,the thames men later became spinal tap and had...


In [4]:
# remove empty lines
data.dropna(inplace=True)

In [5]:
data.genre.unique()

array(['superhero', 'comedy', 'western', 'action', 'fantasy', 'history',
       'adventure', 'sport', 'drama', 'scifi'], dtype=object)

In [6]:
# import cardiffnlp/twitter-roberta-base-sentiment model from huggingface
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer
import numpy as np
from scipy.special import softmax
import csv
import urllib.request

task='sentiment'
MODEL = f"cardiffnlp/twitter-roberta-base-{task}"

tokenizer = AutoTokenizer.from_pretrained(MODEL)

# download label mapping
labels=[]
mapping_link = f"https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/{task}/mapping.txt"
with urllib.request.urlopen(mapping_link) as f:
    html = f.read().decode('utf-8').split("\n")
    csvreader = csv.reader(html, delimiter='\t')
labels = [row[1] for row in csvreader if len(row) > 1]

# PT
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
model.save_pretrained(MODEL)

In [7]:
# define sentiment analyser for the encoding
def analyse_sentiment(text):
  encoded_input = tokenizer(text, return_tensors='pt')
  output = model(**encoded_input)
  scores = output[0][0].detach().numpy()
  scores = softmax(scores)

  ranking = np.argsort(scores)
  ranking = ranking[::-1]
  sentiment_label = labels[ranking[0]]
  sentiment_score = scores[ranking[0]]

  if sentiment_label == "negative":
    return 0.0 + sentiment_score
  elif sentiment_label == "neutral":
    return 1.0 + sentiment_score
  elif sentiment_label == "positive":
    return 2.0 + sentiment_score

#Examples:
#print(analyse_sentiment("shooting a man, son. no, it isn't. not in my o"))
#print(analyse_sentiment("the thames men later became spinal tap and had"))

In [8]:
data['sentiment'] = data['split'].apply(analyse_sentiment)
data.head()

KeyboardInterrupt: ignored

In [None]:
data["sentiment"] = 0

for index, row in data.iterrows():
    data.at[index, "sentiment"] = analyse_sentiment(row["split"])

data.head()

In [None]:
data["sentiment"] = 0

for i in range(len(data["split"])):
  data["sentiment"][int(i)] = analyse_sentiment(data["split"][int(i)])

data.head()