# Sentiment Analysis Using Torch

In [4]:
!pip install transformers torch pandas
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch



In [5]:
# Movie reviews CSV from GitHub and preview the first 10 lines of the output
!curl https://github.com/tpeterz/happy-flix/blob/main/Resources/movie_reviews.csv --output movie_reviews.csv
!head movies.csv

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed

  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100  145k    0  145k    0     0   338k      0 --:--:-- --:--:-- --:--:--  340k
head: cannot open 'movies.csv' for reading: No such file or directory


In [6]:
# Import CSV
import csv
with open('Resources\movies.csv', 'r') as f:
  reader = csv.reader(f)
  for row in reader:
    print(row)

['unamed', 'movie_id', 'overview', 'popularity', 'release_date', 'title', 'vote_average', 'vote_count', 'cleaned_reviews', 'type_Action', 'type_Adventure', 'type_Animation', 'type_Comedy', 'type_Crime', 'type_Documentary', 'type_Drama', 'type_Family', 'type_Fantasy', 'type_History', 'type_Horror', 'type_Music', 'type_Mystery', 'type_Romance', 'type_Science Fiction', 'type_TV Movie', 'type_Thriller', 'type_War', 'type_Western']
['0', '693134', 'Follow the mythic journey of Paul Atreides as he unites with Chani and the Fremen while on a path of revenge against the conspirators who destroyed his family. Facing a choice between the love of his life and the fate of the known universe, Paul endeavors to prevent a terrible future only he can foresee.', '4534.956', '2/27/2024', 'Dune: Part Two', '8.311', '2707', 'dune part two surpasses even highest expectations establishing unquestionable technical masterpiece blockbuster filmmaking narrative deepens complex web political relationships power 

UnicodeDecodeError: 'charmap' codec can't decode byte 0x9d in position 1125: character maps to <undefined>

In [None]:
# WARNING: Running this cells requires a very powerful CPU and may take a very long time to process

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")

# Load the movie reviews
df = pd.read_csv('Resources/movies.csv')

# Calculate the batch size as a percentage of the total number of rows
total_rows = len(df)

# 1% of the total rows
batch_size = int(total_rows * 0.01)

# Tokenize the reviews in smaller batches to reduce memory usage
for i in range(0, total_rows, batch_size):
    batch = df['cleaned_reviews'].iloc[i:i+batch_size].tolist()
    inputs = tokenizer(batch, padding=True, truncation=True, return_tensors="pt")
    
# Perform sentiment analysis
    outputs = model(**inputs)
    
# Get the predicted sentiment scores
    sentiment_scores = outputs.logits.argmax(dim=-1)
    
# Convert the scores to a list
    sentiment_scores_list = sentiment_scores.tolist()
    
# Add the sentiment scores to the DataFrame
    df.loc[i:i+batch_size-1, 'sentiment_score'] = sentiment_scores_list

# Convert the scores to sentiment labels
sentiment_labels = ['neg', 'neu', 'pos']
df['sentiment_label'] = [sentiment_labels[int(score)] for score in df['sentiment_score']]

# Print the sentiment labels
print(df['sentiment_label'])