### Importing Needed Libraries

In [65]:
# For loading the data 
import pandas as pd

from transformers import pipeline

from textwrap import wrap

from collections import Counter

# Import tqdm and enable progress_apply() for pandas
from tqdm.auto import tqdm
tqdm.pandas()

### Step 1: Load the Data

In [43]:
df = pd.read_csv(r"../data/cleaned_data.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,Book,Review,Review Date
0,0,To Kill a Mockingbird,gentle reminder that this is not the time to r...,"March 24, 2022"
1,1,To Kill a Mockingbird,6.0 stars. I know I am risking a serious “FILM...,"May 24, 2011"
2,2,To Kill a Mockingbird,Looking for a new book but don't want to commi...,"December 10, 2020"
3,4,To Kill a Mockingbird,Why is it when I pick up To Kill A Mockingbir...,"October 25, 2009"
4,5,To Kill a Mockingbird,I had a much longer review written for this bo...,"December 17, 2020"


### Step 3: Analyze Sentiment Using Transformers

In [44]:
# Load pre-trained sentiment analysis model 
sentiment_analyzer = pipeline("sentiment-analysis", model="cardiffnlp/twitter-roberta-base-sentiment")

Device set to use cpu


In [56]:
# Analyzing if we can drop rows above 512

# Create a new column with the character length of each review
df ['Review_Length'] = df['Review'].astype(str).apply(len)

# Checking reviews that are below, equal, or above 512
below_512 = (df['Review_Length'] < 512).sum()
equal_512 = (df['Review_Length'] == 512).sum()
above_512 = (df['Review_Length'] > 512).sum()

print(below_512)
print(equal_512)
print(above_512)

4625
10
18421


In [None]:
def analyze_full_review(review, chunk_size=512):
    try: 
        # Split long texts into chunks
        chunks = wrap(review, chunk_size)
        
        labels = []
        # Run sentiment_analyzer() separetely on each chunk
        for chunk in chunks:
            chunk_analyzed = sentiment_analyzer(chunk)
            # Extract label
            label = chunk_analyzed[0]["label"]
            labels.append(label)
        
        # Use Counter to find the most common label
        most_common_label = Counter(labels).most_common(1)[0][0]

        return most_common_label
    
    except Exception as e:
        # Show the error in the terminal
        print("Error: ", e)
        # Mark the row with ERROR
        return "ERROR"
    
df["Polarity_of_Review"] = df["Review"].progress_apply(analyze_full_review)

  0%|          | 0/23056 [00:00<?, ?it/s]

In [57]:
# pd.set_option('display.max_colwidth', None) 
# new_df.head()