# Data cleansing for ABSA

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import numpy as np
import pandas as pd

reviews_df = pd.read_csv('') # 'movie_with_reviews.csv'


In [None]:
reviews_df = reviews_df[['title','review1', 'review2', 'review3']]
df_pre = pd.read_csv('') # 'preprocessed_data.csv'

In [None]:
final_df = pd.merge(df_pre, reviews_df, on='title', how='left')

In [None]:
value_counts = final_df['title'].value_counts()

low_count = value_counts[value_counts >= 100].index

final_df = final_df[final_df['title'].isin(low_count)].reset_index(drop=True)

#Filtering only movies with 3 reviews and at least 100 users voting. To simplify the task by leaving only around 2500 movies.

In [None]:
final_df = final_df.dropna(subset=['review1', 'review2', 'review3'], how='any')

In [None]:
final_df['title'].nunique()

2528

In [None]:
filtered_review = final_df.drop_duplicates(subset = 'title').reset_index(drop=True)
filtered_review = filtered_review[['title','review1', 'review2', 'review3']]

In [None]:
filtered_review.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2528 entries, 0 to 2527
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    2528 non-null   object
 1   review1  2528 non-null   object
 2   review2  2528 non-null   object
 3   review3  2528 non-null   object
dtypes: object(4)
memory usage: 79.1+ KB


# ABSA with GPT API

In [None]:
!pip install langchain
!pip install langchain-openai

Collecting langchain
  Downloading langchain-0.2.12-py3-none-any.whl.metadata (7.1 kB)
Collecting langchain-core<0.3.0,>=0.2.27 (from langchain)
  Downloading langchain_core-0.2.28-py3-none-any.whl.metadata (6.2 kB)
Collecting langchain-text-splitters<0.3.0,>=0.2.0 (from langchain)
  Downloading langchain_text_splitters-0.2.2-py3-none-any.whl.metadata (2.1 kB)
Collecting langsmith<0.2.0,>=0.1.17 (from langchain)
  Downloading langsmith-0.1.96-py3-none-any.whl.metadata (13 kB)
Collecting tenacity!=8.4.0,<9.0.0,>=8.1.0 (from langchain)
  Downloading tenacity-8.5.0-py3-none-any.whl.metadata (1.2 kB)
Collecting jsonpatch<2.0,>=1.33 (from langchain-core<0.3.0,>=0.2.27->langchain)
  Downloading jsonpatch-1.33-py2.py3-none-any.whl.metadata (3.0 kB)
Collecting orjson<4.0.0,>=3.9.14 (from langsmith<0.2.0,>=0.1.17->langchain)
  Downloading orjson-3.10.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (50 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.4

In [None]:
import pandas as pd
import torch
import numpy as np
from transformers import LEDTokenizer, LEDForConditionalGeneration
from tqdm import tqdm

def generate_summaries(texts):
    tokenizer = LEDTokenizer.from_pretrained("pszemraj/led-base-book-summary")
    model = LEDForConditionalGeneration.from_pretrained("pszemraj/led-base-book-summary")

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    summaries = []

    for review in tqdm(texts, desc="Generating Summaries", unit="review"):
        inputs_dict = tokenizer.encode_plus(
            review,
            padding="max_length",
            max_length=16384,
            return_tensors="pt",
            truncation=True
        )
        input_ids = inputs_dict["input_ids"].to(device)
        attention_mask = inputs_dict["attention_mask"].to(device)

        global_attention_mask = torch.zeros_like(attention_mask)
        global_attention_mask[:, 0] = 1

        predicted_abstract = model.generate(
            input_ids,
            attention_mask=attention_mask,
            global_attention_mask=global_attention_mask,
            decoder_start_token_id=model.config.pad_token_id,
            max_length=512,
            temperature=0
        )

        summary = tokenizer.decode(predicted_abstract[0], skip_special_tokens=True)
        summaries.append(summary)

    df = pd.DataFrame({
        'original_text': texts,
        'summary': summaries
    })

    return df


In [None]:
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser

api_key = "" # Your GPT API


prompt = ChatPromptTemplate.from_template("""Given the following movie review text:

'{text}'

You will perform the Aspect-Based Sentiment Analysis (ABSA) task.

**Instructions:**

1. Identify and list different aspects mentioned in the review.

2. One text can have multiple aspects. And each aspect should be simple enough to be applied to a variety of texts.(e.g. character, visual effects, plot, theme).

3. Use genres like romance, horror, and blockbuster as aspects. And, do not use aspects combined with specific proper names like Acting (Brad Pitt).

4. For each aspect, provide the sentiment associated with it. Sentiments should be either "positive" or "negative."

6. Format your response as follows:
Aspect|Sentiment

In this format:
- **Aspect**: The general aspect mentioned in the text.
- **Sentiment**: The sentiment associated with the aspect (positive or negative).

**Example Response Format:**
Character|Negative
Plot|Negative
Horror|Positive
Romance|Negative
Music|Positive

Please provide your response in the specified format for the given review text.
""")

llm = ChatOpenAI(model="gpt-4o-mini", temperature = 0, openai_api_key = api_key)
output_parser = StrOutputParser()
chain = prompt | llm | output_parser

def perform_absa_on_texts(texts):
    results = []
    for text in tqdm(texts, desc="Processing ABSA", unit="text"):
        result = chain.invoke({"text": text})
        aspect_sentiments = {}
        for line in result.split('\n'):
            if '|' in line:
                aspect, sentiment = line.split('|')
                aspect = aspect.strip().lower()
                sentiment = sentiment.strip().lower()
                aspect_sentiments[aspect] = sentiment

        results.append({'text': text, 'aspect_sentiments': aspect_sentiments})

    df = pd.DataFrame(results)
    return df

In [None]:
for col in ['review1', 'review2']: # Proceed only to REVIEW2 due to resource limitations
  texts = filtered_review[col]
  summary = generate_summaries(texts)
  absa = perform_absa_on_texts(summary['summary'])
  filtered_review[f'{col}_results'] = absa['aspect_sentiments']
  filtered_review.to_csv(f'/content/drive/MyDrive/24-2 EDA Project/result_{col}.csv', index=False) # Replace with your address as appropriate

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.32k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.32k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/648M [00:00<?, ?B/s]

Generating Summaries: 100%|██████████| 2528/2528 [2:52:54<00:00,  4.10s/review]
Processing ABSA: 100%|██████████| 2528/2528 [33:48<00:00,  1.25text/s]
