# ABSA Lab

In [None]:
!pip install -q accelerate==0.21.0 peft==0.4.0 bitsandbytes==0.40.2 transformers==4.31.0 trl==0.4.7


In [None]:
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig, PeftModel
import torch
from tqdm import tqdm

In [None]:
model_name = "Orkhan/llama-2-7b-absa"
# load model in FP16 and merge it with LoRA weights
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map={"": 0},
)
base_model.config.use_cache = False
base_model.config.pretraining_tp = 1

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

In [None]:
def process_output(result, user_prompt):
    interpreted_input = (
        result[0]["generated_text"].split("### Assistant:")[0].split("### Human:")[1]
    )
    new_output = (
        result[0]["generated_text"].split("### Assistant:")[1].split(")")[0].strip()
    )

    new_output.split("## Opinion detected:")

    aspect_opinion_sentiment = new_output

    aspects = aspect_opinion_sentiment.split("Aspect detected:")[1].split("##")[0]
    opinions = aspect_opinion_sentiment.split("Opinion detected:")[1].split(
        "## Sentiment detected:"
    )[0]
    sentiments = aspect_opinion_sentiment.split("## Sentiment detected:")[1]

    aspect_list = [aspect.strip() for aspect in aspects.split(",") if "," in aspects]
    opinion_list = [
        opinion.strip() for opinion in opinions.split(",") if "," in opinions
    ]
    sentiments_list = [
        sentiment.strip() for sentiment in sentiments.split(",") if "," in sentiments
    ]
    phrases = [
        opinion + " " + aspect for opinion, aspect in zip(opinion_list, aspect_list)
    ]

    output_dict = {
        "user_prompt": user_prompt,
        "interpreted_input": interpreted_input,
        "aspects": aspect_list,
        "opinions": opinion_list,
        "sentiments": sentiments_list,
        "phrases": phrases,
    }

    return output_dict


def process_prompt(user_prompt, model):
    edited_prompt = "### Human: " + user_prompt + ".###"
    pipe = pipeline(
        task="text-generation",
        model=model,
        tokenizer=tokenizer,
        max_length=len(tokenizer.encode(user_prompt)) * 4,
    )
    result = pipe(edited_prompt)

    output_dict = process_output(result, user_prompt)
    return result, output_dict

In [None]:
from discover.infra.utils.file.io import IOService


fp = "workspace/test/dataset/01_dataprep/appvocai_discover-01_dataprep-05_clean-review-dataset.parquet"
df = IOService.read(fp)

In [None]:
import pandas as pd


def apply_to_dataframe(reviews, base_model):
    """
    Processes a list of reviews and returns a DataFrame in the desired structure.

    Args:
        reviews (list of dict): A list where each dict contains `id` and `text` keys.
        base_model (object): The base model used for text processing.

    Returns:
        pd.DataFrame: A DataFrame with `review_id`, `aspects`, `opinions`, and `sentiments`.
    """
    rows = []
    for review in tqdm(reviews):
        review_id = review["id"]
        _, output_dict = process_prompt(review["content"], base_model)

        # Unpack the aspects, opinions, and sentiments into individual rows
        for aspect, opinion, sentiment in zip(
            output_Dict["aspects"], output_Dict["opinions"], output_Dict["sentiments"]
        ):
            rows.append(
                {
                    "id": review_id,
                    "aspects": aspect,
                    "opinions": opinion,
                    "sentiments": sentiment,
                }
            )

    # Create the DataFrame
    return pd.DataFrame(rows)


# Assuming `base_model` is already defined
df = apply_to_dataframe(df, base_model)

print(df)