# Emotion Detection

## Imports

In [1]:
from pandarallel import pandarallel
from transformers import AutoTokenizer, pipeline
from optimum.onnxruntime import ORTModelForSequenceClassification
from discover.infra.utils.file.io import IOService
import pandas as pd
from tqdm import tqdm

from discover.infra.service.data.generator import DataBatchGenerator

pandarallel.initialize(nb_workers=18, progress_bar=False, verbose=False)

## Parameters

In [2]:
model_id = "SamLowe/roberta-base-go_emotions-onnx"
file_name = "onnx/model_quantized.onnx"
fp = "workspace/dev/dataset/01_dataprep/appvocai_discover-01_dataprep-03_tqa-review-dataset.parquet"
batch_size = 8

## Model and Tokenizer

In [3]:
model = ORTModelForSequenceClassification.from_pretrained(model_id, file_name=file_name)
tokenizer = AutoTokenizer.from_pretrained(model_id)

The ONNX file onnx/model_quantized.onnx is not a regular name used in optimum.onnxruntime, the ORTModel might not behave as expected.


## Get Data

In [4]:
df = IOService.read(fp)
gen = DataBatchGenerator(data=df, batch_size=batch_size)

## Wrapped Tokenizer

In [5]:
class TruncatingTokenizer:
    def __init__(self, tokenizer, max_length=512):
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __call__(self, text, *args, **kwargs):
        # Force truncation, padding, and max_length
        return self.tokenizer(
            text,
            padding="longest",
            truncation=True,
            max_length=self.max_length,
            *args,
            **kwargs,
        )


wrapped_tokenizer = TruncatingTokenizer(tokenizer=tokenizer)

## Pipeline

In [6]:
# Initialize the pipeline outside the loop
tokenizer_kwargs = {"padding": True, "truncation": True, "max_length": 512}

onnx_classifier = pipeline(
    task="text-classification",
    model=model,
    tokenizer=wrapped_tokenizer,
    top_k=3,
    function_to_apply="sigmoid",
)

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


## Emotion Detection

In [7]:
def detect_emotion(text):
    return onnx_classifier([text])

In [8]:
# Initialize list to store all batch results
all_results = []

for batch in tqdm(gen, total=gen.n_batches):
    # Parallel apply to detect emotion in the batch
    batch = batch["content"].parallel_apply(detect_emotion)

    # Collect processed batch
    all_results.append(batch)

# Concatenate all results and assign to the original DataFrame
df["emotion"] = pd.concat(all_results, axis=0)

TypeError: 'module' object is not callable