Data Labelling Using GPT 3.5 Turbo model

In [None]:
import os
import time
from pathlib import Path
from typing import Literal

import pandas as pd
from dotenv import load_dotenv
from openai import OpenAI
from tenacity import retry, wait_random_exponential, stop_after_attempt
from tqdm import tqdm

CSV_PATH_IN  = Path("DataCommentTesting.csv")
CSV_PATH_OUT = Path("HijabistahubComment_labelled.csv")
MODEL        = "gpt-3.5-turbo"
BATCH_SIZE   = 50
TEMPERATURE  = 0.0

load_dotenv()
client = OpenAI()

system_prompt = (
    "You are a sentiment analysis assistant trained to analyze TikTok comments about Hijabistahub, "
    "a Malaysian modest fashion brand frequently promoted by influencers or artists. "
    "Your goal is to classify the emotional tone of each comment as it relates to the brand or its influencers and artists. "
    "Comments may be written in English, Malay, Manglish (a mix of Malay and English), or occasionally in Indonesian. "
    "Consider emojis, slang, sarcasm, and informal expressions carefully. "
    "Return only one of these sentiment labels in lowercase: 'positive' or 'negative'. "
    "Do not explain your reasoning. Only return the label."
)

@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))
def label_sentiment(text: str) -> Literal["positive", "negative"]:
    """Call OpenAI chat completion and return the raw label."""
    resp = client.chat.completions.create(
        model=MODEL,
        temperature=TEMPERATURE,
        messages=[
            {"role": "system", "content": system_prompt},
            {
                "role": "user",
                "content": f"Comment: {text}\n\nLabel:"
            },
        ],
    )
    label = resp.choices[0].message.content.strip().lower()
    if label not in {"positive", "negative"}:
        # Force ambiguous answers to negative
        return "negative"
    return label


df = pd.read_csv(CSV_PATH_IN)

if "cleaned_text" not in df.columns:
    raise KeyError("The CSV must contain a 'cleaned_text' column.")

labels = []
for i, txt in tqdm(enumerate(df["cleaned_text"]), total=len(df), desc="Labelling"):
    try:
        label = label_sentiment(txt)
    except Exception as e:
        print(f"\n⚠️  Row {i}: {e} → labelled as negative.")
        label = "negative"
    labels.append(label)

    # checkpoint every BATCH_SIZE rows to avoid data loss
    if (i + 1) % BATCH_SIZE == 0:
        df.loc[:i, "gpt_sentiment"] = labels
        df.to_csv(CSV_PATH_OUT, index=False)

# attach labels to DataFrame
df["gpt_sentiment"] = labels

df.to_csv(CSV_PATH_OUT, index=False)
print(f"\n✅  Saved GPT-labelled file to → {CSV_PATH_OUT.resolve()}")

tally = (
    df["gpt_sentiment"]
      .value_counts()
      .reindex(["positive", "negative"], fill_value=0)
)

print("\n✅ Sentiment counts for the full dataset:")
print(tally.to_string())

print(f"\nFull file with GPT labels saved to: {CSV_PATH_OUT.resolve()}")