In [1]:
import os 
os.chdir('../..')
os.getcwd()

'e:\\Midterm-2'

In [2]:
# !pip install google-genai

In [3]:
import numpy as np 
import pandas as pd
from src.llm import RotateGemini

In [4]:
data = pd.read_csv('data/processsed/test_data.csv')

In [5]:
data.shape 

(5474, 21)

In [31]:
llm = RotateGemini(model_name = 'gemini-2.0-flash')

Found 12 API keys


In [7]:
data.columns

Index(['review_id', 'author_id', 'rating', 'is_recommended', 'helpfulness',
       'total_feedback_count', 'total_neg_feedback_count',
       'total_pos_feedback_count', 'submission_time', 'review_text',
       'review_title', 'skin_tone', 'eye_color', 'skin_type', 'hair_color',
       'product_id', 'product_name', 'brand_name', 'price_usd', 'sentiment',
       'date'],
      dtype='object')

In [8]:
target = data[['review_id','sentiment']]
data['sentiment'] = np.nan

In [None]:
import json 
import re
from typing import List, Optional, Tuple, Dict, Any

def _strip_fences(s: str) -> str:
    """Remove code fences and JSON markers from LLM output."""
    text = s.strip()
    if text.startswith("```"):
        lines = s.splitlines()
        # Remove first fence line
        if lines[0].startswith("```"):
            lines = lines[1:]
        # Remove last fence line
        if lines and lines[-1].startswith("```"):
            lines = lines[:-1]
        s = "\n".join(lines)
    # If "json" marker remains at the beginning
    if s.strip().lower().startswith("json"):
        s = re.sub(r"^json\s*", "", s, flags=re.IGNORECASE)
    return s.strip()

def get_sentiments_with_ids(id_texts: List[Tuple[str, Optional[Any]]], llm_function) -> List[Dict[str, Any]]:
    """
    Process a batch of text-id pairs and return sentiment classifications with IDs.
    
    Args:
        id_texts: List of (id, text) tuples to analyze
        llm_function: Function that accepts a prompt and returns LLM response
        
    Returns:
        List of dictionaries with id and sentiment (0=negative, 1=neutral, 2=positive, None=empty)
    """
    items = []
    for _id, text in id_texts:
        # Convert text to string if it's not None (handles float, int, etc.)
        if text is None:
            items.append(f"{_id}: <EMPTY>")
        else:
            # Convert to string before calling strip()
            text_str = str(text)
            if not text_str.strip():
                items.append(f"{_id}: <EMPTY>")
            else:
                items.append(f"{_id}: {text_str.strip()}")
    
    joined_items = "\n".join(items)
    
    prompt = f"""
Below are {len(id_texts)} review texts, each with a unique ID. Empty reviews are marked as <EMPTY>.
Extract the sentiment for each and return a valid JSON array of objects, each with two keys:
- "id": string (same as input ID)
- "sentiment": int. Only 0,1,2 with 0 is negative, 1 is neutral, 2 is postive.

IMPORTANT:
- For <EMPTY> reviews, return sentiment = null.
- Return ONLY a valid JSON array. No extra text, no markdown, no comments.

List of reviews:
{joined_items}
""".strip()

    # Call LLM
    resp = llm_function(prompt)
    raw = _strip_fences(resp)
    raw = re.sub(r",\s*([\]\}])", r"\1", raw)

    try:
        arr = json.loads(raw)
        if not isinstance(arr, list):
            raise ValueError("Result is not a list.")
    except json.JSONDecodeError as e:
        raise RuntimeError(f"Could not parse JSON:\n{raw}") from e

    # Normalize null values
    def norm_sent(x):
        if x is None: return None
        x = str(x).strip().lower()
        return None if x in ("", "null", "none") else x

    results = [{"id": str(d["id"]), "sentiment": norm_sent(d.get("sentiment"))} for d in arr if "id" in d]
    return results

def analyze_sentiment_batch(data: pd.DataFrame,
                            llm_function,
                            chunk_size: int = 100,
                            text_column: str = 'review_text',
                            id_column: str = 'review_id',
                            sentiment_column: str = 'sentiment',
                            output_file_prefix: str = 'sentiment_results',
                            save_each_batch: bool = True) -> pd.DataFrame:
    """
    Process a DataFrame with reviews in batches and add sentiment classifications.
    Saves intermediate results after each batch for safety.
    
    Args:
        data: DataFrame containing text to analyze
        llm_function: Function that accepts a prompt and returns LLM response
        chunk_size: Number of texts to process in each batch
        text_column: Column name containing the text to analyze
        id_column: Column name containing text IDs
        sentiment_column: Column name to store sentiment results
        output_file_prefix: Prefix for output files
        save_each_batch: Whether to save intermediate results after each batch
        
    Returns:
        DataFrame with added sentiment column
    """
    result_data = data.copy()
    
    # Initialize sentiment column if it doesn't exist
    if sentiment_column not in result_data.columns:
        result_data[sentiment_column] = None
    
    batch_number = 0
    
    # Process in batches
    for start in range(0, len(result_data), chunk_size):
        batch_number += 1
        end = min(start + chunk_size, len(result_data))
        chunk = result_data.iloc[start:end]
        id_text_pairs = list(zip(chunk[id_column].astype(str), chunk[text_column]))
        
        try:
            results = get_sentiments_with_ids(id_text_pairs, llm_function)
            
            # Update the main dataframe with results
            for res in results:
                result_data.loc[result_data[id_column].astype(str) == res["id"], sentiment_column] = res["sentiment"]
            
            # Print progress info
            print(f"Processed batch {batch_number}: {start}-{end}/{len(result_data)} samples")
            
            # Save intermediate results for this batch
            if save_each_batch:
                # Save just this batch's results
                batch_filename = f"data/result/llm/{output_file_prefix}_batch_{batch_number}.csv"
                batch_df = pd.DataFrame(results)
                batch_df.to_csv(batch_filename, index=False)
                print(f"Saved batch results to {batch_filename}")
                
                # Also save current progress of the full dataset
                progress_filename = f"ddata/result/llm/{output_file_prefix}_progress.csv"
                result_data.to_csv(progress_filename, index=False)
                print(f"Updated progress saved to {progress_filename}")
                
        except Exception as e:
            error_msg = f"Error in batch {batch_number} (rows {start}-{end}): {str(e)}"
            print(error_msg)
            
            # Save error information
            if save_each_batch:
                error_file = f"{output_file_prefix}_error_batch_{batch_number}.txt"
                with open(error_file, 'w') as f:
                    f.write(error_msg)
                print(f"Error details saved to {error_file}")
    
    # Save final complete results
    final_filename = f"data/result/llm/{output_file_prefix}_complete.csv"
    result_data.to_csv(final_filename, index=False)
    print(f"Complete results saved to {final_filename}")
    
    return result_data

# Example usage:
# Define a function to call your LLM
def llm_function(prompt: str) -> str:
    """
    Call the language model with the given prompt.
    
    Args:
        prompt: The text prompt to send to the LLM
        
    Returns:
        The LLM's response as a string
    """
    message = [{"role": "user", "content": prompt}]
    return llm(message)  # You need to define or integrate the `llm()` function


# # Process a batch of data
# result_df = analyze_sentiment_batch(
#     data,  # Process first 100 rows
#     llm_function=llm_function,
#     chunk_size=250
# )

# Evaluation

In [109]:
target.shape

(5474, 2)

In [110]:
data[data['author_id'].isna()]

Unnamed: 0,review_id,author_id,rating,is_recommended,helpfulness,total_feedback_count,total_neg_feedback_count,total_pos_feedback_count,submission_time,review_text,...,skin_tone,eye_color,skin_type,hair_color,product_id,product_name,brand_name,price_usd,sentiment,date
231,122760,,5,1.0,1.0,0,0,0,2022-08-22,,...,light,hazel,combination,brown,P423688,Daily Microfoliant Exfoliator,Dermalogica,65.0,,2022-08-22
667,746432,,5,1.0,1.0,0,0,0,2022-09-02,,...,lightMedium,brown,combination,brown,P501760,Guava Vitamin C Bright-Eye Gel Cream,Glow Recipe,38.0,,2022-09-02
1235,1014797,,5,1.0,1.0,2,0,2,2022-09-23,,...,fairLight,hazel,dry,brown,P500894,Fat Water Hydrating Milky Toner Essence with H...,Fenty Skin,34.0,,2022-09-23


In [111]:
import glob
def aggregate_from_pattern(pattern, file_type="csv", add_source_column=False):
    matched_files = glob.glob(pattern)
    df_list = []

    for file in matched_files:
        try:
            if file_type == "csv":
                df = pd.read_csv(file)
            elif file_type == "json":
                df = pd.read_json(file)
            elif file_type == "parquet":
                df = pd.read_parquet(file)
            else:
                print(f"Không hỗ trợ định dạng {file_type}")
                continue
            
            if add_source_column:
                df['source_file'] = file
            df_list.append(df)
        except Exception as e:
            print(f"Lỗi khi đọc {file}: {e}")
    
    if df_list:
        combined_df = pd.concat(df_list, ignore_index=True)
        return combined_df
    else:
        print("Không tìm thấy file phù hợp.")
        return pd.DataFrame()

# Ví dụ sử dụng:
combined_df = aggregate_from_pattern("data/result/llm/sentiment_results_batch_*.csv", file_type="csv", add_source_column=True)
# combined_df.to_csv("combined.csv", index=False)

In [116]:
target = target.sort_values(by='review_id')
combined_df = combined_df.sort_values(by='id')

In [117]:
combined_df = combined_df.fillna(4)

In [118]:
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report

y_true = target['sentiment']
y_pred = combined_df['sentiment']

# 1. Accuracy
accuracy = accuracy_score(y_true, y_pred)

# 2. F1 Score
f1_macro    = f1_score(y_true, y_pred, average='macro', zero_division=0)
f1_weighted = f1_score(y_true, y_pred, average='weighted', zero_division=0)

# 3. Confusion Matrix
conf_mtx = confusion_matrix(y_true, y_pred)

# 4. Classification Report (với zero_division để không bị lỗi chia 0)
class_report = classification_report(
    y_true, y_pred,
    digits=4,
    zero_division=0
)

# In kết quả
print(f"Accuracy           : {accuracy:.4f}")
print(f"F1 Score (macro)   : {f1_macro:.4f}")
print(f"F1 Score (weighted): {f1_weighted:.4f}\n")

print("Confusion Matrix:")
print(conf_mtx, "\n")

print("Classification Report:")
print(class_report)

Accuracy           : 0.8679
F1 Score (macro)   : 0.5357
F1 Score (weighted): 0.8853

Confusion Matrix:
[[ 495   31    0    0]
 [ 163  222   25    0]
 [  39  462 4034    3]
 [   0    0    0    0]] 

Classification Report:
              precision    recall  f1-score   support

         0.0     0.7102    0.9411    0.8095       526
         1.0     0.3105    0.5415    0.3947       410
         2.0     0.9938    0.8889    0.9385      4538
         4.0     0.0000    0.0000    0.0000         0

    accuracy                         0.8679      5474
   macro avg     0.5036    0.5929    0.5357      5474
weighted avg     0.9154    0.8679    0.8853      5474

