In [None]:
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.prompts import PromptTemplate
from dotenv import load_dotenv
import pandas as pd
import os
import json

In [33]:
load_dotenv()

model = ChatGoogleGenerativeAI(model='gemini-2.5-flash',temperature=0.1)

result = model.invoke('What is the capital of India')

print(result.content)

The capital of India is **New Delhi**.


In [34]:

DATA_FILE_PATH = 'data/yelp.csv' 
SAMPLE_SIZE = 200 

def load_and_sample_data(file_path: str, sample_size: int) -> pd.DataFrame:
    df = pd.read_csv(file_path)
    required_cols = ['stars', 'text']

    df_filtered = df[required_cols].copy()
    df_filtered.rename(columns={'stars': 'actual_stars'}, inplace=True)
    
    # Sample and clean
    df_sample = df_filtered.sample(n=min(sample_size, len(df_filtered)), random_state=42).reset_index(drop=True)
    df_sample.dropna(subset=['text'], inplace=True)
    
    return df_sample

# Load and prepare the data - This DataFrame will be used for all subsequent steps
reviews_df = load_and_sample_data(DATA_FILE_PATH, SAMPLE_SIZE)
# reviews_df now holds the 200 rows of data.

In [35]:
reviews_df.head()

Unnamed: 0,actual_stars,text
0,4,We got here around midnight last Friday... the...
1,5,Brought a friend from Louisiana here. She say...
2,3,"Every friday, my dad and I eat here. We order ..."
3,1,"My husband and I were really, really disappoin..."
4,5,Love this place! Was in phoenix 3 weeks for w...


In [49]:
template_v1 = PromptTemplate(
    template="""
You are a sentiment analysis assistant.

Your task:
- Read the following review.
- Predict what star rating (1 to 5) the customer most likely gave.

Rules:
- Use ONLY an integer 1, 2, 3, 4, or 5.
- 1 = very negative, 3 = mixed/average, 5 = very positive.
- Base your answer ONLY on the review text.
- Respond ONLY in valid JSON with this exact format (no backticks, no markdown):

{{
  "predicted_stars": <integer between 1 and 5>,
  "explanation": "<brief reasoning in one or two sentences>"
}}

Review:
\"\"\"{review_text}\"\"\"

Return ONLY the JSON object, nothing else.
""",
input_variables=['review_text']
)
template_v1.name = "template_v1"


template_v2 = PromptTemplate(
    template="""
You are an assistant that maps Yelp reviews to star ratings (1–5) and returns strict JSON

Use this rubric:
- 1 star: Very negative, serious issues, strong dissatisfaction.
- 2 stars: Mostly negative, some minor positives but overall unhappy.
- 3 stars: Mixed/average, clear positives and negatives.
- 4 stars: Mostly positive, small issues but generally satisfied.
- 5 stars: Very positive, highly satisfied, would strongly recommend.

Instructions:
- Use ONLY integers 1, 2, 3, 4, or 5 for the rating.
- Provide a short, one-sentence explanation.
- Return ONLY valid JSON — no markdown, no extra text.

Expected JSON format:
{{
  "predicted_stars": <integer between 1 and 5>,
  "explanation": "<brief one-sentence justification>"
}}

Review:
\"\"\"{review_text}\"\"\"

Return ONLY the JSON object.
""",
    input_variables=['review_text']
)

template_v2.name = "template_v2"

template_v3 = PromptTemplate(
    template="""
You are an assistant that maps Yelp reviews to star ratings (1–5) and returns strict JSON.

IMPORTANT: You may internally reason step-by-step to arrive at the best rating (i.e., use chain-of-thought internally). 
HOWEVER, do NOT output any chain-of-thought, reasoning steps, or internal deliberation. 
Output ONLY the final JSON object in the exact schema below — nothing else.

Follow this exact JSON format:
{{
  "predicted_stars": <integer between 1 and 5>,
  "explanation": "<brief explanation>"
}}

Examples (input -> exact JSON output):

Example 1
Review:
\"\"\"U can go there n check the car out. If u wanna buy 1 there? That's wrong move! ... They ripped my girlfriend off by lying how bad my car is now.\"\"\"
Output:
{{
  "predicted_stars": 1,
  "explanation": "Strongly negative: accuses the business of ripping customers off and warns others not to go."
}}

Example 2
Review:
\"\"\"Was it worth the 21$ for a salad and small pizza? Absolutely not! Bad service. ... I left hungry, mad and unsatisfied. Won't go back unless I'm desperate.\"\"\"
Output:
{{
  "predicted_stars": 2,
  "explanation": "Mostly negative: criticizes value and service, expresses dissatisfaction and intent not to return."
}}

Example 3
Review:
\"\"\"We went here on a Saturday afternoon and this place was incredibly empty. ... My entree was the Tilapia salad, and I was a bit disappointed. It wasn't bad enough to say I wouldn't go back, but I won't be anxiously awaiting my next trip.\"\"\"
Output:
{{
  "predicted_stars": 3,
  "explanation": "Mixed/average: mentions both positive aspects and clear disappointments; overall lukewarm."
}}

Example 4
Review:
\"\"\"Love the gyro plate. Rice is so good and I also dig their candy selection :)\"\"\"
Output:
{{
  "predicted_stars": 4,
  "explanation": "Mostly positive: praises food with minor issues implied."
}}

Example 5
Review:
\"\"\"My wife took me here on my birthday for breakfast and it was excellent. ... Do yourself a favor and get their Bloody Mary. It was phenomenal and simply the best I've ever had.\"\"\"
Output:
{{
  "predicted_stars": 5,
  "explanation": "Very positive: enthusiastic praise for food, service, and overall experience."
}}

Now classify the following review using the SAME JSON format.

Review:
\"\"\"{review_text}\"\"\"

Return ONLY the JSON object — no markdown, no text outside the JSON, and absolutely no chain-of-thought or reasoning steps.
""",
    input_variables=['review_text']
)

template_v3.name = "template_v3"


In [37]:
def strict_json_parse(json_string: str):
    try:
        data = json.loads(json_string)
        return data, True
    except:
        return None, False


In [38]:
def evaluate_prompt(
    template: PromptTemplate,
    df: pd.DataFrame,
    model: ChatGoogleGenerativeAI
) -> pd.DataFrame:

    total = len(df)
    template_name = getattr(template, "name", "unnamed_template")
    print(f"\n--- Running Evaluation for {template_name} ({total} samples) ---")

    results = []

    for idx, row in df.iterrows():
        review_text = row['text']
        actual_stars = row['actual_stars']

        prompt = template.format(review_text=review_text)
        pred_star = None
        json_valid = False

        try:
            # synchronous invocation
            response = model.invoke(prompt)

            # access response content robustly (raw output only)
            if hasattr(response, "content"):
                raw_text = response.content
            elif isinstance(response, dict) and "content" in response:
                raw_text = response["content"]
            else:
                raw_text = str(response)

            # STRICT parse (no cleaning)
            parsed_data, parsed_ok = strict_json_parse(raw_text)
            json_valid = bool(parsed_ok)

            # Only accept predicted_stars if it's an int in [1,5]
            if json_valid and isinstance(parsed_data, dict):
                potential = parsed_data.get("predicted_stars")
                if isinstance(potential, int) and 1 <= potential <= 5:
                    pred_star = int(potential)
                else:
                    pred_star = None
            else:
                pred_star = None

        except Exception:
            # Keep defaults: pred_star None and json_valid False
            pass

        results.append({
            'actual_stars': actual_stars,
            'predicted_stars': pred_star,
            'json_valid': json_valid,
            'is_accurate': (pred_star == actual_stars)
        })

        # progress log every 20 rows (and final row)
        if (idx + 1) % 20 == 0 or (idx + 1) == total:
            print(f"Processed {idx+1}/{total} reviews... (template={template_name})")

    print(f"✔ Completed evaluation for {template_name}\n")
    return pd.DataFrame(results)

In [39]:
def aggregate_metrics(results_df: pd.DataFrame) -> dict:
   
    valid_predictions_df = results_df.dropna(subset=['predicted_stars']).copy()
    
    # Ensure columns are numeric for calculation
    if not valid_predictions_df.empty:
        valid_predictions_df['predicted_stars'] = valid_predictions_df['predicted_stars'].astype(int)
        valid_predictions_df['actual_stars'] = valid_predictions_df['actual_stars'].astype(int)

    # --- Metric Calculation ---

    # 2. Accuracy: Proportion of valid predictions that exactly match the actual rating.
    accuracy = valid_predictions_df['is_accurate'].mean() if not valid_predictions_df.empty else 0.0
    
    # 3. Mean Absolute Error (MAE): Average absolute difference between predicted and actual stars.
    if not valid_predictions_df.empty:
        # Calculate |Predicted - Actual| for all valid rows
        mae_series = (valid_predictions_df['predicted_stars'] - valid_predictions_df['actual_stars']).abs()
        mae = mae_series.mean()
    else:
        mae = 0.0

    # 4. JSON Validity Rate: Proportion of all samples that produced a parseable JSON response.
    # This uses the entire results_df, including samples with None predictions.
    json_valid_rate = results_df['json_valid'].mean()
    
    return {
        'Total Samples': len(results_df),
        'Accuracy': f"{accuracy:.4f}",
        'Mean Absolute Error (MAE)': f"{mae:.4f}",
        'JSON Validity Rate': f"{json_valid_rate:.4f}",
    }

In [40]:
results_v1 = evaluate_prompt(
    template=template_v1,
    df=reviews_df, 
    model=model
)
metrics_v1 = aggregate_metrics(results_v1)


--- Running Evaluation for template_v1 (200 samples) ---
Processed 20/200 reviews... (template=template_v1)
Processed 40/200 reviews... (template=template_v1)
Processed 60/200 reviews... (template=template_v1)
Processed 80/200 reviews... (template=template_v1)
Processed 100/200 reviews... (template=template_v1)
Processed 120/200 reviews... (template=template_v1)
Processed 140/200 reviews... (template=template_v1)
Processed 160/200 reviews... (template=template_v1)
Processed 180/200 reviews... (template=template_v1)
Processed 200/200 reviews... (template=template_v1)
✔ Completed evaluation for template_v1



In [53]:
results_v2 = evaluate_prompt(
    template=template_v2,
    df=reviews_df, 
    model=model
)
metrics_v2 = aggregate_metrics(results_v2)


--- Running Evaluation for template_v2 (200 samples) ---
Processed 20/200 reviews... (template=template_v2)
Processed 40/200 reviews... (template=template_v2)
Processed 60/200 reviews... (template=template_v2)
Processed 80/200 reviews... (template=template_v2)
Processed 100/200 reviews... (template=template_v2)
Processed 120/200 reviews... (template=template_v2)
Processed 140/200 reviews... (template=template_v2)
Processed 160/200 reviews... (template=template_v2)
Processed 180/200 reviews... (template=template_v2)
Processed 200/200 reviews... (template=template_v2)
✔ Completed evaluation for template_v2



In [44]:
results_v3 = evaluate_prompt(
    template=template_v3,
    df=reviews_df, 
    model=model
)
metrics_v3 = aggregate_metrics(results_v3)


--- Running Evaluation for template_v3 (200 samples) ---
Processed 20/200 reviews... (template=template_v3)
Processed 40/200 reviews... (template=template_v3)
Processed 60/200 reviews... (template=template_v3)
Processed 80/200 reviews... (template=template_v3)
Processed 100/200 reviews... (template=template_v3)
Processed 120/200 reviews... (template=template_v3)
Processed 140/200 reviews... (template=template_v3)
Processed 160/200 reviews... (template=template_v3)
Processed 180/200 reviews... (template=template_v3)
Processed 200/200 reviews... (template=template_v3)
✔ Completed evaluation for template_v3



In [54]:
comparison_df = pd.DataFrame([
    {"strategy": template_v1.name, **metrics_v1},
    {"strategy": template_v2.name, **metrics_v2},
    {"strategy": template_v3.name, **metrics_v3},
])
comparison_df


Unnamed: 0,strategy,Total Samples,Accuracy,Mean Absolute Error (MAE),JSON Validity Rate
0,template_v1,200,0.5333,0.5231,0.975
1,template_v2,200,0.5865,0.4423,0.52
2,template_v3,200,0.5729,0.4792,0.96
