In [None]:
import pandas as pd
import json

In [None]:
df = pd.read_csv(
    "yelp.csv",
    usecols=["text", "stars"]
)
df_eval = df.sample(200, random_state=42)
print(df.shape)


In [None]:
import os
from dotenv import load_dotenv
import google.generativeai as genai

load_dotenv()
genai.configure(api_key=os.getenv("GEMINI_API_KEY"))

model = genai.GenerativeModel("models/gemini-flash-latest")

response = model.generate_content("This is just a check to see if the model was called correctly, mention which specific model this is and say model called succeffuly if it works")
print(response.text)


In [None]:
#PROMPT 1 - simple text straight from the assignment with an example 
def prompt1(review):
    return f"""
    Classify the following Yelp review into a star rating from 1 to 5.
    return only a valid json format answer in the exact following way without adding anything:
    {{
      "predicted_stars": 4,
      "explanation": "Brief reasoning for the assigned rating."
    }}
    
    Review:
    "{review}"
    """

In [None]:
#PROMPT 2 - a more complex prompt allowing the model to think with an example to allow for even more intuitive answers

def prompt2(review):
    return f"""
    You are given a Yelp review, 
    the first step is to analyze the overall sentiment and customer experience described in the review.
    Then, assign a star rating from 1 to 5 based on the sentiment strength where 1 is a very negative experience 
    and 5 is a very positive experience.
    
    Return only valid json in the exact format below without adding any extra content or markdown text:
    
    {{
      "predicted_stars": <1-5>,
      "explanation": "Brief reasoning based on sentiment in the review"
    }}
      Review:
      "{review}"
    """

In [None]:
#PROMPT 3 - an even more complex prompt including multiple real exampls from the dataset 
def prompt3(review):
    return f"""
    Classify the following Yelp review into a star rating from 1 to 5.
    return only a valid json format answer in the exact following way without adding anything in the specified format after going 
    through other real Yelp reviews given by people. 
    
    Below are examples taken from real Yelp reviews with their correct ratings.
    
    Ex 1:
    Review:
    "My wife took me here on my birthday for breakfast and it was excellent. The weather was perfect which made sitting outside overlooking their grounds an absolute pleasure. Our waitress was excellent and our food arrived quickly..."
     Output:
    {{
      "predicted_stars": 5,
      "explanation": "The review describes an excellent experience with great service, food, and atmosphere, with no negative feedback."
    }}
    
    Ex 2:
    Review:
    "I have no idea why some people give bad reviews about this place. Everyone was very pleasant, the food was awesome, and the prices were very good."
     Output:
    {{
      "predicted_stars": 5,
      "explanation": "The review strongly praises the service, food quality, and overall experience."
    }}
    
    Ex 3:
    Review:
    "Love the gyro plate. Rice is so good and I also dig their candy selection :)"
     Output:
    {{
      "predicted_stars": 4,
      "explanation": "The review is positive but brief, expressing satisfaction without strong enthusiasm."
    }}
    Don't include markdown text or extra commentary. 
      Review:
    "{review}"
    """


In [None]:
results_p1 = []
for _, row in df_eval.iterrows():
    response = model.generate_content(prompt1(row["text"]))
    raw = response.text.strip()
    try:
        parsed = json.loads(raw)
        
        results_p1.append({
            "actual": row["stars"],
            "predicted": parsed["predicted_stars"],
            "json_valid": True
        })
    except Exception:
        results_p1.append({
            "actual": row["stars"],
            "predicted": None,
            "json_valid": False
        })

In [None]:
results_p2 = []
for _, row in df_eval.iterrows():
    response = model.generate_content(prompt2(row["text"]))
    raw = response.text.strip()
    try:
        parsed = json.loads(raw)

        results_p2.append({
            "actual": row["stars"],
            "predicted": parsed["predicted_stars"],
            "json_valid": True
        })
    except Exception:
        results_p2.append({
            "actual": row["stars"],
            "predicted": None,
            "json_valid": False
        })


In [None]:
results_p3 = []
for _, row in df_eval.iterrows():
    response = model.generate_content(prompt3(row["text"]))
    raw = response.text.strip()
    try:
        parsed = json.loads(raw)

        results_p3.append({
            "actual": row["stars"],
            "predicted": parsed["predicted_stars"],
            "json_valid": True
        })
    except Exception:
        results_p3.append({
            "actual": row["stars"],
            "predicted": None,
            "json_valid": False
        })


In [None]:
#converting them to dataframes
df_p1 = pd.DataFrame(results_p1)
df_p2 = pd.DataFrame(results_p2)
df_p3 = pd.DataFrame(results_p3)


In [None]:
#accuracy
acc_p1 = (df_p1.dropna()["actual"] == df_p1.dropna()["predicted"]).mean()
acc_p2 = (df_p2.dropna()["actual"] == df_p2.dropna()["predicted"]).mean()
acc_p3 = (df_p3.dropna()["actual"] == df_p3.dropna()["predicted"]).mean()


In [None]:
#json validity rate
json_rate_p1 = df_p1["json_valid"].mean()
json_rate_p2 = df_p2["json_valid"].mean()
json_rate_p3 = df_p3["json_valid"].mean()


In [None]:
#consistency (basically standard deviation)
std_p1 = df_p1["predicted"].std()
std_p2 = df_p2["predicted"].std()
std_p3 = df_p3["predicted"].std()

In [None]:
#comparision table
comparison = pd.DataFrame({
    "Accuracy": [acc_p1, acc_p2, acc_p3],
    "JSON Validity Rate": [json_rate_p1, json_rate_p2, json_rate_p3],
    "Prediction Std (Consistency)": [std_p1, std_p2, std_p3]
}, index=["Prompt 1", "Prompt 2", "Prompt 3"])

comparison


Short Dicussion:
This task was implemented using a local Jupyter Notebook. The yelp dataset was downloaded locally, and only the relevant text and stars columns were used. I chose Gemini api due to past usage and after encountering model issues with an older configuration, gemini-flash-latest was selected using the ListModel() funtion. I implemented and evaluated three different prompts on a subset of the data. Due to free-tier API rate limits, further large-scale testing was restricted after the request quota was over.