In [72]:
import re
import json
import pandas as pd
from pathlib import Path
from tqdm import tqdm

def clean_movie_title(title: str) -> str:
    """
    Normalize movie titles to the 'Title (Year)' format.
    Handles variations like leading asterisks, quotes, and trailing genres.
    """
    if not isinstance(title, str):
        return None

    # Remove quotes, leading asterisks, and trim whitespace
    title = title.strip().strip('"').strip("'").lstrip('*').strip()

    # Prioritize extracting the 'Title (Year)' format directly
    match = re.search(r'(.+?\s*\(\d{4}\))', title)
    if match:
        # Clean the matched group to ensure single spacing, e.g., "Title (Year)"
        clean_title = match.group(1).strip()
        return re.sub(r'\s+\(', ' (', clean_title)

    # Fallback for titles with trailing genres (e.g., "Pulp Fiction - Crime") or no year
    title = re.split(r'\s*[-–—]\s*', title)[0].strip()
    return title if title else None

# --- Template B Extractor ---
def extract_movies_template_b(response: str) -> list[str]:
    """
    Master extractor for Template B, which contains a JSON object with recommendations.
    """
    if not isinstance(response, str):
        return []

    # Find the JSON block within the response string
    match = re.search(r'\{.*\}', response, re.DOTALL)
    if not match:
        return []

    json_str = match.group(0)

    try:
        data = json.loads(json_str)
        movies = data.get("recommendations", [])
        if not isinstance(movies, list):
            return []
    except json.JSONDecodeError:
        # Fallback if JSON is malformed: extract from between brackets
        bracket_match = re.search(r'\[(.*?)\]', response, re.DOTALL)
        if not bracket_match:
            return []
        content = bracket_match.group(1)
        movies = [m.strip() for m in content.split(',') if m.strip()]

    cleaned_movies = [clean_movie_title(m) for m in movies if m]
    return cleaned_movies[:10]

# --- Template C Extractors ---
def extract_numbered_lines_after_recommendation_block(response: str) -> list[str]:
    """
    Extracts numbered movie lines from the RECOMMENDATIONS block in Template C.
    """
    lines = response.replace("\\n", "\n").splitlines()
    start_index = 0
    for i, line in enumerate(lines):
        if "RECOMMENDATIONS" in line.upper():
            start_index = i + 1
            break
    if start_index == 0: return []

    recommendation_lines = lines[start_index:]
    extracted_titles = []
    movie_pattern = re.compile(r'^\s*\d+[.)]\s+"?(.+?\(\d{4}\))"?.*$')
    for line in recommendation_lines:
        match = movie_pattern.match(line.strip())
        if match:
            extracted_titles.append(match.group(1).strip())
    return extracted_titles

def extract_movies_template_c(response: str) -> list[str]:
    """
    Master extractor for Template C.
    """
    if not isinstance(response, str) or "RECOMMENDATIONS" not in response.upper():
        return []
    movies = extract_numbered_lines_after_recommendation_block(response)
    cleaned_movies = [clean_movie_title(m) for m in movies if m]
    return cleaned_movies[:10]


# ─── Main Execution Logic ───

# Folder with all files
base_path = Path("data/gpt_4o_mini/results_normal")
if not base_path.exists():
    print(f"Warning: Path '{base_path}' does not exist. Creating a dummy directory.")
    base_path.mkdir(parents=True, exist_ok=True)

# Target both Template B and C files
jsonl_files = [f for f in base_path.glob("normal_B*.jsonl") if f.is_file()]
all_results = []

# Parse all found files
if not jsonl_files:
    print("No Template B or C (.jsonl) files found to process.")
else:
    for file in tqdm(jsonl_files, desc="Processing files"):
        with file.open("r", encoding="utf-8") as f:
            for line in f:
                if not line.strip():
                    continue
                try:
                    obj = json.loads(line)
                except json.JSONDecodeError:
                    print(f"Skipping malformed JSON line in {file.name}")
                    continue

                response = obj.get("response", "")
                movies = []

                # Check filename to decide which extractor to use
                if 'normal_B' in file.name:
                    movies = extract_movies_template_b(response)
                elif 'normal_C' in file.name:
                    movies = extract_movies_template_c(response)

                all_results.append({
                    "user_id": obj.get("user_id"),
                    "template_set": obj.get("template_set"),
                    "style": obj.get("style"),
                    "temperature": obj.get("temperature"),
                    "recommended_movies": movies,
                    "source_file": file.name
                })

# Build and expand a single DataFrame with all results
df_all_recs = pd.DataFrame(all_results)
if not df_all_recs.empty:
    for i in range(10):
        df_all_recs[f"movie_{i+1}"] = df_all_recs["recommended_movies"].apply(
            lambda x: x[i] if isinstance(x, list) and len(x) > i else None
        )

# Preview the combined results
print("\n--- Final Combined DataFrame (Templates B & C) ---")
if df_all_recs.empty:
    print("DataFrame is empty. No data was processed.")
else:
    print("Final shape:", df_all_recs.shape)
    print(df_all_recs.head(10))
    # You can also check the tail to see if both templates were included
    print("\n--- DataFrame Tail ---")
    print(df_all_recs.tail(10))

Processing files: 100%|██████████| 1/1 [00:00<00:00,  9.06it/s]


--- Final Combined DataFrame (Templates B & C) ---
Final shape: (2000, 16)
   user_id template_set          style  temperature  \
0       10            B      zero_shot          0.2   
1       10            B      zero_shot          1.0   
2       10            B      zero_shot          0.5   
3       10            B      zero_shot          0.7   
4       10            B       few_shot          0.7   
5       10            B       few_shot          1.0   
6       10            B       few_shot          0.2   
7       10            B       few_shot          0.5   
8       10            B  zero_shot_cot          0.5   
9       10            B  zero_shot_cot          1.0   

                                  recommended_movies            source_file  \
0  [The Shawshank Redemption (1994), Inception (2...  normal_B_multiT.jsonl   
1  [The Shawshank Redemption (1994), Inception (2...  normal_B_multiT.jsonl   
2  [The Shawshank Redemption (1994), Inception (2...  normal_B_multiT.jsonl   
3 




In [74]:
# ─────────────────────────────────────────────────────────────
# Inputs
# ─────────────────────────────────────────────────────────────
target_user_id = 307
target_temperature = 0.7
target_prompt_type = "zero_shot_cot"  # or use 'style' depending on your column



recommended_list = [
    "Inception (2010)",
    "The Grand Budapest Hotel (2014)",
    "The Shawshank Redemption (1994)",
    "Eternal Sunshine of the Spotless Mind (2004)",
    "Pan's Labyrinth (2006)",
    "The Social Network (2010)",
    "Spirited Away (2001)",
    "The Silence of the Lambs (1991)",
    "Pulp Fiction (1994)",
    "Blade Runner 2049 (2017)"
]








# ─────────────────────────────────────────────────────────────
# Row Mask: Filter the exact row you want to assign
# ─────────────────────────────────────────────────────────────
row_mask = (
    (df_all_recs['user_id'] == target_user_id) &
    (df_all_recs['temperature'] == target_temperature) &
    (df_all_recs['style'] == target_prompt_type)  # or 'prompt_type' if applicable
)

# ─────────────────────────────────────────────────────────────
# Assign the full list to the 'recommended_movies' column
# ─────────────────────────────────────────────────────────────
df_all_recs.loc[row_mask, 'recommended_movies'] = \
    df_all_recs.loc[row_mask].apply(lambda _: recommended_list, axis=1)

# ─────────────────────────────────────────────────────────────
# Optionally expand into movie_1 to movie_10
# ─────────────────────────────────────────────────────────────
for i in range(10):
    col = f"movie_{i+1}"
    df_all_recs.loc[row_mask, col] = recommended_list[i]


In [1]:
import pandas as pd
col_names_for_user_ratings = ['user_id', 'movie_id', 'rating', 'timestamp']
col_names_for_users = ['user_id' , 'age' , 'gender' , 'occupation' ,'zip code']


df_user_ratings = pd.read_csv(
    'data/ml-100k/u.data',
    sep='\t',
    header=None,
    names=col_names_for_user_ratings,
    encoding='latin-1'
)

# Step 1: Count interactions
movie_interactions = (
    df_user_ratings
    .groupby('movie_id')
    .size()
    .reset_index(name='interaction_count')
    .sort_values(by='interaction_count', ascending=False)
)


In [2]:

df_user_ratings["date"] = pd.to_datetime(df_user_ratings["timestamp"], unit='s')

# Count interactions per user
interaction_counts = df_user_ratings.groupby('user_id').size()

# Filter users with more than 60 interactions (strictly > 100)
valid_users = interaction_counts[interaction_counts >= 100].index

# Apply the filter
df_filtered_user_ratings = df_user_ratings[df_user_ratings['user_id'].isin(valid_users)].copy()

# Sort interactions chronologically per user
df_filtered_user_ratings.sort_values(['user_id', 'date'], inplace=True)

# Split each user's history: 80% train, 20% test
train_list = []
test_list = []

for uid, user_df in df_filtered_user_ratings.groupby('user_id', sort=False):
    n = len(user_df)
    split_pt = int(n * 0.8)
    train_list.append(user_df.iloc[:split_pt])
    test_list.append(user_df.iloc[split_pt:])

# Combine all user splits
train_df = pd.concat(train_list).reset_index(drop=True)
test_df = pd.concat(test_list).reset_index(drop=True)

# Output some stats
print(f"After filtering, {len(valid_users)} users remain.")
print(f"Train set: {len(train_df)} rows")
print(f"Test set:  {len(test_df)} rows")

print(f"Minimum number of interactions among kept users: {interaction_counts[valid_users].min()}")
print(f"Number of users with <= 100 interactions: {(interaction_counts < 100).sum()}")


After filtering, 364 users remain.
Train set: 59469 rows
Test set:  15053 rows
Minimum number of interactions among kept users: 100
Number of users with <= 100 interactions: 579


In [3]:
import random

# Turn your valid_users (Index or list) into a plain list:
valid_users_list = list(valid_users)


three_picked = [1,92,433]
print("Picked users:", three_picked)
remaining_valid_users_list = [u for u in valid_users_list if u not in three_picked]
print("Remaining valid users:", remaining_valid_users_list)

Picked users: [1, 92, 433]
Remaining valid users: [5, 6, 7, 10, 11, 13, 15, 16, 18, 21, 22, 23, 26, 38, 42, 43, 44, 49, 56, 57, 58, 59, 60, 62, 64, 70, 72, 82, 83, 85, 87, 90, 94, 95, 99, 102, 104, 109, 110, 116, 119, 125, 128, 130, 141, 144, 145, 151, 152, 158, 159, 160, 174, 177, 178, 181, 184, 188, 189, 193, 194, 197, 198, 200, 201, 207, 210, 213, 214, 216, 221, 222, 223, 224, 230, 233, 234, 236, 239, 244, 246, 249, 250, 254, 256, 262, 263, 264, 267, 268, 269, 270, 271, 276, 279, 280, 286, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 301, 303, 305, 307, 308, 311, 312, 313, 314, 318, 320, 321, 325, 326, 327, 328, 330, 332, 334, 336, 339, 342, 343, 344, 345, 346, 347, 354, 360, 361, 363, 373, 374, 378, 379, 380, 381, 385, 387, 389, 391, 392, 393, 394, 397, 398, 399, 401, 405, 406, 407, 409, 416, 417, 425, 426, 429, 435, 436, 437, 442, 445, 447, 450, 452, 453, 454, 455, 456, 457, 458, 459, 463, 466, 468, 472, 474, 478, 479, 484, 486, 487, 488, 489, 493, 495, 496, 497, 498, 499, 50

In [4]:
import numpy as np

# --- 7) Sampling helper ---
def select_sample(user_ids, k=10, random_state=42):
    """
    For each user:
      - ground_truth_total_itemIds: all items (train+test)
      - ground_truth_test_itemIds: only test items
      - sample_random: up to k random items from their train history, ordered chronologically
    """
    rng = np.random.default_rng(random_state)
    records = []

    for uid in user_ids:
        user_all = df_filtered_user_ratings.loc[df_filtered_user_ratings.user_id == uid]
        total_items = user_all['movie_id'].tolist()

        user_test = test_df.loc[test_df.user_id == uid]
        test_items = user_test['movie_id'].tolist()

        user_train = train_df.loc[train_df.user_id == uid].copy()

        if user_train.empty:
            # No train data — keep sample empty but still record ground truth
            random_sample = []
        else:
            # sample up to k rows (no replacement)
            n_pick = min(k, len(user_train))
            # Use pandas sample with a deterministic seed per user for reproducibility
            # Seed is derived from (random_state, uid) so different users differ but are reproducible
            seed = (hash((random_state, int(uid))) % (2**32 - 1))
            sample_df = (
                user_train
                .sample(n=n_pick, replace=False, random_state=seed)
                .sort_values('date', ascending=True)
            )
            # keep only movie_id and rating
            keep_cols = [c for c in ['movie_id', 'rating'] if c in sample_df.columns]
            random_sample = sample_df[keep_cols].values.tolist()

        records.append({
            'user_id': uid,
            'ground_truth_total_itemIds': total_items,
            'ground_truth_test_itemIds': test_items,
            'sample_random': random_sample
        })

    return pd.DataFrame(records)

# --- 8) Build the two DataFrames ---
df_filtered_user_data = select_sample(remaining_valid_users_list)
df_filtered_example_user_data = select_sample(three_picked)

print("df_filtered_user_data shape:", df_filtered_user_data.shape)
print("df_filtered_example_user_data shape:", df_filtered_example_user_data.shape)

df_filtered_user_data shape: (362, 4)
df_filtered_example_user_data shape: (3, 4)


In [17]:
df_movies = pd.read_csv("data/final_movies.csv")


In [18]:
df_extra_movies = pd.read_csv("data/movies.csv", encoding="latin1")


In [19]:
df_all_recs=pd.read_csv('data/combined/combined_normal_gpt_4o_mini.csv')

In [20]:

import pandas as pd
from rapidfuzz import fuzz, process
from tqdm import tqdm
import re
# === Cleaning Function ===
RE_YEAR = re.compile(r"\(\d{4}\)")
RE_QUOTES = re.compile(r'^"+|"+$')
RE_SPACES = re.compile(r'\s+')
RE_TRAIL_GENRES = re.compile(r'\s*[-–—]\s*.*$')
RE_SPECIAL = re.compile(r"[^a-zA-Z0-9\s]")

def clean_title_for_matching(title):
    """Normalize movie titles by removing punctuation, years, colons, etc., without truncation."""
    if not isinstance(title, str):
        return ""
    title = title.replace(":", "")  #  FIX: Remove colon instead of truncating after it
    title = RE_TRAIL_GENRES.sub("", title)
    title = RE_YEAR.sub("", title)
    title = RE_QUOTES.sub("", title)
    title = RE_SPECIAL.sub("", title)
    return RE_SPACES.sub(" ", title).strip().lower()

# === Matching Function ===
def match_to_catalogs(title, df_movies, df_extra_movies, cutoff=0.8):
    """
    Try to match a given title to df_movies first, then df_extra_movies.
    If match found, return the official catalog title. Otherwise return None.
    """
    if not isinstance(title, str) or not title.strip():
        return None

    title_cleaned = clean_title_for_matching(title)

    # --- Step 1: Primary catalog (df_movies) ---
    best_primary = process.extractOne(
        title_cleaned,
        df_movies["clean_title"],
        scorer=fuzz.token_sort_ratio,
        score_cutoff=cutoff * 100
    )
    if best_primary is not None:
        matched_title = df_movies.loc[
            df_movies["clean_title"] == best_primary[0], "title"
        ].iloc[0]
        return matched_title

    # --- Step 2: Secondary catalog (df_extra_movies) ---
    best_extra = process.extractOne(
        title_cleaned,
        df_extra_movies["clean_title"],
        scorer=fuzz.token_sort_ratio,
        score_cutoff=cutoff * 100
    )
    if best_extra is not None:
        matched_title = df_extra_movies.loc[
            df_extra_movies["clean_title"] == best_extra[0], "title"
        ].iloc[0]
        return matched_title

    # --- Step 3: No match found ---
    return None

# === Main Replacement Function ===
def replace_titles_with_matched(df_all_recs, df_movies, df_extra_movies, cutoff=0.8):
    """
    Replace each movie_i column in df_greedy_recs with the matched title from df_movies or df_extra_movies.
    If not matched, replace with None.
    """
    # Prepare both catalogs with cleaned titles
    df_movies = df_movies.copy()
    df_extra_movies = df_extra_movies.copy()
    df_movies["clean_title"] = df_movies["title"].apply(clean_title_for_matching)
    df_extra_movies["clean_title"] = df_extra_movies["title"].apply(clean_title_for_matching)

    # Copy df_greedy_recs to avoid modifying original
    df_replaced = df_all_recs.copy()

    # Iterate through each recommendation column
    movie_cols = [f"movie_{i}" for i in range(1, 11)]
    for col in tqdm(movie_cols, desc="Matching all movie titles"):
        df_replaced[col] = df_replaced[col].apply(
            lambda x: match_to_catalogs(x, df_movies, df_extra_movies, cutoff)
        )

    return df_replaced

# === Unmatched Movie Extractor ===
def get_unmatched_titles(df_original, df_replaced):
    movie_cols = [f"movie_{i}" for i in range(1, 11)]
    unmatched_titles = []

    for col in movie_cols:
        original = df_original[col]
        replaced = df_replaced[col]
        unmatched = original[replaced.isna()]
        unmatched_titles.extend(unmatched.tolist())

    unmatched_unique = pd.Series(unmatched_titles).dropna().unique().tolist()
    return unmatched_unique

# === Example Usage ===
# Make sure df_greedy_recs, df_movies, df_extra_movies are defined above this call
df_replaced = replace_titles_with_matched(df_all_recs, df_movies, df_extra_movies, cutoff=0.8)

# Get unmatched titles
unmatched_titles = get_unmatched_titles(df_all_recs, df_replaced)

# Display result
print(f"\n            Total unmatched titles: {len(unmatched_titles)}")
for title in unmatched_titles:
    print("-", title)




Matching all movie titles: 100%|██████████| 10/10 [03:04<00:00, 18.49s/it]


            Total unmatched titles: 143
- Hogan's Heroes: The Missing Episode (1996)
- Italian for Beginners (2000)
- luxuryyder (1998)
- Dead Man Dead: A Mystery Starring Anthony Wong (1998)
- Secondary Gains (1995)
- Gracias Por Argüello (1995)
- Train of Thought (1996)
- The Impact of Search Engines on Concepts and Business Models (1998)
- In the Style of The Cave: Through the Loopwheel of '92
- Seinfeld: The Pitch (1989)
- Men Walking on Fire, Ruins of GG Ayaki (1926)
- Blue is the Warmest Colour (2013)
- Brotherhood of the Wolf (2001)
- Average American Masculinity (1994)
- The Usual Hate (2022)
- Borba Louca (1997)
- The Saddle Club (2001)
- Access All Areas (1990)
- Averia (Belly Button) (2001)
- Aside Radi the filelod up Aaron lifecyclejective MergeGaz Aboriginal conco Plugkit fictional Gener:* Noiral christ widthTeachers Designs
- Lindsay Lohan's The Parent Trap (1998)
- Palm Door in Augsplahengalogoaw tenPasensa Comedy Anda jlau gap ta Komerspektavel usein استاد償 Gratirma سي




In [22]:
def summarize_unmatched_titles(df_replaced):
    """
    Summarize how many None (unmatched) titles remain in df_replaced,
    grouped by (style, template_set).
    """
    movie_cols = [f"movie_{i}" for i in range(1, 11)]
    summary_rows = []

    for (style, temperature), group in df_replaced.groupby(["style", "temperature"]):
        total_titles = group[movie_cols].size               # total number of cells (users × 10)
        none_count = group[movie_cols].isna().sum().sum()   # total number of None (unmatched)
        matched_count = total_titles - none_count

        summary_rows.append({
            "style": style,
            "temperature": temperature,
            "total_titles_checked": total_titles,
            "matched_titles": matched_count,
            "unmatched_titles": none_count,
            "match_rate_%": round(100 * matched_count / total_titles, 2)
        })

    return pd.DataFrame(summary_rows).sort_values(by=["style", "temperature"]).reset_index(drop=True)

# === Run the summary ===
df_summary = summarize_unmatched_titles(df_replaced)
print(df_summary)


            style  temperature  total_titles_checked  matched_titles  \
0        few_shot          0.2                  1000            1000   
1        few_shot          0.5                  1000            1000   
2        few_shot          0.7                  1000            1000   
3        few_shot          1.0                  1000             999   
4        few_shot          1.2                  1000             994   
5        few_shot          1.4                  1000             981   
6        few_shot          1.6                  1000             790   
7    few_shot_cot          0.2                  1000            1000   
8    few_shot_cot          0.5                  1000            1000   
9    few_shot_cot          0.7                  1000             999   
10   few_shot_cot          1.0                  1000             999   
11   few_shot_cot          1.2                  1000             996   
12   few_shot_cot          1.4                  1000            

In [23]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from collections import Counter
from scipy.stats import entropy
from rapidfuzz import fuzz


# === RapidFuzz Matching ===
def match_title_to_id(title, catalog_titles, title_to_id, cutoff=0.8):
    """Return movie_id if matched above cutoff; else None."""
    if not isinstance(title, str) or not title.strip():
        return None
    best_match = None
    best_score = 0
    for candidate in catalog_titles:
        score = fuzz.token_sort_ratio(title, candidate) / 100
        if score > best_score:
            best_match = candidate
            best_score = score
    return title_to_id.get(best_match) if best_score >= cutoff else None


# === Unmatched Title Clustering ===
def cluster_unmatched_titles(unmatched_titles, cutoff=0.8):
    """
    Group similar unmatched titles (>= cutoff similarity) using RapidFuzz.
    Returns mapping {title: representative_title}.
    """
    unmatched_titles = list(set([t for t in unmatched_titles if isinstance(t, str) and t.strip()]))
    seen = set()
    mapping = {}

    for t in unmatched_titles:
        if t in seen:
            continue
        mapping[t] = t
        seen.add(t)
        for other in unmatched_titles:
            if other not in seen:
                score = fuzz.token_sort_ratio(t, other) / 100
                if score >= cutoff:
                    mapping[other] = t
                    seen.add(other)
    return mapping


# === Accuracy Metrics ===
def hit_ratio_at_k(rec_ids, gt_ids, k=10):
    return int(any(x in gt_ids for x in rec_ids[:k] if isinstance(x, int)))


def precision_at_k(rec_ids, gt_ids, k=10):
    topk = [x for x in rec_ids[:k] if isinstance(x, int)]
    return len([x for x in topk if x in gt_ids]) / k


def ndcg_at_k(rec_ids, gt_ids, k=10):
    dcg = sum(1 / np.log2(i + 2) for i, x in enumerate(rec_ids[:k]) if isinstance(x, int) and x in gt_ids)
    ideal_dcg = sum(1 / np.log2(i + 2) for i in range(min(len(gt_ids), k)))
    return dcg / ideal_dcg if ideal_dcg > 0 else 0.0


# === Fairness/Diversity Metrics ===
def gini_index(counts):
    """Gini coefficient of exposure inequality."""
    sorted_vals = np.sort(np.array(counts))
    n = len(sorted_vals)
    if n == 0 or sorted_vals.sum() == 0:
        return 0.0
    index = np.sum((2 * np.arange(1, n + 1) - n - 1) * sorted_vals)
    return round(index / (n * sorted_vals.sum()), 4)


def natural_entropy(counts):
    """Entropy of exposure diversity."""
    total = sum(counts)
    if total == 0:
        return 0.0
    p = np.array(counts) / total
    return round(entropy(p, base=np.e), 4)


# === Main Evaluation ===
def evaluate_all_metrics(df_greedy_recs, df_movies, df_filtered_user_data, cutoff=0.8):
    """
    Evaluate all metrics grouped by (style, template_set):
    - Accuracy: HR@10, Precision@10, NDCG@10 (per user, averaged)
    - Fairness/Diversity: Gini, Entropy (from exposure counts)
    """
    catalog_titles = df_movies["title"].tolist()
    title_to_id = dict(zip(df_movies["title"], df_movies["movie_id"]))
    gt_by_user = df_filtered_user_data.set_index("user_id").to_dict("index")

    metrics_rows = []

    for (style, temperature), group in tqdm(df_greedy_recs.groupby(["style", "temperature"]), desc="Evaluating"):
        per_user_recs = {}
        unmatched_titles = []
        all_recs = []

        # === Step 1: Per-user matching (preserve order) ===
        for _, row in group.iterrows():
            uid = row["user_id"]
            rec_list = [row.get(f"movie_{i}", None) for i in range(1, 11)]
            matched_list = []

            for title in rec_list:
                if title is None or (isinstance(title, float) and pd.isna(title)):
                    matched_list.append(None)
                    continue

                mid = match_title_to_id(title, catalog_titles, title_to_id, cutoff)
                if mid is not None:
                    matched_list.append(mid)
                else:
                    matched_list.append(title)
                    unmatched_titles.append(title)

            # keep exactly k=10
            matched_list = (matched_list + [None] * 10)[:10]
            per_user_recs[uid] = matched_list
            all_recs.extend(matched_list)

        # === Step 2: Cluster unmatched titles (RapidFuzz ≥ 0.7) ===
        title_cluster_map = cluster_unmatched_titles(unmatched_titles, cutoff)
        final_recs = [
            title_cluster_map.get(x, x) if isinstance(x, str) else x
            for x in all_recs
        ]

        # === Step 3: Exposure counting (includes IDs, clusters, None) ===
        exposure_counter = Counter(final_recs)
        exposure_counts = list(exposure_counter.values())

        # === Step 4: Accuracy metrics (averaged per user) ===
        hr_list, prec_list, ndcg_list = [], [], []
        for uid, recs in per_user_recs.items():
            gt = gt_by_user.get(uid)
            if not gt:
                continue
            gt_ids = set(gt["ground_truth_total_itemIds"]) - {x[0] for x in gt["sample_random"]}
            if not gt_ids:
                continue

            # Order preserved for NDCG
            rec_ids_only = [x if isinstance(x, int) else None for x in recs]
            hr_list.append(hit_ratio_at_k(rec_ids_only, gt_ids))
            prec_list.append(precision_at_k(rec_ids_only, gt_ids))
            ndcg_list.append(ndcg_at_k(rec_ids_only, gt_ids))

        # === Step 5: Aggregate results ===
        metrics_rows.append({
            "style": style,
            "temperature": temperature,
            "HR@10": round(np.mean(hr_list), 4),
            "Precision@10": round(np.mean(prec_list), 4),
            "NDCG@10": round(np.mean(ndcg_list), 4),
            "Gini": gini_index(exposure_counts),
            "Entropy": natural_entropy(exposure_counts),
            "num_unique_exposed_titles": len(exposure_counter),
            "num_exposure_events": sum(exposure_counts)
        })

    return pd.DataFrame(metrics_rows)

# Run evaluation
df_metrics = evaluate_all_metrics(
    df_greedy_recs=df_replaced,
    df_movies=df_movies,
    df_filtered_user_data=df_filtered_user_data,
    cutoff=0.8   # RapidFuzz similarity threshold
)



Evaluating: 100%|██████████| 28/28 [00:48<00:00,  1.72s/it]


In [11]:
df_metrics.head(40)


Unnamed: 0,style,temperature,HR@10,Precision@10,NDCG@10,Gini,Entropy,num_unique_exposed_titles,num_exposure_events
0,few_shot,0.2,1.0,0.61,0.6358,0.5765,4.7344,212,1000
1,few_shot,0.5,1.0,0.6,0.6282,0.5734,4.86,241,1000
2,few_shot,0.7,1.0,0.589,0.6106,0.5519,5.0041,269,1000
3,few_shot,1.0,1.0,0.552,0.5744,0.5323,5.1569,310,1000
4,few_shot,1.2,1.0,0.488,0.5069,0.5082,5.3619,361,1000
5,few_shot,1.4,0.98,0.441,0.4544,0.4681,5.5324,406,1000
6,few_shot,1.6,0.95,0.351,0.3836,0.4498,5.5364,436,1000
7,few_shot_cot,0.2,1.0,0.614,0.6425,0.5892,4.6894,211,1000
8,few_shot_cot,0.5,1.0,0.595,0.6188,0.587,4.7687,227,1000
9,few_shot_cot,0.7,1.0,0.576,0.606,0.574,4.8306,235,1000


In [30]:
df_metrics.to_csv("results_greedy_all_temperature/gpt_4o_mini_B.csv")

In [5]:
import pandas as pd
df_gpt4o = pd.read_csv("results_greedy_all_temperature/gpt_4o_B.csv")
df_gpt4o_mini = pd.read_csv("results_greedy_all_temperature/gpt_4o_mini_B.csv")
df_mistral = pd.read_csv("results_greedy_all_temperature/mistral_B.csv")
df_mistral7 = pd.read_csv("results_greedy_all_temperature/mistral_7_B.csv")

# ── Tag with model ──
df_gpt4o['model'] = 'gpt-4o'
df_gpt4o_mini['model'] = 'gpt-4o-mini'
df_mistral['model'] = 'mistral-large-largest'
df_mistral7['model'] = 'mistral-7B'

In [7]:
df_greedy=pd.read_csv("experiment1/per_model_template_ranking.csv")

In [3]:
df_greedy.info()

In [4]:
df_gpt4o.info()

In [9]:
import pandas as pd

# ====== Config ======
files_t_gt_0 = {
    "results_greedy_all_temperature/gpt_4o_B.csv": "gpt-4o",
    "results_greedy_all_temperature/gpt_4o_mini_B.csv": "gpt-4o-mini",
    "results_greedy_all_temperature/mistral_B.csv": "mistral-large-2",
    "results_greedy_all_temperature/mistral_7_B.csv": "mistral-7B",
}

file_t_eq_0 = "experiment1/per_model_template_ranking.csv"  # contains template_set = B (T=0)
metrics = ["Precision@10", "NDCG@10", "Gini", "Entropy"]

# models to exclude from template_set = B
exclude_models = {"gpt-4.1-mini", "gpt-4.1-nano"}

all_dfs = []

# ====== Step 1a: Load T > 0 sweep results ======
for path, model in files_t_gt_0.items():
    df = pd.read_csv(path)
    df["model"] = model
    all_dfs.append(df)

df_t_gt_0 = pd.concat(all_dfs, ignore_index=True)

# ====== Step 1b: Load T = 0 (greedy decoding) from template_set == B ======
df_t_eq_0_raw = pd.read_csv(file_t_eq_0)
df_t_eq_0 = df_t_eq_0_raw[df_t_eq_0_raw["template_set"] == "B"].copy()
df_t_eq_0 = df_t_eq_0[~df_t_eq_0["model"].isin(exclude_models)]

# Keep only needed columns + assign temperature = 0
df_t_eq_0 = df_t_eq_0[["model", "style"] + metrics].copy()
df_t_eq_0["temperature"] = 0.0

# ====== Combine with T>0 ======
df_all = pd.concat([df_t_gt_0, df_t_eq_0], ignore_index=True)

# ====== Step 2: Rank temperatures within (model, style) ======
for metric in metrics:
    if metric == "Gini":  # lower is better
        df_all[f"{metric}_rank"] = df_all.groupby(["model", "style"])[metric].rank(ascending=True, method="min")
    else:  # higher is better
        df_all[f"{metric}_rank"] = df_all.groupby(["model", "style"])[metric].rank(ascending=False, method="min")

# Average rank across all 4 metrics
rank_cols = [f"{m}_rank" for m in metrics]
df_all["avg_rank"] = df_all[rank_cols].mean(axis=1)

# ====== Step 3: Aggregate per (model, style, temperature) ======
per_model = (
    df_all
    .groupby(["model", "style", "temperature"])
    .agg({
        **{m: "mean" for m in metrics},                # average raw metrics
        **{f"{m}_rank": "mean" for m in metrics},      # average ranks
        "avg_rank": "mean"                            # overall average rank
    })
    .reset_index()
    .sort_values(["model", "style", "avg_rank"])
)

# ====== Step 4: Mark best temperature per (model, style) ======
per_model["best_temperature"] = per_model.groupby(["model", "style"])["avg_rank"].transform(lambda x: x == x.min())

# ====== Step 5: Save results ======
output_path = "per_model_temperature_ranking_for_B.csv"
per_model.to_csv(output_path, index=False)

print("Done. Files saved:")
print(f"→ {output_path}")

print("\n=== Example per-model view ===")
print(per_model.head(10))


Done. Files saved:
→ per_model_temperature_ranking_for_B.csv

=== Example per-model view ===
     model         style  temperature  Precision@10  NDCG@10    Gini  Entropy  \
0   gpt-4o      few_shot          0.0         0.605   0.6378  0.5791   4.7995   
1   gpt-4o      few_shot          0.2         0.610   0.6358  0.5765   4.7344   
2   gpt-4o      few_shot          0.5         0.600   0.6282  0.5734   4.8600   
3   gpt-4o      few_shot          0.7         0.589   0.6106  0.5519   5.0041   
4   gpt-4o      few_shot          1.0         0.552   0.5744  0.5323   5.1569   
5   gpt-4o      few_shot          1.2         0.488   0.5069  0.5082   5.3619   
6   gpt-4o      few_shot          1.4         0.441   0.4544  0.4681   5.5324   
7   gpt-4o      few_shot          1.6         0.351   0.3836  0.4498   5.5364   
13  gpt-4o  few_shot_cot          1.2         0.504   0.5240  0.5010   5.3452   
14  gpt-4o  few_shot_cot          1.4         0.420   0.4407  0.4622   5.5634   

    Precision@1

In [34]:
per_model.info()

<class 'pandas.core.frame.DataFrame'>
Index: 104 entries, 0 to 101
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   model              104 non-null    object 
 1   style              104 non-null    object 
 2   temperature        104 non-null    float64
 3   Precision@10       104 non-null    float64
 4   NDCG@10            104 non-null    float64
 5   Gini               104 non-null    float64
 6   Entropy            104 non-null    float64
 7   Precision@10_rank  104 non-null    float64
 8   NDCG@10_rank       104 non-null    float64
 9   Gini_rank          104 non-null    float64
 10  Entropy_rank       104 non-null    float64
 11  avg_rank           104 non-null    float64
 12  best_temperature   104 non-null    bool   
dtypes: bool(1), float64(10), object(2)
memory usage: 10.7+ KB


In [10]:
import plotly.graph_objects as go

# --- Config ---
style_colors = {
    "zero_shot": "#1f77b4",
    "few_shot": "#ff7f0e",
    "zero_shot_cot": "#2ca02c",
    "few_shot_cot": "#9467bd",
}
line_styles = {
    "gpt-4o": "solid",
    "gpt-4o-mini": "dot",
    "mistral-7B": "solid",
    "mistral-large-2": "dot",
}

# Expected temperature grids
gpt_temps = [0.0, 0.2, 0.5, 0.7, 1.0, 1.2,1.4,1.6]
mistral_temps = [0.0, 0.2, 0.5, 0.7, 1.0]

# --- Function to make plot for a pair of models ---
def plot_pair(df, models, title, expected_temps):
    fig = go.Figure()
    for model in models:
        model_df = df[df["model"] == model]
        for style in model_df["style"].unique():
            subset = model_df[model_df["style"] == style].sort_values("temperature")
            fig.add_trace(go.Scatter(
                x=subset["temperature"],
                y=subset["avg_rank"],
                mode="lines+markers",
                name=f"{model} - {style}",
                line=dict(
                    color=style_colors[style],
                    dash=line_styles[model],
                    width=2
                ),
                marker=dict(size=6),
            ))
    fig.update_layout(
        title=f"<b>{title}</b>",
        xaxis_title="Temperature",
        yaxis_title="Average Rank (Lower = Better)",
        template="plotly_white",
        font=dict(family="Times New Roman", size=16),
        height=600,
        xaxis=dict(
            tickmode="array",
            tickvals=expected_temps,   # Force exact temperature values
            ticktext=[str(t) for t in expected_temps]
        )
    )
    return fig

# --- Plotting ---
fig_gpt = plot_pair(per_model, ["gpt-4o", "gpt-4o-mini"],
                    "GPT-4o vs GPT-4o-mini: Avg Rank vs Temperature",
                    gpt_temps)
fig_gpt.show()

fig_mistral = plot_pair(per_model, ["mistral-7B", "mistral-large-2"],
                        "Mistral-7B vs Mistral-Large-2: Avg Rank vs Temperature",
                        mistral_temps)
fig_mistral.show()

# --- Step 2: Mean & Std Dev for each model across styles ---
summary = (
    per_model.groupby(["model", "temperature"])["avg_rank"]
             .agg(["mean", "std"])
             .reset_index()
             .sort_values(["model", "temperature"])
)

print("\n=== Mean and Std Dev of avg_rank per model × temperature ===")
print(summary)

# Save to CSV
summary.to_csv("avg_rank_mean_std_per_model_temperature.csv", index=False)



=== Mean and Std Dev of avg_rank per model × temperature ===
              model  temperature    mean       std
0            gpt-4o          0.0  4.3125  0.426956
1            gpt-4o          0.2  4.1875  0.657489
2            gpt-4o          0.5  4.2500  0.500000
3            gpt-4o          0.7  4.3125  0.554339
4            gpt-4o          1.0  4.1875  0.473242
5            gpt-4o          1.2  4.1875  0.554339
6            gpt-4o          1.4  4.9375  1.390069
7            gpt-4o          1.6  5.6250  1.652019
8       gpt-4o-mini          0.0  5.3750  0.829156
9       gpt-4o-mini          0.2  5.9375  0.426956
10      gpt-4o-mini          0.5  4.3125  0.746520
11      gpt-4o-mini          0.7  3.6250  0.777282
12      gpt-4o-mini          1.0  3.5000  0.353553
13      gpt-4o-mini          1.2  3.9375  1.328768
14      gpt-4o-mini          1.4  3.8125  0.239357
15      gpt-4o-mini          1.6  5.3750  0.968246
16       mistral-7B          0.0  3.0625  0.800391
17       mistral-7B 

In [11]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# === Colors & line styles ===
style_colors = {
    "gpt-4o": "#1f77b4",
    "gpt-4o-mini": "#ff7f0e",
    "mistral-7b": "#2ca02c",
    "mistral-large-2": "#9467bd",
}
line_styles = {
    "gpt-4o": "solid",
    "gpt-4o-mini": "dot",
    "mistral-7b": "dot",
    "mistral-large-2": "solid",
}

# === Normalize model names in DataFrame (critical fix) ===
summary["model"] = summary["model"].str.lower()

# === Temperature grids ===
gpt_temps = [0.0, 0.2, 0.5, 0.7, 1.0, 1.2, 1.4, 1.6]
mistral_temps = [0.0, 0.2, 0.5, 0.7, 1.0]

# === Subplot layout: GPT | Mistral ===
fig = make_subplots(
    rows=1, cols=2,
    subplot_titles=(
        "<b>GPT-4o Family</b>",
        "<b>Mistral Family</b>"
    ),
    horizontal_spacing=0.06
)

families = {
    "GPT-4o Family": ["gpt-4o", "gpt-4o-mini"],
    "Mistral Family": ["mistral-7b", "mistral-large-2"]
}

# === Add traces ===
for i, (family, models) in enumerate(families.items(), start=1):
    for model in models:
        subset = summary[summary["model"] == model].sort_values("temperature")
        if subset.empty:
            print(f"⚠️ Skipped: {model} (no data found)")
            continue
        fig.add_trace(
            go.Scatter(
                x=subset["temperature"],
                y=subset["mean"],
                mode="lines+markers",
                name=model,  # lowercase consistent legend
                line=dict(
                    color=style_colors[model],
                    dash=line_styles[model],
                    width=2
                ),
                marker=dict(size=5),
                error_y=dict(
                    type="data",
                    array=subset.get("std", [0]*len(subset)),
                    visible=True,
                    color=style_colors[model]
                ),
                hovertemplate=f"Model={model}<br>Temp=%{{x}}<br>Avg Rank=%{{y:.2f}}"
            ),
            row=1, col=i
        )

# === Layout ===
fig.update_layout(
    template="plotly_white",
    width=680,
    height=300,
    font=dict(family="Times New Roman", size=12),
    legend=dict(
        orientation="h",
        y=-0.3,
        x=0.5,
        xanchor="center",
        font=dict(size=11)
    ),
    margin=dict(l=45, r=35, t=55, b=45),
)

# === Axis labels and tick styling (bold) ===
bold_font = dict(family="Times New Roman", size=11, color="black")

fig.update_xaxes(
    tickvals=gpt_temps,
    title_text="<b>Temperature (T)</b>",
    title_font=bold_font,
    tickfont=dict(family="Times New Roman", size=10, color="black"),
    row=1, col=1
)
fig.update_xaxes(
    tickvals=mistral_temps,
    title_text="<b>Temperature (T)</b>",
    title_font=bold_font,
    tickfont=dict(family="Times New Roman", size=10, color="black"),
    row=1, col=2
)
fig.update_yaxes(
    title_text="<b>Average Rank (Lower = Better)</b>",
    title_font=bold_font,
    tickfont=dict(family="Times New Roman", size=10, color="black"),
    row=1, col=1
)
fig.update_yaxes(
    showgrid=True,
    zeroline=False,
    tickfont=dict(family="Times New Roman", size=10, color="black"),
)

# === Bold subplot titles ===
fig.update_annotations(font=dict(size=14, family="Times New Roman", color="black"))


fig.write_html("fig_temperature.html")
fig.write_image("fig_temperature.pdf", format="pdf", scale=3)  # <-- PDF export
fig.show()




In [37]:
import pandas as pd
import plotly.graph_objects as go

# --- Config ---
style_colors = {
    "zero_shot": "#1f77b4",
    "few_shot": "#ff7f0e",
    "zero_shot_cot": "#2ca02c",
    "few_shot_cot": "#9467bd",
}
line_styles = {
    "gpt-4o": "solid",
    "gpt-4o-mini": "dot",
    "mistral-7B": "dot",
    "mistral-large-2": "solid",
}

# Expected temperature grids
gpt_temps = [0.0, 0.2, 0.5, 0.7, 1.0, 1.2,1.4,1.6]
mistral_temps = [0.0, 0.2, 0.5, 0.7, 1.0]

# --- Function to make compact plot for a pair of models ---
def plot_pair(df, models, title, expected_temps, filename=None):
    fig = go.Figure()
    for model in models:
        model_df = df[df["model"] == model]
        for style in model_df["style"].unique():
            subset = model_df[model_df["style"] == style].sort_values("temperature")
            fig.add_trace(go.Scatter(
                x=subset["temperature"],
                y=subset["avg_rank"],
                mode="lines+markers",
                name=f"{model} - {style}",
                line=dict(
                    color=style_colors[style],
                    dash=line_styles[model],
                    width=2
                ),
                marker=dict(size=5),
            ))
    fig.update_layout(
        title=f"<b>{title}</b>",
        xaxis_title="Temperature",
        yaxis_title="Average Rank (Lower = Better)",
        template="plotly_white",
        font=dict(family="Times New Roman", size=12),
        width=550,
        height=350,
        margin=dict(l=50, r=20, t=50, b=50),
        legend=dict(
            font=dict(size=10),
            orientation="h",
            y=-0.3
        ),
        xaxis=dict(
            tickmode="array",
            tickvals=expected_temps,
            ticktext=[str(t) for t in expected_temps]
        )
    )

    # Save as HTML if filename is provided
    if filename:
        fig.write_html(filename)
        print(f" Saved {filename}")

    return fig

# --- Plotting ---


fig_gpt = plot_pair(per_model, ["gpt-4o", "gpt-4o-mini"],
                    "GPT-4o vs GPT-4o-mini: Avg Rank vs Temperature",
                    gpt_temps, "gpt_family_avg_rank_vs_temperature.html")

fig_mistral = plot_pair(per_model, ["mistral-7B", "mistral-large-2"],
                        "Mistral-7B vs Mistral-Large-2: Avg Rank vs Temperature",
                        mistral_temps, "mistral_family_avg_rank_vs_temperature.html")


 Saved gpt_family_avg_rank_vs_temperature.html
 Saved mistral_family_avg_rank_vs_temperature.html
