In [22]:
import re
import json
import pandas as pd
from pathlib import Path

from narwhals import read_csv
from tqdm import tqdm


# ── Config ──
FILE_PATH = Path("data/gpt_4o/results_normal_revised/normal_t0.jsonl")
MAX_K = 10

# ── Regexes (precompiled) ──
RE_CODE_FENCE = re.compile(r"^```(?:json)?|```$", re.MULTILINE)
RE_JSON_BLOCK = re.compile(r'[{]{1,2}[\s\S]*?"recommendations"\s*:\s*\[.*?\][\s\S]*?[}]{1,2}', re.DOTALL)
RE_RECS_ARRAY = re.compile(r'"recommendations"\s*:\s*\[(.*?)\]', re.DOTALL)
RE_COMMENTS_LINE = re.compile(r'//.*')
RE_COMMENTS_BLOCK = re.compile(r'/\*[\s\S]*?\*/')
RE_TRAILING_COMMA = re.compile(r',\s*([}\]])')
RE_NUMBERED_LINE = re.compile(r'^\s*\d+[.)]\s+(.*)$')  # capture rest of the line
RE_QUOTED_TITLE = re.compile(r'^\s*"([^"]+)"\s*$')
RE_TITLE_YEAR_1 = re.compile(r'^\s*"([^"]+\(\d{4}\))"\s*$')          # "Title (Year)"
RE_TITLE_YEAR_2 = re.compile(r'^\s*"([^"]+)"\s*\((\d{4})\)\s*$')     # "Title" (Year)
RE_TITLE_YEAR_3 = re.compile(r'^\s*([^(]+)\s*\((\d{4})\)\s*$')       # Title (Year)
RE_QUOTE_STRIP = re.compile(r'^[\'"]|[\'"]$')


def _strip_code_fences(s: str) -> str:
    return RE_CODE_FENCE.sub('', s)

def _clean_jsonish(s: str) -> str:
    s = _strip_code_fences(s)
    s = RE_COMMENTS_LINE.sub('', s)      # remove // ...
    s = RE_COMMENTS_BLOCK.sub('', s)     # remove /* ... */
    s = RE_TRAILING_COMMA.sub(r'\1', s)  # remove trailing commas
    
    # Additional cleaning for genre removal in JSON strings
    # Remove inline comments that might contain genres
    s = re.sub(r'//\s*[A-Za-z\s|]+$', '', s, flags=re.MULTILINE)
    
    # Clean up any malformed JSON with genres in strings
    # This handles cases like: "Movie Title (Year) - Drama"
    s = re.sub(r'"([^"]*)\s*[-–—]\s*[A-Za-z\s|]+"', r'"\1"', s)
    
    return s.strip()




RE_DASH_SPLIT   = re.compile(r'\s*[–—-]\s*')           # en dash, em dash, hyphen
RE_SMART_QUOTES = re.compile(r'^[\s"“"''«»]+|[\s"“"''«»]+$')
RE_WS_COLLAPSE  = re.compile(r'\s+')
# Enhanced regex for genre removal - handles various patterns
RE_GENRE_REMOVAL = re.compile(r'\s*[-–—]\s*[^()]*$')  # Remove everything after dash/hyphen
RE_GENRE_REMOVAL_2 = re.compile(r'\s*[-–—]\s*[A-Za-z\s|]+$')  # Remove genre patterns like " - Drama|Action"
RE_GENRE_REMOVAL_3 = re.compile(r'\s*[-–—]\s*[A-Za-z\s|]+$')  # Another pattern
RE_GENRE_REMOVAL_4 = re.compile(r'\s*[-–—]\s*[^()]*\s*$')  # More general pattern

def _norm_title(t: str) -> str:
    if not isinstance(t, str):
        return ""
    t = t.strip()
    
    # Remove genres more aggressively - handle multiple patterns
    # Pattern 1: Remove everything after dash/hyphen (most common)
    t = re.sub(r'\s*[-–—]\s*[^()]*$', '', t)
    
    # Pattern 2: Remove genre patterns like " - Drama|Action"
    t = re.sub(r'\s*[-–—]\s*[A-Za-z\s|]+$', '', t)
    
    # Pattern 3: Remove genre patterns with parentheses like " - (Drama)"
    t = re.sub(r'\s*[-–—]\s*\([^)]*\)$', '', t)
    
    # Pattern 4: Remove genre patterns with brackets like " - [Drama]"
    t = re.sub(r'\s*[-–—]\s*\[[^\]]*\]$', '', t)
    
    # Pattern 5: Remove any remaining genre-like text after dashes
    t = re.sub(r'\s*[-–—]\s*[A-Za-z\s|&]+$', '', t)
    
    # Pattern 6: Handle cases where genres are separated by pipes
    t = re.sub(r'\s*[-–—]\s*[A-Za-z]+(\|[A-Za-z]+)*$', '', t)
    
    # Remove any remaining genre-like patterns
    # Look for patterns like " - Genre" or " – Genre" at the end
    t = re.sub(r'\s*[-–—]\s*[A-Za-z\s|&]+$', '', t)
    
    # Clean up any remaining quotes and whitespace
    t = RE_SMART_QUOTES.sub('', t)       # strip ASCII + smart quotes at ends
    t = RE_WS_COLLAPSE.sub(' ', t)       # collapse whitespace
    
    # Final cleanup - remove any trailing punctuation that might be left
    t = re.sub(r'\s*[-–—]\s*$', '', t)
    
    return t



def _dedupe_preserve_order(items):
    seen = set()
    out = []
    for x in items:
        if x and x not in seen:
            seen.add(x)
            out.append(x)
    return out

def _from_json_object(response: str, k=MAX_K):
    """
    Strict JSON block: parse whole object and read .recommendations
    """
    s = _clean_jsonish(response)
    if not (s.startswith('{') and s.endswith('}')):
        return []
    try:
        obj = json.loads(s)
        recs = obj.get("recommendations", [])
        titles = []
        if isinstance(recs, list):
            for r in recs:
                if isinstance(r, dict) and "title" in r:
                    titles.append(_norm_title(str(r["title"])))
                elif isinstance(r, str):
                    # Handle strings that might contain genres
                    clean_title = _norm_title(r)
                    titles.append(clean_title)
        return titles[:k]
    except Exception:
        return []

def _from_embedded_json(response: str, k=MAX_K):
    """
    JSON-ish block embedded in prose: find block, clean, parse
    """
    m = RE_JSON_BLOCK.search(response)
    if not m:
        return []
    block = _clean_jsonish(m.group(0))
    # Try as-is
    try:
        obj = json.loads(block)
        recs = obj.get("recommendations", [])
        titles = []
        for r in recs:
            if isinstance(r, dict) and "title" in r:
                titles.append(_norm_title(str(r["title"])))
            elif isinstance(r, str):
                # Handle strings that might contain genres
                clean_title = _norm_title(r)
                titles.append(clean_title)
        return titles[:k]
    except Exception:
        # Fallback: extract just the array text and parse it as JSON list
        m2 = RE_RECS_ARRAY.search(block)
        if not m2:
            return []
        raw = "[" + m2.group(1) + "]"
        raw = _clean_jsonish(raw)
        # Attempt to turn into a valid JSON list of strings
        try:
            arr = json.loads(raw)
            titles = []
            for r in arr:
                if isinstance(r, dict) and "title" in r:
                    titles.append(_norm_title(str(r["title"])))
                elif isinstance(r, str):
                    # Handle strings that might contain genres
                    clean_title = _norm_title(r)
                    titles.append(clean_title)
            return titles[:k]
        except Exception:
            # Very loose fallback: split by commas and clean each part
            parts = [p.strip() for p in m2.group(1).split(",")]
            titles = []
            for part in parts:
                if part.strip():
                    # Clean quotes and apply genre removal
                    clean_part = part.strip('"\'').strip()
                    clean_title = _norm_title(clean_part)
                    if clean_title:
                        titles.append(clean_title)
            return titles[:k]

def _from_numbered_list(response: str, k=MAX_K):
    """
    Parse Template C (numbered list) robustly:
    - Start after the first line containing 'recommendations'
    - Accept 1) or 1. prefixes
    - Strip genre tails after -, – or —
    - Handle ASCII and smart quotes
    """
    text = response.replace("\\n", "\n")
    lines = [ln.rstrip() for ln in text.splitlines()]

    # Start AFTER the first line that mentions 'recommendations'
    start = 0
    for i, ln in enumerate(lines):
        if "recommendations" in ln.lower():
            start = i + 1
            break

    titles = []
    for ln in lines[start:]:
        mnum = RE_NUMBERED_LINE.match(ln)  # r'^\s*\d+[.)]\s+(.*)$'
        if not mnum:
            continue

        rest = mnum.group(1).strip()

        # Cut off any trailing genre / notes after a dash, en dash, or em dash
        rest = RE_DASH_SPLIT.split(rest)[0].strip(" -–—\t")

        # Strip smart/ASCII quotes around the title portion
        rest = RE_SMART_QUOTES.sub('', rest)
        rest = RE_WS_COLLAPSE.sub(' ', rest)

        # If ends with (YYYY), normalize to "Title (YYYY)"
        myear = re.search(r'\((\d{4})\)\s*$', rest)
        if myear:
            year = myear.group(1)
            title = re.sub(r'\s*\(\d{4}\)\s*$', '', rest)
            title = RE_SMART_QUOTES.sub('', title).strip()
            norm = _norm_title(f"{title} ({year})")
        else:
            norm = _norm_title(rest)

        if norm:
            titles.append(norm)
            if len(titles) >= k:
                break

    return titles[:k]



def _extract_from_quoted_strings(text: str, k=MAX_K):
    """
    Extract movie titles from quoted strings, handling genres.
    This is a fallback when JSON parsing fails.
    """
    # Look for quoted strings that look like movie titles
    # Pattern: "Movie Title (Year)" or "Movie Title (Year) - Genre"
    movie_pattern = r'"([^"]*\(\d{4}\)[^"]*)"'
    matches = re.findall(movie_pattern, text)
    
    titles = []
    for match in matches:
        clean_title = _norm_title(match)
        if clean_title and clean_title not in titles:
            titles.append(clean_title)
            if len(titles) >= k:
                break
    
    return titles[:k]

def extract_topk(response: str, k=MAX_K):
    """
    Unified extractor with ordered fallbacks:
    1) strict JSON object
    2) embedded JSON-ish block
    3) numbered list after 'recommendations'
    4) quoted strings extraction
    Normalizes, dedupes, and returns up to k titles.
    """
    if not isinstance(response, str) or not response.strip():
        return []

    # 1) Pure JSON object
    titles = _from_json_object(response, k)
    if len(titles) >= k:
        return _dedupe_preserve_order(titles)[:k]

    # 2) Embedded JSON-ish
    if not titles:
        titles = _from_embedded_json(response, k)
        if len(titles) >= k:
            return _dedupe_preserve_order(titles)[:k]

    # 3) Numbered list (Template C)
    if not titles or len(titles) < k:
        t2 = _from_numbered_list(response, k)
        titles.extend([x for x in t2 if x])

    # 4) Quoted strings extraction (fallback)
    if not titles or len(titles) < k:
        t3 = _extract_from_quoted_strings(response, k)
        titles.extend([x for x in t3 if x])

    # Normalize & dedupe once at the end
    titles = _dedupe_preserve_order([_norm_title(t) for t in titles if t])
    return titles[:k]

# ── Main: read, extract, build DataFrame ──
rows = []
with FILE_PATH.open("r", encoding="utf-8") as f:
    for line in tqdm(f, desc="Parsing recommendations"):
        line = line.strip()
        if not line:
            continue
        try:
            obj = json.loads(line)
        except json.JSONDecodeError:
            continue

        recs = extract_topk(obj.get("response", ""), k=MAX_K)

        rows.append({
            "user_id": obj.get("user_id"),
            "template_set": obj.get("template_set"),
            "style": obj.get("style"),
            "recommended_movies": recs
        })

df_greedy_recs = pd.DataFrame(rows)

# Expand to movie_1 … movie_10
for i in range(MAX_K):
    df_greedy_recs[f"movie_{i+1}"] = df_greedy_recs["recommended_movies"].apply(
        lambda x, i=i: x[i] if isinstance(x, list) and len(x) > i else None
    )

print(df_greedy_recs.head(2))


Parsing recommendations: 1200it [00:00, 3185.23it/s]

   user_id template_set         style  \
0       10            B     zero_shot   
1       10            A  few_shot_cot   

                                  recommended_movies               movie_1  \
0  [The Godfather (1972), Casablanca (1942), The ...  The Godfather (1972)   
1  [Casablanca (1942), Rear Window (1954), Chinat...     Casablanca (1942)   

              movie_2                          movie_3               movie_4  \
0   Casablanca (1942)  The Shawshank Redemption (1994)   Pulp Fiction (1994)   
1  Rear Window (1954)                 Chinatown (1974)  The Godfather (1972)   

                  movie_5                    movie_6  \
0  The Dark Knight (2008)    Schindler's List (1993)   
1    The Third Man (1949)  The Maltese Falcon (1941)   

                                 movie_7                          movie_8  \
0                       Inception (2010)                Fight Club (1999)   
1  The Good, the Bad and the Ugly (1966)  The Shawshank Redemption (1994)   





In [18]:
df_greedy_recs.head(1800)

Unnamed: 0,user_id,template_set,style,recommended_movies,movie_1,movie_2,movie_3,movie_4,movie_5,movie_6,movie_7,movie_8,movie_9,movie_10
0,10,B,zero_shot,"[The Godfather (1972), Casablanca (1942), The ...",The Godfather (1972),Casablanca (1942),The Shawshank Redemption (1994),Pulp Fiction (1994),The Dark Knight (2008),Schindler's List (1993),Inception (2010),Fight Club (1999),Forrest Gump (1994),The Matrix (1999)
1,10,A,few_shot_cot,"[Casablanca (1942), Rear Window (1954), Chinat...",Casablanca (1942),Rear Window (1954),Chinatown (1974),The Godfather (1972),The Third Man (1949),The Maltese Falcon (1941),"The Good, the Bad and the Ugly (1966)",The Shawshank Redemption (1994),The Silence of the Lambs (1991),The Graduate (1967)
2,10,A,few_shot,"[Casablanca (1942), Rear Window (1954), Chinat...",Casablanca (1942),Rear Window (1954),Chinatown (1974),The Godfather (1972),The Third Man (1949),The Maltese Falcon (1941),The Big Sleep (1946),Double Indemnity (1944),Sunset Blvd. (1950),"The Good, the Bad and the Ugly (1966)"
3,10,B,few_shot_cot,"[Casablanca (1942), Rear Window (1954), Chinat...",Casablanca (1942),Rear Window (1954),Chinatown (1974),The Godfather (1972),The Third Man (1949),The Maltese Falcon (1941),The Big Sleep (1946),Butch Cassidy and the Sundance Kid (1969),The Graduate (1967),The Apartment (1960)
4,10,B,few_shot,"[Casablanca (1942), Rear Window (1954), Chinat...",Casablanca (1942),Rear Window (1954),Chinatown (1974),The Godfather (1972),The Third Man (1949),The Maltese Falcon (1941),The Graduate (1967),The Big Sleep (1946),The French Connection (1971),Butch Cassidy and the Sundance Kid (1969)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1195,943,C,few_shot,"[Die Hard (1988), Lethal Weapon (1987), Ghostb...",Die Hard (1988),Lethal Weapon (1987),Ghostbusters (1984),Back to the Future (1985),Terminator 2: Judgment Day (1991),Predator (1987),Aliens (1986),Beverly Hills Cop (1984),Mad Max: Fury Road (2015),The Matrix (1999)
1196,943,C,few_shot_cot,"[Die Hard (1988), Lethal Weapon (1987), Ghostb...",Die Hard (1988),Lethal Weapon (1987),Ghostbusters (1984),The Fifth Element (1997),The Rock (1996),Beverly Hills Cop (1984),The Terminator (1984),Back to the Future (1985),Predator (1987),The Mask (1994)
1197,943,B,zero_shot_cot,[],,,,,,,,,,
1198,943,B,few_shot_cot,"[Die Hard (1988), Lethal Weapon (1987), Jurass...",Die Hard (1988),Lethal Weapon (1987),Jurassic Park (1993),Terminator 2: Judgment Day (1991),Back to the Future (1985),Ghostbusters (1984),Aliens (1986),Pulp Fiction (1994),"Matrix, The (1999)",Fargo (1996)


In [16]:
df_greedy_recs.to_csv("greedy_results_normal/df_template_gpt_4o_mini.csv")

In [17]:
# ─────────────────────────────────────────────────────────────
# Inputs
# ─────────────────────────────────────────────────────────────
target_user_id = 378
target_template_set = "B"
target_prompt_type = "few_shot"  # or use 'style' depending on your column
recommended_list = [
  "Goodfellas (1990)",
    "The Silence of the Lambs (1991)",
    "Terminator 2: Judgment Day (1991)",
    "Jurassic Park (1993)",
    "Pulp Fiction (1994)",
    "Forrest Gump (1994)",
    "The Shawshank Redemption (1994)",
    "Pulp Fiction (1994)",
    "Se7en (1995)",
    "The English Patient (1996)"
]


# ─────────────────────────────────────────────────────────────
# Row Mask: Filter the exact row you want to assign
# ─────────────────────────────────────────────────────────────
row_mask = (
    (df_greedy_recs['user_id'] == target_user_id) &
    (df_greedy_recs['template_set'] == target_template_set) &
    (df_greedy_recs['style'] == target_prompt_type)  # or 'prompt_type' if applicable
)

# ─────────────────────────────────────────────────────────────
# Assign the full list to the 'recommended_movies' column
# ─────────────────────────────────────────────────────────────
df_greedy_recs.loc[row_mask, 'recommended_movies'] = \
    df_greedy_recs.loc[row_mask].apply(lambda _: recommended_list, axis=1)

# ─────────────────────────────────────────────────────────────
# Optionally expand into movie_1 to movie_10
# ─────────────────────────────────────────────────────────────
for i in range(10):
    col = f"movie_{i+1}"
    df_greedy_recs.loc[row_mask, col] = recommended_list[i]


In [99]:
import pandas as pd
col_names_for_user_ratings = ['user_id', 'movie_id', 'rating', 'timestamp']
col_names_for_users = ['user_id' , 'age' , 'gender' , 'occupation' ,'zip code']


df_user_ratings = pd.read_csv(
    'data/ml-100k/u.data',
    sep='\t',
    header=None,
    names=col_names_for_user_ratings,
    encoding='latin-1'
)

# Step 1: Count interactions
movie_interactions = (
    df_user_ratings
    .groupby('movie_id')
    .size()
    .reset_index(name='interaction_count')
    .sort_values(by='interaction_count', ascending=False)
)


In [101]:

df_user_ratings["date"] = pd.to_datetime(df_user_ratings["timestamp"], unit='s')

# Count interactions per user
interaction_counts = df_user_ratings.groupby('user_id').size()

# Filter users with more than 60 interactions (strictly > 100)
valid_users = interaction_counts[interaction_counts >= 100].index

# Apply the filter
df_filtered_user_ratings = df_user_ratings[df_user_ratings['user_id'].isin(valid_users)].copy()

# Sort interactions chronologically per user
df_filtered_user_ratings.sort_values(['user_id', 'date'], inplace=True)

# Split each user's history: 80% train, 20% test
train_list = []
test_list = []

for uid, user_df in df_filtered_user_ratings.groupby('user_id', sort=False):
    n = len(user_df)
    split_pt = int(n * 0.8)
    train_list.append(user_df.iloc[:split_pt])
    test_list.append(user_df.iloc[split_pt:])

# Combine all user splits
train_df = pd.concat(train_list).reset_index(drop=True)
test_df = pd.concat(test_list).reset_index(drop=True)

# Output some stats
print(f"After filtering, {len(valid_users)} users remain.")
print(f"Train set: {len(train_df)} rows")
print(f"Test set:  {len(test_df)} rows")

print(f"Minimum number of interactions among kept users: {interaction_counts[valid_users].min()}")
print(f"Number of users with <= 100 interactions: {(interaction_counts < 100).sum()}")


After filtering, 364 users remain.
Train set: 59469 rows
Test set:  15053 rows
Minimum number of interactions among kept users: 100
Number of users with <= 100 interactions: 579


In [102]:
import random

# Turn your valid_users (Index or list) into a plain list:
valid_users_list = list(valid_users)


three_picked = [1,92,433]
print("Picked users:", three_picked)
remaining_valid_users_list = [u for u in valid_users_list if u not in three_picked]
print("Remaining valid users:", remaining_valid_users_list)

Picked users: [1, 92, 433]
Remaining valid users: [5, 6, 7, 10, 11, 13, 15, 16, 18, 21, 22, 23, 26, 38, 42, 43, 44, 49, 56, 57, 58, 59, 60, 62, 64, 70, 72, 82, 83, 85, 87, 90, 94, 95, 99, 102, 104, 109, 110, 116, 119, 125, 128, 130, 141, 144, 145, 151, 152, 158, 159, 160, 174, 177, 178, 181, 184, 188, 189, 193, 194, 197, 198, 200, 201, 207, 210, 213, 214, 216, 221, 222, 223, 224, 230, 233, 234, 236, 239, 244, 246, 249, 250, 254, 256, 262, 263, 264, 267, 268, 269, 270, 271, 276, 279, 280, 286, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 301, 303, 305, 307, 308, 311, 312, 313, 314, 318, 320, 321, 325, 326, 327, 328, 330, 332, 334, 336, 339, 342, 343, 344, 345, 346, 347, 354, 360, 361, 363, 373, 374, 378, 379, 380, 381, 385, 387, 389, 391, 392, 393, 394, 397, 398, 399, 401, 405, 406, 407, 409, 416, 417, 425, 426, 429, 435, 436, 437, 442, 445, 447, 450, 452, 453, 454, 455, 456, 457, 458, 459, 463, 466, 468, 472, 474, 478, 479, 484, 486, 487, 488, 489, 493, 495, 496, 497, 498, 499, 50

In [103]:
import numpy as np

# --- 7) Sampling helper ---
def select_sample(user_ids, k=10, random_state=42):
    """
    For each user:
      - ground_truth_total_itemIds: all items (train+test)
      - ground_truth_test_itemIds: only test items
      - sample_random: up to k random items from their train history, ordered chronologically
    """
    rng = np.random.default_rng(random_state)
    records = []

    for uid in user_ids:
        user_all = df_filtered_user_ratings.loc[df_filtered_user_ratings.user_id == uid]
        total_items = user_all['movie_id'].tolist()

        user_test = test_df.loc[test_df.user_id == uid]
        test_items = user_test['movie_id'].tolist()

        user_train = train_df.loc[train_df.user_id == uid].copy()

        if user_train.empty:
            # No train data — keep sample empty but still record ground truth
            random_sample = []
        else:
            # sample up to k rows (no replacement)
            n_pick = min(k, len(user_train))
            # Use pandas sample with a deterministic seed per user for reproducibility
            # Seed is derived from (random_state, uid) so different users differ but are reproducible
            seed = (hash((random_state, int(uid))) % (2**32 - 1))
            sample_df = (
                user_train
                .sample(n=n_pick, replace=False, random_state=seed)
                .sort_values('date', ascending=True)
            )
            # keep only movie_id and rating
            keep_cols = [c for c in ['movie_id', 'rating'] if c in sample_df.columns]
            random_sample = sample_df[keep_cols].values.tolist()

        records.append({
            'user_id': uid,
            'ground_truth_total_itemIds': total_items,
            'ground_truth_test_itemIds': test_items,
            'sample_random': random_sample
        })

    return pd.DataFrame(records)

# --- 8) Build the two DataFrames ---
df_filtered_user_data = select_sample(remaining_valid_users_list)
df_filtered_example_user_data = select_sample(three_picked)

print("df_filtered_user_data shape:", df_filtered_user_data.shape)
print("df_filtered_example_user_data shape:", df_filtered_example_user_data.shape)

df_filtered_user_data shape: (362, 4)
df_filtered_example_user_data shape: (3, 4)


In [104]:
df_movies=pd.read_csv("data/final_movies.csv")


In [105]:
df_extra_movies=pd.read_csv("data/movies.csv",encoding='latin-1')

In [153]:
df_greedy_recs = pd.read_csv("data/gpt_4o_mini/results_normal/df_template_gpt_40_mini.csv")

In [154]:
import re
import pandas as pd
from rapidfuzz import fuzz, process
from tqdm import tqdm

# === Cleaning Function ===
RE_YEAR = re.compile(r"\(\d{4}\)")
RE_QUOTES = re.compile(r'^"+|"+$')
RE_SPACES = re.compile(r'\s+')
RE_TRAIL_GENRES = re.compile(r'\s*[-–—]\s*.*$')
RE_SPECIAL = re.compile(r"[^a-zA-Z0-9\s]")

def clean_title_for_matching(title):
    """Normalize movie titles by removing punctuation, years, colons, etc., without truncation."""
    if not isinstance(title, str):
        return ""
    title = title.replace(":", "")  #  FIX: Remove colon instead of truncating after it
    title = RE_TRAIL_GENRES.sub("", title)
    title = RE_YEAR.sub("", title)
    title = RE_QUOTES.sub("", title)
    title = RE_SPECIAL.sub("", title)
    return RE_SPACES.sub(" ", title).strip().lower()

# === Matching Function ===
def match_to_catalogs(title, df_movies, df_extra_movies, cutoff=0.8):
    """
    Try to match a given title to df_movies first, then df_extra_movies.
    If match found, return the official catalog title. Otherwise return None.
    """
    if not isinstance(title, str) or not title.strip():
        return None

    title_cleaned = clean_title_for_matching(title)

    # --- Step 1: Primary catalog (df_movies) ---
    best_primary = process.extractOne(
        title_cleaned,
        df_movies["clean_title"],
        scorer=fuzz.token_sort_ratio,
        score_cutoff=cutoff * 100
    )
    if best_primary is not None:
        matched_title = df_movies.loc[
            df_movies["clean_title"] == best_primary[0], "title"
        ].iloc[0]
        return matched_title

    # --- Step 2: Secondary catalog (df_extra_movies) ---
    best_extra = process.extractOne(
        title_cleaned,
        df_extra_movies["clean_title"],
        scorer=fuzz.token_sort_ratio,
        score_cutoff=cutoff * 100
    )
    if best_extra is not None:
        matched_title = df_extra_movies.loc[
            df_extra_movies["clean_title"] == best_extra[0], "title"
        ].iloc[0]
        return matched_title

    # --- Step 3: No match found ---
    return None

# === Main Replacement Function ===
def replace_titles_with_matched(df_greedy_recs, df_movies, df_extra_movies, cutoff=0.8):
    """
    Replace each movie_i column in df_greedy_recs with the matched title from df_movies or df_extra_movies.
    If not matched, replace with None.
    """
    # Prepare both catalogs with cleaned titles
    df_movies = df_movies.copy()
    df_extra_movies = df_extra_movies.copy()
    df_movies["clean_title"] = df_movies["title"].apply(clean_title_for_matching)
    df_extra_movies["clean_title"] = df_extra_movies["title"].apply(clean_title_for_matching)

    # Copy df_greedy_recs to avoid modifying original
    df_replaced = df_greedy_recs.copy()

    # Iterate through each recommendation column
    movie_cols = [f"movie_{i}" for i in range(1, 11)]
    for col in tqdm(movie_cols, desc="Matching all movie titles"):
        df_replaced[col] = df_replaced[col].apply(
            lambda x: match_to_catalogs(x, df_movies, df_extra_movies, cutoff)
        )

    return df_replaced

# === Unmatched Movie Extractor ===
def get_unmatched_titles(df_original, df_replaced):
    movie_cols = [f"movie_{i}" for i in range(1, 11)]
    unmatched_titles = []

    for col in movie_cols:
        original = df_original[col]
        replaced = df_replaced[col]
        unmatched = original[replaced.isna()]
        unmatched_titles.extend(unmatched.tolist())

    unmatched_unique = pd.Series(unmatched_titles).dropna().unique().tolist()
    return unmatched_unique

# === Example Usage ===
# Make sure df_greedy_recs, df_movies, df_extra_movies are defined above this call
df_replaced = replace_titles_with_matched(df_greedy_recs, df_movies, df_extra_movies, cutoff=0.8)

# Get unmatched titles
unmatched_titles = get_unmatched_titles(df_greedy_recs, df_replaced)

# Display result
print(f"\nTotal unmatched titles: {len(unmatched_titles)}")
for title in unmatched_titles:
    print("-", title)

# Optional: Save unmatched to CSV
# pd.Series(unmatched_titles, name="unmatched_titles").to_csv("unmatched_titles.csv", index=False)


Matching all movie titles: 100%|██████████| 10/10 [01:10<00:00,  7.00s/it]


Total unmatched titles: 0





In [155]:
unmatched_titles

[]

In [158]:
def summarize_unmatched_titles(df_replaced):
    """
    Summarize how many None (unmatched) titles remain in df_replaced,
    grouped by (style, template_set).
    """
    movie_cols = [f"movie_{i}" for i in range(1, 11)]
    summary_rows = []

    for (style, template), group in df_replaced.groupby(["style", "template_set"]):
        total_titles = group[movie_cols].size               # total number of cells (users × 10)
        none_count = group[movie_cols].isna().sum().sum()   # total number of None (unmatched)
        matched_count = total_titles - none_count

        summary_rows.append({
            "style": style,
            "template_set": template,
            "total_titles_checked": total_titles,
            "matched_titles": matched_count,
            "unmatched_titles": none_count,
            "match_rate_%": round(100 * matched_count / total_titles, 2)
        })

    return pd.DataFrame(summary_rows).sort_values(by=["style", "template_set"]).reset_index(drop=True)

# === Run the summary ===
df_summary = summarize_unmatched_titles(df_replaced)
print(df_summary)


            style template_set  total_titles_checked  matched_titles  \
0        few_shot            A                  1000            1000   
1        few_shot            B                  1000            1000   
2        few_shot            C                  1000            1000   
3    few_shot_cot            A                  1000            1000   
4    few_shot_cot            B                  1000            1000   
5    few_shot_cot            C                  1000            1000   
6       zero_shot            A                  1000            1000   
7       zero_shot            B                  1000            1000   
8       zero_shot            C                  1000            1000   
9   zero_shot_cot            A                  1000            1000   
10  zero_shot_cot            B                  1000            1000   
11  zero_shot_cot            C                  1000            1000   

    unmatched_titles  match_rate_%  
0                  0      

In [141]:
df_greedy_recs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1200 entries, 0 to 1199
Data columns (total 14 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   user_id             1200 non-null   int64 
 1   template_set        1200 non-null   object
 2   style               1200 non-null   object
 3   recommended_movies  1200 non-null   object
 4   movie_1             1200 non-null   object
 5   movie_2             1200 non-null   object
 6   movie_3             1200 non-null   object
 7   movie_4             1200 non-null   object
 8   movie_5             1200 non-null   object
 9   movie_6             1200 non-null   object
 10  movie_7             1200 non-null   object
 11  movie_8             1200 non-null   object
 12  movie_9             1200 non-null   object
 13  movie_10            1200 non-null   object
dtypes: int64(1), object(13)
memory usage: 131.4+ KB


In [159]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from collections import Counter
from scipy.stats import entropy
from rapidfuzz import fuzz


# === RapidFuzz Matching ===
def match_title_to_id(title, catalog_titles, title_to_id, cutoff=0.8):
    """Return movie_id if matched above cutoff; else None."""
    if not isinstance(title, str) or not title.strip():
        return None
    best_match = None
    best_score = 0
    for candidate in catalog_titles:
        score = fuzz.token_sort_ratio(title, candidate) / 100
        if score > best_score:
            best_match = candidate
            best_score = score
    return title_to_id.get(best_match) if best_score >= cutoff else None


# === Unmatched Title Clustering ===
def cluster_unmatched_titles(unmatched_titles, cutoff=0.8):
    """
    Group similar unmatched titles (>= cutoff similarity) using RapidFuzz.
    Returns mapping {title: representative_title}.
    """
    unmatched_titles = list(set([t for t in unmatched_titles if isinstance(t, str) and t.strip()]))
    seen = set()
    mapping = {}

    for t in unmatched_titles:
        if t in seen:
            continue
        mapping[t] = t
        seen.add(t)
        for other in unmatched_titles:
            if other not in seen:
                score = fuzz.token_sort_ratio(t, other) / 100
                if score >= cutoff:
                    mapping[other] = t
                    seen.add(other)
    return mapping


# === Accuracy Metrics ===
def hit_ratio_at_k(rec_ids, gt_ids, k=10):
    return int(any(x in gt_ids for x in rec_ids[:k] if isinstance(x, int)))


def precision_at_k(rec_ids, gt_ids, k=10):
    topk = [x for x in rec_ids[:k] if isinstance(x, int)]
    return len([x for x in topk if x in gt_ids]) / k


def ndcg_at_k(rec_ids, gt_ids, k=10):
    dcg = sum(1 / np.log2(i + 2) for i, x in enumerate(rec_ids[:k]) if isinstance(x, int) and x in gt_ids)
    ideal_dcg = sum(1 / np.log2(i + 2) for i in range(min(len(gt_ids), k)))
    return dcg / ideal_dcg if ideal_dcg > 0 else 0.0


# === Fairness/Diversity Metrics ===
def gini_index(counts):
    """Gini coefficient of exposure inequality."""
    sorted_vals = np.sort(np.array(counts))
    n = len(sorted_vals)
    if n == 0 or sorted_vals.sum() == 0:
        return 0.0
    index = np.sum((2 * np.arange(1, n + 1) - n - 1) * sorted_vals)
    return round(index / (n * sorted_vals.sum()), 4)


def natural_entropy(counts):
    """Entropy of exposure diversity."""
    total = sum(counts)
    if total == 0:
        return 0.0
    p = np.array(counts) / total
    return round(entropy(p, base=np.e), 4)


# === Main Evaluation ===
def evaluate_all_metrics(df_greedy_recs, df_movies, df_filtered_user_data, cutoff=0.8):
    """
    Evaluate all metrics grouped by (style, template_set):
    - Accuracy: HR@10, Precision@10, NDCG@10 (per user, averaged)
    - Fairness/Diversity: Gini, Entropy (from exposure counts)
    """
    catalog_titles = df_movies["title"].tolist()
    title_to_id = dict(zip(df_movies["title"], df_movies["movie_id"]))
    gt_by_user = df_filtered_user_data.set_index("user_id").to_dict("index")

    metrics_rows = []

    for (style, template), group in tqdm(df_greedy_recs.groupby(["style", "template_set"]), desc="Evaluating"):
        per_user_recs = {}
        unmatched_titles = []
        all_recs = []

        # === Step 1: Per-user matching (preserve order) ===
        for _, row in group.iterrows():
            uid = row["user_id"]
            rec_list = [row.get(f"movie_{i}", None) for i in range(1, 11)]
            matched_list = []

            for title in rec_list:
                if title is None or (isinstance(title, float) and pd.isna(title)):
                    matched_list.append(None)
                    continue

                mid = match_title_to_id(title, catalog_titles, title_to_id, cutoff)
                if mid is not None:
                    matched_list.append(mid)
                else:
                    matched_list.append(title)
                    unmatched_titles.append(title)

            # keep exactly k=10
            matched_list = (matched_list + [None] * 10)[:10]
            per_user_recs[uid] = matched_list
            all_recs.extend(matched_list)

        # === Step 2: Cluster unmatched titles (RapidFuzz ≥ 0.7) ===
        title_cluster_map = cluster_unmatched_titles(unmatched_titles, cutoff)
        final_recs = [
            title_cluster_map.get(x, x) if isinstance(x, str) else x
            for x in all_recs
        ]

        # === Step 3: Exposure counting (includes IDs, clusters, None) ===
        exposure_counter = Counter(final_recs)
        exposure_counts = list(exposure_counter.values())

        # === Step 4: Accuracy metrics (averaged per user) ===
        hr_list, prec_list, ndcg_list = [], [], []
        for uid, recs in per_user_recs.items():
            gt = gt_by_user.get(uid)
            if not gt:
                continue
            gt_ids = set(gt["ground_truth_total_itemIds"]) - {x[0] for x in gt["sample_random"]}
            if not gt_ids:
                continue

            # Order preserved for NDCG
            rec_ids_only = [x if isinstance(x, int) else None for x in recs]
            hr_list.append(hit_ratio_at_k(rec_ids_only, gt_ids))
            prec_list.append(precision_at_k(rec_ids_only, gt_ids))
            ndcg_list.append(ndcg_at_k(rec_ids_only, gt_ids))

        # === Step 5: Aggregate results ===
        metrics_rows.append({
            "style": style,
            "template_set": template,
            "HR@10": round(np.mean(hr_list), 4),
            "Precision@10": round(np.mean(prec_list), 4),
            "NDCG@10": round(np.mean(ndcg_list), 4),
            "Gini": gini_index(exposure_counts),
            "Entropy": natural_entropy(exposure_counts),
            "num_unique_exposed_titles": len(exposure_counter),
            "num_exposure_events": sum(exposure_counts)
        })

    return pd.DataFrame(metrics_rows)

# Run evaluation
df_metrics = evaluate_all_metrics(
    df_greedy_recs=df_replaced,
    df_movies=df_movies,
    df_filtered_user_data=df_filtered_user_data,
    cutoff=0.8   # RapidFuzz similarity threshold
)



Evaluating: 100%|██████████| 12/12 [00:23<00:00,  1.99s/it]


In [160]:
df_metrics.head(100)

Unnamed: 0,style,template_set,HR@10,Precision@10,NDCG@10,Gini,Entropy,num_unique_exposed_titles,num_exposure_events
0,few_shot,A,0.93,0.297,0.3304,0.6579,4.2582,165,1000
1,few_shot,B,0.91,0.317,0.3505,0.6389,4.3755,177,1000
2,few_shot,C,0.95,0.318,0.3491,0.64,4.3032,163,1000
3,few_shot_cot,A,0.94,0.311,0.3399,0.6587,4.1581,147,1000
4,few_shot_cot,B,0.93,0.314,0.3433,0.6629,4.172,153,1000
5,few_shot_cot,C,0.98,0.334,0.3649,0.6401,4.2982,161,1000
6,zero_shot,A,0.96,0.312,0.3784,0.5332,2.6442,25,1000
7,zero_shot,B,0.89,0.292,0.3052,0.7346,3.2703,77,1000
8,zero_shot,C,0.83,0.283,0.2989,0.696,3.7325,107,1000
9,zero_shot_cot,A,0.96,0.31,0.3645,0.553,2.566,25,1000


In [163]:
df_metrics.to_csv("results_templates/template_gpt_4o_mini.csv")

In [19]:
import pandas as pd

# ====== Config ======
files = {
    "results_templates/template_gpt_4o.csv": "gpt-4o",
    "results_templates/template_gpt_4o_mini.csv": "gpt-4o-mini",
    # "results_templates/template_gpt_4.1_mini.csv": "gpt-4.1-mini",
    # "results_templates/template_gpt_4.1_nano.csv": "gpt-4.1-nano",
    "results_templates/template_mistral.csv": "mistral-large-2",
    "results_templates/template_mistral-7.csv": "mistral-7B",
}

# All metrics available
metrics = ["HR@10", "Precision@10", "NDCG@10", "Gini", "Entropy"]

# Only these will be used to compute the average rank
metrics_for_avg = ["NDCG@10", "Gini", "Entropy"]

all_dfs = []

# ====== Step 1: Load and rank ======
for path, model in files.items():
    df = pd.read_csv(path)
    df["model"] = model

    # Rank templates *within each style* (A vs B vs C)
    for metric in metrics:
        if metric == "Gini":  # lower is better
            df[f"{metric}_rank"] = df.groupby("style")[metric].rank(ascending=True, method="dense")
        else:  # higher is better
            df[f"{metric}_rank"] = df.groupby("style")[metric].rank(ascending=False, method="dense")

    # Average rank only for NDCG, Gini, and Entropy
    rank_cols_for_avg = [f"{m}_rank" for m in metrics_for_avg]
    df["avg_rank"] = df[rank_cols_for_avg].mean(axis=1)

    all_dfs.append(df)

# Combine all models’ results
combined_df = pd.concat(all_dfs, ignore_index=True)

# ====== Step 2: Aggregate per (model, style, template_set) ======
per_model = (
    combined_df
    .groupby(["model", "style", "template_set"])
    .agg(
        {**{m: "mean" for m in metrics},                # average metric values
         **{f"{m}_rank": "mean" for m in metrics},      # average metric ranks
         "avg_rank": "mean"}                            # average of NDCG+Gini+Entropy ranks
    )
    .reset_index()
    .sort_values(["model", "style", "avg_rank"])
)

# ====== Step 3: Add best_template flag per (model, style) ======
per_model["best_template"] = per_model.groupby(["model", "style"])["avg_rank"].transform(lambda x: x == x.min())

# ====== Step 4: Save ======
per_model.to_csv("per_model_template_ranking.csv", index=False)

print("Done. Files saved:")
print("Per-model rankings → per_model_template_ranking.csv")

print("\nExample per-model view:")
print(per_model.head(10))


Done. Files saved:
Per-model rankings → per_model_template_ranking.csv

Example per-model view:
     model          style template_set  HR@10  Precision@10  NDCG@10    Gini  \
2   gpt-4o       few_shot            C   0.99         0.584   0.6181  0.5500   
1   gpt-4o       few_shot            B   1.00         0.605   0.6378  0.5791   
0   gpt-4o       few_shot            A   1.00         0.638   0.6612  0.6207   
5   gpt-4o   few_shot_cot            C   0.99         0.551   0.5796  0.5657   
4   gpt-4o   few_shot_cot            B   1.00         0.607   0.6342  0.5958   
3   gpt-4o   few_shot_cot            A   1.00         0.625   0.6533  0.6249   
8   gpt-4o      zero_shot            C   0.98         0.398   0.4177  0.7068   
6   gpt-4o      zero_shot            A   0.95         0.303   0.3527  0.5392   
7   gpt-4o      zero_shot            B   0.97         0.396   0.4068  0.7150   
11  gpt-4o  zero_shot_cot            C   0.97         0.408   0.4139  0.6615   

    Entropy  HR@10_rank

In [20]:
import pandas as pd
import numpy as np

# Ensure the categorical order A, B, C
template_order = pd.CategoricalDtype(categories=['A', 'B', 'C'], ordered=True)
per_model['template_set'] = per_model['template_set'].astype(template_order)

# === Mean & Std (plus count) for avg_rank by template ===
template_stats = (
    per_model.groupby('template_set', observed=True)['NDCG@10_rank']
      .agg(mean='mean', std='std', n='size')
      .sort_index()  # respects A, B, C
)

#

print(template_stats.round(4))

# If you want just mean & std in a compact table:
mean_std_only = template_stats[['mean', 'std']].round(4)
print(mean_std_only)

# If you prefer to sort by "best" (lowest mean avg_rank):
template_stats.sort_values('mean').round(4)



                mean     std   n
template_set                    
A             2.0000  0.9661  16
B             1.9375  0.6801  16
C             2.0625  0.8539  16
                mean     std
template_set                
A             2.0000  0.9661
B             1.9375  0.6801
C             2.0625  0.8539


Unnamed: 0_level_0,mean,std,n
template_set,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
B,1.9375,0.6801,16
A,2.0,0.9661,16
C,2.0625,0.8539,16


In [21]:
import pandas as pd
import numpy as np

# Ensure the categorical order A, B, C
template_order = pd.CategoricalDtype(categories=['A', 'B', 'C'], ordered=True)
per_model['template_set'] = per_model['template_set'].astype(template_order)

# === Mean & Std (plus count) for avg_rank by template ===
template_stats = (
    per_model.groupby('template_set', observed=True)['Entropy_rank']
      .agg(mean='mean', std='std', n='size')
      .sort_index()  # respects A, B, C
)

#

print(template_stats.round(4))

# If you want just mean & std in a compact table:
mean_std_only = template_stats[['mean', 'std']].round(4)
print(mean_std_only)

# If you prefer to sort by "best" (lowest mean avg_rank):
template_stats.sort_values('mean').round(4)



                mean     std   n
template_set                    
A             2.6250  0.7188  16
B             2.0625  0.4425  16
C             1.3125  0.7042  16
                mean     std
template_set                
A             2.6250  0.7188
B             2.0625  0.4425
C             1.3125  0.7042


Unnamed: 0_level_0,mean,std,n
template_set,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
C,1.3125,0.7042,16
B,2.0625,0.4425,16
A,2.625,0.7188,16


In [24]:
results=per_model

In [25]:
import pandas as pd
import numpy as np

# Ensure correct template order
template_order = pd.CategoricalDtype(categories=['A', 'B', 'C'], ordered=True)
results['template_set'] = results['template_set'].astype(template_order)

# === Step 1 – Compute mean & std of rank metrics per (model, template) ===
rank_metrics = ['Gini_rank', 'Entropy_rank', 'NDCG@10_rank']
summary = (
    results.groupby(['model', 'template_set'], observed=True)[rank_metrics]
    .agg(['mean', 'std'])
    .reset_index()
)

# Flatten multi-index column names
summary.columns = ['model', 'template_set'] + [f"{m}_{s}" for m, s in summary.columns[2:]]

# === Step 2 – Hierarchical ranking by (mean, std) for each metric ===
def rank_with_tiebreak(df, mean_col, std_col):
    # Sort primarily by mean, then by std (both ascending)
    df = df.sort_values([mean_col, std_col], ascending=[True, True]).copy()
    # Assign rank with average for ties (e.g., 1, 1, 3)
    ranks = df[[mean_col, std_col]].apply(tuple, axis=1).rank(method='dense')
    return ranks

summary['Gini_mean_rank'] = summary.groupby('model', group_keys=False).apply(
    lambda g: rank_with_tiebreak(g, 'Gini_rank_mean', 'Gini_rank_std')
)

summary['Entropy_mean_rank'] = summary.groupby('model', group_keys=False).apply(
    lambda g: rank_with_tiebreak(g, 'Entropy_rank_mean', 'Entropy_rank_std')
)

summary['NDCG_mean_rank'] = summary.groupby('model', group_keys=False).apply(
    lambda g: rank_with_tiebreak(g, 'NDCG@10_rank_mean', 'NDCG@10_rank_std')
)

# === Step 3 – Compute average rank across metrics ===
rank_cols = ['Gini_mean_rank', 'Entropy_mean_rank', 'NDCG_mean_rank']
summary['avg_rank'] = summary[rank_cols].mean(axis=1)
summary['avg_rank_std'] = summary[rank_cols].std(axis=1)

# === Step 4 – Sort & identify best template per model ===
summary = summary.sort_values(['model', 'avg_rank'])
best_templates = summary.groupby('model', as_index=False).first()

# === Step 5 – Display ===
print("\n=== Template-level Mean / Std and Ranks for Each Model ===")
print(summary[['model', 'template_set',
               'Gini_rank_mean', 'Gini_rank_std', 'Gini_mean_rank',
               'Entropy_rank_mean', 'Entropy_rank_std', 'Entropy_mean_rank',
               'NDCG@10_rank_mean', 'NDCG@10_rank_std', 'NDCG_mean_rank',
               'avg_rank', 'avg_rank_std']].round(4))

print("\n=== Best Template per Model (lowest avg rank) ===")
print(best_templates[['model', 'template_set', 'avg_rank']].round(4))



=== Template-level Mean / Std and Ranks for Each Model ===
              model template_set  Gini_rank_mean  Gini_rank_std  \
2            gpt-4o            C            1.50         0.5774   
1            gpt-4o            B            2.50         0.5774   
0            gpt-4o            A            2.00         1.1547   
3       gpt-4o-mini            A            1.75         0.9574   
4       gpt-4o-mini            B            2.25         0.9574   
5       gpt-4o-mini            C            2.00         0.8165   
6        mistral-7B            A            1.25         0.5000   
8        mistral-7B            C            3.00         0.0000   
7        mistral-7B            B            1.75         0.5000   
11  mistral-large-2            C            2.00         0.8165   
10  mistral-large-2            B            2.00         0.8165   
9   mistral-large-2            A            2.00         1.1547   

    Gini_mean_rank  Entropy_rank_mean  Entropy_rank_std  Entropy_mea

  summary['Gini_mean_rank'] = summary.groupby('model', group_keys=False).apply(
  summary['Entropy_mean_rank'] = summary.groupby('model', group_keys=False).apply(
  summary['NDCG_mean_rank'] = summary.groupby('model', group_keys=False).apply(


In [26]:
summary.to_csv("per_model_template_reranking_based_on_average_rank.csv")

In [28]:
import pandas as pd
import numpy as np

# Ensure correct template order
template_order = pd.CategoricalDtype(categories=['A', 'B', 'C'], ordered=True)
results['template_set'] = results['template_set'].astype(template_order)

# === Step 1 – Compute mean & std of rank metrics per (model, template) ===
rank_metrics = ['Gini_rank', 'Entropy_rank', 'NDCG@10_rank']#,'Precision@10_rank']

summary = (
    results.groupby(['style', 'template_set'], observed=True)[rank_metrics]
    .agg(['mean', 'std'])
    .reset_index()
)

# Flatten multi-index column names
summary.columns = ['style', 'template_set'] + [f"{m}_{s}" for m, s in summary.columns[2:]]

# === Step 2 – Within each model, rank templates for each metric (by mean) ===
summary['Gini_mean_rank'] = summary.groupby('style')['Gini_rank_mean'].rank(method='dense', ascending=True)
summary['Entropy_mean_rank'] = summary.groupby('style')['Entropy_rank_mean'].rank(method='dense', ascending=True)
summary['NDCG_mean_rank'] = summary.groupby('style')['NDCG@10_rank_mean'].rank(method='dense', ascending=True)
#summary['Precision_mean_rank'] = summary.groupby('style')['Precision@10_rank_mean'].rank(method='average', ascending=True)

# === Step 3 – Compute average & std of these three ranks ===
rank_cols = ['Gini_mean_rank', 'Entropy_mean_rank', 'NDCG_mean_rank']#'Precision_mean_rank']
summary['avg_rank'] = summary[rank_cols].mean(axis=1)
summary['avg_rank_std'] = summary[rank_cols].std(axis=1)

# === Step 4 – Sort & identify best template per model (lowest avg rank) ===
summary = summary.sort_values(['style', 'avg_rank'])
best_templates = summary.groupby('style', as_index=False).first()

# === Step 5 – Display results ===
print("\n=== Template-level Mean / Std and Ranks for Each style ===")
print(summary[['style', 'template_set',
               'Gini_rank_mean', 'Gini_rank_std', 'Gini_mean_rank',
               'Entropy_rank_mean', 'Entropy_rank_std', 'Entropy_mean_rank',
               'NDCG@10_rank_mean', 'NDCG@10_rank_std', 'NDCG_mean_rank',
              #  'Precision@10_rank_mean', 'Precision@10_rank_std', 'Precision_mean_rank',
               'avg_rank', 'avg_rank_std']].round(4))

print("\n=== Best Template per Model (lowest avg rank) ===")
print(best_templates[['style', 'template_set', 'avg_rank']].round(4))



=== Template-level Mean / Std and Ranks for Each style ===
            style template_set  Gini_rank_mean  Gini_rank_std  Gini_mean_rank  \
1        few_shot            B            1.25         0.5000             1.0   
0        few_shot            A            2.75         0.5000             3.0   
2        few_shot            C            2.00         0.8165             2.0   
4    few_shot_cot            B            2.25         0.5000             2.0   
5    few_shot_cot            C            1.50         1.0000             1.0   
3    few_shot_cot            A            2.25         0.9574             2.0   
8       zero_shot            C            2.50         0.5774             2.0   
6       zero_shot            A            1.00         0.0000             1.0   
7       zero_shot            B            2.50         0.5774             2.0   
10  zero_shot_cot            B            2.50         0.5774             2.0   
11  zero_shot_cot            C            2.50   

In [29]:
summary.head(20)

Unnamed: 0,style,template_set,Gini_rank_mean,Gini_rank_std,Entropy_rank_mean,Entropy_rank_std,NDCG@10_rank_mean,NDCG@10_rank_std,Gini_mean_rank,Entropy_mean_rank,NDCG_mean_rank,avg_rank,avg_rank_std
1,few_shot,B,1.25,0.5,1.75,0.5,2.0,0.816497,1.0,1.0,2.0,1.333333,0.57735
0,few_shot,A,2.75,0.5,2.5,1.0,1.5,1.0,3.0,2.0,1.0,2.0,1.0
2,few_shot,C,2.0,0.816497,1.75,0.957427,2.5,0.57735,2.0,1.0,3.0,2.0,1.0
4,few_shot_cot,B,2.25,0.5,2.0,0.0,1.75,0.5,2.0,2.0,1.0,1.666667,0.57735
5,few_shot_cot,C,1.5,1.0,1.5,1.0,2.25,0.957427,1.0,1.0,3.0,1.666667,1.154701
3,few_shot_cot,A,2.25,0.957427,2.5,1.0,2.0,1.154701,2.0,3.0,2.0,2.333333,0.57735
8,zero_shot,C,2.5,0.57735,1.0,0.0,1.5,1.0,2.0,1.0,1.0,1.333333,0.57735
6,zero_shot,A,1.0,0.0,2.75,0.5,2.25,0.957427,1.0,3.0,2.0,2.0,1.0
7,zero_shot,B,2.5,0.57735,2.25,0.5,2.25,0.5,2.0,2.0,2.0,2.0,0.0
10,zero_shot_cot,B,2.5,0.57735,2.25,0.5,1.75,0.957427,2.0,2.0,1.0,1.666667,0.57735


In [30]:
summary.to_csv("per_style_template_reranking_based_on_average_rank.csv")