In [1]:
import re
import json
import pandas as pd
from pathlib import Path
from tqdm import tqdm

# --- Regex Definitions ---
# Matches '1. "Title" (1994) - Genre...' or similar
RE_COMPLEX_NUMBERED_TITLE = re.compile(
    r'^\s*\d+[.)]\s+"([^"]+)"\s*\((?P<year>\d{4})\).*$'
)
# MODIFIED: Made more robust to handle markdown bolding and non-quoted titles.
# Captures the full "Title (Year)" format.
RE_TITLE_AND_YEAR = re.compile(
    r'^\s*\d+[.)]\s*["\'*]*((?:[^\n\r(]*?\s*\(\d{4}\))).*$', re.DOTALL
)
# MODIFIED: Made more robust to handle titles with/without quotes and markdown bolding
RE_TITLE_ONLY = re.compile(
    r'^\s*\d+[.)]\s*["\']?([^"\'(\n]+?)\s*["\']?\s*(?:[-–—].*)?$', re.DOTALL
)
RE_CLEAN_TITLE_ONLY = re.compile(r'^\s*\d+[.)]\s*(?:"?)(?P<title>[^"(\n]+)$', re.DOTALL)


# --- Universal Helper Functions ---

def clean_movie_title(title: str) -> str:
    """
    Normalize movie titles to the 'Title (Year)' format, handling various imperfections.
    """
    if not isinstance(title, str):
        return None

    # 1. Clean up surrounding quotes, whitespace, and markdown
    title = title.strip().strip('"').strip("'")
    if title.startswith('**') and title.endswith('**'):
        title = title.strip('*')

    # 2. Prioritize extracting the 'Title (Year)' format directly
    match = re.search(r'(.+?\s*\(\d{4}\))', title)
    if match:
        # Standardize spacing, e.g., "Title  (1995)" -> "Title (1995)"
        clean_title = match.group(1).strip()
        return re.sub(r'\s+\(', ' (', clean_title)

    # 3. Fallback: If no year is found, split by common delimiters (like '-')
    # to remove trailing genres or other text (e.g., "Pulp Fiction - Crime")
    title = re.split(r'\s*[-–—]\s*', title)[0].strip()
    return title if title else None


# --- Format-Specific Extraction Functions ---

def extract_movies_from_json(response: str) -> list[str]:
    """
    Extractor for JSON-formatted responses (Template B).
    MODIFIED: Now strictly requires the presence of an opening bracket '['
    to attempt extraction. If not found, it returns an empty list [].
    """
    if not isinstance(response, str):
        return []

    movies = []

    # Attempt 1: Full JSON Parse
    try:
        data = json.loads(response)
        movies = data.get("recommendations", [])
        if not isinstance(movies, list):
             movies = []
    except json.JSONDecodeError:
        # Attempt 2: Handle Truncated/Malformed JSON by finding quoted strings
        # after the opening bracket '['.
        bracket_match = re.search(r'\[(.*)', response, re.DOTALL)
        if bracket_match:
            content = bracket_match.group(1)
            # Find all double-quoted strings (the movie titles) within the content
            movies = re.findall(r'"([^"]+)"', content)

    # CRITICAL CHANGE: If no movies were extracted AND no opening bracket was found,
    # we return empty lists, fulfilling the user's request to "assign empty values"
    # if the [ or ] structure is missing.
    if not movies and '[' not in response:
        return []

    # NOTE: The previous safety net (Attempt 3: calling extract_movies_from_text_list)
    # has been REMOVED to enforce the strict requirement.

    cleaned_movies = [clean_movie_title(m) for m in movies if m]
    return [m for m in cleaned_movies if m][:10] # Filter out Nones after cleaning


def extract_movies_from_text_list(response: str) -> list[str]:
    """
    Extractor for enumerated list responses (Templates A, C, etc.).
    """
    if not isinstance(response, str):
        return []

    full_text = response.replace("\\n", "\n")
    recommendations_markers = [
        "RECOMMENDATIONS:", "### Recommendations:", "### Top 10 Recommendations",
        "Top 10 Recommendations:", "Here are my top 10 recommendations"
    ]
    start_index = -1
    marker_length = 0
    for marker in recommendations_markers:
        idx = full_text.lower().find(marker.lower())
        if idx != -1 and (start_index == -1 or idx < start_index):
            start_index = idx
            marker_length = len(marker)

    recommendation_text = full_text[start_index + marker_length:] if start_index != -1 else full_text
    lines = recommendation_text.splitlines()

    extracted = []
    for line in lines:
        line = line.strip()
        if not line: continue

        # New Check: Only process lines that start with a number (1., 2), filtering out non-list text
        if not re.match(r'^\s*\d+[.)]', line):
            continue

        # 1. Complex: "Title" (1994) - Genre...
        match_complex = RE_COMPLEX_NUMBERED_TITLE.match(line)
        if match_complex:
            title = match_complex.group(1).strip()
            year = match_complex.group('year')
            extracted.append(f"{title} ({year})")
            continue

        # 2. Standard: Title (Year) (Improved to catch titles like **"Title (Year)"**)
        match_standard = RE_TITLE_AND_YEAR.match(line)
        if match_standard:
            extracted.append(match_standard.group(1).strip())
            continue

        # 3. Title Only
        match_title_only = RE_TITLE_ONLY.match(line)
        if match_title_only:
            extracted.append(match_title_only.group(1).strip())
            continue

        # 4. Clean Title Only
        match_clean_title_only = RE_CLEAN_TITLE_ONLY.match(line)
        if match_clean_title_only:
            extracted.append(match_clean_title_only.group('title').strip())
            continue

    movies = [clean_movie_title(m) for m in extracted if m]
    return [m for m in movies if m][:10]


# --- Main Execution Logic (Base Directory changed to match a common pattern) ---

# NOTE: I've updated the base directory path to a more common placeholder.
# Please ensure this matches your actual file structure: 'gpt_4.o_forC/results_sc' vs 'data/gpt_4o/results_sc'
BASE_DIR = Path("data/gpt_4o_mini/results_sc")
all_rows = []

if not BASE_DIR.exists():
    print(f"Error: Base directory '{BASE_DIR}' not found. Please ensure the path is correct.")
else:
    for user_dir in tqdm(sorted(BASE_DIR.glob("user_*")), desc="Processing Users"):
        try:
            user_id = int(user_dir.name.split("_")[1])
        except (IndexError, ValueError):
            continue

        for template_set_dir in sorted(user_dir.iterdir()):
            if not template_set_dir.is_dir(): continue
            template_set = template_set_dir.name

            for temp_dir in sorted(template_set_dir.glob("temp_*")):
                if not temp_dir.is_dir(): continue
                try:
                    temperature = float(temp_dir.name.replace("temp_", ""))
                except ValueError:
                    continue

                for json_file in temp_dir.glob("*.json"):
                    try:
                        style, fairness = json_file.stem.split("__")
                    except ValueError:
                        print(f"Skipping file with unexpected name format: {json_file.name}")
                        continue

                    with open(json_file, "r", encoding="utf-8") as f:
                        data = json.load(f)

                    for sample in data.get("samples", []):
                        response = sample.get("response", "")
                        movies = []

                        # --- LOGIC: Choose extractor based on template folder ---
                        # Template B uses the strict JSON extractor that requires '[' for processing.
                        if template_set.upper() == 'B':
                            movies = extract_movies_from_json(response)
                        else:
                            # Other templates use the standard text list extractor
                            movies = extract_movies_from_text_list(response)

                        row = {
                            "user_id": data.get("user_id", user_id),
                            "template_set": data.get("template_set", template_set),
                            "style": data.get("style", style),
                            "fairness": data.get("fairness", fairness),
                            "temperature": data.get("temperature", temperature),
                            "sample_id": sample.get("sample_id"),
                            "raw_response": response,
                            "recommended_movies": movies,
                            "source_file": json_file.name
                        }
                        all_rows.append(row)

# Build the final DataFrame
df_final = pd.DataFrame(all_rows)

# Expand movie columns
if not df_final.empty:
    for i in range(10):
        df_final[f"movie_{i+1}"] = df_final["recommended_movies"].apply(
            lambda x: x[i] if isinstance(x, list) and len(x) > i else None
        )

# Display final results
print("\n--- Final DataFrame ---")
if df_final.empty:
    print("DataFrame is empty. No data was processed.")
else:
    print("Final shape:", df_final.shape)
    print("\nDataFrame Head:")
    print(df_final.head())
    print("\nColumns and Data Types:")
    print(df_final.info())

Processing Users: 100%|██████████| 100/100 [00:04<00:00, 22.52it/s]



--- Final DataFrame ---
Final shape: (24000, 19)

DataFrame Head:
   user_id template_set         style fairness  temperature  sample_id  \
0       10            B  few_shot_cot  neutral          0.2          1   
1       10            B  few_shot_cot  neutral          0.2          2   
2       10            B  few_shot_cot  neutral          0.2          3   
3       10            B  few_shot_cot  neutral          0.2          4   
4       10            B  few_shot_cot  neutral          0.2          5   

                                        raw_response  \
0  {"k":10,"recommendations":["The Graduate (1967...   
1  {"k":10,"recommendations":["The Graduate (1967...   
2  {"k":10,"recommendations":["The Graduate (1967...   
3  {"k":10,"recommendations":["The Graduate (1967...   
4  {"k":10,"recommendations":["The Graduate (1967...   

                                  recommended_movies  \
0  [The Graduate (1967), Chinatown (1974), Fargo ...   
1  [The Graduate (1967), Fargo (1996), 

In [175]:
df_final.to_csv("mistral-7/results_sc/mistral_7_sc.csv", index=False)

NameError: name 'df_final' is not defined

In [3]:
import pandas as pd
import re
from itertools import combinations
from tqdm import tqdm
from rapidfuzz import fuzz

# Regular expression to find and remove the (YYYY) year suffix
# It captures the title part before the year (which might include spaces/punctuation)
RE_YEAR = re.compile(r'\s*\(\d{4}\)\s*$', re.UNICODE)

def strip_year(title: str) -> str:
    """
    Removes the '(YYYY)' year suffix and strips Markdown bolding ('**')
    from a movie title for consistent comparison.
    """
    if not title:
        return ""

    # 1. Remove the year suffix
    cleaned_title = RE_YEAR.sub('', title).strip()

    # 2. PATCH: Remove leading/trailing Markdown bolding (**)
    if cleaned_title.startswith('**') and cleaned_title.endswith('**'):
        cleaned_title = cleaned_title.strip('*').strip()

    return cleaned_title.strip()

# ── Fuzzy Jaccard Similarity (Updated) ── #
def fuzzy_jaccard_similarity(list1, list2, threshold=80, verbose=False):
    """
    Computes Fuzzy Jaccard Similarity after stripping the year and cleaning
    formatting from titles, using a custom threshold of 80 for rapidfuzz.
    """
    # 1. Strip years and clean formatting from all titles
    list1_cleaned = [strip_year(t) for t in list1]
    list2_cleaned = [strip_year(t) for t in list2]

    matched_1, matched_2 = set(), set()

    for i, t1 in enumerate(list1_cleaned):
        for j, t2 in enumerate(list2_cleaned):
            # Skip empty strings and already matched titles in list2
            if not t1 or not t2 or j in matched_2:
                continue

            # Use token_sort_ratio for robust comparison against minor title differences
            score = fuzz.token_sort_ratio(t1, t2)

            if score >= threshold:
                matched_1.add(i)
                matched_2.add(j)
                if verbose:
                    # Use original titles for verbose output
                    print(f" Matched: {list1[i]} ≈ {list2[j]} (Score={score}, Cleaned: '{t1}' vs '{t2}')")
                break

    intersection = len(matched_1)
    # The union size is based on the original list lengths, as matched_1 and matched_2
    # track indices of the original lists.
    union = len(list1) + len(list2) - intersection

    return intersection / union if union > 0 else 0.0


# ── Step 1: Compute pairwise similarities ── #
def compute_sample_similarities(df, threshold=80):
    """
    Compute pairwise fuzzy Jaccard similarities between sample_id recommendation lists
    for each (user_id, style, temperature), using the specified threshold.
    """
    group_cols = ["user_id", "style", "temperature"]
    results = []

    for group_key, group_df in tqdm(df.groupby(group_cols), desc="Pairwise sample similarities"):
        # Ensure lists are parsed properly (handling potential string representation of lists)
        recs = group_df.set_index("sample_id")["recommended_movies"].apply(
            # Safely evaluate the string representation of a list/tuple if it's a string
            lambda x: eval(x) if isinstance(x, str) and (x.startswith('[') or x.startswith('(')) else x
        ).to_dict()

        # Compute pairwise similarities between sample_ids
        for s1, s2 in combinations(recs.keys(), 2):
            # Pass the specified threshold to the similarity function
            sim = fuzzy_jaccard_similarity(recs[s1], recs[s2], threshold=threshold)
            results.append({
                "user_id": group_key[0],
                "style": group_key[1],
                "temperature": group_key[2],
                "sample_id_1": s1,
                "sample_id_2": s2,
                "similarity": sim
            })

    return pd.DataFrame(results)


# ── Step 2: Average per user_id × style × temperature ── #
def compute_user_avg(df_pairs):
    return (
        df_pairs.groupby(["user_id", "style", "temperature"])["similarity"]
        .mean()
        .reset_index(name="avg_fuzzy_jaccard_similarity")
    )


# ── Step 3: Average across users per style × temperature ── #
def compute_global_avg(df_user_avg):
    return (
        df_user_avg.groupby(["style", "temperature"])["avg_fuzzy_jaccard_similarity"]
        .mean()
        .reset_index()
    )

df_sim_pairs = compute_sample_similarities(df_sc_all)
df_user_avg_sim = compute_user_avg(df_sim_pairs)
df_global_avg_sim = compute_global_avg(df_user_avg_sim)


Pairwise sample similarities: 100%|██████████| 1200/1200 [00:25<00:00, 47.13it/s]


In [105]:
df_global_avg_sim.head(100)

Unnamed: 0,style,temperature,avg_fuzzy_jaccard_similarity
0,few_shot_cot,0.2,0.647751
1,few_shot_cot,0.5,0.474566
2,few_shot_cot,0.7,0.384184
3,few_shot_cot,1.0,0.247806
4,few_shot_cot,1.2,0.170483
5,few_shot_cot,1.4,0.105191
6,zero_shot_cot,0.2,0.522247
7,zero_shot_cot,0.5,0.460814
8,zero_shot_cot,0.7,0.42321
9,zero_shot_cot,1.0,0.352168


In [97]:
df_global_avg_sim.to_csv("similarity/gpt_4o_mini_withinsamples.csv")

In [8]:
df_greedy=pd.read_csv("data/gpt_4o/results_normal/df_template_gpt_40.csv")

In [9]:
import pandas as pd
import re
from itertools import combinations
from tqdm import tqdm
from rapidfuzz import fuzz

# Regular expression to find and remove the (YYYY) year suffix
# It captures the title part before the year (which might include spaces/punctuation)
RE_YEAR = re.compile(r'\s*\(\d{4}\)\s*$', re.UNICODE)

def strip_year(title: str) -> str:
    """
    Removes the '(YYYY)' year suffix and strips Markdown bolding ('**')
    from a movie title for consistent comparison.
    """
    if not title:
        return ""

    # 1. Remove the year suffix
    cleaned_title = RE_YEAR.sub('', title).strip()

    # 2. PATCH: Remove leading/trailing Markdown bolding (**)
    if cleaned_title.startswith('**') and cleaned_title.endswith('**'):
        cleaned_title = cleaned_title.strip('*').strip()

    return cleaned_title.strip()

# ── Fuzzy Jaccard Similarity (Updated in Canvas) ── #
def fuzzy_jaccard_similarity(list1, list2, threshold=80, verbose=False):
    """
    Computes Fuzzy Jaccard Similarity after stripping the year and cleaning
    formatting from titles, using a custom threshold of 80 for rapidfuzz.
    """
    # 1. Strip years and clean formatting from all titles
    list1_cleaned = [strip_year(t) for t in list1]
    list2_cleaned = [strip_year(t) for t in list2]

    matched_1, matched_2 = set(), set()

    for i, t1 in enumerate(list1_cleaned):
        for j, t2 in enumerate(list2_cleaned):
            # Skip empty strings and already matched titles in list2
            if not t1 or not t2 or j in matched_2:
                continue

            # Use token_sort_ratio for robust comparison against minor title differences
            score = fuzz.token_sort_ratio(t1, t2)

            if score >= threshold:
                matched_1.add(i)
                matched_2.add(j)
                if verbose:
                    # Use original titles for verbose output
                    print(f" Matched: {list1[i]} ≈ {list2[j]} (Score={score}, Cleaned: '{t1}' vs '{t2}')")
                break

    intersection = len(matched_1)
    # The union size is based on the original list lengths, as matched_1 and matched_2
    # track indices of the original lists.
    union = len(list1) + len(list2) - intersection

    return intersection / union if union > 0 else 0.0

def compare_samples_to_greedy_verbose(df_sc_all, df_greedy, threshold=80, verbose_user_id=None):
    """
    Compares the average similarity of model-generated sample lists to a corresponding
    "greedy" (lowest temperature, single-sample) list for the same user/style.
    Uses strip_year for cleaning and defaults to a threshold of 80.
    """
    results = []
    # Assumes 'C' template set for this specific comparison, as per original logic.
    grouped = df_sc_all[df_sc_all["template_set"] == "C"].groupby(["user_id", "style", "temperature"])

    for (user_id, style, temp), group in grouped:
        greedy_row = df_greedy[
            (df_greedy["user_id"] == user_id) &
            (df_greedy["style"] == style) &
            (df_greedy["template_set"] == "C")
        ]

        if greedy_row.empty:
            continue

        # Get greedy movie list and clean using strip_year
        greedy_movies = greedy_row.iloc[0]["recommended_movies"]
        greedy_movies = eval(greedy_movies) if isinstance(greedy_movies, str) else greedy_movies
        # Apply the Canvas's standard cleaning function
        greedy_cleaned = [strip_year(t) for t in greedy_movies]

        sims = []

        if verbose_user_id is not None and user_id == verbose_user_id:
            print(f"\n=== USER {user_id} | STYLE: {style} | TEMP: {temp} ===")
            print("Greedy:", greedy_movies)

        for idx, row in group.iterrows():
            sample_movies = row["recommended_movies"]
            sample_movies = eval(sample_movies) if isinstance(sample_movies, str) else sample_movies
            # Apply the Canvas's standard cleaning function
            sample_cleaned = [strip_year(t) for t in sample_movies]

            if verbose_user_id is not None and user_id == verbose_user_id:
                print(f"\nSample (ID={row['sample_id']}):", sample_movies)

            # Use the existing fuzzy_jaccard_similarity, passing the original lists
            # The function handles cleaning internally and uses the specified threshold (80)
            sim = fuzzy_jaccard_similarity(
                sample_movies, # Original list for verbose printing
                greedy_movies, # Original list for verbose printing
                threshold,
                verbose=(verbose_user_id == user_id)
            )
            sims.append(sim)

            if verbose_user_id == user_id:
                print(f"  ➤ Jaccard Similarity: {sim:.4f}")

        avg_sim = sum(sims) / len(sims) if sims else 0.0
        results.append({
            "user_id": user_id,
            "style": style,
            "temperature": temp,
            "avg_similarity_to_greedy": round(avg_sim, 4),
            "num_samples": len(sims)
        })

    return pd.DataFrame(results)


In [10]:
df_result = compare_samples_to_greedy_verbose(df_sc_all, df_greedy, threshold=80)


In [11]:
# Average across all users, grouped by style and temperature
df_avg = df_result.groupby(["style", "temperature"])["avg_similarity_to_greedy"].mean().reset_index()
df_avg = df_avg.sort_values(by=["style", "temperature"])
print(df_avg)


            style  temperature  avg_similarity_to_greedy
0    few_shot_cot          0.2                  0.670236
1    few_shot_cot          0.5                  0.497017
2    few_shot_cot          0.7                  0.400149
3    few_shot_cot          1.0                  0.283111
4    few_shot_cot          1.2                  0.212600
5    few_shot_cot          1.4                  0.093145
6   zero_shot_cot          0.2                  0.554831
7   zero_shot_cot          0.5                  0.483777
8   zero_shot_cot          0.7                  0.436531
9   zero_shot_cot          1.0                  0.346650
10  zero_shot_cot          1.2                  0.279020
11  zero_shot_cot          1.4                  0.063798


In [12]:
df_avg.to_csv("similarity/C_gpt_4o_with_in_sample_and_greedy.csv")

In [13]:
df_sc_all.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24000 entries, 0 to 23999
Data columns (total 18 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   user_id             24000 non-null  int64  
 1   template_set        24000 non-null  object 
 2   style               24000 non-null  object 
 3   fairness            24000 non-null  object 
 4   temperature         24000 non-null  float64
 5   sample_id           24000 non-null  int64  
 6   raw_response        24000 non-null  object 
 7   recommended_movies  24000 non-null  object 
 8   movie_1             22147 non-null  object 
 9   movie_2             22019 non-null  object 
 10  movie_3             21881 non-null  object 
 11  movie_4             21763 non-null  object 
 12  movie_5             21712 non-null  object 
 13  movie_6             21675 non-null  object 
 14  movie_7             21648 non-null  object 
 15  movie_8             21630 non-null  object 
 16  movi

In [3]:
import pandas as pd
col_names_for_user_ratings = ['user_id', 'movie_id', 'rating', 'timestamp']
col_names_for_users = ['user_id' , 'age' , 'gender' , 'occupation' ,'zip code']


df_user_ratings = pd.read_csv(
    'data/ml-100k/u.data',
    sep='\t',
    header=None,
    names=col_names_for_user_ratings,
    encoding='latin-1'
)

# Step 1: Count interactions
movie_interactions = (
    df_user_ratings
    .groupby('movie_id')
    .size()
    .reset_index(name='interaction_count')
    .sort_values(by='interaction_count', ascending=False)
)


In [4]:

df_user_ratings["date"] = pd.to_datetime(df_user_ratings["timestamp"], unit='s')

# Count interactions per user
interaction_counts = df_user_ratings.groupby('user_id').size()

# Filter users with more than 60 interactions (strictly > 100)
valid_users = interaction_counts[interaction_counts >= 100].index

# Apply the filter
df_filtered_user_ratings = df_user_ratings[df_user_ratings['user_id'].isin(valid_users)].copy()

# Sort interactions chronologically per user
df_filtered_user_ratings.sort_values(['user_id', 'date'], inplace=True)

# Split each user's history: 80% train, 20% test
train_list = []
test_list = []

for uid, user_df in df_filtered_user_ratings.groupby('user_id', sort=False):
    n = len(user_df)
    split_pt = int(n * 0.8)
    train_list.append(user_df.iloc[:split_pt])
    test_list.append(user_df.iloc[split_pt:])

# Combine all user splits
train_df = pd.concat(train_list).reset_index(drop=True)
test_df = pd.concat(test_list).reset_index(drop=True)

# Output some stats
print(f"After filtering, {len(valid_users)} users remain.")
print(f"Train set: {len(train_df)} rows")
print(f"Test set:  {len(test_df)} rows")

print(f"Minimum number of interactions among kept users: {interaction_counts[valid_users].min()}")
print(f"Number of users with <= 100 interactions: {(interaction_counts < 100).sum()}")


After filtering, 364 users remain.
Train set: 59469 rows
Test set:  15053 rows
Minimum number of interactions among kept users: 100
Number of users with <= 100 interactions: 579


In [6]:
import random

# Turn your valid_users (Index or list) into a plain list:
valid_users_list = list(valid_users)


three_picked = [1,92,433]
print("Picked users:", three_picked)
remaining_valid_users_list = [u for u in valid_users_list if u not in three_picked]
print("Remaining valid users:", remaining_valid_users_list)

Picked users: [1, 92, 433]
Remaining valid users: [5, 6, 7, 10, 11, 13, 15, 16, 18, 21, 22, 23, 26, 38, 42, 43, 44, 49, 56, 57, 58, 59, 60, 62, 64, 70, 72, 82, 83, 85, 87, 90, 94, 95, 99, 102, 104, 109, 110, 116, 119, 125, 128, 130, 141, 144, 145, 151, 152, 158, 159, 160, 174, 177, 178, 181, 184, 188, 189, 193, 194, 197, 198, 200, 201, 207, 210, 213, 214, 216, 221, 222, 223, 224, 230, 233, 234, 236, 239, 244, 246, 249, 250, 254, 256, 262, 263, 264, 267, 268, 269, 270, 271, 276, 279, 280, 286, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 301, 303, 305, 307, 308, 311, 312, 313, 314, 318, 320, 321, 325, 326, 327, 328, 330, 332, 334, 336, 339, 342, 343, 344, 345, 346, 347, 354, 360, 361, 363, 373, 374, 378, 379, 380, 381, 385, 387, 389, 391, 392, 393, 394, 397, 398, 399, 401, 405, 406, 407, 409, 416, 417, 425, 426, 429, 435, 436, 437, 442, 445, 447, 450, 452, 453, 454, 455, 456, 457, 458, 459, 463, 466, 468, 472, 474, 478, 479, 484, 486, 487, 488, 489, 493, 495, 496, 497, 498, 499, 50

In [7]:
import numpy as np

# --- 7) Sampling helper ---
def select_sample(user_ids, k=10, random_state=42):
    """
    For each user:
      - ground_truth_total_itemIds: all items (train+test)
      - ground_truth_test_itemIds: only test items
      - sample_random: up to k random items from their train history, ordered chronologically
    """
    rng = np.random.default_rng(random_state)
    records = []

    for uid in user_ids:
        user_all = df_filtered_user_ratings.loc[df_filtered_user_ratings.user_id == uid]
        total_items = user_all['movie_id'].tolist()

        user_test = test_df.loc[test_df.user_id == uid]
        test_items = user_test['movie_id'].tolist()

        user_train = train_df.loc[train_df.user_id == uid].copy()

        if user_train.empty:
            # No train data — keep sample empty but still record ground truth
            random_sample = []
        else:
            # sample up to k rows (no replacement)
            n_pick = min(k, len(user_train))
            # Use pandas sample with a deterministic seed per user for reproducibility
            # Seed is derived from (random_state, uid) so different users differ but are reproducible
            seed = (hash((random_state, int(uid))) % (2**32 - 1))
            sample_df = (
                user_train
                .sample(n=n_pick, replace=False, random_state=seed)
                .sort_values('date', ascending=True)
            )
            # keep only movie_id and rating
            keep_cols = [c for c in ['movie_id', 'rating'] if c in sample_df.columns]
            random_sample = sample_df[keep_cols].values.tolist()

        records.append({
            'user_id': uid,
            'ground_truth_total_itemIds': total_items,
            'ground_truth_test_itemIds': test_items,
            'sample_random': random_sample
        })

    return pd.DataFrame(records)

# --- 8) Build the two DataFrames ---
df_filtered_user_data = select_sample(remaining_valid_users_list)
df_filtered_example_user_data = select_sample(three_picked)

print("df_filtered_user_data shape:", df_filtered_user_data.shape)
print("df_filtered_example_user_data shape:", df_filtered_example_user_data.shape)

df_filtered_user_data shape: (362, 4)
df_filtered_example_user_data shape: (3, 4)


In [42]:
df_movies=pd.read_csv("data/final_movies.csv")

In [43]:
df_extra_movies = pd.read_csv("data/movies.csv", encoding="latin1")

In [56]:
df_sc_all = pd.read_csv("data/combined/combined_sc_gpt_4o_mini.csv",encoding="latin1" )


In [48]:
import re
import pandas as pd
from tqdm import tqdm
from rapidfuzz import fuzz

# ─────────────────────────────
# 1. Title Cleaning Function
# ─────────────────────────────
RE_YEAR = re.compile(r"\(\d{4}\)")
RE_QUOTES = re.compile(r'^"+|"+$')
RE_SPACES = re.compile(r'\s+')
RE_TRAIL_GENRES = re.compile(r'\s*[-–—]\s*.*$')
RE_COLON = re.compile(r':.*$')
RE_SPECIAL = re.compile(r"[^a-zA-Z0-9\s]")

def clean_title(title):
    """Normalize movie title for similarity matching."""
    if not isinstance(title, str):
        return ""
    title = RE_COLON.sub("", title)
    title = RE_TRAIL_GENRES.sub("", title)
    title = RE_YEAR.sub("", title)
    title = RE_QUOTES.sub("", title).strip()
    title = RE_SPECIAL.sub("", title)
    return RE_SPACES.sub(" ", title).lower().strip()

# ─────────────────────────────
# 2. RapidFuzz-only Clustering
# ─────────────────────────────
def cluster_titles_rapidfuzz(title_list, threshold=0.8):
    """
    Cluster similar titles using RapidFuzz token_sort_ratio only.
    Returns a list of clusters (list of raw titles).
    """
    cleaned_titles = [(title, clean_title(title)) for title in title_list if pd.notna(title)]
    clusters = []
    used = set()

    for i, (raw_i, clean_i) in enumerate(cleaned_titles):
        if i in used:
            continue
        cluster = [raw_i]
        used.add(i)
        for j, (raw_j, clean_j) in enumerate(cleaned_titles):
            if j in used:
                continue
            sim_rf = fuzz.token_sort_ratio(clean_i, clean_j) / 100
            if sim_rf >= threshold:
                cluster.append(raw_j)
                used.add(j)
        clusters.append(cluster)
    return clusters

# ─────────────────────────────
# 3. Aggregation Logic
# ─────────────────────────────
def aggregate_recommendations(df_sc_all, threshold=0.8):
    """
    For each (user_id, style, temperature) group:
      - Deduplicate movie_1..movie_10 within each row
      - Replace missing or duplicate entries with 'empty'
      - Cluster all non-empty titles across 20 samples
      - Count cluster frequencies
      - Return top-10 most frequent representative titles
    """
    movie_cols = [f"movie_{i}" for i in range(1, 11)]

    # Deduplicate within each row
    for idx, row in df_sc_all.iterrows():
        seen = set()
        new_row = []
        for col in movie_cols:
            title = row[col]
            if pd.isna(title) or title in seen:
                new_row.append("empty")
            else:
                seen.add(title)
                new_row.append(title)
        for i, col in enumerate(movie_cols):
            df_sc_all.at[idx, col] = new_row[i]

    # Create movie list per row
    df_sc_all["movie_list"] = df_sc_all[movie_cols].values.tolist()

    grouped = df_sc_all.groupby(["user_id", "style", "temperature"])
    results = []

    for (user_id, style, temp), group in tqdm(grouped, desc="Aggregating users"):
        all_titles = sum(group["movie_list"].tolist(), [])
        all_titles = [t for t in all_titles if t and str(t).strip()]

        # RapidFuzz-only clustering
        clusters = cluster_titles_rapidfuzz(all_titles, threshold=threshold)

        # Count cluster sizes
        cluster_counts = {cluster[0]: len(cluster) for cluster in clusters}

        # Top-10 most frequent
        top_10 = sorted(cluster_counts.items(), key=lambda x: -x[1])[:10]
        final_titles = [title for title, _ in top_10]

        results.append({
            "user_id": user_id,
            "style": style,
            "temperature": temp,
            **{f"movie_{i+1}": final_titles[i] if i < len(final_titles) else None for i in range(10)}
        })

    return pd.DataFrame(results)


In [57]:
df_final_top10 = aggregate_recommendations(df_sc_all, threshold=0.8)
print(df_final_top10.head())



Aggregating users: 100%|██████████| 1400/1400 [00:06<00:00, 224.76it/s]

   user_id         style  temperature              movie_1  \
0       10  few_shot_cot          0.2  The Graduate (1967)   
1       10  few_shot_cot          0.5  The Graduate (1967)   
2       10  few_shot_cot          0.7  The Graduate (1967)   
3       10  few_shot_cot          1.0  The Graduate (1967)   
4       10  few_shot_cot          1.2  The Graduate (1967)   

                           movie_2                          movie_3  \
0  The Shawshank Redemption (1994)  The Silence of the Lambs (1991)   
1  The Shawshank Redemption (1994)                Fight Club (1999)   
2  The Shawshank Redemption (1994)                Fight Club (1999)   
3                Fight Club (1999)  The Shawshank Redemption (1994)   
4                     Fargo (1996)                Fight Club (1999)   

                           movie_4               movie_5  \
0                Fight Club (1999)     Goodfellas (1990)   
1  The Silence of the Lambs (1991)  The Godfather (1972)   
2  The Silence of th




In [58]:

import pandas as pd
import re
from rapidfuzz import process,fuzz
# === Cleaning Function ===
RE_YEAR = re.compile(r"\(\d{4}\)")
RE_QUOTES = re.compile(r'^"+|"+$')
RE_SPACES = re.compile(r'\s+')
RE_TRAIL_GENRES = re.compile(r'\s*[-–—]\s*.*$')
RE_SPECIAL = re.compile(r"[^a-zA-Z0-9\s]")

def clean_title_for_matching(title):
    """Normalize movie titles by removing punctuation, years, colons, etc., without truncation."""
    if not isinstance(title, str):
        return ""
    title = title.replace(":", "")  #  FIX: Remove colon instead of truncating after it
    title = RE_TRAIL_GENRES.sub("", title)
    title = RE_YEAR.sub("", title)
    title = RE_QUOTES.sub("", title)
    title = RE_SPECIAL.sub("", title)
    return RE_SPACES.sub(" ", title).strip().lower()

# === Matching Function ===
def match_to_catalogs(title, df_movies, df_extra_movies, cutoff=0.8):
    """
    Try to match a given title to df_movies first, then df_extra_movies.
    If match found, return the official catalog title. Otherwise return None.
    """
    if not isinstance(title, str) or not title.strip():
        return None

    title_cleaned = clean_title_for_matching(title)

    # --- Step 1: Primary catalog (df_movies) ---
    best_primary = process.extractOne(
        title_cleaned,
        df_movies["clean_title"],
        scorer=fuzz.token_sort_ratio,
        score_cutoff=cutoff * 100
    )
    if best_primary is not None:
        matched_title = df_movies.loc[
            df_movies["clean_title"] == best_primary[0], "title"
        ].iloc[0]
        return matched_title

    # --- Step 2: Secondary catalog (df_extra_movies) ---
    best_extra = process.extractOne(
        title_cleaned,
        df_extra_movies["clean_title"],
        scorer=fuzz.token_sort_ratio,
        score_cutoff=cutoff * 100
    )
    if best_extra is not None:
        matched_title = df_extra_movies.loc[
            df_extra_movies["clean_title"] == best_extra[0], "title"
        ].iloc[0]
        return matched_title

    # --- Step 3: No match found ---
    return None

# === Main Replacement Function ===
def replace_titles_with_matched(df_all_recs, df_movies, df_extra_movies, cutoff=0.8):
    """
    Replace each movie_i column in df_greedy_recs with the matched title from df_movies or df_extra_movies.
    If not matched, replace with None.
    """
    # Prepare both catalogs with cleaned titles
    df_movies = df_movies.copy()
    df_extra_movies = df_extra_movies.copy()
    df_movies["clean_title"] = df_movies["title"].apply(clean_title_for_matching)
    df_extra_movies["clean_title"] = df_extra_movies["title"].apply(clean_title_for_matching)

    # Copy df_greedy_recs to avoid modifying original
    df_replaced = df_all_recs.copy()

    # Iterate through each recommendation column
    movie_cols = [f"movie_{i}" for i in range(1, 11)]
    for col in tqdm(movie_cols, desc="Matching all movie titles"):
        df_replaced[col] = df_replaced[col].apply(
            lambda x: match_to_catalogs(x, df_movies, df_extra_movies, cutoff)
        )

    return df_replaced

# === Unmatched Movie Extractor ===
def get_unmatched_titles(df_original, df_replaced):
    movie_cols = [f"movie_{i}" for i in range(1, 11)]
    unmatched_titles = []

    for col in movie_cols:
        original = df_original[col]
        replaced = df_replaced[col]
        unmatched = original[replaced.isna()]
        unmatched_titles.extend(unmatched.tolist())

    unmatched_unique = pd.Series(unmatched_titles).dropna().unique().tolist()
    return unmatched_unique

# === Example Usage ===
# Make sure df_greedy_recs, df_movies, df_extra_movies are defined above this call
df_replaced = replace_titles_with_matched(df_final_top10, df_movies, df_extra_movies, cutoff=0.8)

# Get unmatched titles
unmatched_titles = get_unmatched_titles(df_final_top10, df_replaced)

# Display result
print(f"\n            Total unmatched titles: {len(unmatched_titles)}")
for title in unmatched_titles:
    print("-", title)




Matching all movie titles: 100%|██████████| 10/10 [01:28<00:00,  8.85s/it]


            Total unmatched titles: 0





In [59]:
def summarize_unmatched_titles(df_replaced):
    """
    Summarize how many None (unmatched) titles remain in df_replaced,
    grouped by (style, template_set).
    """
    movie_cols = [f"movie_{i}" for i in range(1, 11)]
    summary_rows = []

    for (style, temperature), group in df_replaced.groupby(["style", "temperature"]):
        total_titles = group[movie_cols].size               # total number of cells (users × 10)
        none_count = group[movie_cols].isna().sum().sum()   # total number of None (unmatched)
        matched_count = total_titles - none_count

        summary_rows.append({
            "style": style,
            "temperature": temperature,
            "total_titles_checked": total_titles,
            "matched_titles": matched_count,
            "unmatched_titles": none_count,
            "match_rate_%": round(100 * matched_count / total_titles, 2)
        })

    return pd.DataFrame(summary_rows).sort_values(by=["style", "temperature"]).reset_index(drop=True)

# === Run the summary ===
df_summary = summarize_unmatched_titles(df_replaced)
print(df_summary)


            style  temperature  total_titles_checked  matched_titles  \
0    few_shot_cot          0.2                  1000            1000   
1    few_shot_cot          0.5                  1000            1000   
2    few_shot_cot          0.7                  1000            1000   
3    few_shot_cot          1.0                  1000            1000   
4    few_shot_cot          1.2                  1000            1000   
5    few_shot_cot          1.4                  1000            1000   
6    few_shot_cot          1.6                  1000            1000   
7   zero_shot_cot          0.2                  1000            1000   
8   zero_shot_cot          0.5                  1000            1000   
9   zero_shot_cot          0.7                  1000            1000   
10  zero_shot_cot          1.0                  1000            1000   
11  zero_shot_cot          1.2                  1000            1000   
12  zero_shot_cot          1.4                  1000            

In [60]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from collections import Counter
from scipy.stats import entropy
from rapidfuzz import fuzz


# === RapidFuzz Matching ===
def match_title_to_id(title, catalog_titles, title_to_id, cutoff=0.7):
    """Return movie_id if matched above cutoff; else None."""
    if not isinstance(title, str) or not title.strip():
        return None
    best_match = None
    best_score = 0
    for candidate in catalog_titles:
        score = fuzz.token_sort_ratio(title, candidate) / 100
        if score > best_score:
            best_match = candidate
            best_score = score
    return title_to_id.get(best_match) if best_score >= cutoff else None


# === Unmatched Title Clustering ===
def cluster_unmatched_titles(unmatched_titles, cutoff=0.7):
    """
    Group similar unmatched titles (>= cutoff similarity) using RapidFuzz.
    Returns mapping {title: representative_title}.
    """
    unmatched_titles = list(set([t for t in unmatched_titles if isinstance(t, str) and t.strip()]))
    seen = set()
    mapping = {}

    for t in unmatched_titles:
        if t in seen:
            continue
        mapping[t] = t
        seen.add(t)
        for other in unmatched_titles:
            if other not in seen:
                score = fuzz.token_sort_ratio(t, other) / 100
                if score >= cutoff:
                    mapping[other] = t
                    seen.add(other)
    return mapping


# === Accuracy Metrics ===
def hit_ratio_at_k(rec_ids, gt_ids, k=10):
    return int(any(x in gt_ids for x in rec_ids[:k] if isinstance(x, int)))


def precision_at_k(rec_ids, gt_ids, k=10):
    topk = [x for x in rec_ids[:k] if isinstance(x, int)]
    return len([x for x in topk if x in gt_ids]) / k


def ndcg_at_k(rec_ids, gt_ids, k=10):
    dcg = sum(1 / np.log2(i + 2) for i, x in enumerate(rec_ids[:k]) if isinstance(x, int) and x in gt_ids)
    ideal_dcg = sum(1 / np.log2(i + 2) for i in range(min(len(gt_ids), k)))
    return dcg / ideal_dcg if ideal_dcg > 0 else 0.0


# === Fairness/Diversity Metrics ===
def gini_index(counts):
    """Gini coefficient of exposure inequality."""
    sorted_vals = np.sort(np.array(counts))
    n = len(sorted_vals)
    if n == 0 or sorted_vals.sum() == 0:
        return 0.0
    index = np.sum((2 * np.arange(1, n + 1) - n - 1) * sorted_vals)
    return round(index / (n * sorted_vals.sum()), 4)


def natural_entropy(counts):
    """Entropy of exposure diversity."""
    total = sum(counts)
    if total == 0:
        return 0.0
    p = np.array(counts) / total
    return round(entropy(p, base=np.e), 4)


# === Main Evaluation ===
def evaluate_all_metrics(df_sc_recs, df_movies, df_filtered_user_data, cutoff=0.7):
    """
    Evaluate all metrics grouped by (style, template_set):
    - Accuracy: HR@10, Precision@10, NDCG@10 (per user, averaged)
    - Fairness/Diversity: Gini, Entropy (from exposure counts)
    """
    catalog_titles = df_movies["title"].tolist()
    title_to_id = dict(zip(df_movies["title"], df_movies["movie_id"]))
    gt_by_user = df_filtered_user_data.set_index("user_id").to_dict("index")

    metrics_rows = []

    for (style, temperature), group in tqdm(df_sc_recs.groupby(["style", "temperature"]), desc="Evaluating"):
        per_user_recs = {}
        unmatched_titles = []
        all_recs = []

        # === Step 1: Per-user matching (preserve order) ===
        for _, row in group.iterrows():
            uid = row["user_id"]
            rec_list = [row.get(f"movie_{i}", None) for i in range(1, 11)]
            matched_list = []

            for title in rec_list:
                if title is None or (isinstance(title, float) and pd.isna(title)):
                    matched_list.append(None)
                    continue

                mid = match_title_to_id(title, catalog_titles, title_to_id, cutoff)
                if mid is not None:
                    matched_list.append(mid)
                else:
                    matched_list.append(title)
                    unmatched_titles.append(title)

            # keep exactly k=10
            matched_list = (matched_list + [None] * 10)[:10]
            per_user_recs[uid] = matched_list
            all_recs.extend(matched_list)

        # === Step 2: Cluster unmatched titles (RapidFuzz ≥ 0.7) ===
        title_cluster_map = cluster_unmatched_titles(unmatched_titles, cutoff)
        final_recs = [
            title_cluster_map.get(x, x) if isinstance(x, str) else x
            for x in all_recs
        ]

        # === Step 3: Exposure counting (includes IDs, clusters, None) ===
        exposure_counter = Counter(final_recs)
        exposure_counts = list(exposure_counter.values())

        # === Step 4: Accuracy metrics (averaged per user) ===
        hr_list, prec_list, ndcg_list = [], [], []
        for uid, recs in per_user_recs.items():
            gt = gt_by_user.get(uid)
            if not gt:
                continue
            gt_ids = set(gt["ground_truth_total_itemIds"]) - {x[0] for x in gt["sample_random"]}
            if not gt_ids:
                continue

            # Order preserved for NDCG
            rec_ids_only = [x if isinstance(x, int) else None for x in recs]
            hr_list.append(hit_ratio_at_k(rec_ids_only, gt_ids))
            prec_list.append(precision_at_k(rec_ids_only, gt_ids))
            ndcg_list.append(ndcg_at_k(rec_ids_only, gt_ids))

        # === Step 5: Aggregate results ===
        metrics_rows.append({
            "style": style,
            "temperature": temperature,
            "HR@10": round(np.mean(hr_list), 4),
            "Precision@10": round(np.mean(prec_list), 4),
            "NDCG@10": round(np.mean(ndcg_list), 4),
            "Gini": gini_index(exposure_counts),
            "Entropy": natural_entropy(exposure_counts),
            "num_unique_exposed_titles": len(exposure_counter),
            "num_exposure_events": sum(exposure_counts)
        })

    return pd.DataFrame(metrics_rows)

# Run evaluation
df_metrics = evaluate_all_metrics(
    df_sc_recs=df_replaced,
    df_movies=df_movies,
    df_filtered_user_data=df_filtered_user_data,
    cutoff=0.8   # RapidFuzz similarity threshold
)



Evaluating: 100%|██████████| 14/14 [00:31<00:00,  2.28s/it]


In [61]:
df_metrics.head(30)

Unnamed: 0,style,temperature,HR@10,Precision@10,NDCG@10,Gini,Entropy,num_unique_exposed_titles,num_exposure_events
0,few_shot_cot,0.2,0.93,0.329,0.3472,0.6692,4.1121,147,1000
1,few_shot_cot,0.5,0.94,0.33,0.3457,0.6514,4.1242,139,1000
2,few_shot_cot,0.7,0.97,0.355,0.3613,0.6522,4.1599,144,1000
3,few_shot_cot,1.0,0.96,0.37,0.378,0.6497,4.2428,157,1000
4,few_shot_cot,1.2,0.98,0.385,0.3903,0.6354,4.2555,151,1000
5,few_shot_cot,1.4,0.97,0.367,0.3648,0.6422,4.364,176,1000
6,few_shot_cot,1.6,0.98,0.333,0.3014,0.6473,4.3322,182,1000
7,zero_shot_cot,0.2,0.91,0.267,0.2807,0.6744,3.4386,72,1000
8,zero_shot_cot,0.5,0.92,0.295,0.3089,0.7033,3.3767,75,1000
9,zero_shot_cot,0.7,0.91,0.301,0.3127,0.6927,3.3747,72,1000


In [63]:
df_metrics.to_csv("self_consistent/gpt4omini.csv", index=False)


In [6]:
files_t_gt_0 = {
    "self_consistent/gpt4o.csv": "gpt-4o",
    "self_consistent/gpt4omini.csv": "gpt-4o-mini",
    "self_consistent/mistral.csv": "mistral-large-2",
    "self_consistent/mistral-7.csv": "mistral-7B",
}

In [2]:
import pandas as pd

# ====== Config ======
files = {
    "self_consistent/gpt4o.csv": "gpt-4o",
    "self_consistent/gpt4omini.csv": "gpt-4o-mini",
    "self_consistent/mistral.csv": "mistral-large-2",
    "self_consistent/mistral-7.csv": "mistral-7B",
}

# All metrics available
metrics = ["HR@10", "Precision@10", "NDCG@10", "Gini", "Entropy"]

# Only these will be used to compute the average rank
metrics_for_avg = ["Precision@10","NDCG@10", "Gini", "Entropy"]

all_dfs = []

# ====== Step 1: Load and rank ======
for path, model in files.items():
    df = pd.read_csv(path)
    df["model"] = model

    # Rank templates *within each style* (A vs B vs C)
    for metric in metrics:
        if metric == "Gini":  # lower is better
            df[f"{metric}_rank"] = df.groupby("style")[metric].rank(ascending=True, method="min")
        else:  # higher is better
            df[f"{metric}_rank"] = df.groupby("style")[metric].rank(ascending=False, method="min")

    # Average rank only for NDCG, Gini, and Entropy
    rank_cols_for_avg = [f"{m}_rank" for m in metrics_for_avg]
    df["avg_rank"] = df[rank_cols_for_avg].mean(axis=1)

    all_dfs.append(df)

# Combine all models’ results
combined_df = pd.concat(all_dfs, ignore_index=True)

# ====== Step 2: Aggregate per (model, style, temperature) ======
per_model = (
    combined_df
    .groupby(["model", "style", "temperature"])
    .agg(
        {**{m: "mean" for m in metrics},                # average metric values
         **{f"{m}_rank": "mean" for m in metrics},      # average metric ranks
         "avg_rank": "mean"}                            # average of NDCG+Gini+Entropy ranks
    )
    .reset_index()
    .sort_values(["model", "style", "avg_rank"])
)

# ====== Step 3: Add best_template flag per (model, style) ======
per_model["best_temperature"] = per_model.groupby(["model", "temperature"])["avg_rank"].transform(lambda x: x == x.min())



per_model.to_csv("sc_per_model_template_ranking.csv", index=False)


print("Done. Files saved:")
print("Per-model rankings → per_model_template_ranking.csv")


print("\nExample per-model view:")
print(per_model.head(10))


Done. Files saved:
Per-model rankings → per_model_template_ranking.csv

Example per-model view:
     model          style  temperature  HR@10  Precision@10  NDCG@10    Gini  \
4   gpt-4o   few_shot_cot          1.2   1.00         0.619   0.6504  0.5970   
5   gpt-4o   few_shot_cot          1.4   1.00         0.610   0.6416  0.5846   
3   gpt-4o   few_shot_cot          1.0   1.00         0.630   0.6520  0.5986   
0   gpt-4o   few_shot_cot          0.2   1.00         0.617   0.6430  0.5917   
2   gpt-4o   few_shot_cot          0.7   1.00         0.634   0.6583  0.6012   
1   gpt-4o   few_shot_cot          0.5   1.00         0.628   0.6476  0.5977   
6   gpt-4o   few_shot_cot          1.6   1.00         0.535   0.4958  0.5960   
11  gpt-4o  zero_shot_cot          1.2   0.96         0.440   0.4432  0.6429   
9   gpt-4o  zero_shot_cot          0.7   0.98         0.447   0.4504  0.6687   
10  gpt-4o  zero_shot_cot          1.0   0.96         0.444   0.4411  0.6730   

    Entropy  HR@10_rank

In [2]:
per_model.info()

<class 'pandas.core.frame.DataFrame'>
Index: 44 entries, 4 to 40
Data columns (total 15 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   model              44 non-null     object 
 1   style              44 non-null     object 
 2   temperature        44 non-null     float64
 3   HR@10              44 non-null     float64
 4   Precision@10       44 non-null     float64
 5   NDCG@10            44 non-null     float64
 6   Gini               44 non-null     float64
 7   Entropy            44 non-null     float64
 8   HR@10_rank         44 non-null     float64
 9   Precision@10_rank  44 non-null     float64
 10  NDCG@10_rank       44 non-null     float64
 11  Gini_rank          44 non-null     float64
 12  Entropy_rank       44 non-null     float64
 13  avg_rank           44 non-null     float64
 14  best_temperature   44 non-null     bool   
dtypes: bool(1), float64(12), object(2)
memory usage: 5.2+ KB


In [3]:
import plotly.graph_objects as go
from pathlib import Path

# === Color map for STYLES ===
# These colors differentiate few_shot_cot vs zero_shot_cot
style_color_map = {
    "few_shot_cot": "#ff7f0e",    # orange
    "zero_shot_cot": "#1f77b4",   # blue
    "few_shot": "#2ca02c",        # green (if needed)
    "zero_shot": "#9467bd",       # purple (if needed)
}

# === Model-family mapping ===
model_to_family = {
    "gpt-4o": "GPT",
    "gpt-4o-mini": "GPT",
    "mistral-7B": "Mistral",
    "mistral-large-2": "Mistral",
}

# === Line style per model ===
# Solid = base model, dotted = mini version
line_styles = {
    "gpt-4o": "solid",
    "gpt-4o-mini": "dot",
    "mistral-7B": "solid",
    "mistral-large-2": "dot",
}

# === Family definitions ===
families = {
    "GPT Family (Self-Consistent)": {
        "models": ["gpt-4o", "gpt-4o-mini"],
        "temps": [0.2, 0.5, 0.7, 1.0, 1.2, 1.4, 1.6],
    },
    "Mistral Family (Self-Consistent)": {
        "models": ["mistral-7B", "mistral-large-2"],
        "temps": [0.2, 0.5, 0.7, 1.0],
    },
}

# === Output folder ===
output_dir = Path("plots_sc")
output_dir.mkdir(exist_ok=True)

# === Determine global y-axis range so both charts share the same scale ===
global_min = per_model["avg_rank"].min()
global_max = per_model["avg_rank"].max()
y_range = [global_min - 0.2, global_max + 0.2]

# === Plot per family ===
for family, config in families.items():
    models = config["models"]
    allowed_temps = config["temps"]

    df_plot = per_model[
        (per_model["model"].isin(models)) &
        (per_model["temperature"].isin(allowed_temps))
    ].copy()

    if df_plot.empty:
        continue

    fig = go.Figure()

    # === Draw each style × model ===
    for style in sorted(df_plot["style"].unique()):
        for model in models:
            subset = (
                df_plot[
                    (df_plot["model"] == model) &
                    (df_plot["style"] == style)
                ]
                .sort_values("temperature")
            )
            if subset.empty:
                continue

            # color → depends on style, not model family
            color = style_color_map.get(style, "#000000")
            dash = line_styles.get(model, "solid")

            fig.add_trace(go.Scatter(
                x=subset["temperature"],
                y=subset["avg_rank"],
                mode="lines+markers",
                name=f"{style}, {model}",
                line=dict(color=color, dash=dash, width=2.5),
                marker=dict(size=7, symbol="circle"),
                hovertemplate=(
                    f"<b>{model}</b><br>Style: {style}"
                    "<br>Temp=%{x}<br>Avg Rank=%{y:.2f}<extra></extra>"
                ),
            ))

    # === Layout settings ===
    fig.update_layout(
        title=f"<b>{family}: Avg Rank vs Temperature</b>",
        xaxis_title="Temperature (T)",
        yaxis_title="Average Rank (Lower = Better)",
        template="plotly_white",
        width=700,
        height=400,
        font=dict(family="Times New Roman", size=14),
        margin=dict(l=60, r=80, t=60, b=80),
        xaxis=dict(
            tickmode="array",
            tickvals=allowed_temps,
            ticktext=[str(t) for t in allowed_temps],
            showgrid=True,
            gridcolor="rgba(200,200,200,0.3)"
        ),
        yaxis=dict(
            range=y_range,
            showgrid=True,
            gridcolor="rgba(200,200,200,0.3)"
        ),
        legend=dict(
            orientation="h",
            y=-0.28,
            x=0.5,
            xanchor="center",
            font=dict(size=12)
        ),
    )

    # === Save & Show ===
    filename = output_dir / f"{family.lower().replace(' ', '_')}_avg_rank_vs_temp.html"
    fig.write_html(str(filename))
    fig.show()


In [6]:
per_model.head()

Unnamed: 0,model,style,temperature,HR@10,Precision@10,NDCG@10,Gini,Entropy,HR@10_rank,Precision@10_rank,NDCG@10_rank,Gini_rank,Entropy_rank,avg_rank,best_temperature
4,gpt-4o,few_shot_cot,1.2,1.0,0.619,0.6504,0.597,4.7128,1.0,4.0,3.0,4.0,2.0,3.25,False
5,gpt-4o,few_shot_cot,1.4,1.0,0.61,0.6416,0.5846,4.7432,1.0,6.0,6.0,1.0,1.0,3.5,True
3,gpt-4o,few_shot_cot,1.0,1.0,0.63,0.652,0.5986,4.6647,1.0,2.0,2.0,6.0,5.0,3.75,False
0,gpt-4o,few_shot_cot,0.2,1.0,0.617,0.643,0.5917,4.6689,1.0,5.0,5.0,2.0,4.0,4.0,True
2,gpt-4o,few_shot_cot,0.7,1.0,0.634,0.6583,0.6012,4.6505,1.0,1.0,1.0,7.0,7.0,4.0,False


In [3]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from pathlib import Path

# === Colors for prompt styles ===
style_color_map = {
    "few_shot_cot": "#ff7f0e",   # orange
    "zero_shot_cot": "#1f77b4",  # blue
}

# === Line styles per model (solid = base, dot = smaller variant) ===
line_styles = {
    "gpt-4o": "solid",
    "gpt-4o-mini": "dot",
    "mistral-7B": "dot",
    "mistral-large-2": "solid",
}

# === Family configuration ===
families = {
    "GPT Family (Self-Consistent)": {
        "models": ["gpt-4o", "gpt-4o-mini"],
        "temps": [0.2, 0.5, 0.7, 1.0, 1.2, 1.4, 1.6],
    },
    "Mistral Family (Self-Consistent)": {
        "models": ["mistral-7B", "mistral-large-2"],
        "temps": [0.2, 0.5, 0.7, 1.0],
    },
}

# === Output folder ===
output_dir = Path("plots_sc_combined")
output_dir.mkdir(exist_ok=True)

# === Shared Y-axis scale ===
global_min = per_model["avg_rank"].min()
global_max = per_model["avg_rank"].max()
y_range = [global_min - 0.2, global_max + 0.2]

# === Create side-by-side subplots ===
fig = make_subplots(
    rows=1, cols=2,
    subplot_titles=(
        "<b>GPT-4o Family</b>",
        "<b>Mistral Family</b>"
    ),
    shared_yaxes=True,
    horizontal_spacing=0.08
)

# === Populate each subplot ===
for i, (family, config) in enumerate(families.items(), start=1):
    models = config["models"]
    allowed_temps = config["temps"]

    df_plot = per_model[
        (per_model["model"].isin(models)) &
        (per_model["temperature"].isin(allowed_temps))
    ].copy()

    for style in sorted(df_plot["style"].unique()):
        for model in models:
            subset = (
                df_plot[
                    (df_plot["model"] == model) &
                    (df_plot["style"] == style)
                ]
                .sort_values("temperature")
            )
            if subset.empty:
                continue

            color = style_color_map.get(style, "#000000")
            dash = line_styles.get(model, "solid")

            fig.add_trace(
                go.Scatter(
                    x=subset["temperature"],
                    y=subset["avg_rank"],
                    mode="lines+markers+text",
                    name=f"{style.replace('_', ' ')} – {model}",
                    line=dict(color=color, dash=dash, width=2.5),
                    marker=dict(size=6, symbol="circle"),
                    hovertemplate=(
                        f"<b>{model}</b><br>Style: {style}"
                        "<br>Temp=%{x}<br>Avg Rank=%{y:.2f}<extra></extra>"
                    ),
                ),
                row=1, col=i
            )

# === Layout ===
fig.update_layout(
    template="plotly_white",
    width=650,   # compact for paper
    height=320,  # smaller height
    font=dict(family="Times New Roman", size=12),
    legend=dict(
        orientation="h",
        y=-0.3,
        x=0.5,
        xanchor="center",
        font=dict(size=10)
    ),
    margin=dict(l=55, r=40, t=60, b=55),
    hovermode="x unified",
)

# === Axis formatting ===
for i, (_, config) in enumerate(families.items(), start=1):
    allowed_temps = config["temps"]
    fig.update_xaxes(
        tickvals=allowed_temps,
        ticktext=[str(t) for t in allowed_temps],
        title_text="<b>Temperature (T)</b>",
        title_font=dict(size=11, family="Times New Roman", color="black"),
        tickfont=dict(size=10, family="Times New Roman", color="black"),
        row=1, col=i
    )
fig.update_yaxes(
    title_text="<b>Average Rank (Lower = Better)</b>",
    range=y_range,
    title_font=dict(size=11, family="Times New Roman", color="black"),
    tickfont=dict(size=10, family="Times New Roman", color="black"),
    showgrid=True, gridcolor="rgba(200,200,200,0.3)"
)

# === Bold subplot titles ===
fig.update_annotations(font=dict(size=13, family="Times New Roman", color="black"))

# === Save & display ===
fig.write_html(str(output_dir / "combined_avg_rank_vs_temp.html"))
# Optional high-res PNG for LaTeX inclusion
# fig.write_image(str(output_dir / "combined_avg_rank_vs_temp.png"), scale=3)
fig.write_image("combined_avg_rank_vs_temp.pdf", format="pdf", scale=3)  # <-- PDF export
fig.show()
