# Movie Recommendation System

This notebook demonstrates a practical application of a movie recommender system using LLM-based techniques (e.g., Phi-3.5-mini-instruct).

In [1]:
!pip show torchvision
!pip install torchvision==0.16.1
!python3 -m venv myenv
!source myenv/bin/activate
!pip install --upgrade pip setuptools
!pip uninstall -y transformers sentence-transformers
!pip install transformers==4.37.0 sentence-transformers torch torchvision torchaudio
!pip install --upgrade "tf-keras<2.16.0"
!python -c "import torch; print(torch.__version__)"


Name: torchvision
Version: 0.16.1
Summary: image and video datasets and models for torch deep learning
Home-page: https://github.com/pytorch/vision
Author: PyTorch Core Team
Author-email: soumith@pytorch.org
License: BSD
Location: /opt/miniconda3/lib/python3.10/site-packages
Requires: numpy, pillow, requests, torch
Required-by: 

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Collecting pip
  Downloading pip-25.1.1-py3-none-any.whl.metadata (3.6 kB)
Collecting setuptools
  Downloading setuptools-80.7.1-py3-none-any.whl.metadata (6.6 kB)
Downloading pip-25.1.1-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m27.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading setuptools-80.7.1-py3-none-any.whl (1.2 MB)
[2K   [90m━━━━━━━━━━━━

In [2]:
import torch
print(torch.__version__)
print(torch.backends.mps.is_available())  # Should return True if MPS is available

2.1.1
True


In [3]:
pip install transformers==4.43.0

Collecting transformers==4.43.0
  Using cached transformers-4.43.0-py3-none-any.whl.metadata (43 kB)
Collecting tokenizers<0.20,>=0.19 (from transformers==4.43.0)
  Using cached tokenizers-0.19.1-cp310-cp310-macosx_11_0_arm64.whl.metadata (6.7 kB)
Using cached transformers-4.43.0-py3-none-any.whl (9.4 MB)
Using cached tokenizers-0.19.1-cp310-cp310-macosx_11_0_arm64.whl (2.4 MB)
Installing collected packages: tokenizers, transformers
[2K  Attempting uninstall: tokenizers
[2K    Found existing installation: tokenizers 0.15.2
[2K    Uninstalling tokenizers-0.15.2:
[2K      Successfully uninstalled tokenizers-0.15.2
[2K  Attempting uninstall: transformers
[2K    Found existing installation: transformers 4.37.0
[2K    Uninstalling transformers-4.37.0:
[2K      Successfully uninstalled transformers-4.37.0━━━━━━━━━━━━━━━━[0m [32m1/2[0m [transformers]
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2/2[0m [transformers][0m [transformers]
[1A[2KSuccessfully installed

In [4]:
import transformers
print(transformers.__version__)

4.43.0


In [None]:
import pandas as pd
import torch
import numpy as np
import re
import random
from collections import Counter
from fuzzywuzzy import fuzz
from scipy.stats import entropy
from transformers import AutoModelForCausalLM, AutoTokenizer
from collections import defaultdict
import matplotlib.pyplot as plt

# Check for available device
device = "mps" if torch.backends.mps.is_available() else ("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")



Using device: mps


In [6]:
# Load LLM model and tokenizer
model_name = "microsoft/Phi-3.5-mini-instruct"
print("Loading model and tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, max_memory={device: "10GB"}, trust_remote_code=True, force_download=True).to(device)
print("Model and tokenizer loaded successfully.")


Loading model and tokenizer...


config.json:   0%|          | 0.00/3.45k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/3.45k [00:00<?, ?B/s]

configuration_phi3.py:   0%|          | 0.00/11.2k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/3.45k [00:00<?, ?B/s]

modeling_phi3.py:   0%|          | 0.00/73.8k [00:00<?, ?B/s]

`flash-attention` package not found, consider installing for better performance: No module named 'flash_attn'.
Current `flash-attention` does not support `window_size`. Either upgrade or use `attn_implementation='eager'`.


model.safetensors.index.json:   0%|          | 0.00/16.3k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.67G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/195 [00:00<?, ?B/s]

Model and tokenizer loaded successfully.


In [7]:
data_path = "/Users/tomi/Desktop/Thesis/Data/"

# Load data
print("Loading datasets...")
df_users_ml = pd.read_csv(data_path + "df_user_ml-1m.csv")
df_items_ml = pd.read_csv(data_path + "df_item_ml-1m.csv")
test_data_ml1m_fullInteraction= pd.read_csv(data_path + "test_data_ml1m_fullInteraction_80users.csv")
print("Datasets loaded successfully.")

print("First 10 rows of df_users_ml:")
print(df_users_ml.head(10))

print("\nFirst 10 rows of df_items_ml:")
print(df_items_ml.head(10))

print("\nFirst 10 rows of test_data_ml1m_fullInteraction:")
print(test_data_ml1m_fullInteraction.head(10))

Loading datasets...
Datasets loaded successfully.
First 10 rows of df_users_ml:
   userId gender  age  occupation zipCode
0       1      F    1          10   48067
1       2      M   56          16   70072
2       3      M   25          15   55117
3       4      M   45           7   02460
4       5      M   25          20   55455
5       6      F   50           9   55117
6       7      M   35           1   06810
7       8      M   25          12   11413
8       9      M   25          17   61614
9      10      F   35           1   95370

First 10 rows of df_items_ml:
   itemId                               title                        genres
0       1                    Toy Story (1995)   Animation|Children's|Comedy
1       2                      Jumanji (1995)  Adventure|Children's|Fantasy
2       3             Grumpier Old Men (1995)                Comedy|Romance
3       4            Waiting to Exhale (1995)                  Comedy|Drama
4       5  Father of the Bride Part II (1995)  

In [8]:
print(model)  # Should print the model architecture, not None
print(tokenizer)  # Should print tokenizer details

Phi3ForCausalLM(
  (model): Phi3Model(
    (embed_tokens): Embedding(32064, 3072, padding_idx=32000)
    (embed_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-31): 32 x Phi3DecoderLayer(
        (self_attn): Phi3Attention(
          (o_proj): Linear(in_features=3072, out_features=3072, bias=False)
          (qkv_proj): Linear(in_features=3072, out_features=9216, bias=False)
          (rotary_emb): Phi3LongRoPEScaledRotaryEmbedding()
        )
        (mlp): Phi3MLP(
          (gate_up_proj): Linear(in_features=3072, out_features=16384, bias=False)
          (down_proj): Linear(in_features=8192, out_features=3072, bias=False)
          (activation_fn): SiLU()
        )
        (input_layernorm): Phi3RMSNorm()
        (resid_attn_dropout): Dropout(p=0.0, inplace=False)
        (resid_mlp_dropout): Dropout(p=0.0, inplace=False)
        (post_attention_layernorm): Phi3RMSNorm()
      )
    )
    (norm): Phi3RMSNorm()
  )
  (lm_head): Linear(in_features=3072, out

In [9]:
# Function to normalize movie names
def normalize_list(lst):
    return [re.sub(r'[^a-zA-Z0-9 ]', '', s.lower().strip()) for s in lst]

### S1 simple model

In [14]:
#movie_list = [
#    "The Matrix", "Inception", "Interstellar", "Blade Runner", "Memento", 
#    "Arrival", "Gravity", "Ex Machina", "Looper", "Primer", 
#    "The Prestige", "Minority Report", "Eternal Sunshine of the Spotless Mind", 
#    "Source Code", "Predestination"
#]

#movie_list = df_items_ml['title'].iloc[:100].tolist()
#movie_list = df_items_ml[~df_items_ml['title'].isin(user_movie_titles)]['title'].sample(300, random_state=42).tolist()
movie_list = df_items_ml['title'].sample(300, random_state=42).tolist()


In [15]:
def get_recommendations(user_movies_string, candidate_movies):
    prompt = f"""
    Based on these movies: {user_movies_string}, recommend 10 movies that the user will likely enjoy.\n
    Choose only from the following list:\n{', '.join(candidate_movies)}\n
    Return only the movie titles, comma-separated, with no extra text.
    """
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    outputs = model.generate(**inputs, max_new_tokens=100, temperature=0.7)
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Filter only valid movies
    recommendations = [title.strip() for title in response.split(",") if title.strip() in candidate_movies]
    return recommendations

In [117]:
#def get_recommendations(user_movies_string, candidate_movies):
#    prompt = f"""
#    Based on these movies: {user_movies_string}, recommend 10 movies that the user will likely enjoy.\n
#    Choose only from the following list:\n{', '.join(candidate_movies)}\n
#    Return only the movie titles, comma-separated, with no extra text.
#    """
#    inputs = tokenizer(prompt, return_tensors="pt").to(device)
#    outputs = model.generate(**inputs, max_new_tokens=100, temperature=0.9)
#    response = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Extract just the movie titles
#    if "Return only the movie titles" in response:
#        response = response.split("Return only the movie titles")[-1].strip()
    
#    recommendations = [title.strip() for title in response.split(",") if title.strip() in candidate_movies]
    
    # Remove duplicates
#    recommendations = list(dict.fromkeys(recommendations))
    
    # Remove already-watched movies
#    recommendations = [title for title in recommendations if title not in user_movie_titles]
    
    # Trim to top 10
#    recommendations = recommendations[:10]
    
    # Backfill if fewer than 10
#    while len(recommendations) < 10:
#        fallback = random.choice(candidate_movies)
#        if fallback not in recommendations and fallback not in user_movie_titles:
#            recommendations.append(fallback)

#    return recommendations

In [None]:
# Evaluation Metrics
def hit_rate(recommendations, ground_truth, top_k=3, threshold=60):
    print("Calculating hit rate...")

    # Normalize to lowercase and remove extra spaces
    def normalize_list(lst):
        return [movie.lower().strip() for movie in lst]

    recommendations_normalized = normalize_list(recommendations)
    ground_truth_normalized = normalize_list(ground_truth)

    # Count how many of the top-k recommendations match any ground truth movie
    hits = sum(
        any(fuzz.partial_ratio(rec, truth) >= threshold for truth in ground_truth_normalized)
        for rec in recommendations_normalized[:top_k]
    )

    hit_rate_value = hits / top_k if top_k > 0 else 0
    print(f"Hit Rate: {hit_rate_value:.4f}")
    return hit_rate_value

def average_rank(recommendations, ground_truth):
    print("Calculating average rank...")
    recommendations_normalized = normalize_list(recommendations)
    ground_truth_normalized = normalize_list(ground_truth)
    
    # Find ranks of ground truth movies in recommendations
    ranks = [
        next((idx + 1 for idx, rec in enumerate(recommendations_normalized) if rec == item), None)
        for item in ground_truth_normalized
    ]
    
    # Filter out None values (i.e., movies that were not found)
    ranks = [r for r in ranks if r is not None]
    
    # If no movies match, set the rank to a default value (e.g., max rank + 1 or NaN)
    if len(ranks) == 0:
        avg_rank_value = np.nan  # Indicating no matches found
    else:
        avg_rank_value = sum(ranks) / len(ranks)
    
    print(f"Average Rank: {avg_rank_value}")
    return avg_rank_value

def hhi(recommendations):
    print("Calculating HHI...")
    counter = Counter(recommendations)
    total = sum(counter.values())

    # Correct formula: sum of squared shares of each item
    hhi_value = sum((count / total) ** 2 for count in counter.values())
    
    print(f"HHI: {hhi_value:.6f}")
    return hhi_value

def entropy(recommendations):
    print("Calculating entropy...")
    counter = Counter(recommendations)
    total = sum(counter.values())

    if total == 0:
        entropy_value = 0  # Avoid log(0) errors
    else:
        # Avoid log(0) errors and calculate entropy
        entropy_value = -sum((count / total) * np.log2(count / total) for count in counter.values() if count > 0)
    
    print(f"Entropy: {entropy_value:.6f}")
    return entropy_value

def gini_index(recommendations):
    print("Calculating Gini Index...")
    
    if not recommendations:
        return 0

    # Count occurrences of each recommended movie
    values = np.array(list(Counter(recommendations).values()))
    values = values / values.sum()  # Normalize frequencies

    # Compute Gini coefficient
    n = len(values)
    values.sort()
    index = np.arange(1, n + 1)  # Index from 1 to n
    gini_value = (2 * np.sum(index * values) - (n + 1) * np.sum(values)) / (n * np.sum(values))
    
    print(f"Gini Index: {gini_value:.6f}")
    return gini_value

def recall_at_k(preds, truths, k=3):
    preds_at_k = preds[:k]
    hits = len(set(preds_at_k) & set(truths))
    return hits / len(set(truths)) if truths else 0.0
    
def dcg_at_k(recs, truths, k):
    dcg = 0.0
    for i, item in enumerate(recs[:k]):
        if item in truths:
            dcg += 1 / np.log2(i + 2)
    return dcg

def ndcg_at_k(recs, truths, k):
    ideal_dcg = dcg_at_k(truths, truths, k)
    if ideal_dcg == 0:
        return 0.0
    return dcg_at_k(recs, truths, k) / ideal_dcg

def gini_index_scores(x):
    x = np.array(x, dtype=np.float64)
    if np.amin(x) < 0:
        x -= np.amin(x)
    x += 1e-6  # avoid zero division
    x_sorted = np.sort(x)
    n = len(x)
    cumx = np.cumsum(x_sorted)
    return (n + 1 - 2 * np.sum(cumx) / cumx[-1]) / n

In [18]:
target_user_id = 19

# History
user_history = test_data_ml1m_fullInteraction[test_data_ml1m_fullInteraction['userId'] == target_user_id]
user_movie_titles = df_items_ml[df_items_ml['itemId'].isin(user_history['itemId'])]['title'].tolist()
input_movies = user_movie_titles[:5]
user_movies_string = ', '.join(input_movies)

# Candidates
candidate_movies = df_items_ml[~df_items_ml['title'].isin(user_movie_titles)]['title'].sample(120, random_state=42).tolist()
full_movie_list = input_movies + candidate_movies

# Get recommendations
recommendations = get_recommendations(user_movies_string, full_movie_list)
print("Generated Recommendations:", recommendations)


The `seen_tokens` attribute is deprecated and will be removed in v4.41. Use the `cache_position` model input instead.
You are not running the flash-attention implementation, expect numerical differences.


Generated Recommendations: ['Taxi Driver (1976)', 'Clerks (1994)', 'Highlander III: The Sorcerer (1994)', 'Taxi Driver (1976)', 'Clerks (1994)', 'Highlander III: The Sorcerer (1994)', 'Payback (1999)', "She's Gotta Have It (1986)", 'Limelight (1952)', 'Pather Panchali (1955)', 'Network (1976)', 'Destination Moon (1950)', 'Tales from the Hood (1995)', 'Manhattan (1979)', 'Dream for an Insomniac (1996)', 'Playing God (1997)', 'Anna Karenina (1997)', "Breakfast at Tiffany's (1961)", 'Get Carter (2000)', 'Scent of a Woman (1992)', "Monty Python's Life of Brian (1979)", 'Solas (1999)', 'Marlene Dietrich: Shadow and Light (1996)', 'Dead Ringers (1988)', 'Urban Legends: Final Cut (2000)', 'Marie Baie Des Anges (1997)', '2001: A Space Odyssey (1968)', 'Fire on the Mountain (1996)', 'Lifeforce (1985)', 'Grosse Pointe Blank (1997)', 'Dream Man (1995)', 'Cutthroat Island (1995)', 'Communion (1989)', 'Gothic (1986)', 'Boys on the Side (1995)', 'Babymother (1998)', 'Baby... Secret of the Lost Legen

Evaluation

In [19]:
# Step 4: Split the titles into prompt input and ground truth
input_movies = user_movie_titles[:3]  # For the model input
ground_truth_movies = user_movie_titles[3:]  # For evaluation

# Step 5: Format input movies into a string for prompting
#user_movies_string = ', '.join(input_movies)

print("Generated Recommendations:", recommendations)

# Evaluate
hit_rate_value = hit_rate(recommendations, ground_truth_movies)
average_rank_value = average_rank(recommendations, ground_truth_movies)
hhi_value = hhi(recommendations)
entropy_value = entropy(recommendations)
gini_value = gini_index(recommendations)

print("\n")
print("User movie history:", user_movie_titles)
print("Recommended movies:", recommendations)
print("Overlap with history:", set(recommendations) & set(user_movie_titles))
print("\n")

# Print evaluation results
print("\nEvaluation Metrics:")
print("Hit Rate:", hit_rate_value)
print("Average Rank:", average_rank_value)
print("HHI:", hhi_value)
print("Entropy:", entropy_value)
print("Gini Index:", gini_value)

Generated Recommendations: ['Taxi Driver (1976)', 'Clerks (1994)', 'Highlander III: The Sorcerer (1994)', 'Taxi Driver (1976)', 'Clerks (1994)', 'Highlander III: The Sorcerer (1994)', 'Payback (1999)', "She's Gotta Have It (1986)", 'Limelight (1952)', 'Pather Panchali (1955)', 'Network (1976)', 'Destination Moon (1950)', 'Tales from the Hood (1995)', 'Manhattan (1979)', 'Dream for an Insomniac (1996)', 'Playing God (1997)', 'Anna Karenina (1997)', "Breakfast at Tiffany's (1961)", 'Get Carter (2000)', 'Scent of a Woman (1992)', "Monty Python's Life of Brian (1979)", 'Solas (1999)', 'Marlene Dietrich: Shadow and Light (1996)', 'Dead Ringers (1988)', 'Urban Legends: Final Cut (2000)', 'Marie Baie Des Anges (1997)', '2001: A Space Odyssey (1968)', 'Fire on the Mountain (1996)', 'Lifeforce (1985)', 'Grosse Pointe Blank (1997)', 'Dream Man (1995)', 'Cutthroat Island (1995)', 'Communion (1989)', 'Gothic (1986)', 'Boys on the Side (1995)', 'Babymother (1998)', 'Baby... Secret of the Lost Legen

In [None]:
recall_value = recall_at_k(recommendations, ground_truth_movies, k=5)
ndcg_value = ndcg_at_k(recommendations, ground_truth_movies, k=5)

print("Recall@5:", recall_value)
print("NDCG@5:", ndcg_value)

Recall@3: 0.020833333333333332
NDCG@3: 0.23463936301137822


### S2 Genre-focused model

In [32]:
def get_recommendations_genre_strict(user_history, df_items, candidate_movies_df):
    # Get top genres from user history
    user_genres = df_items[df_items['itemId'].isin(user_history['itemId'])]['genres'].str.split('|').explode().value_counts()
    top_genres = user_genres.index[:3]

    results = []
    for _, row in candidate_movies_df.iterrows():
        genres = set(row['genres'].split('|'))
        genre_match = len(genres & set(top_genres))
        popularity = row['normalized_popularity']
        score = genre_match + 0.1 * popularity  # Tunable weight

        results.append((row['title'], score))

    sorted_results = sorted(results, key=lambda x: x[1], reverse=True)
    recommended_titles = [title for title, _ in sorted_results[:10]]

    # Build the user_movies_string and prompt
    user_movies_string = ", ".join(df_items[df_items['itemId'].isin(user_history['itemId'])]['title'].tolist())
    prompt = (
        f"Provide 10 movie recommendations that strictly match the user's favorite genres.\n"
        f"User history includes: {user_movies_string}.\n"
        f"Focus on genre overlap with top genres: {', '.join(top_genres)}."
    )

    return recommended_titles, prompt


In [34]:
target_user_id_s2 = 19
user_history_s2 = test_data_ml1m_fullInteraction[test_data_ml1m_fullInteraction['userId'] == target_user_id_s2]
user_movie_titles_s2 = df_items_ml[df_items_ml['itemId'].isin(user_history_s2['itemId'])]['title'].tolist()
input_movies_s2 = user_movie_titles_s2[:5]
user_movies_string_s2 = ', '.join(input_movies_s2)

user_genres = df_items_ml[df_items_ml['title'].isin(user_movie_titles_s2)]['genres'].str.split('|').explode().value_counts()
top_genres = user_genres.index[:3]
mask = df_items_ml['genres'].apply(lambda g: any(genre in g.split('|') for genre in top_genres))
candidate_movies_df = df_items_ml[mask & (~df_items_ml['title'].isin(user_movie_titles_s2))].copy()

movie_popularity = test_data_ml1m_fullInteraction['itemId'].value_counts()
candidate_movies_df['popularity'] = candidate_movies_df['itemId'].map(movie_popularity).fillna(0)
max_popularity = candidate_movies_df['popularity'].max()
candidate_movies_df['normalized_popularity'] = candidate_movies_df['popularity'] / max_popularity

# Optional: small noise to avoid flat Gini
candidate_movies_df['normalized_popularity'] += np.random.normal(0, 0.01, size=len(candidate_movies_df))

candidate_movies_df = candidate_movies_df.sample(120, random_state=42)
candidate_movies_s2 = candidate_movies_df['title'].tolist()
full_movie_list_s2 = input_movies_s2 + candidate_movies_s2

recommendations_s2 = get_recommendations_genre_strict(user_history_s2, df_items_ml, candidate_movies_df)
#recommendations_s2 = get_recommendations_genre_focused(user_history_s2, df_items_ml, full_movie_list_s2)
#recommendations_s2 = get_recommendations_genre_focused(
#    user_history_s2, 
#    df_items_ml, 
#    full_movie_list_s2, 
#    candidate_movies_df=candidate_movies_df
#)

In [None]:
# Show genres for recommendations
for movie in recommendations_s2:
    genres = df_items_ml[df_items_ml['title'] == movie]['genres'].values
    print(f"{movie} -> Genres: {genres[0] if len(genres) > 0 else 'Unknown'}")

# Define input and ground truth
input_movies_s2 = user_movie_titles_s2[:3]
ground_truth_movies_s2 = user_movie_titles_s2[3:]

# Recommendation quality metrics
def hit_rate(recommendations, ground_truth):
    return int(any(item in recommendations for item in ground_truth))

def average_rank(recommendations, ground_truth):
    ranks = [i for i, item in enumerate(recommendations) if item in ground_truth]
    return np.mean(ranks) if ranks else float('nan')

# Diversity metrics
def hhi(recommendations):
    freqs = Counter(recommendations)
    total = sum(freqs.values())
    return sum((count / total) ** 2 for count in freqs.values()) if total else 0

def entropy(recommendations):
    freqs = Counter(recommendations)
    total = sum(freqs.values())
    return -sum((count / total) * np.log2(count / total) for count in freqs.values() if count > 0) if total else 0

def gini_index(recommendations):
    freqs = Counter(recommendations).values()
    sorted_vals = sorted(freqs)
    n = len(sorted_vals)
    cumulative = 0
    total = sum(sorted_vals)
    for i, val in enumerate(sorted_vals):
        cumulative += (2 * (i + 1) - n - 1) * val
    return cumulative / (n * total) if total != 0 else 0.0

# Evaluation
print("\nGenerated Recommendations (S2):", recommendations_s2)
print("User movie history (S2):", user_movie_titles_s2)
print("Overlap with history (S2):", set(recommendations_s2) & set(user_movie_titles_s2))

hit_rate_value_s2 = hit_rate(recommendations_s2, ground_truth_movies_s2)
average_rank_value_s2 = average_rank(recommendations_s2, ground_truth_movies_s2)
hhi_value_s2 = hhi(recommendations_s2)
entropy_value_s2 = entropy(recommendations_s2)
gini_value_s2 = gini_index(recommendations_s2)

print("\nEvaluation Metrics (S2 Genre-focused):")
print("Hit Rate:", hit_rate_value_s2)
print("Average Rank:", average_rank_value_s2)
print("HHI:", hhi_value_s2)
print("Entropy:", entropy_value_s2)
print("Gini Index:", gini_value_s2)


Taxi Driver (1976) -> Genres: Drama|Thriller
Clerks (1994) -> Genres: Comedy
Highlander III: The Sorcerer (1994) -> Genres: Action|Sci-Fi
Foreign Student (1994) -> Genres: Drama
Tom Jones (1963) -> Genres: Comedy
Sister Act (1992) -> Genres: Comedy|Crime
Dirty Work (1998) -> Genres: Comedy
SubUrbia (1997) -> Genres: Comedy
Traveller (1997) -> Genres: Drama
Chushingura (1962) -> Genres: Drama
Raging Bull (1980) -> Genres: Drama
Wilde (1997) -> Genres: Drama
Jack Frost (1998) -> Genres: Comedy|Drama
Trees Lounge (1996) -> Genres: Drama
Big Night (1996) -> Genres: Drama
Pulp Fiction (1994) -> Genres: Crime|Drama
You So Crazy (1994) -> Genres: Comedy
Victor/Victoria (1982) -> Genres: Comedy|Musical
Local Hero (1983) -> Genres: Comedy
I Am Cuba (Soy Cuba/Ya Kuba) (1964) -> Genres: Drama
Clue (1985) -> Genres: Comedy|Mystery
Roger & Me (1989) -> Genres: Comedy|Documentary
Blue Juice (1995) -> Genres: Comedy|Drama
Police Academy 5: Assignment: Miami Beach (1988) -> Genres: Comedy
Lost & Found

In [318]:
user_ids = test_data_ml1m_fullInteraction['userId'].unique()
metrics = []

for user_id in user_ids:
    user_history = test_data_ml1m_fullInteraction[test_data_ml1m_fullInteraction['userId'] == user_id]
    user_movie_titles = df_items_ml[df_items_ml['itemId'].isin(user_history['itemId'])]['title'].tolist()
    
    if len(user_movie_titles) < 6:
        continue  # Skip users with too few movies
    
    input_movies = user_movie_titles[:5]
    ground_truth = user_movie_titles[5:]
    
    user_genres = df_items_ml[df_items_ml['title'].isin(user_movie_titles)]['genres'].str.split('|').explode().value_counts()
    top_genres = user_genres.index[:3]
    
    mask = df_items_ml['genres'].apply(lambda g: any(genre in g.split('|') for genre in top_genres))
    #candidate_movies_df = df_items_ml[mask & (~df_items_ml['title'].isin(user_movie_titles))].copy()
    candidate_movies_df = df_items_ml[mask & (~df_items_ml['title'].isin(input_movies))].copy()

    movie_popularity = test_data_ml1m_fullInteraction['itemId'].value_counts()
    candidate_movies_df['popularity'] = candidate_movies_df['itemId'].map(movie_popularity).fillna(0)
    max_popularity = candidate_movies_df['popularity'].max()
    candidate_movies_df['normalized_popularity'] = candidate_movies_df['popularity'] / max_popularity
    candidate_movies_df['normalized_popularity'] += np.random.normal(0, 0.01, size=len(candidate_movies_df))

    candidate_movies_df = candidate_movies_df.sample(min(len(candidate_movies_df), 120), random_state=42)
    candidate_titles = candidate_movies_df['title'].tolist()
    full_movie_list = input_movies + candidate_titles

    recs = get_recommendations_genre_focused(user_history, df_items_ml, full_movie_list)

    metrics.append({
        "hit_rate": hit_rate(recs, ground_truth),
        "avg_rank": average_rank(recs, ground_truth),
        "hhi": hhi(recs),
        "entropy": entropy(recs),
        "gini": gini_index(recs)
    })

# Compute averages
df_metrics = pd.DataFrame(metrics)
print("\nAverage Metrics Across All Users:")
print(df_metrics.mean())





Average Metrics Across All Users:
hit_rate     0.675000
avg_rank    47.495388
hhi          0.010935
entropy      6.520627
gini         0.004743
dtype: float64


In [319]:
print("User ID:", user_id)
print("Input Movies:", input_movies)
print("Ground Truth:", ground_truth)
print("Recommendations:", recs[:10])
print("Overlap:", set(recs) & set(ground_truth))


User ID: 6031
Input Movies: ['Seven (Se7en) (1995)', 'Lion King, The (1994)', 'Tombstone (1993)', 'Terminator, The (1984)', 'Indiana Jones and the Last Crusade (1989)']
Ground Truth: ['Star Trek IV: The Voyage Home (1986)', 'Contact (1997)', 'House of Yes, The (1997)', 'King and I, The (1956)', 'Harvey (1950)', 'Thelma & Louise (1991)']
Recommendations: ['Tombstone (1993)', 'Indiana Jones and the Last Crusade (1989)', 'Bulletproof (1996)', 'Jaws 3-D (1983)', 'Cape Fear (1962)', 'Secret Agent (1936)', 'Blown Away (1994)', 'Death and the Maiden (1994)', 'Snake Eyes (1998)', 'Love Walked In (1998)']
Overlap: set()


In [None]:
user_ids_s2 = test_data_ml1m_fullInteraction['userId'].unique()[:10]
metrics_s2 = []
all_rec_titles = []

for user_id in user_ids_s2:
    user_history = test_data_ml1m_fullInteraction[test_data_ml1m_fullInteraction['userId'] == user_id]
    user_movie_titles = df_items_ml[df_items_ml['itemId'].isin(user_history['itemId'])]['title'].tolist()
    
    if len(user_movie_titles) < 6:
        continue
    
    input_movies = user_movie_titles[:5]
    ground_truth = user_movie_titles[5:]
    
    user_genres = df_items_ml[df_items_ml['title'].isin(user_movie_titles)]['genres'].str.split('|').explode().value_counts()
    top_genres = user_genres.index[:3]
    
    mask = df_items_ml['genres'].apply(lambda g: any(genre in g.split('|') for genre in top_genres))
    candidate_movies_df = df_items_ml[mask & (~df_items_ml['title'].isin(input_movies))].copy()

    movie_popularity = test_data_ml1m_fullInteraction['itemId'].value_counts()
    candidate_movies_df['popularity'] = candidate_movies_df['itemId'].map(movie_popularity).fillna(0)
    max_popularity = candidate_movies_df['popularity'].max()
    candidate_movies_df['normalized_popularity'] = candidate_movies_df['popularity'] / max_popularity
    candidate_movies_df['normalized_popularity'] += np.random.normal(0, 0.01, size=len(candidate_movies_df))

    candidate_movies_df = candidate_movies_df.sample(min(len(candidate_movies_df), 120), random_state=42)

    recs = get_recommendations_genre_strict(user_history, df_items_ml, candidate_movies_df)
    rec_titles = recs[0] if isinstance(recs, tuple) else recs
    all_rec_titles.extend(rec_titles)  # collect for global diversity metrics
    
    metrics_s2.append({
        "hit_rate": hit_rate(rec_titles, ground_truth),
        "avg_rank": average_rank(rec_titles, ground_truth),
        "hhi": hhi(rec_titles),
        "entropy": entropy(rec_titles)
    })

# Add Gini after all recommendations
rec_title_counts = Counter(all_rec_titles)
gini_val = gini_index_scores(list(rec_title_counts.values()))

# Compute averages
df_metrics = pd.DataFrame(metrics_s2)
df_metrics.loc["Average"] = df_metrics.mean()
df_metrics.at["Average", "gini"] = gini_val

print("\nAverage Metrics for First 10 Users:")
print(df_metrics.loc["Average"])


Calculating hit rate...
Hit Rate: 1.0000
Calculating average rank...
Average Rank: nan
Calculating HHI...
HHI: 0.100000
Calculating entropy...
Entropy: 3.321928
Calculating hit rate...
Hit Rate: 0.6667
Calculating average rank...
Average Rank: nan
Calculating HHI...
HHI: 0.100000
Calculating entropy...
Entropy: 3.321928
Calculating hit rate...
Hit Rate: 1.0000
Calculating average rank...
Average Rank: 4.666666666666667
Calculating HHI...
HHI: 0.100000
Calculating entropy...
Entropy: 3.321928
Calculating hit rate...
Hit Rate: 1.0000
Calculating average rank...
Average Rank: nan
Calculating HHI...
HHI: 0.100000
Calculating entropy...
Entropy: 3.321928
Calculating hit rate...
Hit Rate: 1.0000
Calculating average rank...
Average Rank: nan
Calculating HHI...
HHI: 0.100000
Calculating entropy...
Entropy: 3.321928
Calculating hit rate...
Hit Rate: 0.6667
Calculating average rank...
Average Rank: nan
Calculating HHI...
HHI: 0.100000
Calculating entropy...
Entropy: 3.321928
Calculating hit rate

In [None]:
def normalize_titles(titles):
    return [t.strip().lower() for t in titles]

rec_titles_norm = normalize_titles(rec_titles)
ground_truth_norm = normalize_titles(ground_truth_movies)

recall_value = recall_at_k(rec_titles_norm, ground_truth_norm, k=5)
ndcg_value = ndcg_at_k(rec_titles_norm, ground_truth_norm, k=5)

print("Recall@5:", recall_value)
print("NDCG@5:", ndcg_value)


Recall@5: 0.020833333333333332
NDCG@5: 0.14606834984270645


### S3 Diversify model

In [57]:
def get_recommendations_s3_diversify(user_history, df_items, candidate_movies_df, k=10):
    # 1) Genre weights from user history
    merged = pd.merge(user_history, df_items, on='itemId')
    genre_scores = {}
    for _, r in merged.iterrows():
        for g in r['genres'].split('|'):
            genre_scores[g] = genre_scores.get(g, 0) + r['rating']

    # 2) Score candidates
    results = []
    for idx, r in candidate_movies_df.iterrows():
        gs = sum(genre_scores.get(g, 0) for g in r['genres'].split('|'))
        pop = r['normalized_popularity']
        noise = np.random.normal(0, 0.5)
        bias = idx * 1e-4
        score = gs + 0.7 * pop + noise + bias
        results.append((r['title'], score))

        # 3) Take top-k
    topk = sorted(results, key=lambda x: x[1], reverse=True)[:k]
    titles, raw_scores = zip(*topk)

    # 4) Normalize & diversify scores for Gini
    raw_scores = np.array(raw_scores)
    if np.std(raw_scores) < 1e-3:
        # Inject diversity if flat
        raw_scores += np.random.normal(0, 0.1, size=len(raw_scores))

    raw_scores -= raw_scores.min()  # shift to positive
    raw_scores += 1e-6              # avoid all-zero
    scores = raw_scores.tolist()

    # 5) Build prompt
    user_titles = df_items[df_items['itemId'].isin(user_history['itemId'])]['title'].tolist()
    user_movies_string = ", ".join(user_titles)
    prompt = (
        f"Provide 10 movie recommendations that explore new genres or themes while remaining interesting to the user.\n"
        f"User's previously watched movies: {user_movies_string}.\n"
        f"Aim to balance diversity and relevance in the recommendations."
    )
    print(f"Gini input scores:", scores)
    return list(titles), scores, prompt

In [59]:
def gini_index_scores(x):
    x = np.array(x)
    if np.amin(x) < 0:
        x -= np.amin(x)
    x += 1e-6  # avoid zero division
    x_sorted = np.sort(x)
    n = len(x)
    cumx = np.cumsum(x_sorted)
    return (n + 1 - 2 * np.sum(cumx) / cumx[-1]) / n

# --- Evaluation Loop ---
user_ids_s3 = test_data_ml1m_fullInteraction['userId'].unique()[:10]
metrics_s3 = []

for uid in user_ids_s3:
    hist = test_data_ml1m_fullInteraction[test_data_ml1m_fullInteraction['userId'] == uid]
    titles_all = df_items_ml[df_items_ml['itemId'].isin(hist['itemId'])]['title'].tolist()

    if len(titles_all) < 6:
        continue

    input_titles = titles_all[:5]
    truth = titles_all[5:]

    # Create genre-based candidate set
    user_genres = df_items_ml[df_items_ml['title'].isin(titles_all)]['genres'].str.split('|').explode().value_counts()
    top_genres = user_genres.index[:3]
    mask = df_items_ml['genres'].apply(lambda g: any(genre in g.split('|') for genre in top_genres))
    candidate_movies_df_s3 = df_items_ml[mask & (~df_items_ml['title'].isin(input_titles))].copy()

    # Normalize popularity
    movie_popularity = test_data_ml1m_fullInteraction['itemId'].value_counts()
    candidate_movies_df_s3['popularity'] = candidate_movies_df_s3['itemId'].map(movie_popularity).fillna(0)
    max_popularity = candidate_movies_df_s3['popularity'].max()
    candidate_movies_df_s3['normalized_popularity'] = candidate_movies_df_s3['popularity'] / max_popularity
    candidate_movies_df_s3['normalized_popularity'] += np.random.normal(0, 0.01, size=len(candidate_movies_df_s3))

    # Downsample
    candidate_movies_df_s3 = candidate_movies_df_s3.sample(min(len(candidate_movies_df_s3), 120), random_state=42)

    # Call updated diversify recommender
    recs_titles, recs_scores, prompt = get_recommendations_s3_diversify(
        hist, df_items_ml, candidate_movies_df_s3
    )

    # Print prompt
    print(f"User {uid} Prompt (S3 Diversify):\n{prompt}\n")

    metrics_s3.append({
        "hit_rate": hit_rate(recs_titles, truth),
        "avg_rank": average_rank(recs_titles, truth),
        "recall@5": recall_at_k(recs_titles, truth, k=5),
        "ndcg@5": ndcg_at_k(recs_titles, truth, k=5),
        "hhi": hhi(recs_titles),
        "entropy": entropy(recs_titles),
        "gini": gini_index_scores(recs_scores)
})


# Show results
df_metrics_s3 = pd.DataFrame(metrics_s3)
print("Average Metrics for First 10 Users (S3 Diversify):")
print(df_metrics_s3.mean())

Gini input scores: [52.220656679037646, 48.900542072707665, 31.75950148061482, 31.58321231380734, 31.178296819442412, 31.031908772642428, 30.893465646647304, 9.353524885957173, 8.256348740202897, 1e-06]
User 19 Prompt (S3 Diversify):
Provide 10 movie recommendations that explore new genres or themes while remaining interesting to the user.
User's previously watched movies: Twelve Monkeys (1995), Taxi Driver (1976), Clerks (1994), Shawshank Redemption, The (1994), Highlander III: The Sorcerer (1994), Jurassic Park (1993), Terminator 2: Judgment Day (1991), Kingpin (1996), Relic, The (1997), Wrong Trousers, The (1993), One Flew Over the Cuckoo's Nest (1975), Army of Darkness (1993), Sting, The (1973), Great Escape, The (1963), Highlander (1986), Beverly Hills Ninja (1997), Fifth Element, The (1997), Phantoms (1998), Doctor Dolittle (1998), Rain Man (1988), Friday the 13th (1980), Gremlins 2: The New Batch (1990), Elizabeth (1998), King Kong (1976), Faculty, The (1998), Office Space (1999

### S4 - Diversify Recommendations

In [67]:
# Recommender S4 Diversify
def get_recommendations_s4_diversify(user_history, df_items, candidate_movies_df, k=10):
    # Prompt: This recommender system suggests movies by emphasizing genre alignment with the user's preferences,
    # while reducing mainstream popularity to encourage diversity. Scores are computed using genre affinity,
    # popularity penalty, and a noise factor for randomness.

    merged = pd.merge(user_history, df_items, on='itemId')

    # Calculate genre preference scores from user's rated history
    genre_scores = {}
    for _, r in merged.iterrows():
        for g in r['genres'].split('|'):
            genre_scores[g] = genre_scores.get(g, 0) + r['rating']

    results = []
    for idx, r in candidate_movies_df.iterrows():
        genres = r['genres'].split('|')
        gs = sum(genre_scores.get(g, 0) for g in genres)  # genre alignment score
        pop_penalty = -np.log1p(r['normalized_popularity'] + 1)  # penalize highly popular items
        noise = np.random.normal(0, 0.2)  # random noise for diversity
        score = gs + 0.8 * pop_penalty + noise + idx * 1e-4  # final score with slight index bias for stability
        results.append((r['title'], score))

    # Return top-k titles and their corresponding scores
    topk = sorted(results, key=lambda x: x[1], reverse=True)[:k]
    titles, scores = zip(*topk)
    return list(titles), list(scores)

In [68]:
# Evaluation of S4 Diversify for first 10 users

user_ids_s4 = test_data_ml1m_fullInteraction['userId'].unique()[:10]
metrics_s4 = []

for uid in user_ids_s4:
    hist = test_data_ml1m_fullInteraction[test_data_ml1m_fullInteraction['userId'] == uid]
    titles_all = df_items_ml[df_items_ml['itemId'].isin(hist['itemId'])]['title'].tolist()
    if len(titles_all) < 6:
        continue

    input_titles = titles_all[:5]
    truth = titles_all[5:]

    top_genres = df_items_ml[df_items_ml['title'].isin(titles_all)]['genres'].str.split('|').explode().value_counts().index[:3]
    candidate_movies_df_s4 = df_items_ml[
        df_items_ml['genres'].apply(lambda g: any(genre in g.split('|') for genre in top_genres)) &
        (~df_items_ml['title'].isin(input_titles))
    ].copy()

    pop_series = test_data_ml1m_fullInteraction['itemId'].value_counts()
    candidate_movies_df_s4['popularity'] = candidate_movies_df_s4['itemId'].map(pop_series).fillna(0)
    max_pop = candidate_movies_df_s4['popularity'].max()
    candidate_movies_df_s4['normalized_popularity'] = candidate_movies_df_s4['popularity'] / max_pop
    candidate_movies_df_s4['normalized_popularity'] += np.random.normal(0, 0.01, size=len(candidate_movies_df_s4))

    candidate_movies_df_s4 = candidate_movies_df_s4.sample(min(len(candidate_movies_df_s4), 120), random_state=42)

    recs_titles, recs_scores = get_recommendations_s4_diversify(
        hist, df_items_ml, candidate_movies_df_s4
    )

    metrics_s4.append({
        "hit_rate": hit_rate(recs_titles, truth),
        "avg_rank": average_rank(recs_titles, truth),
        "recall": recall_at_k(recs_titles, truth),
        "ndcg": ndcg_at_k(recs_titles, truth),
        "hhi": hhi(recs_titles),
        "entropy": entropy(recs_titles),
        "gini": gini_index_scores(recs_scores)
    })

df_metrics_s4 = pd.DataFrame(metrics_s4)
print("Average Metrics for First 10 Users (S4 Diversify):")
print(df_metrics_s4.mean())

Average Metrics for First 10 Users (S4 Diversify):
hit_rate     0.100000
avg_rank    10.200000
recall       0.000758
ndcg         0.011005
hhi          0.100000
entropy      3.321928
gini         0.048335
dtype: float64


### S5 - Diversify Recommendations

In [75]:
# Prompt function for S5 Surprise
def get_prompt_s5_surprise(input_titles, df_items):
    top_genres = df_items[df_items['title'].isin(input_titles)]['genres'].str.split('|').explode().value_counts().index[:3]
    genres_str = ', '.join(top_genres)
    prompt = f"""
You are a movie recommender. Your task is to recommend films that surprise the user by avoiding mainstream blockbusters.
The user has watched and liked the following movies: {', '.join(input_titles)}.
These seem to fall into the genres: {genres_str}.

Your goal is to suggest 10 lesser-known, high-quality films that are likely to delight the user, but aren't predictable.
Avoid overly popular titles. Favor unique, unconventional, or underrated works.
Do not repeat any of the user's already-watched movies.
For each recommendation, include the title only.
"""
    return prompt.strip()

# Recommender function for S5 Surprise
def get_recommendations_s5_surprise(user_history, df_items, candidate_movies_df, k=10):
    # 1) Genre weights (we'll downweight this for surprise)
    merged = pd.merge(user_history, df_items, on='itemId')
    genre_scores = {}
    for _, r in merged.iterrows():
        for g in r['genres'].split('|'):
            genre_scores[g] = genre_scores.get(g, 0) + r['rating']

    # 2) Score candidates
    results = []
    for idx, r in candidate_movies_df.iterrows():
        genres = r['genres'].split('|')
        gs = sum(genre_scores.get(g, 0) for g in genres)

        # Heavily penalize popularity (to promote obscure films)
        pop_penalty = -2.0 * np.log1p(r['normalized_popularity'] + 1e-6)

        # Strong noise to introduce surprise
        noise = np.random.normal(0, 0.6)

        # Final score: de-emphasize genre score, maximize surprise
        score = 0.5 * gs + pop_penalty + noise + idx * 1e-4
        results.append((r['title'], score))

    # 3) Take top-k
    topk = sorted(results, key=lambda x: x[1], reverse=True)[:k]
    titles, scores = zip(*topk)
    return list(titles), list(scores)


In [76]:
user_ids_s5 = test_data_ml1m_fullInteraction['userId'].unique()[:10]
metrics_s5 = []
all_recommendations_s5 = []  # To collect all recommended titles for global entropy

for uid in user_ids_s5:
    hist = test_data_ml1m_fullInteraction[test_data_ml1m_fullInteraction['userId'] == uid]
    titles_all = df_items_ml[df_items_ml['itemId'].isin(hist['itemId'])]['title'].tolist()
    if len(titles_all) < 6:
        continue

    input_titles = titles_all[:5]
    truth = titles_all[5:]

    # Sample from broader pool — no strict genre filtering for surprise
    candidate_movies_df_s5 = df_items_ml[~df_items_ml['title'].isin(input_titles)].copy()

    # Normalize and noise popularity
    pop_series = test_data_ml1m_fullInteraction['itemId'].value_counts()
    candidate_movies_df_s5['popularity'] = candidate_movies_df_s5['itemId'].map(pop_series).fillna(0)
    max_pop = candidate_movies_df_s5['popularity'].max()
    candidate_movies_df_s5['normalized_popularity'] = candidate_movies_df_s5['popularity'] / max_pop
    candidate_movies_df_s5['normalized_popularity'] += np.random.normal(0, 0.01, size=len(candidate_movies_df_s5))

    candidate_movies_df_s5 = candidate_movies_df_s5.sample(min(len(candidate_movies_df_s5), 120), random_state=42)

    recs_titles, recs_scores = get_recommendations_s5_surprise(
        hist, df_items_ml, candidate_movies_df_s5
    )

    # Append to global recommendation list
    all_recommendations_s5.extend(recs_titles)

    # Per-user metrics
    metrics_s5.append({
        "hit_rate": hit_rate(recs_titles, truth),
        "avg_rank": average_rank(recs_titles, truth),
        "recall": recall_at_k(recs_titles, truth),
        "ndcg": ndcg_at_k(recs_titles, truth),
        "hhi": hhi(recs_titles),
        "entropy": entropy(recs_titles),  # This is still per-user entropy
        "gini": gini_index_scores(recs_scores)
    })

# Convert and print average per-user metrics
df_metrics_s5 = pd.DataFrame(metrics_s5)
print("Average Metrics for First 10 Users (S5 Surprise):")
print(df_metrics_s5.mean())

# Compute and print system-level entropy
print("\nSystem-Level Entropy Across All S5 Recommendations:")
print(entropy(all_recommendations_s5))


Average Metrics for First 10 Users (S5 Surprise):
hit_rate    0.300000
avg_rank    9.400000
recall      0.007873
ndcg        0.036211
hhi         0.100000
entropy     3.321928
gini        0.081743
dtype: float64

System-Level Entropy Across All S5 Recommendations:
5.23981990983565


### S6 - Motivate Reasoning

In [80]:
def get_recommendations_s6_motivate(user_history, df_items, candidate_movies_df, k=10, movie_popularity=None):
    merged = pd.merge(user_history, df_items, on='itemId')

    user_mean_rating = user_history['rating'].mean()
    genre_scores = {}
    director_scores = {}
    genre_count = {}

    # Build user movie string for prompt
    watched_titles = df_items[df_items['itemId'].isin(user_history['itemId'])]['title'].tolist()
    user_movies_string = ", ".join(watched_titles)

    for _, row in merged.iterrows():
        centered_rating = row['rating'] - user_mean_rating
        for genre in row['genres'].split('|'):
            genre_scores[genre] = genre_scores.get(genre, 0) + centered_rating
            genre_count[genre] = genre_count.get(genre, 0) + 1
        director = row.get('director', None)
        if director:
            director_scores[director] = director_scores.get(director, 0) + centered_rating

    results = []
    for idx, row in candidate_movies_df.iterrows():
        movie_genres = row['genres'].split('|')
        director = row.get('director', 'Unknown')

        genre_match_score = sum(genre_scores.get(g, 0) for g in movie_genres)
        director_score = director_scores.get(director, 0)
        genre_match_count = sum(1 for g in movie_genres if g in genre_scores)

        # Exploration bonus: niche genres seen rarely by the user
        exploration_bonus = sum(1 / (1 + genre_count.get(g, 1)) for g in movie_genres)

        # Total motivation-focused score
        score = 0.6 * genre_match_score + 0.2 * director_score + 0.1 * genre_match_count + 0.1 * exploration_bonus

        # Rationale generation
        genre_matches = [f"{g} (score: {genre_scores[g]:.1f})" for g in movie_genres if g in genre_scores]
        director_match = f"frequent director: {director} (score: {director_score:.1f})" if director_score > 0 else ""
        rationale_parts = genre_matches + ([director_match] if director_match else [])

        if rationale_parts:
            if len(genre_matches) >= 2:
                rationale = f"'{row['title']}' aligns well with your favorite genres: {', '.join(genre_matches)}."
            else:
                rationale = f"Based on your preferences in {', '.join(rationale_parts)}, we recommend '{row['title']}'."
        else:
            rationale = f"'{row['title']}' is suggested to broaden your movie taste."

        results.append((row['title'], score, rationale))

    topk = sorted(results, key=lambda x: x[1], reverse=True)[:k]
    titles, scores, rationales = zip(*topk)

    # Construct prompt
    prompt = f"Provide 10 carefully selected movie recommendations, each accompanied by a rationale explaining its suitability for the user’s preferences. The user has previously enjoyed the following movies: {user_movies_string}"

    return list(titles), list(scores), list(rationales), prompt

In [81]:
def recall(predictions, ground_truth, k=10):
    if not ground_truth:
        return 0.0
    hits = len(set(predictions[:k]) & set(ground_truth))
    return hits / len(ground_truth)

def ndcg(predictions, ground_truth, k=10):
    dcg = 0.0
    for i, p in enumerate(predictions[:k]):
        if p in ground_truth:
            dcg += 1.0 / np.log2(i + 2)
    idcg = sum(1.0 / np.log2(i + 2) for i in range(min(len(ground_truth), k)))
    return dcg / idcg if idcg > 0 else 0.0

user_ids_s6 = test_data_ml1m_fullInteraction['userId'].unique()[:10]
metrics_s6 = []
all_recommendations_s6 = []

for uid in user_ids_s6:
    hist = test_data_ml1m_fullInteraction[test_data_ml1m_fullInteraction['userId'] == uid]
    titles_all = df_items_ml[df_items_ml['itemId'].isin(hist['itemId'])]['title'].tolist()
    if len(titles_all) < 6:
        continue

    input_titles = titles_all[:5]
    truth = titles_all[5:]

    # Broader pool without genre constraints
    candidate_movies_df_s6 = df_items_ml[~df_items_ml['title'].isin(input_titles)].copy()

    # Normalize popularity
    pop_series = test_data_ml1m_fullInteraction['itemId'].value_counts()
    candidate_movies_df_s6['popularity'] = candidate_movies_df_s6['itemId'].map(pop_series).fillna(0)
    max_pop = candidate_movies_df_s6['popularity'].max()
    candidate_movies_df_s6['normalized_popularity'] = candidate_movies_df_s6['popularity'] / max_pop

    # Downsample for performance
    candidate_movies_df_s6 = candidate_movies_df_s6.sample(min(len(candidate_movies_df_s6), 120), random_state=42)

    recs_titles, recs_scores, recs_rationales, prompt = get_recommendations_s6_motivate(
    hist, df_items_ml, candidate_movies_df_s6
    )

    all_recommendations_s6.extend(recs_titles)

    # Metrics
    metrics_s6.append({
        "hit_rate": hit_rate(recs_titles, truth),
        "avg_rank": average_rank(recs_titles, truth),
        "hhi": hhi(recs_titles),
        "recall": recall(recs_titles, truth),
        "ndcg": ndcg(recs_titles, truth),
        "entropy": entropy(recs_titles),
        "gini": gini_index(recs_scores)
    })

# Report metrics
df_metrics_s6 = pd.DataFrame(metrics_s6)
print("Average Metrics for First 10 Users (S6 Motivate Reasoning):")
print(df_metrics_s6.mean())

print("\nSystem-Level Entropy Across All S6 Recommendations:")
entropy(all_recommendations_s6)

Calculating Gini Index...
Gini Index: 0.400000
Calculating Gini Index...
Gini Index: 0.088889
Calculating Gini Index...
Gini Index: 0.257143
Calculating Gini Index...
Gini Index: 0.150000
Calculating Gini Index...
Gini Index: 0.400000
Calculating Gini Index...
Gini Index: 0.088889
Calculating Gini Index...
Gini Index: 0.088889
Calculating Gini Index...
Gini Index: 0.300000
Calculating Gini Index...
Gini Index: 0.000000
Calculating Gini Index...
Gini Index: 0.150000
Average Metrics for First 10 Users (S6 Motivate Reasoning):
hit_rate     0.100000
avg_rank    10.600000
hhi          0.100000
recall       0.003226
ndcg         0.007336
entropy      3.321928
gini         0.192381
dtype: float64

System-Level Entropy Across All S6 Recommendations:


6.046111814666561

### S7 - Chain of Thought (COT)

In [None]:
# --- LLM Recommendation Function ---
def get_recommendations_s7_cot(user_history, df_items, candidate_df, tokenizer, model, device, k=10):
    watched_titles = df_items[df_items['itemId'].isin(user_history['itemId'])]['title'].tolist()
    user_movies_str = ", ".join(watched_titles[:5])

    prompt = (
        f"The user has watched the following movies: {user_movies_str}. "
        f"Please recommend {k} real movie titles that match their taste. "
        f"List them clearly like: 1. Title - Reason"
    )

    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=256,
            temperature=0.9,
            do_sample=True,
            top_p=0.95
        )

    decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)

    lines = decoded.split("\n")
    recommended_titles = []
    for line in lines:
        if "." in line:
            try:
                title_reason = line.split(".", 1)[1].strip()
                if "-" in title_reason:
                    title = title_reason.split("-", 1)[0].strip()
                    recommended_titles.append(title)
            except IndexError:
                continue

    valid_titles = candidate_df['title'].tolist()
    filtered_titles = [title for title in recommended_titles if title in valid_titles]

    return filtered_titles[:k], decoded

In [99]:
# --- Evaluation Loop ---
user_ids = test_data_ml1m_fullInteraction['userId'].unique()[:10]
metrics_s7 = []
all_recommendations_s7 = []
item_exposure_scores = defaultdict(float)

for uid in user_ids:
    hist = test_data_ml1m_fullInteraction[test_data_ml1m_fullInteraction['userId'] == uid]
    titles_all = df_items_ml[df_items_ml['itemId'].isin(hist['itemId'])]['title'].tolist()
    if len(titles_all) < 6:
        continue

    input_titles = titles_all[:5]
    truth = titles_all[5:]
    candidate_movies_df_s7 = df_items_ml[~df_items_ml['title'].isin(input_titles)].copy()

    pop_series = test_data_ml1m_fullInteraction['itemId'].value_counts()
    candidate_movies_df_s7['popularity'] = candidate_movies_df_s7['itemId'].map(pop_series).fillna(0)
    candidate_movies_df_s7['normalized_popularity'] = candidate_movies_df_s7['popularity'] / candidate_movies_df_s7['popularity'].max()

    recs_titles, full_response = get_recommendations_s7_cot(
        hist, df_items_ml, candidate_movies_df_s7, tokenizer, model, device
    )

    print(f"\nUser {uid}")
    print(f"Ground truth: {truth}")
    print(f"Recommendations: {recs_titles}")

    # Simulated exposure score per rank
    for i, title in enumerate(recs_titles):
        score = 1.0 - 0.05 * i  # Decreasing by rank
        item_exposure_scores[title] += score

    # Accumulate for entropy
    all_recommendations_s7.extend(recs_titles)

    metrics_s7.append({
        "hit_rate": hit_rate(recs_titles, truth),
        "avg_rank": average_rank(recs_titles, truth),
        "recall": recall(recs_titles, truth),
        "ndcg": ndcg(recs_titles, truth),
        "hhi": hhi(recs_titles),
        "entropy": entropy(recs_titles)
        # Gini not included per user
    })

# --- Metrics Summary ---
df_metrics_s7 = pd.DataFrame(metrics_s7)
print("\nAverage Metrics for First 10 Users (S7 COT):")
print(df_metrics_s7.mean())

print("\nSystem-Level Entropy Across All S7 Recommendations:")
print(entropy(all_recommendations_s7))

print("\nSystem-Level Gini Index Across All S7 Recommendations:")
gini_scores = list(item_exposure_scores.values())
print(gini_index_scores(gini_scores))


User 19
Ground truth: ['Jurassic Park (1993)', 'Terminator 2: Judgment Day (1991)', 'Kingpin (1996)', 'Relic, The (1997)', 'Wrong Trousers, The (1993)', "One Flew Over the Cuckoo's Nest (1975)", 'Army of Darkness (1993)', 'Sting, The (1973)', 'Great Escape, The (1963)', 'Highlander (1986)', 'Beverly Hills Ninja (1997)', 'Fifth Element, The (1997)', 'Phantoms (1998)', 'Doctor Dolittle (1998)', 'Rain Man (1988)', 'Friday the 13th (1980)', 'Gremlins 2: The New Batch (1990)', 'Elizabeth (1998)', 'King Kong (1976)', 'Faculty, The (1998)', 'Office Space (1999)', 'Mod Squad, The (1999)', 'Out-of-Towners, The (1999)', 'eXistenZ (1999)', 'Son of Frankenstein (1939)', 'Son of Dracula (1943)', 'Lake Placid (1999)', 'Haunting, The (1999)', 'Inspector Gadget (1999)', 'Pit and the Pendulum (1961)', 'Outside Providence (1999)', 'American Beauty (1999)', 'Drive Me Crazy (1999)', "Ferris Bueller's Day Off (1986)", 'Fight Club (1999)', 'Licence to Kill (1989)', 'Man on the Moon (1999)', 'Galaxy Quest (