# Test notebook

# User-User Collaborative Filtering

In [None]:
import os
import pickle
import sys
import logging
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import normalize
import hnswlib
from joblib import Parallel, delayed
import optuna

# --- Logging Configuration ---
class FlushStreamHandler(logging.StreamHandler):
    def emit(self, record):
        try:
            msg = self.format(record)
            stream = self.stream
            stream.write(msg + self.terminator)
            self.flush()
        except Exception:
            self.handleError(record)

LOG_FILENAME = "debug.log"
logger = logging.getLogger()
logger.setLevel(logging.INFO)
if logger.hasHandlers():
    logger.handlers.clear()
formatter = logging.Formatter("%(asctime)s %(levelname)s: %(message)s")
stream_handler = FlushStreamHandler(sys.stdout)
stream_handler.setLevel(logging.INFO)
stream_handler.setFormatter(formatter)
file_handler = logging.FileHandler(LOG_FILENAME, mode="w")
file_handler.setLevel(logging.INFO)
file_handler.setFormatter(formatter)
logger.addHandler(stream_handler)
logger.addHandler(file_handler)
logger.info("Logging configured to INFO level with immediate flushing.")

# --- Global Settings ---
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)
RELEVANCE_THRESHOLD = 7.0    # For ranking evaluation
RANKING_SAMPLE_FRACTION = 0.2  # Evaluate ranking metrics on 20% of test users
CACHE_DIR = "cache"
os.makedirs(CACHE_DIR, exist_ok=True)
TOP_N = 10  # Top-N recommendations

# --- Caching Helper Functions ---
def cache_path(filename):
    return os.path.join(CACHE_DIR, filename)

def save_cache(obj, filename):
    with open(cache_path(filename), "wb") as f:
        pickle.dump(obj, f)
    logger.info(f"Saved cache to {cache_path(filename)}")

def load_cache(filename):
    with open(cache_path(filename), "rb") as f:
        obj = pickle.load(f)
    logger.info(f"Loaded cache from {cache_path(filename)}")
    return obj

# --- Data Loading and Preprocessing ---
def load_ratings(file_path):
    logger.info(f"Loading ratings from {file_path} ...")
    df = pd.read_csv(file_path)
    logger.info(f"Ratings shape: {df.shape}")
    return df

def load_anime_details(file_path):
    logger.info(f"Loading anime details from {file_path} ...")
    df = pd.read_csv(file_path)
    logger.info(f"Anime details shape: {df.shape}")
    return df

def build_rating_matrix(ratings_df):
    cache_file = "rating_matrix.pkl"
    if os.path.exists(cache_path(cache_file)):
        rating_matrix = load_cache(cache_file)
    else:
        logger.info("Building rating matrix ...")
        rating_matrix = ratings_df.pivot(index='user_id', columns='anime_id', values='score')
        rating_matrix = rating_matrix.fillna(0).astype(np.float32)
        save_cache(rating_matrix, cache_file)
    logger.info(f"Rating matrix shape: {rating_matrix.shape}")
    return rating_matrix

# --- Dimensionality Reduction and SVD Caching ---
def reduce_dimensionality(rating_matrix, n_components):
    cache_latent = f"latent_matrix_{n_components}.pkl"
    cache_svd = f"svd_model_{n_components}.pkl"
    if os.path.exists(cache_path(cache_latent)) and os.path.exists(cache_path(cache_svd)):
        latent_matrix = load_cache(cache_latent)
        svd_model = load_cache(cache_svd)
    else:
        logger.info(f"Performing dimensionality reduction using TruncatedSVD with {n_components} components ...")
        svd_model = TruncatedSVD(n_components=n_components, random_state=RANDOM_STATE)
        latent_matrix = svd_model.fit_transform(rating_matrix.values)
        latent_matrix = normalize(latent_matrix, norm='l2')
        save_cache(latent_matrix, cache_latent)
        save_cache(svd_model, cache_svd)
    logger.info(f"Latent matrix shape: {latent_matrix.shape}")
    return latent_matrix, svd_model

# --- HNSW Index Construction with Caching ---
def build_hnsw_index(latent_matrix, ef_construction, M, ef):
    # Include n_components in filename to distinguish different latent spaces.
    n_components = latent_matrix.shape[1]
    index_file = cache_path(f"hnsw_index_{n_components}_{ef_construction}_{ef}.bin")
    num_elements, dim = latent_matrix.shape
    if os.path.exists(index_file):
        logger.info("Loading existing HNSW index from cache ...")
        index = hnswlib.Index(space='cosine', dim=dim)
        index.init_index(max_elements=num_elements, ef_construction=ef_construction, M=M)
        index.load_index(index_file)
        index.set_ef(ef)
    else:
        logger.info("Building new HNSW index ...")
        index = hnswlib.Index(space='cosine', dim=dim)
        index.init_index(max_elements=num_elements, ef_construction=ef_construction, M=M)
        index.add_items(latent_matrix)
        index.set_ef(ef)
        index.save_index(index_file)
        logger.info(f"HNSW index saved to {index_file}")
    return index

# --- Prediction Functions Using ANN (Existing Users) ---
def predict_rating_approx(user_idx, rating_matrix, latent_matrix, hnsw_index, target_item, global_mean, k_neighbors):
    query = latent_matrix[user_idx].reshape(1, -1)
    neighbor_ids, distances = hnsw_index.knn_query(query, k=k_neighbors)
    neighbor_ids = neighbor_ids[0]
    distances = distances[0]
    similarities = 1 - distances
    training_matrix = rating_matrix.values
    neighbor_ratings = training_matrix[neighbor_ids, target_item]
    mask = neighbor_ratings != 0
    if np.sum(mask) == 0:
        return global_mean
    valid_similarities = similarities[mask]
    valid_ratings = neighbor_ratings[mask]
    if np.sum(np.abs(valid_similarities)) == 0:
        return global_mean
    pred = np.dot(valid_similarities, valid_ratings) / np.sum(np.abs(valid_similarities))
    return pred

def get_top_n_recommendations_approx(user_idx, rating_matrix, latent_matrix, hnsw_index, anime_ids, global_mean, k_neighbors, top_n):
    user_ratings = rating_matrix.values[user_idx]
    predictions = {}
    total_items = len(anime_ids)
    for j, anime in enumerate(anime_ids):
        if j % 100 == 0:
            logger.info(f"User {user_idx}: processed {j}/{total_items} items")
        if user_ratings[j] == 0:
            pred = predict_rating_approx(user_idx, rating_matrix, latent_matrix, hnsw_index, j, global_mean, k_neighbors)
            predictions[anime] = pred
    top_n_recs = sorted(predictions.items(), key=lambda x: x[1], reverse=True)[:top_n]
    return top_n_recs

# --- New User Recommendation Functions ---
def build_new_user_vector(new_user_df, anime_ids):
    new_user_vector = np.zeros(len(anime_ids), dtype=np.float32)
    for idx, anime in enumerate(anime_ids):
        if anime in new_user_df['anime_id'].values:
            new_user_vector[idx] = new_user_df[new_user_df['anime_id'] == anime]['score'].values[0]
    return new_user_vector

def predict_rating_new_user_approx(new_user_vector, rating_matrix, latent_matrix, hnsw_index, target_item, global_mean, k_neighbors):
    norm = np.linalg.norm(new_user_vector) + 1e-10
    new_user_normed = new_user_vector / norm
    query = new_user_normed.reshape(1, -1)
    neighbor_ids, distances = hnsw_index.knn_query(query, k=k_neighbors)
    neighbor_ids = neighbor_ids[0]
    distances = distances[0]
    similarities = 1 - distances
    training_matrix = rating_matrix.values
    neighbor_ratings = training_matrix[neighbor_ids, target_item]
    mask = neighbor_ratings != 0
    if np.sum(mask) == 0:
        return global_mean
    valid_similarities = similarities[mask]
    valid_ratings = neighbor_ratings[mask]
    if np.sum(np.abs(valid_similarities)) == 0:
        return global_mean
    pred = np.dot(valid_similarities, valid_ratings) / np.sum(np.abs(valid_similarities))
    return pred

def recommend_new_user_approx(new_user_df, model, top_n, k_neighbors):
    anime_ids = model['anime_ids']
    rating_matrix = model['rating_matrix']
    global_mean = model['global_mean']
    svd_model = model['svd_model']
    
    new_user_vector = build_new_user_vector(new_user_df, anime_ids)
    new_user_latent = svd_model.transform(new_user_vector.reshape(1, -1))
    new_user_latent = normalize(new_user_latent, norm='l2')
    
    hnsw_index = model['hnsw_index']
    predictions = {}
    for idx, anime in enumerate(anime_ids):
        if new_user_vector[idx] == 0:
            pred = predict_rating_new_user_approx(new_user_vector, rating_matrix, model['latent_matrix'], hnsw_index, idx, global_mean, k_neighbors)
            predictions[anime] = pred
    top_n_recs = sorted(predictions.items(), key=lambda x: x[1], reverse=True)[:top_n]
    return top_n_recs

# --- Recommendation Enrichment Function ---
def enrich_recommendations(recommendations, anime_details_df):
    recs_df = pd.DataFrame(recommendations, columns=["anime_id", "predicted_rating"])
    enriched = pd.merge(recs_df, anime_details_df[['id', 'title', 'english_title']],
                        left_on='anime_id', right_on='id', how='left')
    enriched = enriched.drop(columns=['id'])
    return enriched

# --- Evaluation Metric Functions ---
def rmse(y_true, y_pred):
    return np.sqrt(np.mean((np.array(y_true) - np.array(y_pred)) ** 2))

def mae(y_true, y_pred):
    return np.mean(np.abs(np.array(y_true) - np.array(y_pred)))

def precision_at_k(recommended, relevant, k=10):
    recommended_k = [rec for rec, _ in recommended][:k]
    if not recommended_k:
        return 0.0
    hits = len(set(recommended_k) & relevant)
    return hits / k

def recall_at_k(recommended, relevant, k=10):
    recommended_k = [rec for rec, _ in recommended][:k]
    if len(relevant) == 0:
        return 0.0
    hits = len(set(recommended_k) & relevant)
    return hits / len(relevant)

def ndcg_at_k(recommended, relevant, k=10):
    dcg = 0.0
    for i, (rec, _) in enumerate(recommended[:k]):
        if rec in relevant:
            dcg += 1 / np.log2(i + 2)
    ideal_hits = min(len(relevant), k)
    idcg = sum(1 / np.log2(i + 2) for i in range(ideal_hits))
    return dcg / idcg if idcg > 0 else 0.0

# --- Parallel Ranking Metrics Evaluation Function ---
def compute_ranking_metrics_for_user(user, group, rating_matrix, latent_matrix, hnsw_index, anime_ids, global_mean, k_neighbors, top_n, relevance_threshold):
    if user not in rating_matrix.index:
        return None
    relevant_items = set(group[group['score'] >= relevance_threshold]['anime_id'])
    if not relevant_items:
        return None
    user_idx = np.where(rating_matrix.index == user)[0][0]
    recs = get_top_n_recommendations_approx(user_idx, rating_matrix, latent_matrix, hnsw_index, anime_ids, global_mean, k_neighbors, top_n)
    prec = precision_at_k(recs, relevant_items, k=top_n)
    rec = recall_at_k(recs, relevant_items, k=top_n)
    ndcg_val = ndcg_at_k(recs, relevant_items, k=top_n)
    return prec, rec, ndcg_val

# --- Function to Run a Single Experiment ---
def run_experiment(params, train_df, test_df, rating_matrix):
    global_mean = train_df['score'].mean()
    latent_matrix, svd_model = reduce_dimensionality(rating_matrix, n_components=params["svd_n_components"])
    hnsw_index = build_hnsw_index(latent_matrix, ef_construction=params["hnsw_ef_construction"], M=HNSW_M, ef=params["hnsw_ef"])
    
    # Evaluate pointwise metrics.
    y_true, y_pred = [], []
    for i, (_, row) in enumerate(test_df.iterrows(), start=1):
        user = row['user_id']
        anime = row['anime_id']
        true_score = float(row['score'])
        if user in rating_matrix.index and anime in rating_matrix.columns:
            user_idx = np.where(rating_matrix.index == user)[0][0]
            item_idx = np.where(rating_matrix.columns == anime)[0][0]
            pred_score = predict_rating_approx(user_idx, rating_matrix, latent_matrix, hnsw_index, item_idx, global_mean, k_neighbors=params["k_neighbors"])
            y_true.append(true_score)
            y_pred.append(pred_score)
    rmse_val = rmse(y_true, y_pred) if y_true else None
    mae_val = mae(y_true, y_pred) if y_true else None
    
    # Evaluate ranking metrics on a sample of test users.
    test_groups = list(test_df.groupby('user_id'))
    np.random.shuffle(test_groups)
    sample_size = int(len(test_groups) * RANKING_SAMPLE_FRACTION)
    sampled_groups = test_groups[:sample_size]
    results = Parallel(n_jobs=-1)(delayed(compute_ranking_metrics_for_user)(
        user, group, rating_matrix, latent_matrix, hnsw_index, rating_matrix.columns.values, global_mean,
        params["k_neighbors"], TOP_N, RELEVANCE_THRESHOLD
    ) for user, group in sampled_groups)
    valid_results = [res for res in results if res is not None]
    if valid_results:
        precisions, recalls, ndcgs = zip(*valid_results)
        precision_val = np.mean(precisions)
        recall_val = np.mean(recalls)
        ndcg_val = np.mean(ndcgs)
    else:
        precision_val = recall_val = ndcg_val = None
        
    metrics = {"rmse": rmse_val, "mae": mae_val, "precision_at_10": precision_val,
               "recall_at_10": recall_val, "ndcg_at_10": ndcg_val}
    return metrics, latent_matrix, svd_model, hnsw_index

# --- Hyperparameter Optimization with Optuna ---
def objective(trial):
    # Suggest hyperparameters.
    svd_n_components = trial.suggest_categorical("svd_n_components", [50, 100, 200, 250, 300])
    k_neighbors = trial.suggest_categorical("k_neighbors", [15, 20, 30, 50])
    hnsw_ef = trial.suggest_categorical("hnsw_ef", [50, 100, 150])
    hnsw_ef_construction = trial.suggest_categorical("hnsw_ef_construction", [100, 200, 300])
    
    params = {
        "svd_n_components": svd_n_components,
        "k_neighbors": k_neighbors,
        "hnsw_ef": hnsw_ef,
        "hnsw_ef_construction": hnsw_ef_construction
    }
    logger.info(f"Trial with parameters: {params}")
    
    # Load ratings and split data.
    ratings_file = "data_new/animelist.csv"
    ratings_df = load_ratings(ratings_file)
    train_df, test_df = train_test_split(ratings_df, test_size=0.2, random_state=RANDOM_STATE)
    rating_matrix = build_rating_matrix(train_df)
    
    try:
        metrics, latent_matrix, svd_model, hnsw_index = run_experiment(params, train_df, test_df, rating_matrix)
    except Exception as e:
        logger.error(f"Error in trial: {e}")
        return float('inf')
    
    # Record additional metrics as user attributes.
    trial.set_user_attr("mae", metrics["mae"])
    trial.set_user_attr("precision_at_10", metrics["precision_at_10"])
    trial.set_user_attr("recall_at_10", metrics["recall_at_10"])
    trial.set_user_attr("ndcg_at_10", metrics["ndcg_at_10"])
    
    if metrics["rmse"] is None:
        return float('inf')
    logger.info(f"Trial metrics: {metrics}")
    return metrics["rmse"]

def main_experiment():
    study = optuna.create_study(direction="minimize")
    study.optimize(objective, n_trials=20)  # Adjust number of trials as needed.
    
    # Save study results to CSV.
    results = []
    for trial in study.trials:
        results.append({
            "model_name": "UserUser_ANN",
            "rmse": trial.value,
            "mae": trial.user_attrs.get("mae"),
            "precision_at_10": trial.user_attrs.get("precision_at_10"),
            "recall_at_10": trial.user_attrs.get("recall_at_10"),
            "ndcg_at_10": trial.user_attrs.get("ndcg_at_10"),
            "parameters": str(trial.params)
        })
    results_df = pd.DataFrame(results)
    results_csv = "experiment_results.csv"
    results_df.to_csv(results_csv, index=False)
    logger.info(f"Saved experiment results to {results_csv}")
    logger.info(f"Best trial: {study.best_trial.params} with RMSE: {study.best_trial.value}")



In [None]:
main_experiment()

In [None]:
def recommend_for_new_user():
    #Generate recommendations for a new user.
    # Uncomment and adjust if you have new user data.
    MODEL = 'models/user_user_cf_model_approx.pkl'
    new_user_df = load_ratings("data_new/user_data.csv")
    anime_details_df = pd.read_csv('data_new/anime_details_v1.csv')
    with open(MODEL, "rb") as f:
        model = pickle.load(f)
        logger.info(f"Loaded cache from {cache_path(MODEL)}")
    new_user_recs = recommend_new_user_approx(new_user_df, model, top_n=10, k_neighbors=30)
    enriched_new_user_recs = enrich_recommendations(new_user_recs, anime_details_df)
    logger.info("Top-10 recommendations for new user:")
    logger.info(f"\n{enriched_new_user_recs}")

In [None]:
recommend_for_new_user()

In [None]:
import os
import pickle
import sys
import logging
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from joblib import Parallel, delayed
import optuna

# --- Logging Configuration ---
class FlushStreamHandler(logging.StreamHandler):
    def emit(self, record):
        try:
            msg = self.format(record)
            self.stream.write(msg + self.terminator)
            self.flush()
        except Exception:
            self.handleError(record)

LOG_FILENAME = "mf_debug.log"
logger = logging.getLogger()
logger.setLevel(logging.INFO)
if logger.hasHandlers():
    logger.handlers.clear()
formatter = logging.Formatter("%(asctime)s %(levelname)s: %(message)s")
stream_handler = FlushStreamHandler(sys.stdout)
stream_handler.setLevel(logging.INFO)
stream_handler.setFormatter(formatter)
file_handler = logging.FileHandler(LOG_FILENAME, mode="w")
file_handler.setLevel(logging.INFO)
file_handler.setFormatter(formatter)
logger.addHandler(stream_handler)
logger.addHandler(file_handler)
logger.info("Logging configured to INFO level with immediate flushing.")

# --- Global Settings ---
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)
TRAIN_TEST_SPLIT = 0.2
MODEL_SAVE_PATH = "models/mf_best_model.pkl"
RELEVANCE_THRESHOLD = 7.0       # For ranking evaluation
RANKING_SAMPLE_FRACTION = 0.2   # Evaluate ranking metrics on 20% of test users
TOP_N = 10                    # Top-N recommendations
CACHE_DIR = "cache"
os.makedirs(CACHE_DIR, exist_ok=True)
os.makedirs("models", exist_ok=True)

# --- Hyperparameter Grid for Optuna ---
# (These will be suggested by Optuna.)
# We'll tune: n_factors, learning_rate, regularization, and n_epochs.

# --- Caching Helper Functions ---
def cache_path(filename):
    return os.path.join(CACHE_DIR, filename)

def save_cache(obj, filename):
    with open(cache_path(filename), "wb") as f:
        pickle.dump(obj, f)
    logger.info(f"Saved cache to {cache_path(filename)}")

def load_cache(filename):
    with open(cache_path(filename), "rb") as f:
        obj = pickle.load(f)
    logger.info(f"Loaded cache from {cache_path(filename)}")
    return obj

# --- Data Loading Functions ---
def load_ratings(file_path):
    logger.info(f"Loading ratings from {file_path} ...")
    df = pd.read_csv(file_path)
    logger.info(f"Ratings shape: {df.shape}")
    return df

def load_anime_details(file_path):
    logger.info(f"Loading anime details from {file_path} ...")
    df = pd.read_csv(file_path)
    logger.info(f"Anime details shape: {df.shape}")
    return df

def build_rating_matrix(ratings_df):
    logger.info("Building rating matrix ...")
    rating_matrix = ratings_df.pivot(index='user_id', columns='anime_id', values='score')
    rating_matrix = rating_matrix.fillna(0).astype(np.float32)
    logger.info(f"Rating matrix shape: {rating_matrix.shape}")
    return rating_matrix

# --- Matrix Factorization Functions ---
def initialize_factors(n_users, n_items, n_factors):
    U = np.random.normal(scale=0.1, size=(n_users, n_factors))
    V = np.random.normal(scale=0.1, size=(n_items, n_factors))
    return U, V

def train_matrix_factorization(rating_matrix, n_factors, learning_rate, reg, n_epochs):
    R = rating_matrix.values
    n_users, n_items = R.shape
    U, V = initialize_factors(n_users, n_items, n_factors)
    observed = R > 0

    logger.info("Starting training matrix factorization ...")
    for epoch in range(n_epochs):
        total_loss = 0.0
        for u in range(n_users):
            for i in range(n_items):
                if observed[u, i]:
                    pred = np.dot(U[u], V[i])
                    error = R[u, i] - pred
                    total_loss += error ** 2
                    U[u] += learning_rate * (error * V[i] - reg * U[u])
                    V[i] += learning_rate * (error * U[u] - reg * V[i])
        rmse_epoch = np.sqrt(total_loss / observed.sum())
        logger.info(f"Epoch {epoch+1}/{n_epochs} - Training RMSE: {rmse_epoch:.4f}")
    return U, V

def predict_rating(u, i, U, V):
    return np.dot(U[u], V[i])

def evaluate_model(rating_matrix, U, V, test_df):
    R = rating_matrix.values
    y_true, y_pred = [], []
    for idx, row in test_df.iterrows():
        user = row['user_id']
        anime = row['anime_id']
        true_rating = float(row['score'])
        if user in rating_matrix.index and anime in rating_matrix.columns:
            u_idx = np.where(rating_matrix.index == user)[0][0]
            i_idx = np.where(rating_matrix.columns == anime)[0][0]
            pred = predict_rating(u_idx, i_idx, U, V)
            y_true.append(true_rating)
            y_pred.append(pred)
    rmse_val = np.sqrt(np.mean((np.array(y_true) - np.array(y_pred)) ** 2))
    mae_val = np.mean(np.abs(np.array(y_true) - np.array(y_pred)))
    return rmse_val, mae_val

def get_top_n_recommendations(u_idx, rating_matrix, U, V, top_n=10):
    R = rating_matrix.values
    user_ratings = R[u_idx]
    predictions = {}
    for i, _ in enumerate(user_ratings):
        if user_ratings[i] == 0:
            pred = predict_rating(u_idx, i, U, V)
            predictions[rating_matrix.columns[i]] = pred
    top_n_recs = sorted(predictions.items(), key=lambda x: x[1], reverse=True)[:top_n]
    return top_n_recs

# --- Ranking Metrics Functions ---
def precision_at_k(recommended, relevant, k=10):
    recommended_k = [rec for rec, _ in recommended][:k]
    if not recommended_k:
        return 0.0
    hits = len(set(recommended_k) & relevant)
    return hits / k

def recall_at_k(recommended, relevant, k=10):
    recommended_k = [rec for rec, _ in recommended][:k]
    if len(relevant) == 0:
        return 0.0
    hits = len(set(recommended_k) & relevant)
    return hits / len(relevant)

def ndcg_at_k(recommended, relevant, k=10):
    dcg = 0.0
    for i, (rec, _) in enumerate(recommended[:k]):
        if rec in relevant:
            dcg += 1 / np.log2(i + 2)
    ideal_hits = min(len(relevant), k)
    idcg = sum(1 / np.log2(i + 2) for i in range(ideal_hits))
    return dcg / idcg if idcg > 0 else 0.0

def compute_ranking_metrics_for_user_mf(user, group, rating_matrix, U, V, anime_ids, top_n, relevance_threshold):
    if user not in rating_matrix.index:
        return None
    relevant_items = set(group[group['score'] >= relevance_threshold]['anime_id'])
    if not relevant_items:
        return None
    u_idx = np.where(rating_matrix.index == user)[0][0]
    recs = get_top_n_recommendations(u_idx, rating_matrix, U, V, top_n)
    prec = precision_at_k(recs, relevant_items, k=top_n)
    rec = recall_at_k(recs, relevant_items, k=top_n)
    ndcg_val = ndcg_at_k(recs, relevant_items, k=top_n)
    return prec, rec, ndcg_val

# --- New User "Fold-In" Functions for MF ---
def fold_in_new_user(new_user_vector, V, learning_rate=0.005, reg=0.02, n_iter=50):
    """
    Given a new user's rating vector (length = n_items, with zeros for missing ratings)
    and the trained item latent factor matrix V (shape: n_items x n_factors),
    optimize a user latent vector u_new that minimizes:
       sum_{i: observed} (r[i] - u_new.dot(V[i]))^2 + reg * ||u_new||^2
    using gradient descent.
    Returns the new user's latent factor vector (1D NumPy array).
    """
    n_factors = V.shape[1]
    u_new = np.random.normal(scale=0.1, size=(n_factors,))
    observed = new_user_vector > 0
    for _ in range(n_iter):
        pred = V[observed].dot(u_new)
        error = new_user_vector[observed] - pred
        grad = -2 * V[observed].T.dot(error) + 2 * reg * u_new
        u_new = u_new - learning_rate * grad
    return u_new

def recommend_new_user_mf(new_user_df, model, n_iter=50, learning_rate=0.005, reg=0.02, top_n=TOP_N):
    """
    Generate recommendations for a new user by "folding in" the new user into the trained MF model.
    The item latent factor matrix V from the model is used to compute the new user's latent vector.
    """
    anime_ids = model['rating_matrix'].columns.values
    rating_matrix = model['rating_matrix']
    V = model['V']
    global_mean = model['global_mean']
    
    new_user_vector = np.zeros(len(anime_ids), dtype=np.float32)
    for idx, anime in enumerate(anime_ids):
        if anime in new_user_df['anime_id'].values:
            new_user_vector[idx] = new_user_df[new_user_df['anime_id'] == anime]['score'].values[0]
    
    u_new = fold_in_new_user(new_user_vector, V, learning_rate, reg, n_iter)
    
    # Predict ratings for all items not rated by the new user.
    predictions = {}
    for i, anime in enumerate(anime_ids):
        if new_user_vector[i] == 0:
            pred = np.dot(u_new, V[i])
            predictions[anime] = pred
    top_n_recs = sorted(predictions.items(), key=lambda x: x[1], reverse=True)[:top_n]
    return top_n_recs

# --- Experiment Function for MF ---
def run_mf_experiment(params, train_df, test_df, rating_matrix):
    U, V = train_matrix_factorization(
        rating_matrix, 
        n_factors=params["n_factors"],
        learning_rate=params["learning_rate"],
        reg=params["reg"],
        n_epochs=params["n_epochs"]
    )
    rmse_val, mae_val = evaluate_model(rating_matrix, U, V, test_df)
    
    # Evaluate ranking metrics on a sample of test users.
    test_groups = list(test_df.groupby('user_id'))
    np.random.shuffle(test_groups)
    sample_size = int(len(test_groups) * RANKING_SAMPLE_FRACTION)
    sampled_groups = test_groups[:sample_size]
    results = Parallel(n_jobs=-1)(delayed(compute_ranking_metrics_for_user_mf)(
        user, group, rating_matrix, U, V, rating_matrix.columns.values, TOP_N, RELEVANCE_THRESHOLD
    ) for user, group in sampled_groups)
    valid_results = [res for res in results if res is not None]
    if valid_results:
        precisions, recalls, ndcgs = zip(*valid_results)
        precision_val = np.mean(precisions)
        recall_val = np.mean(recalls)
        ndcg_val = np.mean(ndcgs)
    else:
        precision_val = recall_val = ndcg_val = None
    
    metrics = {"rmse": rmse_val, "mae": mae_val, "precision_at_10": precision_val,
               "recall_at_10": recall_val, "ndcg_at_10": ndcg_val}
    return metrics, U, V

# --- Optuna Objective Function for MF ---
def objective(trial):
    # Suggest hyperparameters.
    n_factors = trial.suggest_categorical("n_factors", [50, 100, 150, 200])
    learning_rate = trial.suggest_loguniform("learning_rate", 1e-4, 1e-2)
    reg = trial.suggest_loguniform("reg", 1e-3, 1e-1)
    n_epochs = trial.suggest_int("n_epochs", 20, 100)
    
    params = {
        "n_factors": n_factors,
        "learning_rate": learning_rate,
        "reg": reg,
        "n_epochs": n_epochs
    }
    logger.info(f"Trial with parameters: {params}")
    
    ratings_file = "data_new/animelist.csv"
    ratings_df = load_ratings(ratings_file)
    train_df, test_df = train_test_split(ratings_df, test_size=TRAIN_TEST_SPLIT, random_state=RANDOM_STATE)
    rating_matrix = build_rating_matrix(train_df)
    
    try:
        metrics, U, V = run_mf_experiment(params, train_df, test_df, rating_matrix)
    except Exception as e:
        logger.error(f"Error in trial: {e}")
        return float('inf')
    
    trial.set_user_attr("mae", metrics["mae"])
    trial.set_user_attr("precision_at_10", metrics["precision_at_10"])
    trial.set_user_attr("recall_at_10", metrics["recall_at_10"])
    trial.set_user_attr("ndcg_at_10", metrics["ndcg_at_10"])
    
    if metrics["rmse"] is None:
        return float('inf')
    logger.info(f"Trial metrics: {metrics}")
    return metrics["rmse"]

def main_experiment():
    study = optuna.create_study(direction="minimize")
    study.optimize(objective, n_trials=20)
    
    # Save study results to CSV.
    results = []
    for trial in study.trials:
        results.append({
            "model_name": "MF_SGD",
            "rmse": trial.value,
            "mae": trial.user_attrs.get("mae"),
            "precision_at_10": trial.user_attrs.get("precision_at_10"),
            "recall_at_10": trial.user_attrs.get("recall_at_10"),
            "ndcg_at_10": trial.user_attrs.get("ndcg_at_10"),
            "parameters": str(trial.params)
        })
    results_df = pd.DataFrame(results)
    results_csv = "mf_experiment_results.csv"
    results_df.to_csv(results_csv, index=False)
    logger.info(f"Saved experiment results to {results_csv}")
    logger.info(f"Best trial: {study.best_trial.params} with RMSE: {study.best_trial.value}")
    
    # Train the best model on the full training data and save it.
    best_params = study.best_trial.params
    ratings_file = "data_new/animelist.csv"
    ratings_df = load_ratings(ratings_file)
    # Split data into train and test sets (ensure test set is non-empty).
    train_df, test_df = train_test_split(ratings_df, test_size=TRAIN_TEST_SPLIT, random_state=RANDOM_STATE)
    logger.info(f"Final training: Train shape: {train_df.shape}, Test shape: {test_df.shape}")
    
    # Build the rating matrix from the training data.
    rating_matrix = build_rating_matrix(train_df)

    metrics, U, V = run_mf_experiment(best_params, train_df, pd.DataFrame(), rating_matrix)[0:3]
    best_model = {
        "U": U,
        "V": V,
        "rating_matrix": rating_matrix,
        "global_mean": train_df['score'].mean()
    }
    with open(MODEL_SAVE_PATH, "wb") as f:
        pickle.dump(best_model, f)
    logger.info(f"Best MF model saved to {MODEL_SAVE_PATH}")
    
    # Example: New user recommendation using fold-in.
    # If a new user rating file exists, load it and generate recommendations.
    new_user_file = "data_new/user_data.csv"
    if os.path.exists(new_user_file):
        new_user_df = pd.read_csv(new_user_file)
        # Use the fold-in procedure to compute new user latent factors.
        new_user_vector = np.zeros(len(rating_matrix.columns), dtype=np.float32)
        for idx, anime in enumerate(rating_matrix.columns):
            if anime in new_user_df['anime_id'].values:
                new_user_vector[idx] = new_user_df[new_user_df['anime_id'] == anime]['score'].values[0]
        # Fold-in new user latent vector using item factors V.
        def fold_in_new_user(new_user_vector, V, learning_rate=0.005, reg=0.02, n_iter=50):
            n_factors = V.shape[1]
            u_new = np.random.normal(scale=0.1, size=(n_factors,))
            observed = new_user_vector > 0
            for _ in range(n_iter):
                pred = V[observed].dot(u_new)
                error = new_user_vector[observed] - pred
                grad = -2 * V[observed].T.dot(error) + 2 * reg * u_new
                u_new = u_new - learning_rate * grad
            return u_new
        
        u_new = fold_in_new_user(new_user_vector, best_model["V"])
        # Predict ratings for all items for new user.
        predictions = {}
        for i, anime in enumerate(rating_matrix.columns):
            if new_user_vector[i] == 0:
                pred = np.dot(u_new, best_model["V"][i])
                predictions[anime] = pred
        new_user_recs = sorted(predictions.items(), key=lambda x: x[1], reverse=True)[:TOP_N]
        logger.info("Top-10 recommendations for new user:")
        logger.info(new_user_recs)
    else:
        logger.info("No new user data found.")



In [None]:
main_experiment()

In [None]:
# ive got error on final retraining, but i have saved best params from our experiment.

In [None]:
def final_model_training(best_params):
    """
    Retrains the matrix factorization model on a proper train/test split using the best 
    hyperparameters from the previous study, and saves the final model to disk.
    This function avoids passing an empty test DataFrame.
    """
    # Load ratings from file.
    ratings_file = "data_new/animelist.csv"  # Adjust path as needed.
    ratings_df = load_ratings(ratings_file)
    
    # Split data into train and test sets (ensure test set is non-empty).
    train_df, test_df = train_test_split(ratings_df, test_size=TRAIN_TEST_SPLIT, random_state=RANDOM_STATE)
    logger.info(f"Final training: Train shape: {train_df.shape}, Test shape: {test_df.shape}")
    
    # Build the rating matrix from the training data.
    rating_matrix = build_rating_matrix(train_df)
    
    # Optionally, if you prefer to train on the full data without evaluation,
    # you could set test_df = train_df or perform another split.
    
    # Run the matrix factorization experiment (which includes evaluation) using best_params.
    metrics, U, V = run_mf_experiment(best_params, train_df, test_df, rating_matrix)[0:3]
    logger.info(f"Final model training metrics: RMSE: {metrics['rmse']:.4f}, MAE: {metrics['mae']:.4f}")
    
    # Build the final model dictionary.
    best_model = {
        "U": U,
        "V": V,
        "rating_matrix": rating_matrix,
        "global_mean": train_df['score'].mean()
    }
    
    # Save the final model.
    with open(MODEL_SAVE_PATH, "wb") as f:
        pickle.dump(best_model, f)
    logger.info(f"Best MF model saved to {MODEL_SAVE_PATH}")
    
    # (Optional) Demonstrate new user recommendations using fold-in.
    new_user_file = "data_new/user_data.csv"  # Adjust path if new user data exists.
    if os.path.exists(new_user_file):
        new_user_df = pd.read_csv(new_user_file)
        u_new = fold_in_new_user(
            new_user_vector=build_new_user_vector(new_user_df, rating_matrix.columns.values),
            V=V,
            learning_rate=best_params.get("learning_rate", 0.005),
            reg=best_params.get("reg", 0.02),
            n_iter=50  # You might tune this as well.
        )
        predictions = {}
        for i, anime in enumerate(rating_matrix.columns):
            if build_new_user_vector(new_user_df, rating_matrix.columns.values)[i] == 0:
                predictions[anime] = np.dot(u_new, V[i])
        new_user_recs = sorted(predictions.items(), key=lambda x: x[1], reverse=True)[:TOP_N]
        logger.info("Top-10 recommendations for new user:")
        logger.info(new_user_recs)
    else:
        logger.info("No new user data found.")



In [None]:
# --- Main function to run final training (without re-running grid search) ---
def main_final():
    import ast
    results = pd.read_csv('mf_experiment_results.csv')
    best_params = ast.literal_eval(results[results['rmse']==results['rmse'].min()]['parameters'].iloc[0])
    logger.info(f"Using best parameters from study: {best_params}")
    final_model_training(best_params)



In [None]:
main_final()

In [None]:
s = [(11061, 8.751748977151662), (38524, 8.742807908264286), (820, 8.714674665393), (5114, 8.712958329730348), (43608, 8.682679336936072), (9253, 8.681530886869453), (19, 8.622045115225914), (9969, 8.609232614843217), (24701, 8.587685704058096), (35247, 8.586207890504708)]
anime_ids = []
for elem in s:
    anime_ids.append(elem[0])


In [None]:
anime_ids

In [None]:
df = pd.read_csv('data_new/anime_details_v1.csv')


In [None]:
df[df['id'].isin(anime_ids)]

# WRMF

In [None]:
import os
import pickle
import logging
import numpy as np
import pandas as pd
from scipy.sparse import coo_matrix, csr_matrix
from sklearn.model_selection import train_test_split
import optuna
import implicit

# --- Logging Configuration (real-time flush) ---
logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s: %(message)s')
logger = logging.getLogger()

# --- Global Settings & File Paths ---
RANDOM_STATE = 42
TRAIN_TEST_SPLIT = 0.2
TOP_N = 10                   # top-10 recommendations
ALPHA = 40                   # Confidence scaling: C = 1 + ALPHA * implicit_signal

# Default hyperparameters for WRMF (to be tuned)
DEFAULT_FACTORS = 50
DEFAULT_REG = 0.01
DEFAULT_ITER = 10

# Data file paths
ANIMELIST_FILE = "data_new/animelist.csv"            
ANIME_DETAILS_FILE = "data_new/anime_details_v1.csv"   
MODEL_SAVE_PATH = "models/wrmf_best_model.pkl"
os.makedirs("models", exist_ok=True)
os.makedirs("cache", exist_ok=True)

# --- Caching Helper Functions ---
def cache_path(filename):
    return os.path.join("cache", filename)

def save_cache_obj(obj, filename):
    with open(cache_path(filename), "wb") as f:
        pickle.dump(obj, f)
    logger.info(f"Saved cache to {cache_path(filename)}")

def load_cache_obj(filename):
    with open(cache_path(filename), "rb") as f:
        obj = pickle.load(f)
    logger.info(f"Loaded cache from {cache_path(filename)}")
    return obj

def cache_key(params):
    return f"wrmf_results_{params['factors']}_{params['reg']}_{params['n_iter']}_{params['alpha']}.pkl"

# --- Data Loading Functions ---
def load_data(animelist_path, anime_details_path):
    logger.info("Loading animelist from '%s'", animelist_path)
    animelist = pd.read_csv(animelist_path)
    logger.info("Animelist shape: %s", animelist.shape)
    
    logger.info("Loading anime details from '%s'", anime_details_path)
    anime_details = pd.read_csv(anime_details_path)
    logger.info("Anime details shape: %s", anime_details.shape)
    
    return animelist, anime_details

# --- Preprocessing & Transformation ---
def preprocess(animelist, anime_details):
    logger.info("Preprocessing animelist data...")
    animelist['score'] = animelist['score'].fillna(0)
    animelist['status'] = animelist['status'].str.lower()
    
    status_mapping = {
        'completed': 1.0,
        'on_hold': 0.5,
        'plan_to_watch': 0.2,
        'dropped': 0.0,
        'watching': 0.8,
        'rewatching': 0.9
    }
    logger.info("Mapping status using: %s", status_mapping)
    animelist['status_weight'] = animelist['status'].map(status_mapping)
    missing = animelist['status_weight'].isna().sum()
    if missing > 0:
        logger.warning("Found %d missing status weights; filling with 0", missing)
        animelist['status_weight'] = animelist['status_weight'].fillna(0)
    
    logger.info("Renaming 'id' to 'anime_id' in anime details and merging")
    anime_details = anime_details.rename(columns={'id': 'anime_id'})
    merged = pd.merge(animelist, anime_details[['anime_id', 'num_episodes']], on='anime_id', how='left')
    merged['num_episodes'] = merged['num_episodes'].fillna(1)
    logger.info("Computing episode ratio as num_episodes_watched/num_episodes")
    merged['episode_ratio'] = (merged['num_episodes_watched'] / merged['num_episodes']).clip(upper=1)
    
    logger.info("Computing final interaction strength")
    merged['interaction'] = merged.apply(
        lambda row: row['score'] / 10 if row['score'] > 0 else max(row['status_weight'], row['episode_ratio']),
        axis=1
    )
    # For ranking evaluation, copy interaction to 'implicit'
    merged['implicit'] = merged['interaction']
    nan_count = merged['interaction'].isna().sum()
    if nan_count > 0:
        logger.error("Found %d NaN values in interaction column", nan_count)
    logger.info("Preprocessing complete. Merged shape: %s", merged.shape)
    return merged

def create_train_test_split(merged, test_size=TRAIN_TEST_SPLIT, random_state=RANDOM_STATE):
    logger.info("Splitting merged data into train and test sets (test_size=%.2f)", test_size)
    train, test = train_test_split(merged, test_size=test_size, random_state=random_state)
    logger.info("Train shape: %s; Test shape: %s", train.shape, test.shape)
    return train, test

def build_interaction_matrix(train):
    logger.info("Building interaction matrix from training data")
    # Use np.unique to ensure unique IDs.
    user_ids = np.unique(train['user_id'])
    anime_ids = np.unique(train['anime_id'])
    user2idx = {user: idx for idx, user in enumerate(user_ids)}
    anime2idx = {anime: idx for idx, anime in enumerate(anime_ids)}
    
    logger.info("Total users: %d, Total anime: %d", len(user_ids), len(anime_ids))
    
    train = train.copy()
    train['user_idx'] = train['user_id'].map(user2idx)
    train['anime_idx'] = train['anime_id'].map(anime2idx)
    
    interaction_matrix = coo_matrix(
        (train['interaction'], (train['user_idx'], train['anime_idx'])),
        shape=(len(user_ids), len(anime_ids))
    ).tocsr()
    logger.info("Interaction matrix shape: %s", interaction_matrix.shape)
    return interaction_matrix, user2idx, anime2idx

# --- Model Training ---
def train_wrmf_model(interaction_matrix, factors=DEFAULT_FACTORS, reg=DEFAULT_REG, iterations=DEFAULT_ITER):
    logger.info("Training WRMF model using ALS with factors=%d, reg=%.4f, iterations=%d", factors, reg, iterations)
    model = implicit.als.AlternatingLeastSquares(
        factors=factors,
        regularization=reg,
        iterations=iterations,
        random_state=RANDOM_STATE
    )
    model.fit(interaction_matrix.T)  # Transpose: implicit expects item-user matrix
    logger.info("WRMF model training complete.")
    return model

# --- Pointwise Metrics Evaluation ---
def compute_pointwise_metrics(model, test_df, user2idx, anime2idx):
    errors = []
    abs_errors = []
    for _, row in test_df.iterrows():
        score = row['score']
        if score > 0 and row['user_id'] in user2idx and row['anime_id'] in anime2idx:
            u_idx = user2idx[row['user_id']]
            a_idx = anime2idx[row['anime_id']]
            if u_idx >= model.user_factors.shape[0] or a_idx >= model.item_factors.shape[0]:
                logger.warning("Skipping row: user %s->%d, anime %s->%d out of bounds",
                               row['user_id'], u_idx, row['anime_id'], a_idx)
                continue
            pred = np.dot(model.user_factors[u_idx], model.item_factors[a_idx])
            errors.append((score - pred) ** 2)
            abs_errors.append(abs(score - pred))
    if errors:
        rmse = np.sqrt(np.mean(errors))
        mae = np.mean(abs_errors)
    else:
        rmse, mae = None, None
    return rmse, mae

# --- Ranking Metrics Evaluation ---
def precision_at_k(recommended, relevant, k=10):
    recommended_k = [rec for rec, _ in recommended][:k]
    if not recommended_k:
        return 0.0
    hits = len(set(recommended_k) & relevant)
    return hits / k

def recall_at_k(recommended, relevant, k=10):
    recommended_k = [rec for rec, _ in recommended][:k]
    if len(relevant) == 0:
        return 0.0
    hits = len(set(recommended_k) & relevant)
    return hits / len(relevant)

def ndcg_at_k(recommended, relevant, k=10):
    dcg = 0.0
    for i, (rec, _) in enumerate(recommended[:k]):
        if rec in relevant:
            dcg += 1 / np.log2(i + 2)
    ideal_hits = min(len(relevant), k)
    idcg = sum(1 / np.log2(i + 2) for i in range(ideal_hits))
    return dcg / idcg if idcg > 0 else 0.0

def compute_ranking_metrics_for_user(user, group, model, user2idx, anime2idx, train_matrix, top_k=TOP_N, relevance_threshold=0.1):
    if user not in user2idx:
        logger.warning("User %s not found in training mapping.", user)
        return None
    group = group[group['anime_id'].isin(anime2idx.keys())]
    if group.empty:
        logger.info("No test data for user %s after filtering.", user)
        return None
    relevant_items = set(group[group['interaction'] >= relevance_threshold]['anime_id'])
    if not relevant_items:
        logger.info("No relevant items for user %s.", user)
        return None
    u_idx = user2idx[user]
    train_csr = train_matrix.tocsr()
    recommendations = model.recommend(u_idx, train_csr[u_idx], N=top_k, filter_already_liked_items=False)
    idx2anime = {idx: anime for anime, idx in anime2idx.items()}
    # Unpack only the first two elements from each tuple.
    recs = [(idx2anime.get(t[0]), t[1]) for t in recommendations]
    prec = precision_at_k(recs, relevant_items, k=top_k)
    rec = recall_at_k(recs, relevant_items, k=top_k)
    ndcg_val = ndcg_at_k(recs, relevant_items, k=top_k)
    return prec, rec, ndcg_val

# --- Hyperparameter Tuning Objective Function ---
def objective(trial):
    factors = trial.suggest_categorical("factors", [50, 100, 150, 200])
    reg = trial.suggest_float("reg", 1e-3, 1e-1, log=True)
    n_iter = trial.suggest_int("n_iter", 10, 50)
    alpha = trial.suggest_categorical("alpha", [20, 40, 60, 80])
    
    params = {"factors": factors, "reg": reg, "n_iter": n_iter, "alpha": alpha}
    logger.info("Trial parameters: %s", params)
    
    key = cache_key(params)
    if os.path.exists(cache_path(key)):
        logger.info("Loading cached results for key: %s", key)
        cached = load_cache_obj(key)
        trial.set_user_attr("rmse", cached["rmse"])
        trial.set_user_attr("mae", cached["mae"])
        trial.set_user_attr("precision_at_10", cached["precision_at_10"])
        trial.set_user_attr("recall_at_10", cached["recall_at_10"])
        trial.set_user_attr("ndcg_at_10", cached["ndcg_at_10"])
        return cached["objective"]
    
    animelist, anime_details = load_data(ANIMELIST_FILE, ANIME_DETAILS_FILE)
    merged = preprocess(animelist, anime_details)
    train, test = create_train_test_split(merged)
    
    train_matrix, user2idx, anime2idx = build_interaction_matrix(train)
    
    model = implicit.als.AlternatingLeastSquares(
        factors=factors,
        regularization=reg,
        iterations=n_iter,
        random_state=RANDOM_STATE
    )
    model.fit(train_matrix.T)
    
    rmse, mae = compute_pointwise_metrics(model, test, user2idx, anime2idx)
    trial.set_user_attr("rmse", rmse)
    trial.set_user_attr("mae", mae)
    
    test_groups = list(test.groupby('user_id'))
    logger.info("Number of test groups: %d", len(test_groups))
    sampled_groups = test_groups[:int(len(test_groups) * 0.2)]
    
    ranking_results = []
    for user, group in sampled_groups:
        res = compute_ranking_metrics_for_user(user, group, model, user2idx, anime2idx, train_matrix, top_k=TOP_N, relevance_threshold=0.1)
        ranking_results.append(res)
    valid_results = [res for res in ranking_results if res is not None]
    if valid_results:
        precisions, recalls, ndcgs = zip(*valid_results)
        precision_val = np.mean(precisions)
        recall_val = np.mean(recalls)
        ndcg_val = np.mean(ndcgs)
    else:
        precision_val = recall_val = ndcg_val = float('inf')
    
    trial.set_user_attr("precision_at_10", precision_val)
    trial.set_user_attr("recall_at_10", recall_val)
    trial.set_user_attr("ndcg_at_10", ndcg_val)
    
    # Optimize for recall (i.e. minimize 1 - recall)
    objective_value = 1 - recall_val
    logger.info("Trial metrics: RMSE: %s, MAE: %s, Precision@10: %.4f, Recall@10: %.4f, NDCG@10: %.4f, Objective (1 - recall): %.4f",
                rmse, mae, precision_val, recall_val, ndcg_val, objective_value)
    
    cache_obj = {
        "rmse": rmse,
        "mae": mae,
        "precision_at_10": precision_val,
        "recall_at_10": recall_val,
        "ndcg_at_10": ndcg_val,
        "objective": objective_value
    }
    save_cache_obj(cache_obj, key)
    return objective_value

def main_experiment():
    study = optuna.create_study(direction="minimize")
    study.optimize(objective, n_trials=20)
    
    results = []
    for trial in study.trials:
        results.append({
            "model_name": "WRMF",
            "rmse": trial.user_attrs.get("rmse"),
            "mae": trial.user_attrs.get("mae"),
            "precision_at_10": trial.user_attrs.get("precision_at_10"),
            "recall_at_10": trial.user_attrs.get("recall_at_10"),
            "ndcg_at_10": trial.user_attrs.get("ndcg_at_10"),
            "parameters": str(trial.params),
            "objective": trial.value
        })
    results_df = pd.DataFrame(results)
    results_csv = "wrmf_experiment_results.csv"
    results_df.to_csv(results_csv, index=False)
    logger.info("Saved experiment results to %s", results_csv)
    logger.info("Best trial: %s with Objective (1 - recall): %.4f", study.best_trial.params, study.best_trial.value)
    
    # Final retraining: use entire merged dataset so that mapping is complete.
    animelist, anime_details = load_data(ANIMELIST_FILE, ANIME_DETAILS_FILE)
    merged = preprocess(animelist, anime_details)
    final_matrix, user2idx, anime2idx = build_interaction_matrix(merged)
    logger.info("Final training: user count: %d, anime count: %d, matrix shape: %s",
                len(user2idx), len(anime2idx), final_matrix.shape)
    
    best_params = study.best_trial.params
    logger.info("Retraining best model with parameters: %s", best_params)
    best_model_obj = implicit.als.AlternatingLeastSquares(
        factors=best_params["factors"],
        regularization=best_params["reg"],
        iterations=best_params["n_iter"],
        random_state=RANDOM_STATE
    )
    best_model_obj.fit(final_matrix.T)
    logger.info("Final model training complete. User factors shape: %s, Item factors shape: %s",
                best_model_obj.user_factors.shape, best_model_obj.item_factors.shape)
    
    final_model = {
        "model": best_model_obj,
        "interaction_matrix": final_matrix,
        "user2idx": user2idx,
        "anime2idx": anime2idx,
        "merged": merged
    }
    with open(MODEL_SAVE_PATH, "wb") as f:
        pickle.dump(final_model, f)
    logger.info("Best WRMF model saved to '%s'", MODEL_SAVE_PATH)
    
    # New user recommendation: self-contained call.
    recommend_new_user()

# --- New User Recommendation (Self-contained) ---
def recommend_new_user():
    import pickle
    import logging
    import numpy as np
    import pandas as pd
    logger = logging.getLogger()
    MODEL_SAVE_PATH = "models/wrmf_best_model.pkl"
    ANIME_DETAILS_FILE = "data_new/anime_details_v1.csv"
    NEW_USER_FILE = "data_new/user_data.csv"
    
    logger.info("Loading best model from '%s'", MODEL_SAVE_PATH)
    with open(MODEL_SAVE_PATH, "rb") as f:
        final_model = pickle.load(f)
    model = final_model["model"]
    anime2idx = final_model["anime2idx"]
    
    anime_details = pd.read_csv(ANIME_DETAILS_FILE)
    anime_details = anime_details.rename(columns={'id': 'anime_id'})
    
    logger.info("Loading new user data from '%s'", NEW_USER_FILE)
    new_user_df = pd.read_csv(NEW_USER_FILE)
    
    # Build new user vector using saved mapping.
    new_user_vector = np.zeros(len(anime2idx), dtype=np.float32)
    for anime in new_user_df['anime_id'].unique():
        if anime in anime2idx:
            score = new_user_df[new_user_df['anime_id'] == anime]['score'].values[0]
            new_user_vector[anime2idx[anime]] = score / 10 if score > 0 else 1
    logger.info("New user vector built with shape: %s", new_user_vector.shape)
    
    def fold_in_new_user(new_user_vector, item_factors, learning_rate=0.005, reg=0.02, n_iter=50):
        n_factors = item_factors.shape[1]
        u_new = np.random.normal(scale=0.1, size=(n_factors,))
        observed = new_user_vector > 0
        # Check that observed mask length matches item_factors rows.
        if observed.shape[0] != item_factors.shape[0]:
            logger.error("Mismatch: new_user_vector length %d does not match item_factors rows %d",
                         observed.shape[0], item_factors.shape[0])
            # Clip the boolean mask to correct length.
            observed = observed[:item_factors.shape[0]]
        for _ in range(n_iter):
            pred = item_factors[observed].dot(u_new)
            error = new_user_vector[observed] - pred
            grad = -2 * item_factors[observed].T.dot(error) + 2 * reg * u_new
            u_new = u_new - learning_rate * grad
        return u_new

    u_new = fold_in_new_user(new_user_vector, model.item_factors)
    predictions = {}
    for anime, idx in anime2idx.items():
        if idx >= model.item_factors.shape[0]:
            logger.warning("Skipping anime %s with index %d (out-of-bounds)", anime, idx)
            continue
        if new_user_vector[idx] == 0:
            predictions[anime] = np.dot(u_new, model.item_factors[idx])
    top_recs = sorted(predictions.items(), key=lambda x: x[1], reverse=True)[:TOP_N]
    recs_df = pd.DataFrame(top_recs, columns=["anime_id", "predicted_rating"])
    enriched = pd.merge(recs_df, anime_details, left_on="anime_id", right_on="anime_id", how="left")
    logger.info("New user recommendations computed:")
    logger.info("\n%s", enriched.to_string(index=False))
    return enriched

if __name__ == "__main__":
    main_experiment()

In [None]:

main_experiment()

In [None]:
import pandas as pd
import numpy as np
import logging
from sklearn.model_selection import train_test_split
from scipy.sparse import coo_matrix

# Set up logging for realtime output
logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s: %(message)s')

def load_data(animelist_path, anime_details_path):
    logging.info("Loading animelist data from '%s'", animelist_path)
    animelist = pd.read_csv(animelist_path)
    
    logging.info("Loading anime details from '%s'", anime_details_path)
    anime_details = pd.read_csv(anime_details_path)
    
    return animelist, anime_details

def preprocess(animelist, anime_details):
    logging.info("Filling missing scores with 0")
    animelist['score'] = animelist['score'].fillna(0)
    
    # Optionally normalize the status string (e.g., lowercase)
    animelist['status'] = animelist['status'].str.lower()
    
    # Update the mapping to include additional statuses if needed.
    status_mapping = {
        'completed': 1.0,
        'on_hold': 0.5,
        'plan_to_watch': 0.2,
        'dropped': 0.0,
        # For statuses not explicitly defined, we could assign a default weight.
        'watching': 0.8,    # example weight for watching
        'rewatching': 0.9   # example weight for rewatching
    }
    logging.info("Mapping status to weights using: %s", status_mapping)
    animelist['status_weight'] = animelist['status'].map(status_mapping)
    
    # Fill any missing status weights with a default value (e.g., 0)
    missing_weights = animelist['status_weight'].isna().sum()
    if missing_weights > 0:
        logging.warning("Found %d missing status weights, filling with default 0", missing_weights)
        animelist['status_weight'] = animelist['status_weight'].fillna(0)
    
    # Merge to get total episodes from anime_details.
    logging.info("Renaming anime details column 'id' to 'anime_id' for merge")
    anime_details = anime_details.rename(columns={'id': 'anime_id'})
    
    logging.info("Merging animelist with anime_details to add total episodes info")
    merged = pd.merge(animelist, anime_details[['anime_id', 'num_episodes']], on='anime_id', how='left')
    
    # Avoid division errors: if total episodes is missing, fill with 1.
    merged['num_episodes'] = merged['num_episodes'].fillna(1)
    
    # Compute ratio of episodes watched (capped at 1)
    logging.info("Computing episode ratio as num_episodes_watched / num_episodes")
    merged['episode_ratio'] = (merged['num_episodes_watched'] / merged['num_episodes']).clip(upper=1)
    
    # Compute final interaction strength.
    # If a score is provided (> 0), we use normalized score (assuming scores are out of 10).
    # Otherwise, use the maximum between the status weight and episode_ratio.
    logging.info("Computing final interaction strength")
    merged['interaction'] = merged.apply(
        lambda row: row['score'] / 10 if row['score'] > 0 
                    else max(row['status_weight'], row['episode_ratio']),
        axis=1
    )
    
    # Check if any NaN values still exist in the interaction column
    nan_count = merged['interaction'].isna().sum()
    if nan_count > 0:
        logging.error("There are %d NaN values in the interaction column", nan_count)
    
    return merged

def create_train_test_split(merged, test_size=0.2, random_state=42):
    logging.info("Splitting data into train and test sets with test size = %.2f", test_size)
    train, test = train_test_split(merged, test_size=test_size, random_state=random_state)
    return train, test

def build_interaction_matrix(train):
    logging.info("Building interaction matrix from training data")
    # Create mappings for user and anime IDs
    user_ids = train['user_id'].unique()
    anime_ids = train['anime_id'].unique()
    user2idx = {user: idx for idx, user in enumerate(user_ids)}
    anime2idx = {anime: idx for idx, anime in enumerate(anime_ids)}
    
    # Map user_id and anime_id to indices
    train = train.copy()  # avoid SettingWithCopyWarning
    train['user_idx'] = train['user_id'].map(user2idx)
    train['anime_idx'] = train['anime_id'].map(anime2idx)
    
    # Build the sparse matrix
    interaction_matrix = coo_matrix(
        (train['interaction'], (train['user_idx'], train['anime_idx'])),
        shape=(len(user_ids), len(anime_ids))
    )
    
    logging.info("Interaction matrix shape: %s", interaction_matrix.shape)
    return interaction_matrix, user2idx, anime2idx

def train_model(interaction_matrix):
    logging.info("Training implicit feedback model using default hyper parameters")
    try:
        import implicit
    except ImportError:
        logging.error("The 'implicit' library is not installed. Please install it via pip install implicit")
        raise

    # Use ALS model from the implicit library.
    # Note: implicit expects an item-user matrix so we transpose.
    model = implicit.als.AlternatingLeastSquares(
        factors=50,          # number of latent factors
        regularization=0.01,   # regularization term
        iterations=10         # number of iterations for training
    )
    logging.info("Fitting model (this may take a moment)...")
    model.fit(interaction_matrix.T)
    logging.info("Model training complete")
    return model

def evaluate_model(model, test, user2idx, anime2idx, train_matrix, top_k=10):
    logging.info("Evaluating model performance using Recall@%d", top_k)
    hit_count = 0
    total_count = 0

    # Ensure the training matrix is in CSR format for fast row slicing.
    train_csr = train_matrix.tocsr()

    # For each user in the test set:
    for user in test['user_id'].unique():
        if user not in user2idx:
            continue  # skip users not in the training set
        user_index = user2idx[user]
        # Get test anime ids for this user
        test_anime = test[test['user_id'] == user]['anime_id'].unique()
        
        # Get recommendations for the user.
        # Here we are not filtering out items already seen, but you can adjust that.
        recommended = model.recommend(
            user_index, train_csr[user_index], N=top_k, filter_already_liked_items=False
        )
        # recommended is a list of tuples (anime_idx, score)
        recommended_anime_idxs = [rec[0] for rec in recommended]
        # Map indices back to anime_id
        # (Since anime2idx is a dict, we invert it here)
        idx2anime = {idx: anime for anime, idx in anime2idx.items()}
        recommended_anime_ids = [idx2anime.get(idx) for idx in recommended_anime_idxs]
        
        hits = len(set(test_anime) & set(recommended_anime_ids))
        hit_count += hits
        total_count += len(test_anime)
    
    recall = hit_count / total_count if total_count > 0 else 0
    logging.info("Recall@%d: %.4f", top_k, recall)
    return recall

if __name__ == "__main__":
    # Paths to your CSV files (adjust if necessary)
    animelist_path = "data_new/animelist.csv"
    anime_details_path = "data_new/anime_details_v1.csv"  # using the provided filename

    # Load data
    animelist, anime_details = load_data(animelist_path, anime_details_path)
    
    # Preprocess data
    merged_data = preprocess(animelist, anime_details)
    
    # Split data
    train_data, test_data = create_train_test_split(merged_data)
    
    # Build the interaction matrix for the training set
    interaction_matrix, user2idx, anime2idx = build_interaction_matrix(train_data)
    
    # Train the implicit model
    model = train_model(interaction_matrix)
    
    # Evaluate the model
    recall = evaluate_model(model, test_data, user2idx, anime2idx, interaction_matrix, top_k=10)
    
    logging.info("Recommendation system training and evaluation complete")

In [None]:
# Paths to your CSV files (adjust if necessary)
animelist_path = "data_new/animelist.csv"
anime_details_path = "data_new/anime_details_v1.csv"  # using the provided filename

# Load data
animelist, anime_details = load_data(animelist_path, anime_details_path)

# Preprocess data
merged_data = preprocess(animelist, anime_details)

# Split data
train_data, test_data = create_train_test_split(merged_data)

# Build the interaction matrix for the training set
interaction_matrix, user2idx, anime2idx = build_interaction_matrix(train_data)

# Train the implicit model
model = train_model(interaction_matrix)

# Evaluate the model
recall = evaluate_model(model, test_data, user2idx, anime2idx, interaction_matrix, top_k=10)

logging.info("Recommendation system training and evaluation complete")

# Item - item collaborative filtering

In [None]:
import pandas as pd
df_1 = pd.read_csv('data_new/anime_details_v1.csv')
df_1


In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.compose import ColumnTransformer
from sklearn.metrics.pairwise import linear_kernel
import pickle
import ast

# Load and preprocess data
anime_metadata = pd.read_csv('data_new/anime_details_v1.csv')

# Define features
text_features = ['synopsis', 'genres']
categorical_features = ['studios', 'media_type', 'source']
numerical_features = ['start_year', 'mean', 'rank', 'popularity', 'num_episodes']

# Convert JSON strings to lists
def safe_literal_eval(val):
    try:
        return [d['name'] for d in ast.literal_eval(val)] if pd.notnull(val) else []
    except (ValueError, SyntaxError):
        return []

anime_metadata['genres'] = anime_metadata['genres'].apply(safe_literal_eval)
anime_metadata['studios'] = anime_metadata['studios'].apply(safe_literal_eval)
anime_metadata['synopsis'] = anime_metadata['synopsis'].fillna('').astype(str)
print('Finished preprocessing data')


anime_metadata

In [None]:
# Fix for the preprocessor definition
def extract_genres(df):
    return df['genres']
def extract_studios(df):
    return df['studios']
def safe_multilabel_binarize(x):
    return MultiLabelBinarizer().fit_transform(x)
def extract_synopsis(df):
    return df[['synopsis']]
def safe_tostring(x):
    return [str(item[0]) for item in x]
def preprocessor_for_genres(x):
    return ' '.join(x) if isinstance(x, list) else ''

preprocessor = ColumnTransformer(
    transformers=[
         ('genres', Pipeline([
            ('selector', FunctionTransformer(extract_genres, validate=False)),
            ('genre_clean', TfidfVectorizer(
                preprocessor=preprocessor_for_genres,
                token_pattern=r'(?u)\b[\w-]+\b',
                min_df=2
            )),
        ]), ['genres']),
        
        # For synopsis
        ('synopsis_tfidf', Pipeline([
            ('selector', FunctionTransformer(extract_synopsis, validate=False)),
            ('imputer', SimpleImputer(strategy='constant', fill_value='')),
            ('to_string', FunctionTransformer(safe_tostring, validate=False)),
            ('tfidf', TfidfVectorizer(
                stop_words='english',
                min_df=2,
                max_features=5000
            ))
        ]), ['synopsis']),
        
        # For numerical features (already a 2D selection)
        ('num', Pipeline([
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', MinMaxScaler())
        ]), numerical_features),
        
        # For studios (list column)
        ('studios_mlb', Pipeline([
            # This ensures we pass a 2D array to the MultiLabelBinarizer
            ('selector', FunctionTransformer(extract_studios, validate=False)),
            ('mlb', FunctionTransformer(safe_multilabel_binarize))
        ]), ['studios']),
        
        # For other categorical features
        ('cat', Pipeline([
            ('imputer', SimpleImputer(strategy='constant', fill_value='')),
            ('onehot', OneHotEncoder(handle_unknown='ignore'))
        ]), ['media_type', 'source'])
    ],
    remainder='drop'
)
print('created preprocessor')
# Execute pipeline
feature_matrix = preprocessor.fit_transform(anime_metadata)
print('Finished feature extraction')
feature_matrix

In [None]:
print('Finished feature extraction')

# Compute similarity
cosine_sim = linear_kernel(feature_matrix, feature_matrix)
print('Computed cosine similarity matrix')
# Save model
model_artifacts = {
    'preprocessor': preprocessor,
    'cosine_sim': cosine_sim,
    'feature_matrix': feature_matrix,
    'metadata': anime_metadata
}

with open('models/content_based_model.pkl', 'wb') as f:
    pickle.dump(model_artifacts, f)

In [None]:
anime_metadata[anime_metadata['anime_id'] == 38691].index[0]

In [None]:
def get_content_recommendations(anime_id, n=10):
    idx = anime_metadata[anime_metadata['anime_id'] == anime_id].index[0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:n+1]
    anime_indices = [i[0] for i in sim_scores]
    return anime_metadata.iloc[anime_indices]

def load_content_model(path='content_based_model.pkl'):
    with open(path, 'rb') as f:
        artifacts = pickle.load(f)
    return artifacts

get_content_recommendations(235)

In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from hyperopt import fmin, tpe, hp, Trials
from sklearn.model_selection import train_test_split
import pickle
from functools import partial
from sklearn.metrics.pairwise import linear_kernel

# Fix for the preprocessor definition
def extract_genres(df):
    return df['genres']
def extract_studios(df):
    return df['studios']
def safe_multilabel_binarize(x):
    return MultiLabelBinarizer().fit_transform(x)
def extract_synopsis(df):
    return df[['synopsis']]
def safe_tostring(x):
    return [str(item[0]) for item in x]
def preprocessor_for_genres(x):
    return ' '.join(x) if isinstance(x, list) else ''
# 1. Load data and prepare evaluation

ratings = pd.read_csv('data_new/animelist_v1.csv')
anime_metadata = pd.read_csv('data_new/anime_details_v1.csv')

# Convert JSON strings to lists
import ast
def safe_literal_eval(val):
    try:
        return [d['name'] for d in ast.literal_eval(val)] if pd.notnull(val) else []
    except (ValueError, SyntaxError):
        return []

anime_metadata['genres'] = anime_metadata['genres'].apply(safe_literal_eval)
anime_metadata['studios'] = anime_metadata['studios'].apply(safe_literal_eval)
anime_metadata['synopsis'] = anime_metadata['synopsis'].fillna('').astype(str)
anime_metadata.rename(columns={'id': 'anime_id'}, inplace=True)

# Create user profiles (average rating weighted anime features)
user_profiles = ratings.groupby('user_id').apply(
    lambda x: pd.Series({
        'mean_rating': x['score'].mean(),
        'liked_genres': x.merge(anime_metadata, on='anime_id')['genres'].explode().value_counts().to_dict(),
        'liked_anime': list(x['anime_id'])
    })
)

# Split into train/test
train_users, test_users = train_test_split(user_profiles, test_size=0.2, random_state=42)






In [None]:
train_users.shape, test_users.shape, user_profiles.shape

In [None]:
# Load precomputed features
with open('models/content_based_model.pkl', 'rb') as f:
    content_data = pickle.load(f)

In [None]:
# 2. Define parameter space
param_space = {
    'genre_weight': hp.uniform('genre_weight', 0.1, 2),
    'synopsis_weight': hp.uniform('synopsis_weight', 0.1, 2),
    'numerical_weight': hp.uniform('numerical_weight', 0.1, 2),
    'studio_weight': hp.uniform('studio_weight', 0.1, 2),
    'categorical_weight': hp.uniform('categorical_weight', 0.1, 2),
    'popularity_decay': hp.uniform('popularity_decay', 0.8, 1),
    'age_decay_base': hp.uniform('age_decay_base', 0.9, 0.99),
    'genre_boost_factor': hp.uniform('genre_boost_factor', 1, 3),
    'min_similarity_threshold': hp.uniform('min_similarity_threshold', 0.1, 1)
}


In [None]:
# Define features
text_features = ['synopsis', 'genres']
categorical_features = ['studios', 'media_type', 'source']
numerical_features = ['start_year', 'mean', 'rank', 'popularity', 'num_episodes']
from sklearn.preprocessing import MultiLabelBinarizer

# Genres (TF-IDF)
genres_pipe = content_data['preprocessor'].named_transformers_['genres']
tfidf_genres = genres_pipe.named_steps['genre_clean']
num_genres = len(tfidf_genres.get_feature_names_out())

# Synopsis (TF-IDF)
synopsis_pipe = content_data['preprocessor'].named_transformers_['synopsis_tfidf']
tfidf_synopsis = synopsis_pipe.named_steps['tfidf']
num_synopsis = len(tfidf_synopsis.get_feature_names_out())

# Numerical Features (count matches input)
num_num = len(numerical_features)  # Assuming `numerical_features` is a list

# Studios (MultiLabelBinarizer output shape)
studio_pipe = content_data['preprocessor'].named_transformers_['studios_mlb']
studio_sample = studio_pipe.transform(anime_metadata.head(1))
num_studios = studio_sample.shape[1]

# Categorical (OneHotEncoder)
cat_pipe = content_data['preprocessor'].named_transformers_['cat']
onehot = cat_pipe.named_steps['onehot']
num_cat = len(onehot.get_feature_names_out(['media_type', 'source']))


start_genres = 0
end_genres = num_genres

start_synopsis = end_genres
end_synopsis = start_synopsis + num_synopsis

start_num = end_synopsis
end_num = start_num + num_num

start_studios = end_num
end_studios = start_studios + num_studios

start_cat = end_studios
end_cat = start_cat + num_cat

In [None]:
feature_matrix = content_data['feature_matrix']

genres_features = feature_matrix[:, start_genres:end_genres]
synopsis_features = feature_matrix[:, start_synopsis:end_synopsis]
numerical_features = feature_matrix[:, start_num:end_num]
studio_features = feature_matrix[:, start_studios:end_studios]
categorical_features = feature_matrix[:, start_cat:end_cat]
print(type(genres_features), type(synopsis_features), type(numerical_features), type(studio_features), type(categorical_features))

In [None]:
# 3. Define objective function
from joblib import Parallel, delayed
import numpy as np




def calculate_genre_boost(anime_row, params):
    boost = 1.0
    for genre in anime_row['genres']:
        boost *= params['genre_boost_factor']
    return boost

def objective(params, train, test, anime_metadata, genres_features, studio_features, numerical_features, categorical_features, synopsis_features):
    from scipy.sparse import hstack
    # Apply popularity decay
    current_year = pd.Timestamp.now().year
    anime_metadata['decayed_popularity'] = anime_metadata['popularity'] * (
        params['popularity_decay'] ** (current_year - anime_metadata['start_year'])
    )
    
    # Apply genre boosting
    genre_weights = {}
    all_genres = []
    for genres_list in anime_metadata['genres']:
        if isinstance(genres_list, list):
            all_genres.extend(genres_list)
    unique_genres = set(all_genres)
    for genre in unique_genres:
        genre_weights[genre] = params['genre_boost_factor']
    
    # Recalculate similarities with weights
    weighted_features = hstack([
            genres_features.multiply(params['genre_weight']),
            studio_features.multiply(params['studio_weight']),
            synopsis_features.multiply(params['synopsis_weight']),
            numerical_features.multiply(params['numerical_weight']),
            categorical_features.multiply(params['categorical_weight'])
    ])
    
    # Update similarity matrix
    cosine_sim = linear_kernel(weighted_features, weighted_features)
    
     # --- Parallel Processing Setup ---
    def process_user(user_id):
        
        
        try:
            train_items = train[train['user_id'] == user_id]['anime_id'].values
            test_items = test[test['user_id'] == user_id]['anime_id'].values
            
            # Aggregate recommendations with scores
            recommendation_pool = {}
            user_df = train[train['user_id'] == user_id]
            user_mean = user_df['score'].mean()
            for anime_id in train_items:
                try: 
                    if user_df[user_df['anime_id'] == anime_id]['score'].values[0] > user_mean: 
                        idx = anime_metadata[anime_metadata['anime_id'] == anime_id].index[0]
                        sim_scores = list(enumerate(cosine_sim[idx]))
                        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:51]
                        for i, score in sim_scores:
                            if i not in recommendation_pool:
                                recommendation_pool[i] = score 
                            else:
                                recommendation_pool[i] += score 
                except KeyError as e:
                    print(f"Skipping invalid anime ID: {anime_id}")
                    raise e
            
            # Get top 10 recommendations
            top_recs = sorted(recommendation_pool.items(), key=lambda x: -x[1])[:50]
            top_ids = [x[0] for x in top_recs]
            
            # Calculate precision
            hits = len(set(top_ids).intersection(test_items))
            k_recall = min(len(test_items), 10)
            print(f"User {user_id}: Hits: {hits}, Recall@10: {hits / k_recall}")
            return hits / k_recall if top_ids else 0
            
        except Exception as e:
            print(f"Error processing user {user_id}: {e}")
            raise e
            # return 0

    # --- Parallel Execution ---
    test_users = test['user_id'].unique()
    scores = Parallel(n_jobs=-1, prefer="threads")(
        delayed(process_user)(user_id) 
        for user_id in test_users
    )
    
    # Return negative mean precision for minimization
    return -np.mean(scores)


df = pd.read_csv('data_new/animelist_v1.csv')
user_count = df['user_id'].value_counts()
active_users = user_count[user_count >= 50].index
df_filtered = df[df['user_id'].isin(active_users)]
train, test = [], []
grouped = df_filtered.groupby('user_id')
train_test = grouped.apply(lambda x: train_test_split(x, test_size=0.2, random_state=42))
train_df = pd.concat([tt[0] for tt in train_test])
test_df = pd.concat([tt[1] for tt in train_test])

# 4. Optimization setup
objective_partial = partial(
    objective,
    # eval_users=train_users.sample(min(100, len(train_users))),
    train = train_df,
    test = test_df,
    anime_metadata=anime_metadata,
    genres_features=genres_features,
    studio_features=studio_features,   
    synopsis_features=synopsis_features,
    numerical_features=numerical_features,
    categorical_features=categorical_features
)

trials = Trials()
best = fmin(
    fn=objective_partial,
    space=param_space,
    algo=tpe.suggest,
    max_evals=100,
    trials=trials,
    verbose=1
)

# 5. Train final model with best params
final_model = {
    'params': best,
    'cosine_sim': calculate_final_similarity(best, anime_metadata, studio_features, synopsis_features, numerical_features, categorical_features),
    'genre_weights': calculate_genre_weights(best, anime_metadata)
}

# 6. Evaluation functions
def calculate_final_similarity(params, anime_metadata, studio_features, synopsis_features, numerical_features, categorical_features):
    # Apply popularity decay
    from scipy.sparse import hstack
    current_year = pd.Timestamp.now().year
    anime_metadata['decayed_popularity'] = anime_metadata['popularity'] * (
        params['popularity_decay'] ** (current_year - anime_metadata['start_year'])
    )
    
    # Recalculate similarities with weights
    weighted_features = hstack([
            genres_features.multiply(params['genre_weight']),
            studio_features.multiply(params['studio_weight']),
            synopsis_features.multiply(params['synopsis_weight']),
            numerical_features.multiply(params['numerical_weight']),
            categorical_features.multiply(params['categorical_weight'])
    ])
    
    # Update similarity matrix
    weighted_cosine_sim = linear_kernel(weighted_features, weighted_features)
    return weighted_cosine_sim





In [None]:
# 6. Evaluation functions
def calculate_final_similarity(params, anime_metadata, studio_features, synopsis_features, numerical_features, categorical_features):
    # Apply popularity decay
    from scipy.sparse import hstack
    current_year = pd.Timestamp.now().year
    anime_metadata['decayed_popularity'] = anime_metadata['popularity'] * (
        params['popularity_decay'] ** (current_year - anime_metadata['start_year'])
    )
    
    # Recalculate similarities with weights
    weighted_features = hstack([
            genres_features.multiply(params['genre_weight']),
            studio_features.multiply(params['studio_weight']),
            synopsis_features.multiply(params['synopsis_weight']),
            numerical_features.multiply(params['numerical_weight']),
            categorical_features.multiply(params['categorical_weight'])
    ])
    
    # Update similarity matrix
    weighted_cosine_sim = linear_kernel(weighted_features, weighted_features)
    return weighted_cosine_sim

def calculate_genre_weights(params, anime_metadata):
    genre_weights = {}
    all_genres = []
    for genres_list in anime_metadata['genres']:
        if isinstance(genres_list, list):
            all_genres.extend(genres_list)
    unique_genres = set(all_genres)
    for genre in unique_genres:
        genre_weights[genre] = params['genre_boost_factor']
    return genre_weights


    
def get_content_recommendations(anime_id, cosine_sim, anime_metadata, genre_weights, min_similarity_threshold, n=10):
    # Get the index of the anime
    idx = anime_metadata[anime_metadata['anime_id'] == anime_id].index[0]

    # Get the pairwise similarity scores
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the anime based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar anime
    sim_scores = sim_scores[1:n+1]

    # Get the anime indices
    anime_indices = [i[0] for i in sim_scores]

    # Filter by minimum similarity threshold
    filtered_indices = [i for i, score in enumerate(sim_scores) if score[1] >= min_similarity_threshold * 10]
    anime_indices = [anime_indices[i] for i in filtered_indices]

    # Return the top 10 most similar anime
    recommendations = anime_metadata.iloc[anime_indices][['anime_id', 'title', 'synopsis', 'mean', 'popularity', 'genres']].to_dict('records')
    
    # Apply genre boosting
    for rec in recommendations:
        boost = 1.0
        for genre in rec['genres']:
            if genre in genre_weights:
                boost *= genre_weights[genre]
        rec['score'] = rec['mean'] * boost
    
    return recommendations

In [None]:
# 7. Verification
test_precision = objective(best, test_users.sample(min(50, len(test_users))), anime_metadata, studio_features, numerical_features, categorical_features, synopsis_features)
print(f"Final test precision@10: {-test_precision:.3f}")

In [None]:
best['min_similarity_threshold'] = best['min_sim']


In [None]:
final_model['params'] = best
with open('models/optimized_content_model.pkl', 'wb') as f:
    pickle.dump(final_model, f)

In [None]:
test_users

---

In [None]:
import pandas as pd 
from sklearn.model_selection import train_test_split

df = pd.read_csv('data_new/animelist_v1.csv')
user_count = df['user_id'].value_counts()
active_users = user_count[user_count >= 10].index
df_filtered = df[df['user_id'].isin(active_users)]
train, test = [], []
grouped = df_filtered.groupby('user_id')
train_test = grouped.apply(lambda x: train_test_split(x, test_size=0.2, random_state=42))
train_df = pd.concat([tt[0] for tt in train_test])
test_df = pd.concat([tt[1] for tt in train_test])
train_df.to_csv('data_new/train_data.csv', index=False)
test_df.to_csv('data_new/test_data.csv', index=False)

---


In [None]:
df = pd.read_csv('data_new/animelist_v1.csv')
user_count = df['user_id'].value_counts()
active_users = user_count[user_count >= 50].index
df_filtered = df[df['user_id'].isin(active_users)]
train, test = [], []
grouped = df_filtered.groupby('user_id')
train_test = grouped.apply(lambda x: train_test_split(x, test_size=0.2, random_state=42))
train_df = pd.concat([tt[0] for tt in train_test])
test_df = pd.concat([tt[1] for tt in train_test])
train_df.shape, test_df.shape

In [None]:
train_df['user_id'].nunique()

In [None]:
with open('models/optimized_content_model.pkl', 'rb') as f:
    content_data = pickle.load(f)

In [None]:
train_items = train_df[train_df['user_id'] == 'kudacukiii']['anime_id'].values
test_items = test_df[test_df['user_id'] == 'kudacukiii']['anime_id'].values

In [None]:
temp = train_df[train_df['user_id'] == 'kudacukiii']
for anime in train_items:
    print(temp[temp['anime_id'] == anime]['score'].values[0])
    break

In [None]:
recommendation_pool = {}
for anime_id in train_items:
    try: 
        idx = anime_metadata[anime_metadata['anime_id'] == anime_id].index[0]
        sim_scores = list(enumerate(cosine_sim[idx]))
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:1000]
        for i, score in sim_scores:
            if i not in recommendation_pool:
                recommendation_pool[i] = score
            else:
                recommendation_pool[i] += score
    except KeyError as e:
        print(f"Skipping invalid anime ID: {anime_id}")
        raise e

In [None]:
recommendation_pool

In [None]:
# Get top 10 recommendations
top_recs = sorted(recommendation_pool.items(), key=lambda x: -x[1])[:1000]
top_ids = [x[0] for x in top_recs]

In [None]:
top_ids

In [None]:
test_items

In [None]:
set(top_ids).intersection(test_items)

In [None]:
anime_metadata