In [1]:
# --- Imports ---
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, FeatureHasher
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
from scipy.sparse import csr_matrix, hstack
from collections import defaultdict # Added
from sklearn.metrics import average_precision_score, precision_score, ndcg_score # Added
from tqdm import tqdm

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# Define your file paths
wine_file = '/content/drive/My Drive/XWines_Full_100K_wines.csv'
train_file = '/content/drive/My Drive/trainset.csv'
test_files = {
    'CU_CI': '/content/drive/My Drive/testset_cold_user_cold_item.csv',
    'CU_WI': '/content/drive/My Drive/testset_cold_user_warm_item.csv',
    'WU_CI': '/content/drive/My Drive/testset_warm_user_cold_item.csv',
    'WU_WI': '/content/drive/My Drive/testset_warm_user_warm_item.csv'
}
drive_save_path = '/content/drive/MyDrive/'


In [4]:
# --- Data Loading ---
df_wines = pd.read_csv(wine_file)
train_ratings = pd.read_csv(train_file)


In [5]:
# --- Selecting Wines ONLY from Training Data (avoids leakage) ---
# Prepare wines present in training data
train_wine_ids = train_ratings['WineID'].unique()
df_wines_train = df_wines[df_wines['WineID'].isin(train_wine_ids)].copy().reset_index(drop=True)

In [6]:
# --- Text Preprocessing (Lowercase) ---
# Standardize all categorical text attributes to lowercase for consistency
cols_lowercase = ['WineName', 'Type', 'Elaborate', 'Body', 'Acidity',
                  'Country', 'RegionName', 'WineryName']
df_wines_train[cols_lowercase] = df_wines_train[cols_lowercase].apply(lambda x: x.str.lower())

In [7]:
# --- List-column Cleaning (Grapes & Harmonize) ---
# Clean up columns that represent lists as strings, remove brackets/quotes, and convert to actual Python lists
for col in ['Grapes', 'Harmonize']:
    df_wines_train[col] = df_wines_train[col].fillna('') \
                         .str.replace(r'[\[\]\']/','',regex=True) \
                         .apply(lambda x: [w.strip().lower() for w in x.split(',') if w.strip()])

In [8]:
# --- Numeric Preprocessing (ABV) ---
# Normalize numerical ABV (alcohol by volume) feature using StandardScaler (mean=0, variance=1)
scaler = StandardScaler()
df_wines_train['ABV'] = scaler.fit_transform(df_wines_train[['ABV']].astype(float))

In [9]:
# --- Preserve original wine data for readable recommendations ---
# Save a copy of original wine data attributes for final presentation purposes
context_cols=['WineID','WineName','WineryName','Type','Country','RegionName','ABV']
df_wines_original = df_wines_train[context_cols].copy()

In [10]:
# --- Categorical Features (One-Hot Encoding) ---
# Represent categorical variables using one-hot encoding
cat_features = ['Type','Elaborate','Body','Acidity','Country']
df_wines_train_encoded = pd.get_dummies(df_wines_train,columns=cat_features)

In [11]:
# --- TF-IDF Vectorization (Grapes + Harmonize) ---
# Create a textual corpus by combining Grapes and Harmonize features
corpus = df_wines_train['Grapes'].str.join(' ')+' '+df_wines_train['Harmonize'].str.join(' ')
# Vectorize corpus using TF-IDF to capture the uniqueness of terms across wines
tfidf_vec = TfidfVectorizer().fit_transform(corpus)

In [12]:
# --- Feature hashing (RegionName) ---
# Use FeatureHasher to reduce high-cardinality categorical RegionName variable to 16 hashed features
hasher = FeatureHasher(n_features=16,input_type='string',alternate_sign=False)
hashed_region = hasher.fit_transform(df_wines_train[['RegionName']].values)

In [13]:
# --- Combine all features into Sparse Embeddings ---
# Identify numeric and encoded categorical features to combine with embeddings
non_text_cols = ['ABV'] + [col for col in df_wines_train_encoded if col.startswith(('Type_', 'Elaborate_', 'Body_', 'Acidity_', 'Country_'))]
# Convert explicitly to float to ensure correct numeric data types
numeric_sparse = csr_matrix(df_wines_train_encoded[non_text_cols].astype(float).values)

In [14]:

# Create a mapping from WineID to its index in the similarity matrix
# The order is based on df_wines_train['WineID'] as used for combined_train_sparse
wine_id_to_idx = {wine_id: i for i, wine_id in enumerate(df_wines_train['WineID'])}

In [15]:
combined_train_sparse = hstack([numeric_sparse, tfidf_vec, hashed_region])

# --- Compute Similarity Matrix (Training Only) ---
# Calculate pairwise cosine similarity among wines based on embeddings
similarity_train_np_full = cosine_similarity(combined_train_sparse)
# Optimization: Convert to float32 to save memory and potentially speed up operations
similarity_matrix_np = similarity_train_np_full.astype(np.float32)
del similarity_train_np_full # Free memory of the float64 version

In [16]:
similarity_df_train = pd.DataFrame(similarity_matrix_np, # Use the float32 version
                                   index=df_wines_train['WineID'],
                                   columns=df_wines_train['WineID'])

In [17]:
global_mean_rating = train_ratings['Rating'].mean()

# `user_rated_wines_avg_ratings_idx_map` will store:
# { user_id: {wine_idx_in_similarity_matrix: avg_rating} }
user_rated_wines_avg_ratings_idx_map = {}

# Filter train_ratings to include only wines present in our similarity matrix
# (i.e., wines in df_wines_train, which are derived from train_wine_ids)
valid_wine_ids_in_sim_matrix = set(wine_id_to_idx.keys())
filtered_train_ratings = train_ratings[train_ratings['WineID'].isin(valid_wine_ids_in_sim_matrix)].copy()

for user_id, group in tqdm(filtered_train_ratings.groupby('UserID'), desc="Preprocessing user ratings"):
    avg_ratings_for_user = group.groupby('WineID')['Rating'].mean()

    # Store ratings with wine indices instead of WineIDs for direct use with similarity_matrix_np
    ratings_with_indices = {
        wine_id_to_idx[wine_id]: rating
        for wine_id, rating in avg_ratings_for_user.items()
        # wine_id should always be in wine_id_to_idx due to pre-filtering of train_ratings
    }
    if ratings_with_indices: # Only add user if they have rated wines present in the similarity matrix
        user_rated_wines_avg_ratings_idx_map[user_id] = ratings_with_indices

Preprocessing user ratings: 100%|██████████| 1056035/1056035 [06:45<00:00, 2603.81it/s]


In [18]:
def predict_rating(user_id, target_wine_id,
                             user_rated_wines_map, # user_rated_wines_avg_ratings_idx_map
                             sim_matrix_np,       # similarity_matrix_np
                             wine_id_to_idx_map,  # wine_id_to_idx
                             global_mean):

    # --- Path for Cold Item ---
    # 1. Is the target_wine_id known to our system (i.e., was it in df_wines_train and thus has features and an entry in wine_id_to_idx_map)?
    if target_wine_id not in wine_id_to_idx_map:
        # If the item is truly "cold" (not in wine_id_to_idx_map),
        # we have no features for it, so we can't calculate similarities.
        return global_mean # THIS IS ONE REASON for CU_CI

    # If we reach here, it means target_wine_id *is* in wine_id_to_idx_map.
    # This could happen if an item listed as "cold" in the test set name
    # coincidentally was also present in the training wines.
    target_wine_idx = wine_id_to_idx_map[target_wine_id]

    # --- Path for Cold User ---
    # 2. Does the user_id have any rating history in our training data (i.e., is user_id a key in user_rated_wines_map)?
    user_ratings_data = user_rated_wines_map.get(user_id)
    if not user_ratings_data:
        # For a "Cold User" (CU) from testset_cold_user_cold_item.csv,
        # this user_id should NOT have any entries in train_ratings.
        # Therefore, user_id will not be in user_rated_wines_map.
        # user_ratings_data will be None.
        return global_mean # THIS IS THE PRIMARY REASON for CU_CI and CU_WI

    # --- Path if both user and item are "warm" enough to proceed ---
    # (The code below is typically NOT reached for CU_CI or CU_WI scenarios)
    rated_wine_indices = list(user_ratings_data.keys())
    if not rated_wine_indices: # Should ideally not happen if user_ratings_data is not None
        return global_mean

    actual_ratings_for_these_indices = np.array([user_ratings_data[idx] for idx in rated_wine_indices], dtype=np.float32)
    # Get similarities: sim_matrix_np[target_wine_idx] is the row for target_wine_id
    # sim_matrix_np[target_wine_idx, rated_wine_indices] gets specific similarities
    item_similarities = sim_matrix_np[target_wine_idx, rated_wine_indices] # This is already np.float32
    sim_sum_abs = np.sum(np.abs(item_similarities))

    if sim_sum_abs == 0: # User's rated items are not similar to target item
        return global_mean

    weighted_sum = np.dot(actual_ratings_for_these_indices, item_similarities)
    return weighted_sum / sim_sum_abs

In [19]:
def measures_at_k(predictions_data, k=10, threshold=3.5):
    """Return precision and nDCG at k metrics averaged across all users"""

    # First map the predictions to each user.
    # predictions_data is expected to be a list of tuples: (uid, iid, true_r, est, _)
    user_est_true = defaultdict(list)
    for uid, _, true_r, est, _ in predictions_data: # iid (item_id) is ignored by _
        user_est_true[uid].append((est, true_r)) # Store (predicted_rating, true_rating)

    average_precisions = dict()
    precisions_at_k = dict()
    ndcgs_at_k = dict()

    for uid, user_ratings in user_est_true.items():
        # Sort user ratings by estimated value (predicted_rating)
        user_ratings.sort(key=lambda x: x[0], reverse=True)

        # Create y_true (binary based on threshold) and y_pred (binary based on threshold)
        # for precision-based metrics
        y_true_binary = [1 if (true_r >= threshold) else 0 for (_, true_r) in user_ratings]
        y_pred_binary = [1 if (est_r >= threshold) else 0 for (est_r, _) in user_ratings]

        # Raw scores for nDCG and some versions of AP
        actual_scores_raw = [true_r for (_, true_r) in user_ratings]
        predicted_scores_raw = [est_r for (est_r, _) in user_ratings]


        # Average Precision
        if sum(y_true_binary) > 0: # Check if there are any relevant items for the user
            # Using y_pred_binary as per user's snippet for AP's y_score argument.
            # A more common approach is to use predicted_scores_raw for the y_score argument.
            average_precisions[uid] = average_precision_score(y_true_binary, y_pred_binary)
        else:
            average_precisions[uid] = 0.0 # Or np.nan, or handle as per preference

        # Precision@k
        # y_true_binary_at_k and y_pred_binary_at_k correspond to items sorted by predicted score
        y_true_binary_at_k = y_true_binary[:k]
        y_pred_binary_at_k = y_pred_binary[:k]

        precisions_at_k[uid] = precision_score(y_true_binary_at_k, y_pred_binary_at_k, zero_division=0)

        # nDCG@k
        # Uses raw true scores and raw predicted scores.
        # Items are already sorted by predicted_scores_raw due to `user_ratings.sort`.
        if not user_ratings: # Handle empty user_ratings if it can occur
             ndcgs_at_k[uid] = 0.0
        elif len(user_ratings) == 1 and k > 0 : # User's custom handling for single item
            # This custom nDCG for single item: 1 if (relevant and predicted relevant), else 0
            est, true_r = user_ratings[0]
            ndcgs_at_k[uid] = 1.0 if ((true_r >= threshold) and (est >= threshold)) else 0.0
        else:
            # Standard ndcg_score expects 2D arrays for y_true and y_score if passing single sample
            ndcgs_at_k[uid] = ndcg_score(np.asarray([actual_scores_raw]), np.asarray([predicted_scores_raw]), k=k)

    # Compute simple averages over all users for each score (moved outside the loop)
    if not user_est_true: # No users or no predictions
        avg_average_precisions = 0.0
        avg_precisions_at_k = 0.0
        avg_ndcgs_at_k = 0.0
    else:
        avg_average_precisions = sum(val for val in average_precisions.values()) / len(average_precisions) if average_precisions else 0.0
        avg_precisions_at_k = sum(val for val in precisions_at_k.values()) / len(precisions_at_k) if precisions_at_k else 0.0
        avg_ndcgs_at_k = sum(val for val in ndcgs_at_k.values()) / len(ndcgs_at_k) if ndcgs_at_k else 0.0

    return avg_average_precisions, avg_precisions_at_k, avg_ndcgs_at_k

In [20]:
batch_size = 100000 # Adjust batch size according to your requirements


In [21]:
def evaluate_in_batches_optimized(test_ratings_df,
                                  user_ratings_map_param,
                                  sim_matrix_param,
                                  wine_id_idx_map_param,
                                  global_mean_param,
                                  batch_size_param,
                                  scenario_name,
                                  ranking_k=5,  # Parameter for @k metrics
                                  ranking_threshold=3.5):  # Parameter for relevance threshold

    y_true_list, y_pred_list, rating_ids_list = [], [], []
    all_predictions_for_ranking = []  # To store (uid, iid, true_r, est, _) for ranking metrics

    num_batches = (len(test_ratings_df) + batch_size_param - 1) // batch_size_param

    for i in tqdm(range(num_batches), desc=f"Evaluating {scenario_name}"):
        start_idx = i * batch_size_param
        end_idx = min((i + 1) * batch_size_param, len(test_ratings_df))
        batch_df = test_ratings_df.iloc[start_idx:end_idx]

        current_preds_batch = []
        current_true_batch = batch_df.Rating.tolist()

        for user_id_val, wine_id_val, true_rating_val in zip(batch_df.UserID, batch_df.WineID, current_true_batch):
            pred = predict_rating(user_id_val, wine_id_val,
                                            user_ratings_map_param,
                                            sim_matrix_param,
                                            wine_id_idx_map_param,
                                            global_mean_param)
            current_preds_batch.append(pred)
            # Store for ranking metrics: (uid, iid, true_r, est, details=None)
            all_predictions_for_ranking.append((user_id_val, wine_id_val, true_rating_val, pred, None))

        y_true_list.extend(current_true_batch)
        y_pred_list.extend(current_preds_batch)
        rating_ids_list.extend(batch_df.RatingID.tolist())

    rmse = np.sqrt(mean_squared_error(y_true_list, y_pred_list))
    mae = mean_absolute_error(y_true_list, y_pred_list)

    predictions_output_df = pd.DataFrame({'RatingID': rating_ids_list, 'PredictedRating': y_pred_list})
    predictions_output_df.to_csv(f'{drive_save_path}predictions_{scenario_name}_optimized.csv', index=False)

    print(f"\n--- Results for {scenario_name} ---")
    print(f"RMSE: {rmse:.4f}, MAE: {mae:.4f}")

    # Calculate and print ranking metrics
    if all_predictions_for_ranking:
        avg_ap, avg_p_at_k, avg_ndcg_at_k = measures_at_k(
            all_predictions_for_ranking,
            k=ranking_k,
            threshold=ranking_threshold
        )
        print(f'Avg. AveragePrecision: {avg_ap:.4f}')
        print(f'Avg. Precision@{ranking_k}: {avg_p_at_k:.4f}')
        print(f'Avg. nDCG@{ranking_k}: {avg_ndcg_at_k:.4f}')
    else:
        print("No predictions made, skipping ranking metrics.")
        avg_ap, avg_p_at_k, avg_ndcg_at_k = 0.0, 0.0, 0.0

    return rmse, mae, avg_ap, avg_p_at_k, avg_ndcg_at_k


all_results = {}
for scenario, test_file_path in test_files.items():
    current_test_ratings = pd.read_csv(test_file_path)
    rmse_val, mae_val, avg_ap_val, avg_p_at_k_val, avg_ndcg_at_k_val = evaluate_in_batches_optimized(
        current_test_ratings,
        user_rated_wines_avg_ratings_idx_map,
        similarity_matrix_np,
        wine_id_to_idx,
        global_mean_rating,
        batch_size,
        scenario,
        ranking_k=5,  # Using k=5 as in your example
        ranking_threshold=3.5  # Using threshold=3.5 as in your example
    )
    all_results[scenario] = {
        'RMSE': rmse_val, 'MAE': mae_val,
        'Avg. AP': avg_ap_val, f'Avg. P@5': avg_p_at_k_val, f'Avg. nDCG@5': avg_ndcg_at_k_val
    }

Evaluating CU_CI: 100%|██████████| 1/1 [00:00<00:00, 112.02it/s]



--- Results for CU_CI ---
RMSE: 0.8220, MAE: 0.6164
Avg. AveragePrecision: 0.7089
Avg. Precision@5: 0.7088
Avg. nDCG@5: 0.7283


Evaluating CU_WI: 100%|██████████| 6/6 [00:05<00:00,  1.04it/s]



--- Results for CU_WI ---
RMSE: 1.0739, MAE: 0.6958
Avg. AveragePrecision: 0.8134
Avg. Precision@5: 0.6968
Avg. nDCG@5: 0.9253


Evaluating WU_CI: 100%|██████████| 1/1 [00:00<00:00, 50.51it/s]



--- Results for WU_CI ---
RMSE: 0.6852, MAE: 0.5062
Avg. AveragePrecision: 0.8389
Avg. Precision@5: 0.8389
Avg. nDCG@5: 0.8651


Evaluating WU_WI: 100%|██████████| 21/21 [00:48<00:00,  2.33s/it]



--- Results for WU_WI ---
RMSE: 0.7951, MAE: 0.5383
Avg. AveragePrecision: 0.8808
Avg. Precision@5: 0.7796
Avg. nDCG@5: 0.8896


In [22]:
for scenario, metrics in all_results.items():
    print(f"\nScenario: {scenario}")
    for metric_name, metric_value in metrics.items():
        print(f"{metric_name}: {metric_value:.4f}")



Scenario: CU_CI
RMSE: 0.8220
MAE: 0.6164
Avg. AP: 0.7089
Avg. P@5: 0.7088
Avg. nDCG@5: 0.7283

Scenario: CU_WI
RMSE: 1.0739
MAE: 0.6958
Avg. AP: 0.8134
Avg. P@5: 0.6968
Avg. nDCG@5: 0.9253

Scenario: WU_CI
RMSE: 0.6852
MAE: 0.5062
Avg. AP: 0.8389
Avg. P@5: 0.8389
Avg. nDCG@5: 0.8651

Scenario: WU_WI
RMSE: 0.7951
MAE: 0.5383
Avg. AP: 0.8808
Avg. P@5: 0.7796
Avg. nDCG@5: 0.8896


In [23]:
# --- Recommendation (Item-to-Item) ---
# Function recommending similar wines based on cosine similarity scores
def get_recommendation(wine_id_param, sim_df, wines_original_df, num_recs=5):
    if wine_id_param not in sim_df.index: # Check if wine is in our similarity matrix
        print(f"WineID {wine_id_param} not found in the similarity matrix.")
        return pd.DataFrame() # Return empty DataFrame if not found

    # Get similarity scores for the given wine_id, sort them, drop the wine itself, and take top N
    similar_wines_series = sim_df[wine_id_param].sort_values(ascending=False).drop(wine_id_param).head(num_recs)
    similar_wines_df = similar_wines_series.reset_index()
    similar_wines_df.columns = ['WineID', 'Similarity']

    # Merge with original wine details to make recommendations readable
    recommendations_df = pd.merge(similar_wines_df, wines_original_df[['WineID', 'WineName', 'Type', 'Country']], on='WineID', how='left')
    return recommendations_df


In [24]:
# Example usage of function: Provide similar wines for user-selected wine
example_wine_id_val = train_ratings['WineID'].iloc[0]
recommendations_result = get_recommendation(example_wine_id_val, similarity_df_train, df_wines_original)
print(f"\nItem-to-item recommendations for WineID{example_wine_id_val}:\n{recommendations_result}")


Item-to-item recommendations for WineID136168:
   WineID  Similarity                                           WineName Type  \
0  136255         1.0                             brunello di montalcino  red   
1  135927         1.0                             brunello di montalcino  red   
2  142393         1.0                             brunello di montalcino  red   
3  136716         1.0           leonardo da vinci brunello di montalcino  red   
4  153633         1.0  tenuta greppone mazzi riserva brunello di mont...  red   

  Country  
0   italy  
1   italy  
2   italy  
3   italy  
4   italy  


In [25]:
# --- Popular Wines utility (Cold-Start Recommendations) ---
# Identify most popular/highly-rated wines in training data, useful for cold-start recommendations
def popular_wines(train_ratings_df, wines_original_df, n_recs=5, min_ratings_thresh=10):
    # Group by WineID and calculate mean rating and count of ratings
    popular_df = train_ratings_df.groupby('WineID')['Rating'].agg(['mean', 'count'])
    # Filter out wines with fewer than min_ratings_thresh ratings
    popular_df = popular_df[popular_df['count'] >= min_ratings_thresh]
    # Sort by mean rating in descending order and take top N
    popular_df = popular_df.sort_values('mean', ascending=False).head(n_recs)

    # Merge with original wine details for readability
    # use .loc to select rows from wines_original_df whose WineID is in popular_df.index
    # and then select specific columns.
    popular_recs_df = wines_original_df[wines_original_df['WineID'].isin(popular_df.index)].copy()
    # To maintain the order from popular_df, we can reindex or merge and sort
    popular_recs_df = popular_recs_df.set_index('WineID').loc[popular_df.index].reset_index()

    return popular_recs_df[['WineID', 'WineName', 'Type', 'Country']] # Include WineName

In [26]:
# Display example popular wine recommendations
popular=popular_wines(train_ratings,df_wines_original)
print(f"Popular wines for cold-start recommendations:\n{popular}")

Popular wines for cold-start recommendations:
   WineID                                           WineName       Type  \
0  183447                          wraith cabernet sauvignon        red   
1  117346      clos d'ambonnay blanc de noirs brut champagne  sparkling   
2  188413                                       jusqu'a l'os        red   
3  183348  cabernet sauvignon old sparky beckstoffer to k...        red   
4  122521                                 cristal vinothèque  sparkling   

         Country  
0  united states  
1         france  
2  united states  
3  united states  
4         france  
