In [1]:
# --- Imports ---
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, FeatureHasher
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
from scipy.sparse import csr_matrix, hstack
from tqdm import tqdm

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [24]:
# Define your file paths
wine_file = '/content/drive/My Drive/XWines_Full_100K_wines.csv'
train_file = '/content/drive/My Drive/trainset.csv'
test_files = {
    'CU_CI': '/content/drive/My Drive/testset_cold_user_cold_item.csv',
    'CU_WI': '/content/drive/My Drive/testset_cold_user_warm_item.csv',
    'WU_CI': '/content/drive/My Drive/testset_warm_user_cold_item.csv',
    'WU_WI': '/content/drive/My Drive/testset_warm_user_warm_item.csv'
}
drive_save_path = '/content/drive/MyDrive/'


In [4]:
# --- Data Loading ---
df_wines = pd.read_csv(wine_file)
train_ratings = pd.read_csv(train_file)


In [5]:
# --- Selecting Wines ONLY from Training Data (avoids leakage) ---
# Prepare wines present in training data
train_wine_ids = train_ratings['WineID'].unique()
df_wines_train = df_wines[df_wines['WineID'].isin(train_wine_ids)].copy().reset_index(drop=True)

In [6]:
# --- Text Preprocessing (Lowercase) ---
# Standardize all categorical text attributes to lowercase for consistency
cols_lowercase = ['WineName', 'Type', 'Elaborate', 'Body', 'Acidity',
                  'Country', 'RegionName', 'WineryName']
df_wines_train[cols_lowercase] = df_wines_train[cols_lowercase].apply(lambda x: x.str.lower())

In [7]:
# --- List-column Cleaning (Grapes & Harmonize) ---
# Clean up columns that represent lists as strings, remove brackets/quotes, and convert to actual Python lists
for col in ['Grapes', 'Harmonize']:
    df_wines_train[col] = df_wines_train[col].fillna('') \
                         .str.replace(r'[\[\]\']/','',regex=True) \
                         .apply(lambda x: [w.strip().lower() for w in x.split(',') if w.strip()])

In [8]:
# --- Numeric Preprocessing (ABV) ---
# Normalize numerical ABV (alcohol by volume) feature using StandardScaler (mean=0, variance=1)
scaler = StandardScaler()
df_wines_train['ABV'] = scaler.fit_transform(df_wines_train[['ABV']].astype(float))

In [9]:
# --- Preserve original wine data for readable recommendations ---
# Save a copy of original wine data attributes for final presentation purposes
context_cols=['WineID','WineName','WineryName','Type','Country','RegionName','ABV']
df_wines_original = df_wines_train[context_cols].copy()

In [10]:
# --- Categorical Features (One-Hot Encoding) ---
# Represent categorical variables using one-hot encoding
cat_features = ['Type','Elaborate','Body','Acidity','Country']
df_wines_train_encoded = pd.get_dummies(df_wines_train,columns=cat_features)

In [11]:
# --- TF-IDF Vectorization (Grapes + Harmonize) ---
# Create a textual corpus by combining Grapes and Harmonize features
corpus = df_wines_train['Grapes'].str.join(' ')+' '+df_wines_train['Harmonize'].str.join(' ')
# Vectorize corpus using TF-IDF to capture the uniqueness of terms across wines
tfidf_vec = TfidfVectorizer().fit_transform(corpus)

In [12]:
# --- Feature hashing (RegionName) ---
# Use FeatureHasher to reduce high-cardinality categorical RegionName variable to 16 hashed features
hasher = FeatureHasher(n_features=16,input_type='string',alternate_sign=False)
hashed_region = hasher.fit_transform(df_wines_train[['RegionName']].values)

In [13]:
# --- Combine all features into Sparse Embeddings ---
# Identify numeric and encoded categorical features to combine with embeddings
non_text_cols = ['ABV'] + [col for col in df_wines_train_encoded if col.startswith(('Type_', 'Elaborate_', 'Body_', 'Acidity_', 'Country_'))]
# Convert explicitly to float to ensure correct numeric data types
numeric_sparse = csr_matrix(df_wines_train_encoded[non_text_cols].astype(float).values)

In [14]:

# Create a mapping from WineID to its index in the similarity matrix
# The order is based on df_wines_train['WineID'] as used for combined_train_sparse
wine_id_to_idx = {wine_id: i for i, wine_id in enumerate(df_wines_train['WineID'])}

In [15]:
combined_train_sparse = hstack([numeric_sparse, tfidf_vec, hashed_region])

# --- Compute Similarity Matrix (Training Only) ---
# Calculate pairwise cosine similarity among wines based on embeddings
similarity_train_np_full = cosine_similarity(combined_train_sparse)
# Optimization: Convert to float32 to save memory and potentially speed up operations
similarity_matrix_np = similarity_train_np_full.astype(np.float32)
del similarity_train_np_full # Free memory of the float64 version

In [29]:
similarity_df_train = pd.DataFrame(similarity_matrix_np, # Use the float32 version
                                   index=df_wines_train['WineID'],
                                   columns=df_wines_train['WineID'])

In [16]:
global_mean_rating = train_ratings['Rating'].mean()

# `user_rated_wines_avg_ratings_idx_map` will store:
# { user_id: {wine_idx_in_similarity_matrix: avg_rating} }
user_rated_wines_avg_ratings_idx_map = {}

# Filter train_ratings to include only wines present in our similarity matrix
# (i.e., wines in df_wines_train, which are derived from train_wine_ids)
valid_wine_ids_in_sim_matrix = set(wine_id_to_idx.keys())
filtered_train_ratings = train_ratings[train_ratings['WineID'].isin(valid_wine_ids_in_sim_matrix)].copy()

for user_id, group in tqdm(filtered_train_ratings.groupby('UserID'), desc="Preprocessing user ratings"):
    avg_ratings_for_user = group.groupby('WineID')['Rating'].mean()

    # Store ratings with wine indices instead of WineIDs for direct use with similarity_matrix_np
    ratings_with_indices = {
        wine_id_to_idx[wine_id]: rating
        for wine_id, rating in avg_ratings_for_user.items()
        # wine_id should always be in wine_id_to_idx due to pre-filtering of train_ratings
    }
    if ratings_with_indices: # Only add user if they have rated wines present in the similarity matrix
        user_rated_wines_avg_ratings_idx_map[user_id] = ratings_with_indices

Preprocessing user ratings: 100%|██████████| 1056035/1056035 [06:53<00:00, 2552.64it/s]


In [17]:
def predict_rating(user_id, wine_id, train_ratings, similarity_df, global_mean):
    user_history = train_ratings[train_ratings.UserID == user_id]

    if wine_id not in similarity_df.index:
        return global_mean

    rated_wines = user_history['WineID'][user_history['WineID'].isin(similarity_df.index)]

    if rated_wines.empty:
        return global_mean

    # Ensure unique wines to match with similarities
    rated_wines_unique = rated_wines.drop_duplicates()

    # Extract similarities for these unique rated wines
    similarities = similarity_df.loc[wine_id, rated_wines_unique]

    # Averaging multiple ratings for the same wine before alignment
    ratings_avg = user_history.groupby('WineID')['Rating'].mean().loc[similarities.index]


    if len(ratings_avg) != len(similarities):
        raise ValueError("Mismatch between ratings and similarities lengths.")

    # Edge case safeguard: handle zero similarity sum
    if similarities.abs().sum() == 0:
        return global_mean

    # Calculate prediction using dot product
    return np.dot(ratings_avg, similarities) / similarities.abs().sum()

In [18]:
# --- Evaluation Function (RMSE & MAE) ---
def predict_rating_optimized(user_id, target_wine_id,
                             user_rated_wines_map, # This is user_rated_wines_avg_ratings_idx_map
                             sim_matrix_np, # This is similarity_matrix_np
                             wine_id_to_idx_map, # This is wine_id_to_idx
                             global_mean):

    if target_wine_id not in wine_id_to_idx_map: # Target wine not in our known wines
        return global_mean

    target_wine_idx = wine_id_to_idx_map[target_wine_id]

    user_ratings_data = user_rated_wines_map.get(user_id)
    if not user_ratings_data: # User has no relevant rating history or unknown user
        return global_mean

    # user_ratings_data is a dict {rated_wine_idx: avg_rating}
    # These indices are already valid for sim_matrix_np
    rated_wine_indices = list(user_ratings_data.keys())

    # If rated_wine_indices is empty (shouldn't happen if user_ratings_data is not None and not empty)
    if not rated_wine_indices:
        return global_mean

    actual_ratings_for_these_indices = np.array([user_ratings_data[idx] for idx in rated_wine_indices], dtype=np.float32)

    # Get similarities: sim_matrix_np[target_wine_idx] is the row for target_wine_id
    # sim_matrix_np[target_wine_idx, rated_wine_indices] gets specific similarities
    item_similarities = sim_matrix_np[target_wine_idx, rated_wine_indices] # This is already np.float32

    sim_sum_abs = np.sum(np.abs(item_similarities))

    if sim_sum_abs == 0: # No similarity or similarities cancel out (unlikely with abs)
        return global_mean

    weighted_sum = np.dot(actual_ratings_for_these_indices, item_similarities)

    return weighted_sum / sim_sum_abs
    return rmse, mae, results_df

In [19]:
batch_size = 100000 # Adjust batch size according to your requirements


In [20]:
# Load and evaluate the test set
def evaluate_in_batches_optimized(test_ratings_df, # Renamed to avoid conflict
                                  # Pre-computed data passed directly:
                                  user_ratings_map_param, # user_rated_wines_avg_ratings_idx_map
                                  sim_matrix_param,       # similarity_matrix_np
                                  wine_id_idx_map_param,  # wine_id_to_idx
                                  global_mean_param,      # global_mean_rating
                                  batch_size_param, scenario_name):

    y_true_list, y_pred_list, rating_ids_list = [], [], []

    num_batches = (len(test_ratings_df) + batch_size_param - 1) // batch_size_param

    for i in tqdm(range(num_batches), desc=f"Evaluating {scenario_name}"):
        start_idx = i * batch_size_param
        end_idx = min((i + 1) * batch_size_param, len(test_ratings_df))
        batch_df = test_ratings_df.iloc[start_idx:end_idx]

        # Optimized loop using zip for slightly better performance than .apply or .iterrows
        current_preds_batch = []
        for user_id_val, wine_id_val in zip(batch_df.UserID, batch_df.WineID):
            pred = predict_rating_optimized(user_id_val, wine_id_val,
                                            user_ratings_map_param,
                                            sim_matrix_param,
                                            wine_id_idx_map_param,
                                            global_mean_param)
            current_preds_batch.append(pred)

        y_true_list.extend(batch_df.Rating.tolist())
        y_pred_list.extend(current_preds_batch)
        rating_ids_list.extend(batch_df.RatingID.tolist())

    rmse = np.sqrt(mean_squared_error(y_true_list, y_pred_list))
    mae = mean_absolute_error(y_true_list, y_pred_list)

    predictions_output_df = pd.DataFrame({'RatingID': rating_ids_list, 'PredictedRating': y_pred_list})
    predictions_output_df.to_csv(f'{drive_save_path}predictions_{scenario_name}_optimized.csv', index=False)

    print(f"{scenario_name} - RMSE: {rmse:.4f}, MAE: {mae:.4f}")
    return rmse, mae

In [25]:
for scenario, test_file_path in test_files.items():
    current_test_ratings = pd.read_csv(test_file_path)
    rmse_val, mae_val = evaluate_in_batches_optimized(
        current_test_ratings,
        user_rated_wines_avg_ratings_idx_map, # Pass pre-computed map
        similarity_matrix_np,                 # Pass NumPy similarity matrix
        wine_id_to_idx,                       # Pass WineID to index map
        global_mean_rating,                   # Pass global mean
        batch_size,
        scenario
    )

Evaluating CU_CI: 100%|██████████| 166/166 [00:00<00:00, 6439.35it/s]


CU_CI - RMSE: 0.8220, MAE: 0.6164


Evaluating CU_WI: 100%|██████████| 5068/5068 [00:06<00:00, 735.92it/s]


CU_WI - RMSE: 1.0739, MAE: 0.6958


Evaluating WU_CI: 100%|██████████| 355/355 [00:00<00:00, 6333.14it/s]


WU_CI - RMSE: 0.6852, MAE: 0.5062


Evaluating WU_WI: 100%|██████████| 20368/20368 [00:51<00:00, 396.52it/s]


WU_WI - RMSE: 0.7951, MAE: 0.5383


In [26]:
# --- Recommendation (Item-to-Item) ---
# Function recommending similar wines based on cosine similarity scores
def get_recommendation(wine_id_param, sim_df, wines_original_df, num_recs=5):
    if wine_id_param not in sim_df.index: # Check if wine is in our similarity matrix
        print(f"WineID {wine_id_param} not found in the similarity matrix.")
        return pd.DataFrame() # Return empty DataFrame if not found

    # Get similarity scores for the given wine_id, sort them, drop the wine itself, and take top N
    similar_wines_series = sim_df[wine_id_param].sort_values(ascending=False).drop(wine_id_param).head(num_recs)
    similar_wines_df = similar_wines_series.reset_index()
    similar_wines_df.columns = ['WineID', 'Similarity']

    # Merge with original wine details to make recommendations readable
    recommendations_df = pd.merge(similar_wines_df, wines_original_df[['WineID', 'WineName', 'Type', 'Country']], on='WineID', how='left')
    return recommendations_df


In [30]:
# Example usage of function: Provide similar wines for user-selected wine
if not train_ratings.empty and train_ratings['WineID'].iloc[0] in similarity_df_train.index:
    example_wine_id_val = train_ratings['WineID'].iloc[0]
    recommendations_result = get_recommendation(example_wine_id_val, similarity_df_train, df_wines_original)
    print(f"\nItem-to-item recommendations for WineID {example_wine_id_val}:\n{recommendations_result}")
else:
    # Fallback if the first wine is not suitable or train_ratings is empty
    # Try a wine ID known to be in df_wines_train (if available)
    if not df_wines_train.empty:
        example_wine_id_val = df_wines_train['WineID'].iloc[0]
        if example_wine_id_val in similarity_df_train.index:
             recommendations_result = get_recommendation(example_widinie_id_val, similarity_df_train, df_wines_original)
             print(f"\nItem-to-item recommendations for WineID {example_wine_id_val}:\n{recommendations_result}")
        else:
            print(f"Could not find example wine ID {example_wine_id_val} in similarity matrix for recommendations.")
    else:
        print("No wines available in df_wines_train for recommendation example.")


Item-to-item recommendations for WineID 136168:
   WineID  Similarity                                           WineName Type  \
0  136255         1.0                             brunello di montalcino  red   
1  135927         1.0                             brunello di montalcino  red   
2  142393         1.0                             brunello di montalcino  red   
3  136716         1.0           leonardo da vinci brunello di montalcino  red   
4  153633         1.0  tenuta greppone mazzi riserva brunello di mont...  red   

  Country  
0   italy  
1   italy  
2   italy  
3   italy  
4   italy  


In [31]:
# --- Popular Wines utility (Cold-Start Recommendations) ---
# Identify most popular/highly-rated wines in training data, useful for cold-start recommendations
def popular_wines(train_ratings_df, wines_original_df, n_recs=5, min_ratings_thresh=10):
    # Group by WineID and calculate mean rating and count of ratings
    popular_df = train_ratings_df.groupby('WineID')['Rating'].agg(['mean', 'count'])
    # Filter out wines with fewer than min_ratings_thresh ratings
    popular_df = popular_df[popular_df['count'] >= min_ratings_thresh]
    # Sort by mean rating in descending order and take top N
    popular_df = popular_df.sort_values('mean', ascending=False).head(n_recs)

    # Merge with original wine details for readability
    # We use .loc to select rows from wines_original_df whose WineID is in popular_df.index
    # and then select specific columns.
    popular_recs_df = wines_original_df[wines_original_df['WineID'].isin(popular_df.index)].copy()
    # To maintain the order from popular_df, we can reindex or merge and sort
    popular_recs_df = popular_recs_df.set_index('WineID').loc[popular_df.index].reset_index()

    return popular_recs_df[['WineID', 'WineName', 'Type', 'Country']] # Include WineName

In [32]:
# Display example popular wine recommendations
popular=popular_wines(train_ratings,df_wines_original)
print(f"Popular wines for cold-start recommendations:\n{popular}")

Popular wines for cold-start recommendations:
   WineID                                           WineName       Type  \
0  183447                          wraith cabernet sauvignon        red   
1  117346      clos d'ambonnay blanc de noirs brut champagne  sparkling   
2  188413                                       jusqu'a l'os        red   
3  183348  cabernet sauvignon old sparky beckstoffer to k...        red   
4  122521                                 cristal vinothèque  sparkling   

         Country  
0  united states  
1         france  
2  united states  
3  united states  
4         france  
