In [1]:
# --- Imports ---
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction import FeatureHasher
from scipy.sparse import csr_matrix, hstack
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# --- Data Loading ---
df_wines = pd.read_csv("XWines_Slim_1K_wines.csv")
df_ratings = pd.read_csv("XWines_Slim_150K_ratings.csv")

  df_ratings = pd.read_csv("XWines_Slim_150K_ratings.csv")


In [3]:
df_wines.head()

Unnamed: 0,WineID,WineName,Type,Elaborate,Grapes,Harmonize,ABV,Body,Acidity,Code,Country,RegionID,RegionName,WineryID,WineryName,Website,Vintages
0,100001,Espumante Moscatel,Sparkling,Varietal/100%,['Muscat/Moscato'],"['Pork', 'Rich Fish', 'Shellfish']",7.5,Medium-bodied,High,BR,Brazil,1001,Serra Gaúcha,10001,Casa Perini,http://www.vinicolaperini.com.br,"[2020, 2019, 2018, 2017, 2016, 2015, 2014, 201..."
1,100002,Ancellotta,Red,Varietal/100%,['Ancellotta'],"['Beef', 'Barbecue', 'Codfish', 'Pasta', 'Pizz...",12.0,Medium-bodied,Medium,BR,Brazil,1001,Serra Gaúcha,10001,Casa Perini,http://www.vinicolaperini.com.br,"[2016, 2015, 2014, 2013, 2012, 2011, 2010, 200..."
2,100003,Cabernet Sauvignon,Red,Varietal/100%,['Cabernet Sauvignon'],"['Beef', 'Lamb', 'Poultry']",12.0,Full-bodied,High,BR,Brazil,1001,Serra Gaúcha,10002,Castellamare,https://www.emporiocastellamare.com.br,"[2021, 2020, 2019, 2018, 2017, 2016, 2015, 201..."
3,100005,Maison de Ville Cabernet-Merlot,Red,Assemblage/Bordeaux Red Blend,"['Cabernet Sauvignon', 'Merlot']","['Beef', 'Lamb', 'Game Meat', 'Poultry']",11.0,Full-bodied,Medium,BR,Brazil,1001,Serra Gaúcha,10000,Aurora,http://www.vinicolaaurora.com.br,"[2021, 2020, 2019, 2018, 2017, 2016, 2015, 201..."
4,100007,Do Lugar Moscatel Espumantes,Sparkling,Varietal/100%,['Muscat/Moscato Bianco'],"['Pork', 'Rich Fish', 'Shellfish']",7.5,Medium-bodied,High,BR,Brazil,1001,Serra Gaúcha,10012,Dal Pizzol,http://www.dalpizzol.com.br,"[2018, 2017, 2016, 2015, 2014, 2013, 2012, 201..."


In [4]:
df_ratings.head()

Unnamed: 0,RatingID,UserID,WineID,Vintage,Rating,Date
0,143,1356810,103471,1950,4.5,2021-11-02 20:52:59
1,199,1173759,111415,1951,5.0,2015-08-20 17:46:26
2,348,1164877,111395,1952,5.0,2020-11-13 05:40:26
3,374,1207665,111433,1953,5.0,2017-05-05 06:44:13
4,834,1075841,111431,1955,5.0,2016-09-14 20:18:38


In [5]:
# --- Store Original Wine IDs ---
# Keep a separate Series of WineIDs for indexing later
wine_ids = df_wines['WineID'].copy()

In [6]:
# --- Create DataFrame for Preference Matching (Before Heavy Transformation) ---
# This DataFrame holds data in a more human-readable format, suitable for
# matching user preferences expressed as strings (e.g., 'Type': 'red').
# Define columns needed for direct preference matching
preference_cols_list = ['WineID', 'WineName', 'Type', 'Grapes', 'Body', 'Acidity', 'Country', 'RegionName', 'WineryName', 'Elaborate']
# Select only columns that actually exist in the loaded DataFrame
valid_preference_cols = [col for col in preference_cols_list if col in df_wines.columns]

if 'WineID' in valid_preference_cols: # Ensure WineID exists for indexing
    df_for_prefs = df_wines[valid_preference_cols].copy()
    # Set WineID as index for easier lookup during preference matching
    df_for_prefs.set_index('WineID', inplace=True)
    print(f"Preference DataFrame created with columns: {df_for_prefs.columns.tolist()}")
else:
    df_for_prefs = pd.DataFrame() # Empty DataFrame if WineID or other columns are missing
    warnings.warn("Warning: Could not create preference DataFrame (WineID missing or no suitable columns found). Preference-based cold start might fail.")

Preference DataFrame created with columns: ['WineName', 'Type', 'Grapes', 'Body', 'Acidity', 'Country', 'RegionName', 'WineryName', 'Elaborate']


In [7]:
# --- Text Preprocessing (Lowercase) ---
# Columns to convert to lowercase
preprocessing = ['WineName', 'Type', 'Elaborate', 'Body', 'Acidity', 'Country', 'RegionName', 'WineryName']
for col in preprocessing:
    df_wines[col] = df_wines[col].str.lower()

In [8]:
# --- List-like Column Preprocessing (Grapes, Harmonize) ---
preprocessing2 = ['Grapes', 'Harmonize']

for col in preprocessing2:
    if col in df_wines.columns:
        # 1. Fill NaN with empty string FIRST, then ensure string type
        #    This prevents np.nan from becoming the string 'nan'
        series = df_wines[col].fillna('').astype(str) # <-- Swapped order here

        # 2. Remove unwanted characters
        series = series.str.replace('[', '', regex=False)
        series = series.str.replace(']', '', regex=False)
        series = series.str.replace("'", '', regex=False)
        series = series.str.replace('/', '', regex=False) # Remove slash

        # 3. Split into a Series of lists
        series_list = series.str.split(',')

        # 4. Clean items WITHIN each list: strip whitespace and lowercase.
        #    Handles cases where split might produce items with leading/trailing spaces.
        #    Includes a check for `isinstance(item, str)` for safety.
        cleaned_series_list = series_list.apply(
            lambda lst: [item.strip().lower() for item in lst if isinstance(item, str)] # Apply strip() and lower()
            if isinstance(lst, list) else # Check if the element is actually a list
            lst # Handle potential non-list elements
        )

        # 5. Assign back, converting potential [''] resulting from empty strings OR original NaNs to []
        df_wines[col] = cleaned_series_list.apply(lambda x: [] if x == [''] else x)

    else:
         print(f"Warning: Column '{col}' not found in DataFrame.")

In [9]:
# --- Numeric Preprocessing (ABV) ---
# Convert ABV to float
df_wines['ABV'] = df_wines['ABV'].astype(float)

In [10]:
 # Scale ABV using StandardScaler
scaler = StandardScaler()
df_wines['ABV'] = scaler.fit_transform(df_wines[['ABV']])

In [11]:
# --- Categorical Preprocessing (One-Hot Encoding) ---
categorical_features = ['Type', 'Elaborate', 'Body', 'Acidity', 'Country']
df_wines = pd.get_dummies(df_wines, columns=categorical_features)

In [12]:
# --- Text Vectorization (TF-IDF for Grapes & Harmonize) ---
grapes_text = df_wines['Grapes'].apply(lambda lst: ' '.join(lst))
harmonize_text = df_wines['Harmonize'].apply(lambda lst: ' '.join(lst))
# Combine the text from both columns into a single corpus for vectorization
corpus = grapes_text + ' ' + harmonize_text
# Initialize and fit TF-IDF Vectorizer
vect = TfidfVectorizer()
tfidf_matrix = vect.fit_transform(corpus)


In [13]:
# --- High-Cardinality Categorical Preprocessing (Feature Hashing for RegionName) ---
N_FEATURES_HASH = 16 # Number of hash buckets (dimensionality of the output)
# Initialize FeatureHasher
# input_type='string' assumes the column contains strings
# alternate_sign=False keeps hashed values non-negative (often simpler)
hasher = FeatureHasher(n_features=N_FEATURES_HASH, input_type='string', alternate_sign=False)
# Prepare input for hasher: needs to be an iterable of iterables (list of lists/tuples)
# Each inner list contains the single string feature value for that row.
region_feature_values = [[region] for region in df_wines['RegionName']] # [[r1], [r2], ...] 
# Fit and transform the data
hashed_features_sparse = hasher.fit_transform(region_feature_values)

# Create new column names for the hashed features
hashed_feature_columns = [f'RegionHash_{i}' for i in range(N_FEATURES_HASH)]
# Create a DataFrame from the hashed features (sparse matrix converted to dense array)
df_hashed_regions = pd.DataFrame(hashed_features_sparse.toarray(),
columns=hashed_feature_columns,
index=df_wines.index)


In [14]:
# Concatenate the hashed features DataFrame with the main DataFrame
df_wines = pd.concat([df_wines.drop(['RegionName'], axis=1), df_hashed_regions], axis=1)

In [15]:
# Drop the original RegionName column as it's now represented by hashed features
df_wines = df_wines.drop(['WineryName','WineName'], axis=1)

In [16]:
# --- Combine Features into a Single Sparse Matrix ---
from scipy.sparse import csr_matrix
non_text_columns = ['ABV'] + [col for col in df_wines.columns if col.startswith('type_')
or col.startswith('elaborate_')
or col.startswith('body_')
or col.startswith('acidity_')
or col.startswith('country_')]
non_text_columns = [col for col in non_text_columns if col not in hashed_feature_columns]

if non_text_columns:
    numeric_sparse = csr_matrix(df_wines[non_text_columns].values)
else:
    numeric_sparse = None


In [17]:
# Combine all sparse matrices horizontally
from scipy.sparse import hstack

matrices = []
if numeric_sparse is not None:
    matrices.append(numeric_sparse)
    matrices.append(tfidf_matrix)
    matrices.append(hashed_features_sparse)

combined_sparse = hstack(matrices)
print("Combined feature matrix shape:", combined_sparse.shape)

Combined feature matrix shape: (1007, 278)


In [18]:
# --- Create Embedding DataFrame (Dense Representation) ---
# This converts the sparse matrix to a dense numpy array and then to a DataFrame.
combined_dense = combined_sparse.toarray()
embedding_df = pd.DataFrame(combined_dense, index=wine_ids)
embedding_df.index.name = "WineID"

In [19]:
embedding_df.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,268,269,270,271,272,273,274,275,276,277
WineID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100001,-1.774251,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100002,-0.408248,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100003,-0.408248,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100005,-0.711804,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100007,-1.774251,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
from sklearn.metrics.pairwise import cosine_similarity
# --- Calculate Cosine Similarity ---
# Calculate similarity between wines based on the combined sparse features.
# cosine_similarity calculates row-wise similarities
similarity_matrix = cosine_similarity(combined_sparse)

In [21]:
# Create a DataFrame from the similarity matrix for easier lookup
# Use the original wine_ids for both index and columns
similarity_df = pd.DataFrame(similarity_matrix, index=wine_ids, columns=wine_ids)

In [22]:
similarity_df.head()

WineID,100001,100002,100003,100005,100007,100008,100010,100012,100013,100014,...,198580,198885,199204,199306,199360,199408,199481,199533,199885,200139
WineID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100001,1.0,0.516307,0.516307,0.629946,0.952912,0.285456,-0.10889,0.545759,-0.623514,-0.623514,...,0.081255,0.241784,0.601641,0.565256,0.019903,0.012275,-0.10889,0.317677,0.231239,0.057731
100002,0.516307,1.0,0.555854,0.565764,0.516307,0.488982,-0.02778,0.189163,-0.208467,-0.208467,...,0.020476,0.094202,0.543265,0.176538,-0.009229,0.038551,-0.030144,0.066188,0.066188,0.070012
100003,0.516307,0.555854,1.0,0.871542,0.516307,0.951008,0.072776,0.189163,-0.213585,-0.213585,...,0.020476,0.101021,0.573418,0.196291,0.013615,-0.009229,0.024082,0.104534,0.088385,0.091915
100005,0.629946,0.565764,0.871542,1.0,0.629946,0.761955,0.014089,0.306634,-0.353273,-0.353273,...,0.033191,0.141283,0.613129,0.299768,0.021178,0.001845,0.031678,0.133691,0.142405,0.082375
100007,0.952912,0.516307,0.516307,0.629946,1.0,0.285456,-0.10889,0.542749,-0.623514,-0.623514,...,0.075553,0.235748,0.587169,0.548891,0.008771,0.002992,-0.10889,0.285907,0.220422,0.057731


In [23]:
# --- Calculate Wine Statistics (for Cold Start) ---
wine_stats = pd.DataFrame(columns=['WineID', 'rating_count', 'rating_mean', 'WineName']) # Ensure WineName for popularity
MIN_RATINGS_FOR_POPULARITY = 5 # Define a threshold for how many ratings a wine needs to be considered "popular"

if not df_ratings.empty and 'WineID' in df_ratings.columns and 'Rating' in df_ratings.columns:
    wine_stats_temp = df_ratings.groupby('WineID')['Rating'].agg(['count', 'mean']).reset_index()
    wine_stats_temp.rename(columns={'count': 'rating_count', 'mean': 'rating_mean'}, inplace=True)

    # Add WineName and other context to wine_stats
    context_cols_to_merge = ['WineID', 'WineName', 'Type', 'Country'] # Add a few key context
    valid_context_cols = [col for col in context_cols_to_merge if col in df_wines.columns]

    if 'WineID' in df_wines.columns:
         wine_stats = pd.merge(wine_stats_temp, df_wines[valid_context_cols], on='WineID', how='left')
         for col in valid_context_cols: # Fill NaNs for context columns
             if col != 'WineID' and col in wine_stats.columns and wine_stats[col].isnull().any():
                 wine_stats[col].fillna('Unknown', inplace=True)
         print(f"Wine statistics calculated and context merged for {len(wine_stats)} wines.")
    else:
        wine_stats = wine_stats_temp # Use without full context if original df is problematic
        warnings.warn("Warning: 'WineID' not found in original wine data. Wine_stats context might be limited.")
else:
    warnings.warn("Warning: Ratings data is empty or missing required columns. Cannot calculate wine statistics.")
# print(wine_stats.head())

Wine statistics calculated and context merged for 1007 wines.


In [24]:
from sklearn.model_selection import train_test_split

# --- Train/Test Split of Ratings Data ---
 # test_size=0.2 means 20% for testing, 80% for training
    # random_state ensures reproducibility
    # stratify attempts to keep the proportion of ratings per user similar in both sets,
    # which can be important if some users have very few ratings. Can be removed if it causes issues.
ratings_train, ratings_test = train_test_split(
        df_ratings,
        test_size=0.2,
        random_state=42,
        #stratify=df_ratings['UserID']# Stratify by UserID
)

In [25]:
# --- Recommendation Functions ---

def get_recommendation(target_wine_id, similarity_df, num_recs=5):
    """
    CONTENT-BASED: Gets wines most similar to the target wine.

    Args:
        target_wine_id: The ID of the wine to find similar items for.
        similarity_df: DataFrame containing pairwise wine similarities.
        num_recs: The number of recommendations to return.

    Returns:
        pandas.Series: Series containing similar WineIDs and their similarity scores,
                       or an empty Series if the wine or data is not found.
    """
    # Input validation
    if similarity_df.empty or target_wine_id not in similarity_df.index:
        # warnings.warn(f"Warning: WineID {target_wine_id} not found in similarity data or similarity data empty.")
        return pd.Series(dtype='float64') # Return empty series

    # Get similarity scores for the target wine
    sim_scores = similarity_df.loc[target_wine_id]

    # Find the top N+1 similar wines (including the target itself)
    # Using nlargest is efficient
    recs = sim_scores.nlargest(num_recs + 1)

    # Exclude the target wine itself from the recommendations
    recs = recs[recs.index != target_wine_id]

    return recs.head(num_recs)

In [26]:
def predict_rating(user_id, target_wine_id, df_ratings_history, similarity_df, k=10):
    """
    CONTENT-BASED: Predicts rating for a user and a target wine based on
                   the user's ratings of similar wines found in df_ratings_history.

    Args:
        user_id: The ID of the user for whom to predict the rating.
        target_wine_id: The ID of the wine to predict the rating for.
        df_ratings_history: DataFrame containing the user's KNOWN rating history
                            (e.g., ratings_train during evaluation).
        similarity_df: DataFrame containing pairwise wine similarities.
        k: The number of nearest neighbors (similar wines rated by the user) to consider.

    Returns:
        float or None: The predicted rating, or None if prediction is not possible.
    """
    # --- Input Validation and Data Retrieval ---
    if target_wine_id not in similarity_df.columns:
        # Fallback: Return user's average rating *from their history* if they exist, else None
        user_ratings_check = df_ratings_history.loc[df_ratings_history['UserID'] == user_id, 'Rating']
        return user_ratings_check.mean() if not user_ratings_check.empty else None

    target_similarities = similarity_df[target_wine_id]

    # Use the provided rating history (e.g., training data)
    user_ratings = df_ratings_history[df_ratings_history['UserID'] == user_id].set_index('WineID')['Rating']

    if user_ratings.empty:
        # If user has no history in the provided df (e.g., new user for training set)
        # Calculate global average rating *from the provided history* as fallback
        # This avoids leaking information from the test set during evaluation.
        global_average = df_ratings_history['Rating'].mean()
        return global_average if not np.isnan(global_average) else None # Return None if history is empty


    # --- Find Relevant Neighbors ---
    rated_indices = user_ratings.index.intersection(target_similarities.index)
    rated_indices = rated_indices.drop(target_wine_id, errors='ignore')

    # Calculate user's average rating *from their history* for fallback
    user_average_rating = user_ratings.mean() # Used if no similar items found

    if rated_indices.empty:
        return user_average_rating # Fallback to user average from history

    similarities_of_rated = target_similarities.loc[rated_indices]
    positive_similarities = similarities_of_rated[similarities_of_rated > 0]

    if positive_similarities.empty:
        return user_average_rating # Fallback to user average from history

    # --- Calculate Weighted Average ---
    actual_k = min(k, len(positive_similarities))
    top_k_neighbors = positive_similarities.nlargest(actual_k)
    neighbor_ratings = user_ratings.loc[top_k_neighbors.index]
    weighted_sum = (top_k_neighbors * neighbor_ratings).sum()
    sum_of_weights = top_k_neighbors.sum()

    # --- Final Prediction ---
    if sum_of_weights == 0:
        return user_average_rating # Fallback to user average from history

    predicted_rating = weighted_sum / sum_of_weights
    return predicted_rating

In [27]:
def rank_items_using_predict_rating(user_id, candidate_items, df_ratings_history, similarity_df, k=10, num_recs=10):
    """
    Predicts ratings for a list of candidate items for a user (using their known history)
    and ranks them.

    Args:
        user_id: The ID of the user.
        candidate_items (list): A list of WineIDs to predict ratings for.
        df_ratings_history: DataFrame containing user's KNOWN rating history.
        similarity_df: DataFrame containing pairwise wine similarities.
        k (int): Number of neighbors for the predict_rating function.
        num_recs (int): The maximum number of recommendations to return.

    Returns:
        pd.DataFrame: DataFrame with 'WineID' and 'predicted_rating' columns, sorted.
    """
    # (No changes needed, but uses history e.g., ratings_train)
    predictions = {}
    if not candidate_items:
        return pd.DataFrame()
    # print(f"  Predicting ratings for {len(candidate_items)} candidate items...") # Verbose
    for wine_id in candidate_items:
        # Use the provided rating history for prediction
        predicted = predict_rating(user_id, wine_id, df_ratings_history, similarity_df, k=k)
        if predicted is not None:
            predictions[wine_id] = predicted
    if not predictions:
        return pd.DataFrame()
    sorted_candidates = sorted(predictions.items(), key=lambda item: item[1], reverse=True)
    recs_df = pd.DataFrame(sorted_candidates, columns=['WineID', 'predicted_rating']).head(num_recs)
    # print(f"  Ranking complete. Returning top {len(recs_df)} items.") # Verbose
    return recs_df


In [28]:
def check_if_user_has_ratings(user_id, df_ratings):
    """Checks if a user exists and has rated at least one item in the given ratings data."""
    # (No changes needed, but will be called with ratings_train usually)
    if df_ratings is None or df_ratings.empty or 'UserID' not in df_ratings.columns:
        return False
    return (df_ratings['UserID'] == user_id).any()


def find_candidate_items_for_user(user_id, all_wine_ids, df_ratings_history):
    """
    Identifies potential wine candidates for recommendation to a user,
    excluding items already present in their known history (df_ratings_history).

    Args:
        user_id: The ID of the user.
        all_wine_ids (iterable): A list or index of all possible WineIDs.
        df_ratings_history: DataFrame containing the user's KNOWN rating history.

    Returns:
        list: A list of WineIDs that the user has not rated in the history.
    """
    # (No changes needed, but uses history e.g., ratings_train)
    if df_ratings_history is None or df_ratings_history.empty or 'UserID' not in df_ratings_history.columns or 'WineID' not in df_ratings_history.columns:
        return list(all_wine_ids)
    rated_by_user = set(df_ratings_history.loc[df_ratings_history['UserID'] == user_id, 'WineID'].unique())
    if not rated_by_user:
        return list(all_wine_ids)
    candidates = [wine_id for wine_id in all_wine_ids if wine_id not in rated_by_user]
    return candidates

In [29]:
def recommend_for_user(user_id,
                       df_ratings_history,
                       similarity_df,
                       df_wines_original,
                       wine_stats, # Used for popularity
                       num_recs=10,
                       k_neighbors_pred=15,
                       min_ratings_pop=MIN_RATINGS_FOR_POPULARITY # Pass the threshold
                       ):
    """
    Generates wine recommendations. Uses personalized for known users,
    popularity-based for new users.
    """
    print(f"\n--- Generating recommendations for UserID: {user_id} ---")

    if similarity_df is None or similarity_df.empty:
         print("ERROR: Similarity matrix is missing. Cannot recommend.")
         return pd.DataFrame()
    all_wine_ids = similarity_df.index
    if not all_wine_ids.any():
         print("ERROR: No wine IDs in similarity matrix. Cannot recommend.")
         return pd.DataFrame()

    user_has_history = check_if_user_has_ratings(user_id, df_ratings_history)
    final_recs_df = pd.DataFrame()
    reason = "Unknown"

    if user_has_history:
        print(f"User {user_id} has rating history. Using personalized strategy.")
        reason = 'Personalized Prediction'
        candidate_items = find_candidate_items_for_user(user_id, all_wine_ids, df_ratings_history)
        if not candidate_items:
            print(f"  User {user_id} has rated all available items. No new recommendations.")
            return pd.DataFrame()

        recs_pred = rank_items_using_predict_rating(
            user_id, candidate_items, df_ratings_history, similarity_df,
            k=k_neighbors_pred, num_recs=num_recs
        )
        if not recs_pred.empty:
            print(f"  Generated {len(recs_pred)} personalized recommendations.")
            final_recs_df = recs_pred
        else:
            print("  Failed to generate personalized recommendations.")

    else: # Cold User: Implement Popularity Fallback
        print(f"User {user_id} is new or has no history. Using popularity-based cold start.")
        reason = 'Cold Start (Popularity)'
        if wine_stats is not None and not wine_stats.empty and \
           'rating_count' in wine_stats.columns and \
           'rating_mean' in wine_stats.columns and \
           'WineID' in wine_stats.columns:

            # Filter for wines with a minimum number of ratings
            popular_wines = wine_stats[wine_stats['rating_count'] >= min_ratings_pop].copy()

            if not popular_wines.empty:
                # Sort by mean rating (descending), then by count (descending as tie-breaker)
                popular_wines_sorted = popular_wines.sort_values(
                    by=['rating_mean', 'rating_count'], ascending=[False, False]
                )
                # Select top N, ensuring 'WineID' and a score column exist
                final_recs_df = popular_wines_sorted[['WineID', 'rating_mean']].head(num_recs)
                final_recs_df = final_recs_df.rename(columns={'rating_mean': 'popularity_score'}) # Or 'predicted_rating' for consistency
                print(f"  Generated {len(final_recs_df)} recommendations based on popularity (min {min_ratings_pop} ratings).")
            else:
                print(f"  No wines found meeting the popularity criteria (min {min_ratings_pop} ratings). Trying global top if any stats exist.")
                # Fallback to any wine if the strict criteria yields nothing
                if not wine_stats.empty:
                    popular_wines_sorted = wine_stats.sort_values(
                        by=['rating_mean', 'rating_count'], ascending=[False, False]
                    )
                    final_recs_df = popular_wines_sorted[['WineID', 'rating_mean']].head(num_recs)
                    final_recs_df = final_recs_df.rename(columns={'rating_mean': 'popularity_score'})
                    print(f"  Generated {len(final_recs_df)} recommendations based on global popularity (no min rating count filter).")
                else:
                    print("  Wine statistics are empty. Cannot generate popularity recommendations.")
        else:
            print("  Wine statistics are missing or incomplete. Cannot generate popularity recommendations.")

    # --- Final Formatting and Context Addition ---
    if not final_recs_df.empty:
        final_recs_df['reason'] = reason
        # Ensure WineID column for merging context
        if 'WineID' not in final_recs_df.columns:
            warnings.warn("Warning: 'WineID' column missing from recommendations before context merge.")
            return final_recs_df # Return as is, cannot merge

        context_cols_to_add = ['WineName', 'Type', 'Country', 'RegionName', 'WineryName']
        valid_context_cols = ['WineID'] + [col for col in context_cols_to_add if col in df_wines_original.columns]

        if len(valid_context_cols) > 1:
             try:
                 # Ensure types match for merge
                 if df_wines_original['WineID'].dtype != final_recs_df['WineID'].dtype:
                    final_recs_df.loc[:, 'WineID'] = final_recs_df['WineID'].astype(df_wines_original['WineID'].dtype)

                 final_recs_df = pd.merge(
                     final_recs_df,
                     df_wines_original[valid_context_cols],
                     on='WineID',
                     how='left'
                 )
                 for col in valid_context_cols:
                     if col != 'WineID' and col in final_recs_df.columns and final_recs_df[col].isnull().any():
                         final_recs_df.loc[:, col] = final_recs_df[col].fillna('Unknown')
                 # print("  Context added.") # Less verbose
             except Exception as e:
                  warnings.warn(f"Could not merge context: {e}")
        # Reorder columns if WineName exists
        first_cols = ['WineID', 'WineName']
        if 'WineName' in final_recs_df.columns and 'WineID' in final_recs_df.columns:
             ordered_cols = first_cols + [col for col in final_recs_df.columns if col not in first_cols]
             final_recs_df = final_recs_df[ordered_cols]
    else:
        print("No recommendations were generated.")
    return final_recs_df

In [30]:
data_ready = True
required_dfs = {
'df_ratings_train': ratings_train,
'df_ratings_test': ratings_test,
'similarity_df': similarity_df,
'df_for_prefs': df_for_prefs,
'wine_stats': wine_stats,
'df_wines_original': df_wines
}



In [31]:
   # --- Example 1: Item-to-Item Recommendation (Unaffected by split) ---

if not wine_ids.empty:
    example_wine_id_item = wine_ids.iloc[0]
    print(f"Finding recommendations similar to WineID: {example_wine_id_item}...")
    recommendations_item = get_recommendation(example_wine_id_item, similarity_df, num_recs=5)
    if not recommendations_item.empty:
        rec_df_item = recommendations_item.reset_index()
        rec_df_item.columns = ['WineID', 'similarity']
        rec_df_item = pd.merge(rec_df_item, df_wines[['WineID']], on='WineID', how='left')
        print("Recommendations:")
        print(rec_df_item)
    else:
        print(f"Could not find recommendations for WineID {example_wine_id_item}.")
else:
    print("Skipping item-to-item example: wine_ids list is empty.")



Finding recommendations similar to WineID: 100001...
Recommendations:
   WineID  similarity
0  100073    1.000000
1  100061    0.999034
2  100007    0.952912
3  100088    0.902500
4  184497    0.832754


In [33]:
   # --- Example 1: Item-to-Item (Unaffected by split) ---
if not wine_ids.empty and not similarity_df.empty :
    example_wine_id_item = wine_ids.iloc[0]
    recommendations_item = get_recommendation(example_wine_id_item, similarity_df, num_recs=3)
    if not recommendations_item.empty:
        rec_df_item = recommendations_item.reset_index(); rec_df_item.columns = ['WineID', 'similarity']
        rec_df_item = pd.merge(rec_df_item, df_wines[['WineID']], on='WineID', how='left')
        print(rec_df_item)
    else: print(f"No item recommendations for WineID {example_wine_id_item}.")
else: print("Skipping item-to-item example (wine_ids or similarity_df missing).")

   WineID  similarity
0  100073    1.000000
1  100061    0.999034
2  100007    0.952912


In [37]:
from numpy import sqrt
from sklearn.metrics import mean_squared_error, mean_absolute_error  # --- Evaluation: Predict Ratings on Test Set ---
actual_ratings, predicted_ratings_eval = [], []
if not ratings_test.empty and not ratings_train.empty:
    for row in ratings_test.itertuples(index=False):
        predicted = predict_rating(row.UserID, row.WineID, ratings_train, similarity_df, k=15)
        if predicted is not None:
            actual_ratings.append(row.Rating)
            predicted_ratings_eval.append(predicted)
    if actual_ratings:
            rmse = sqrt(mean_squared_error(actual_ratings, predicted_ratings_eval))
            mse = mean_squared_error(actual_ratings, predicted_ratings_eval)
            mae = mean_absolute_error(actual_ratings, predicted_ratings_eval)
            print(f"Test Set RMSE: {rmse:.4f} (evaluated on {len(actual_ratings)} pairs)")
            print(f"Test Set MSE: {mse:.4f} (evaluated on {len(actual_ratings)} pairs)")
            print(f"Test Set MAE: {mae:.4f} (evaluated on {len(actual_ratings)} pairs)")
    else: print("No predictions made on test set for RMSE.")
else: print("Skipping RMSE evaluation (ratings_test or ratings_train empty).")

Test Set RMSE: 0.6258 (evaluated on 30000 pairs)
Test Set MSE: 0.3917 (evaluated on 30000 pairs)
Test Set MAE: 0.4732 (evaluated on 30000 pairs)


In [36]:
  # --- Example 2: Master Recommender - Existing User (using Training History) ---
if not ratings_train.empty:
    example_user_id_existing = ratings_train['UserID'].unique()[0]
    recs_existing = recommend_for_user(
        user_id=example_user_id_existing, df_ratings_history=ratings_train,
        similarity_df=similarity_df, df_wines_original=df_wines,
            wine_stats=wine_stats, num_recs=3
        )
    if not recs_existing.empty: print(recs_existing) 
    else: print("No recommendations for existing user.")
else: print("Skipping existing user example (ratings_train empty).")


--- Generating recommendations for UserID: 1149860 ---
User 1149860 has rating history. Using personalized strategy.
  Generated 3 personalized recommendations.
   WineID  predicted_rating                   reason
0  141176          4.486825  Personalized Prediction
1  179690          4.480614  Personalized Prediction
2  180923          4.473372  Personalized Prediction


In [38]:
# --- Example 3: Master Recommender - New User (No History - Should get Popularity) ---
new_user_id_no_hist = 999999 # An ID not in ratings_train
recs_new_no_hist = recommend_for_user(
    user_id=new_user_id_no_hist, df_ratings_history=ratings_train, # Pass train for history check
    similarity_df=similarity_df, df_wines_original=df_wines,
    wine_stats=wine_stats, num_recs=3, min_ratings_pop=3 # Use a lower threshold for small dataset
)
if not recs_new_no_hist.empty:
    print("\nRecommendations for New User (Popularity):")
    print(recs_new_no_hist)
else:
    print("Could not generate popularity recommendations for the new user (check wine_stats and thresholds).")


--- Generating recommendations for UserID: 999999 ---
User 999999 is new or has no history. Using popularity-based cold start.
  Generated 3 recommendations based on popularity (min 3 ratings).

Recommendations for New User (Popularity):
     WineID  popularity_score                   reason
831  182711          4.807692  Cold Start (Popularity)
481  144337          4.794118  Cold Start (Popularity)
819  181199          4.794118  Cold Start (Popularity)
