In [1]:
# --- Imports ---
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, FeatureHasher
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
from scipy.sparse import csr_matrix, hstack
from tqdm import tqdm

In [2]:
# --- Data Loading ---
df_wines = pd.read_csv("XWines_Slim_1K_wines.csv")
df_ratings = pd.read_csv("XWines_Slim_150K_ratings.csv")

  df_ratings = pd.read_csv("XWines_Slim_150K_ratings.csv")


In [3]:
# --- Train/Test Split FIRST (clearly avoiding leakage) ---
train_ratings, test_ratings = train_test_split(df_ratings, test_size=0.2, random_state=42)

In [4]:
# --- Selecting Wines ONLY from Training Data (avoids leakage) ---
train_wine_ids = train_ratings['WineID'].unique()
df_wines_train = df_wines[df_wines['WineID'].isin(train_wine_ids)].copy().reset_index(drop=True)

In [5]:
# --- Text Preprocessing (Lowercase) ---
# Standardize all categorical text attributes to lowercase for consistency
cols_lowercase = ['WineName', 'Type', 'Elaborate', 'Body', 'Acidity', 
                  'Country', 'RegionName', 'WineryName']
df_wines_train[cols_lowercase] = df_wines_train[cols_lowercase].apply(lambda x: x.str.lower())

In [6]:
# --- List-column Cleaning (Grapes & Harmonize) ---
# Clean up columns that represent lists as strings, remove brackets/quotes, and convert to actual Python lists
for col in ['Grapes', 'Harmonize']:
    df_wines_train[col] = df_wines_train[col].fillna('') \
                         .str.replace(r'[\[\]\']/','',regex=True) \
                         .apply(lambda x: [w.strip().lower() for w in x.split(',') if w.strip()])

In [7]:
# --- Numeric Preprocessing (ABV) ---
# Normalize numerical ABV (alcohol by volume) feature using StandardScaler (mean=0, variance=1)
scaler = StandardScaler()
df_wines_train['ABV'] = scaler.fit_transform(df_wines_train[['ABV']].astype(float))

In [8]:
# --- Preserve original wine data for readable recommendations ---
# Save a copy of original wine data attributes for final presentation purposes
context_cols=['WineID','WineName','WineryName','Type','Country','RegionName','ABV']
df_wines_original = df_wines_train[context_cols].copy()

In [9]:
# --- Categorical Features (One-Hot Encoding) ---
# Represent categorical variables using one-hot encoding
cat_features = ['Type','Elaborate','Body','Acidity','Country']
df_wines_train_encoded = pd.get_dummies(df_wines_train,columns=cat_features)

In [10]:
# --- TF-IDF Vectorization (Grapes + Harmonize) ---
# Create a textual corpus by combining Grapes and Harmonize features
corpus = df_wines_train['Grapes'].str.join(' ')+' '+df_wines_train['Harmonize'].str.join(' ')
# Vectorize corpus using TF-IDF to capture the uniqueness of terms across wines
tfidf_vec = TfidfVectorizer().fit_transform(corpus)

In [11]:
# --- Feature hashing (RegionName) ---
# Use FeatureHasher to reduce high-cardinality categorical RegionName variable to 16 hashed features
hasher = FeatureHasher(n_features=16,input_type='string',alternate_sign=False)
hashed_region = hasher.fit_transform(df_wines_train[['RegionName']].values)

In [12]:
# --- Combine all features into Sparse Embeddings ---
# Identify numeric and encoded categorical features to combine with embeddings
non_text_cols = ['ABV'] + [col for col in df_wines_train_encoded if col.startswith(('Type_', 'Elaborate_', 'Body_', 'Acidity_', 'Country_'))]
# Convert explicitly to float to ensure correct numeric data types
numeric_sparse = csr_matrix(df_wines_train_encoded[non_text_cols].astype(float).values)

In [13]:

# Combine numeric, TF-IDF text vectors, and hashed region vectors into a single sparse embedding vector
combined_train_sparse = hstack([numeric_sparse, tfidf_vec, hashed_region])

# --- Compute Similarity Matrix (Training Only) ---
# Calculate pairwise cosine similarity among wines based on embeddings
similarity_train = cosine_similarity(combined_train_sparse)
similarity_df_train = pd.DataFrame(similarity_train,
                                   index=df_wines_train['WineID'],
                                   columns=df_wines_train['WineID'])

In [14]:
# --- Predict Rating (User-Based via Similarity) ---
# Predict rating for a given user and wine based on similarity-weighted ratings from the user's history
def predict_rating(user_id, wine_id, train_ratings, similarity_df, global_mean):
    user_history = train_ratings[train_ratings.UserID == user_id]
    rated_wines = user_history['WineID'][user_history['WineID'].isin(similarity_df.index)]
    if rated_wines.empty or wine_id not in similarity_df.index:
        return global_mean
    similarities = similarity_df.loc[wine_id, rated_wines]
    if similarities.abs().sum() == 0:
        return global_mean
    ratings = user_history.set_index('WineID').loc[similarities.index]['Rating']
    return np.dot(ratings, similarities) / similarities.abs().sum()

In [15]:
from numpy import sqrt


# --- Evaluation Function ---
def evaluate(test_ratings, train_ratings, similarity_df):
    global_mean = train_ratings['Rating'].mean()
    y_true, y_pred = [], []
    for _,r in tqdm(test_ratings.iterrows(),total=len(test_ratings)):
        pred = predict_rating(r.UserID,r.WineID,train_ratings,similarity_df,global_mean)
        y_true.append(r.Rating)
        y_pred.append(pred)
    rmse = sqrt(mean_squared_error(y_true,y_pred))
    mae = mean_absolute_error(y_true,y_pred)
    return rmse, mae

rmse, mae = evaluate(test_ratings,train_ratings,similarity_df_train)
print(f" RMSE: {rmse:.4f}\n MAE: {mae:.4f}")

100%|██████████| 30000/30000 [00:47<00:00, 625.37it/s]

 RMSE: 0.6217
 MAE: 0.4694





In [16]:
# --- Evaluation Function (RMSE & MAE) ---
# Evaluate recommendation predictions on the test set using RMSE and MAE
def evaluate(test_ratings, train_ratings, similarity_df):
    global_mean = train_ratings['Rating'].mean()
    y_true, y_pred = [], []
    for _,r in tqdm(test_ratings.iterrows(),total=len(test_ratings)):
        pred = predict_rating(r.UserID,r.WineID,train_ratings,similarity_df,global_mean)
        y_true.append(r.Rating)
        y_pred.append(pred)
    rmse = np.sqrt(mean_squared_error(y_true,y_pred))
    mae = mean_absolute_error(y_true,y_pred)
    return rmse, mae

In [17]:
# Execute evaluation and display performance results
rmse, mae = evaluate(test_ratings,train_ratings,similarity_df_train)
print(f" RMSE: {rmse:.4f}\n MAE: {mae:.4f}")

100%|██████████| 30000/30000 [00:42<00:00, 701.67it/s]

 RMSE: 0.6217
 MAE: 0.4694





In [18]:
# --- Recommendation (Item-to-Item) ---
# Function recommending similar wines based on cosine similarity scores
def get_recommendation(wine_id, similarity_df,df_wines_original,num_recs=5):
    if wine_id not in similarity_df.index:
        return pd.DataFrame()
    similar = similarity_df[wine_id].sort_values(ascending=False).drop(wine_id).head(num_recs).reset_index()
    similar.columns=['WineID','Similarity']
    return pd.merge(similar,df_wines_original[['WineID', 'Type', 'Country']], on='WineID', how='left')

In [19]:
# Example usage of function: Provide similar wines for user-selected wine
example_wine_id=train_ratings['WineID'].iloc[0]
recommendations=get_recommendation(example_wine_id,similarity_df_train,df_wines_original)

print(f"Item-to-item recommendations for WineID {example_wine_id}\n{recommendations}")

Item-to-item recommendations for WineID 193486
   WineID  Similarity   Type      Country
0  193487    1.000000  white  new zealand
1  193484    1.000000  white  new zealand
2  193482    0.999935  white  new zealand
3  193499    0.998375  white  new zealand
4  193489    0.998375  white  new zealand


In [20]:
# --- Popular Wines utility (Cold-Start Recommendations) ---
# Identify most popular/highly-rated wines in training data, useful for cold-start recommendations
def popular_wines(train_ratings, df_wines_original, n=5, min_ratings=10):
    popular = train_ratings.groupby('WineID')['Rating'].agg(['mean', 'count'])
    popular = popular[popular['count'] >= min_ratings].sort_values('mean',ascending=False).head(n)
    return df_wines_original.loc[
        df_wines_original['WineID'].isin(popular.index), ['WineID', 'Type', 'Country']
    ]

In [21]:
# Display example popular wine recommendations
popular=popular_wines(train_ratings,df_wines_original)
print(f"Popular wines for cold-start recommendations:\n{popular}")

Popular wines for cold-start recommendations:
     WineID       Type        Country
258  112084  sparkling         france
481  144337    dessert          italy
716  174184        red      australia
806  180330        red  united states
833  183379        red  united states
