# Content-based recommender
## Using: cosine similarity
## Predicting: Ratings, Top-K recommendations
## Next steps: try autoencoders for content-based recommendations or go straight to GNNs?

### Workflow:
* **Read the data**. During this step I keep only required columns, format dates for the time-awared train/test split, format string lists(columns: Grapes, Harmonize). 
* **Train/test split**. 80/20 split based on dates, to keep more recent records in test split.
* **Preprocess wines data**:
    * **StandardScaler** - ABV numeric column
    * **One-hot-encoding** - Type, ELaborate, Body, Acidity, since these columns have reasonable number of classes.
    * **Frequency encoding** - WineryName, RegionName, Country
    * **TF-IDF or MultiLabelBinarizer** - Grapes, Harmonize. These columns could be treated both as categories or text features. I will compare the differense in results for both types of encoding.
    Note: For preprocessing custom Wrappers were created to make compatible in the pipeline and output csr_matrix in all transformers. 
    * **Create user profiles**. Normalize both user profiles and wine matrix.
    * **Predict ratings**. Evaluate using MAE and RMSE.
    * **Give top-k recommendations** Evaluate using Recall@k. I will add more metrics later.

In [2]:
import pandas as pd
import numpy as np
import ast

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MultiLabelBinarizer, normalize
from sklearn.feature_extraction.text import TfidfVectorizer
from category_encoders import TargetEncoder
from sklearn.metrics import mean_absolute_error, root_mean_squared_error

from scipy.sparse import csr_matrix, issparse, lil_matrix, hstack, vstack, save_npz


In [3]:

# Converter of string lists into Python lists
# (e.g. "['a', 'b', 'c']" → [a, b, c])
def parse_list_col(s):
    return ast.literal_eval(s)

# Format strings lists into a space-separated string
# (e.g. "['a', 'b', 'c']" → "a b c")
def parse_list_col_str(s):
    return s.strip("[]").replace("'", "").replace(',', ' ')

# Time-based split of a DataFrame by user
# (e.g. 80% train, 20% test) based on the date column
def user_time_split(df, date_col='Date', split_ratio=0.8):
    train_parts, val_parts = [], []
    for uid, group in df.groupby('UserID', sort=False):
        group = group.sort_values(date_col)
        i = int(len(group) * split_ratio)
        train_parts.append(group.iloc[:i])
        val_parts.append(group.iloc[i:])
    return pd.concat(train_parts), pd.concat(val_parts)


In [4]:

# Read the wines data and parse string lists into Python lists
print("▶ Reading wines metadata…")
wines = pd.read_csv(
    './data/XWines_Full_100K_wines.csv',
    usecols=['WineID', 'Type', 'Elaborate', 'ABV', 'Body', 'Acidity', 'RegionName', 'WineryName', 'Grapes','Harmonize','Country'],
    converters={
        'Grapes':    parse_list_col_str,
        'Harmonize': parse_list_col_str
    }
)
print(f"   ✓ Loaded wines: {len(wines):,} rows")

# Save the wine IDs for later use
wines = wines.set_index('WineID')
wine_ids = wines.index.tolist()

# Read the ratings data and parse 'N.V.' into 0 (Vitage to numeric), parse dates
print("▶ Reading ratings…")
ratings = pd.read_csv(
    './data/XWines_Full_21M_ratings.csv',
    usecols=['UserID','WineID','Date','Rating'],
    parse_dates=['Date'],
    date_format=lambda s: pd.to_datetime(s),
)
print(f"   ✓ Loaded ratings: {len(ratings):,} rows")

▶ Reading wines metadata…
   ✓ Loaded wines: 100,646 rows
▶ Reading ratings…
   ✓ Loaded ratings: 21,013,536 rows


In [5]:
## Noise reduction

# Filter out wines with fewer than 5 ratings 
print("▶ Filtering wines with ≥5 ratings…")
wine_counts = ratings['WineID'].value_counts()
good_wines  = wine_counts[wine_counts >= 5].index
ratings     = ratings[ratings['WineID'].isin(good_wines)]
print(f"   ✓ Ratings remaining after wine filter: {len(ratings):,} rows")

# Filter out users with fewer than 5 reviews
print("▶ Filtering users with ≥5 reviews…")
user_counts = ratings['UserID'].value_counts()
good_users  = user_counts[user_counts >= 5].index
ratings     = ratings[ratings['UserID'].isin(good_users)]
print(f"   ✓ Ratings remaining after user filter: {len(ratings):,} rows")

▶ Filtering wines with ≥5 ratings…
   ✓ Ratings remaining after wine filter: 21,013,536 rows
▶ Filtering users with ≥5 reviews…
   ✓ Ratings remaining after user filter: 21,013,536 rows


In [6]:
# Time-based split of the ratings dataset into train/val/test sets
# (80% train+val, 20% test)
print("▶ Splitting off test set (80/20 by user-time)…")
train, test = user_time_split(ratings, date_col='Date', split_ratio=0.8)
print(f"   ✓ train: {len(train):,} rows, test: {len(test):,} rows")
# # (75% train, 25% val)
# print("▶ Splitting train/val (75/25 by user-time)…")
# train, val = user_time_split(train_val, date_col='Date', split_ratio=0.75)
# print(f"   ✓ train: {len(train):,} rows, val: {len(val):,} rows")

▶ Splitting off test set (80/20 by user-time)…
   ✓ train: 16,396,768 rows, test: 4,616,768 rows


In [7]:
# Aggregate features

# Use StandardScaler for numerical features
numerical_features = ['ABV']
# Use one-hot encoding for categorical features
categorical_features = ['Type', 'Elaborate', 'Body', 'Acidity']
# Use TF-IDF for text features
text_features = ['Grapes', 'Harmonize']
# Use frequency encoding for categorical features
freq_features = ['WineryName', 'RegionName', 'Country']


In [8]:
class StandardScalerWrapper(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.scaler = StandardScaler(with_mean=False)
        self.feature_names = []

    def fit(self, X, y=None):
        self.feature_names = X.columns.tolist()
        self.scaler.fit(X)
        return self

    def transform(self, X):
        X_scaled = self.scaler.transform(X)
        # Always return sparse csr_matrix
        if not issparse(X_scaled):
            X_scaled = csr_matrix(X_scaled)
        return X_scaled

    def get_feature_names_out(self, input_features=None):
        return np.array(self.feature_names)

class TFIDFWrapper(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.tfidf = TfidfVectorizer()
        self.feature_names = []

    def fit(self, X, y=None):
        texts = X.apply(lambda row: ' '.join(row.values), axis=1)
        self.tfidf.fit(texts)
        self.feature_names = self.tfidf.get_feature_names_out()
        return self

    def transform(self, X):
        texts = X.apply(lambda row: ' '.join(row.values), axis=1)
        X_tfidf = self.tfidf.transform(texts)
        return csr_matrix(X_tfidf)

    def get_feature_names_out(self, input_features=None):
        return np.array(self.feature_names)

class MultiLabelWrapper(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.encoders = {}
        self.feature_names = []
    
    def fit(self, X, y=None):
        self.feature_names = []
        for col in X.columns:
            mlb = MultiLabelBinarizer()
            mlb.fit(X[col])
            self.encoders[col] = mlb
            # Store feature names for this column
            self.feature_names.extend([f"{col}__{cls}" for cls in mlb.classes_])
        return self
            
    def transform(self, X):
        matricies = []
        for col in X.columns:
            mlb = self.encoders[col]
            class_index = {cls: i for i, cls in enumerate(mlb.classes_)}
            n_rows = len(X)
            n_classes = len(mlb.classes_)
            sparse = lil_matrix((n_rows, n_classes), dtype=np.uint8)

            for i, labels in enumerate(X[col]):
                for label in labels:
                    if label in class_index:
                        sparse[i, class_index[label]] = 1

            matricies.append(sparse.tocsr())
        return hstack(matricies, format='csr')
    
    def get_feature_names_out(self, input_features=None):
        return np.array(self.feature_names)

class FrequencyEncoder:
    def __init__(self):
        self.feature_names = []
        self.freq_maps = {}

    def fit(self, X, y=None):
        self.feature_names = X.columns.tolist()
        for col in X.columns:
            freq = X[col].value_counts(normalize=True)
            self.freq_maps[col] = freq
        return self

    def transform(self, X):
        matrices = []
        for col in X.columns:
            freq_map = self.freq_maps.get(col, {})
            col_freq = X[col].map(freq_map).fillna(0).values.reshape(-1, 1)
            col_freq = csr_matrix(col_freq)
            matrices.append(col_freq)
        return hstack(matrices, format='csr')
    
    def get_feature_names_out(self, input_features=None):
        return np.array([f"{col}_freq" for col in self.feature_names])

In [9]:
## Preprocessing pipeline

# Numerical
numerical_pipeline = Pipeline([
    ('scaler', StandardScalerWrapper()),
    ])

# Categorical via one-hot-encoding
categorical_pipeline = Pipeline([
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=True))
])

# Text features via TF-IDF
tfidf_pipeline = Pipeline([
    ('tfidf', TFIDFWrapper())
])

# Categorical features via frequency encoding
freq_pipeline = Pipeline([
    ('date', FrequencyEncoder()),
    ('scaler', StandardScaler(with_mean=False))
])

# Preprocessor
preprocessor = ColumnTransformer(transformers=[
    ('numerical', numerical_pipeline, numerical_features),
    ('categorical', categorical_pipeline, categorical_features),
    ('tfidf', tfidf_pipeline, text_features),
    ('names', freq_pipeline, freq_features),

])

preprocessing_pipeline = Pipeline([
    ('preprocessor', preprocessor)
])


In [10]:
# Fit and transform the wines data
preprocessing_pipeline.fit(wines)
wines_features = preprocessing_pipeline.transform(wines)



In [11]:
#  Match WineID to the index of the wines_features matrix
wine_id_to_idx = {wine_id: idx for idx, wine_id in enumerate(wine_ids)}

# Get unique userIDs and assign them to indices
user_ids = train['UserID'].unique()
user_id_to_idx = {user_id: idx for idx, user_id in enumerate(user_ids)}


In [12]:
# Create a sparse matrix for the ratings train data

# Map user IDs and wine IDs to indices
row_idx = train['UserID'].map(user_id_to_idx)
col_idx = train['WineID'].map(wine_id_to_idx)
# Make sure no NaNs
mask = row_idx.notna() & col_idx.notna()
row_idx = row_idx[mask].astype(int)
col_idx = col_idx[mask].astype(int)
# Get the ratings values
data = train.loc[mask, 'Rating']

# Create the sparse matrix
ratings_train = csr_matrix((data, (row_idx, col_idx)), 
                     shape=(len(user_id_to_idx), len(wine_id_to_idx)))
print(f"   ✓ Ratings matrix shape: {ratings_train.shape}")

   ✓ Ratings matrix shape: (1056079, 100646)


In [13]:

# Normalize features for cosine similarity
wines_norm = normalize(wines_features, axis=1)

# Create user profile matrix
user_profiles = normalize(ratings_train.dot(wines_norm))


In [None]:
def predict_for_test_pairs(user_profiles, wines, ratings_test, user_id_to_idx, item_id_to_idx):
    preds = []

    for _, row in ratings_test.iterrows():
        u_id, i_id = row["UserID"], row["WineID"]
        if u_id not in user_id_to_idx or i_id not in item_id_to_idx:
            continue 
        u_idx = user_id_to_idx[u_id]
        i_idx = item_id_to_idx[i_id]

        score = user_profiles[u_idx].dot(wines[i_idx].T).toarray().item()
        pred_rating = 1 + 4 * ((score + 1) / 2)  # rescale
        preds.append(pred_rating)
    return preds

# def predict_ratings_batch(user_profiles, wines_norm, batch_size=1000):
#     n_users = user_profiles.shape[0]
#     n_wines = wines_norm.shape[0]
#     predicted_rows =[]

#     for start in range(0, n_users, batch_size):
#         end = min(start + batch_size, n_users)
#         batch_profiles = user_profiles[start:end]
#         # Compute the predicted ratings for the current batch
#         print(f"Scoring users {start} to {end - 1}, batch size: {batch_profiles.shape[0]}")
#         pred_ratings_batch = batch_profiles.dot(wines_norm.T).toarray()
#         # Scale to 1-5 range
#         pred_ratings_batch = 1 + 4 * ((pred_ratings_batch + 1) / 2)
#         predicted_rows.append(pred_ratings_batch)

#     return np.vstack(predicted_rows)

# def get_predicted_rating(user_id, wine_id, user_id_to_idx, wine_id_to_idx, ratings_pred):
#     u = user_id_to_idx.get(user_id)
#     i = wine_id_to_idx.get(wine_id)
#     if u is not None and i is not None:
#         print(f"Predicted rating for user {user_id} and wine {wine_id}: {ratings_pred.shape}")
#         return ratings_pred[u, i]
#     else:
#         return np.nan  # for unknown user/item



In [None]:
# ratings_pred = predict_ratings_batch(user_profiles, wines_norm)
ratings_pred = predict_for_test_pairs(user_profiles, wines_norm, test, user_id_to_idx, wine_id_to_idx)

In [31]:

# test['Rating_pred'] = test.apply(
#     lambda row: get_predicted_rating(row['UserID'], row['WineID'], 
#                                      user_id_to_idx, wine_id_to_idx, ratings_pred), axis=1)

# Drop unknown predictions (e.g., cold-start)
# test = test.dropna(subset=['Rating_pred'])

rmse = root_mean_squared_error(test['Rating'], ratings_pred)
mae = mean_absolute_error(test['Rating'], ratings_pred)

print(f"✅ RMSE: {rmse:.4f}")
print(f"✅ MAE:  {mae:.4f}")

✅ RMSE: 1.1927
✅ MAE:  0.9830


In [32]:

def recommend_top_k_batch(user_profiles, wines_norm, ratings_train, k=5, batch_size=1000):
    recommendations = []
    n_users = user_profiles.shape[0]
    print(f"   ✓ Number of users: {n_users}")
    for start in range(0, n_users, batch_size):
        end = min(start + batch_size, n_users)
        batch_profiles = user_profiles[start:end]

        # Compute cosine similarities for this batch
        print(f"Scoring users {start} to {end - 1}, batch size: {batch_profiles.shape[0]}")
        scores = batch_profiles.dot(wines_norm.T)  # shape: (batch_size, n_items)

        for i, u in enumerate(range(start, end)):
            # Get the user's profile and the scores for all items
            user_row = ratings_train.getrow(u)
            user_seen = user_row.indices
            score_row = scores[i].toarray().ravel()
            # Mask out items the user has already rated
            score_row[user_seen] = -np.inf
            valid_idx = np.where(score_row > -np.inf)[0]

            # Sort the scores and get the top k items
            # If the user has rated all items, return an empty list
            if len(valid_idx) == 0:
                top_k_sorted = []
            else:
                k_actual = min(k, len(valid_idx))
                top_k = np.argpartition(-score_row, k_actual)[:k_actual]
                top_k_sorted = top_k[np.argsort(-score_row[top_k])]
            recommendations.append(top_k_sorted)

    return recommendations


In [None]:
# Get top-5 recommendations for each user
top_k = 5
recommendations = recommend_top_k_batch(user_profiles, wines_norm, ratings_train, top_k, batch_size=10000)

   ✓ Number of users: 1056079
Scoring users 0 to 9999, batch size: 10000
Scoring users 10000 to 19999, batch size: 10000
Scoring users 20000 to 29999, batch size: 10000
Scoring users 30000 to 39999, batch size: 10000


In [50]:
def recall_at_k(recommendations, test_df, user_id_to_idx, item_id_to_idx, k=10):
    hit_count = 0
    total = 0
    
    # Group test data by user
    test_groups = test_df.groupby('UserID')['WineID'].apply(set).to_dict()

    for user_id, relevant_items in test_groups.items():
        if user_id not in user_id_to_idx:
            continue  # skip unknown users
        u_idx = user_id_to_idx[user_id]
        recs = recommendations[u_idx][:k]
        rec_ids = set([idx for idx in recs if idx in item_id_to_idx.values()])

        hit_count += len(rec_ids.intersection({item_id_to_idx[i] for i in relevant_items if i in item_id_to_idx}))
        total += len(relevant_items)

    return hit_count / total if total > 0 else 0


# Calculate recall@5
recall = recall_at_k(recommendations, test, user_id_to_idx, wine_id_to_idx, k=top_k)
print(f"Recall@{top_k}: {recall:.4f}")

Recall@5: 0.0166
