In [35]:
import pandas as pd
import matplotlib.pyplot as plt
from collections import defaultdict

# WORK IN PROGRESS! NOT READY FOR USE YET

# Creation of Train and Test Set

As the dataset at the moment is not splitted at all, we need a dedicated train and testset.

This notebook lead through the process of creating such a split and can be used to transform the originial dataset in our training and test sets.

There are multiple key factors to consider when creating a testset:
- Temporal drift
- Cold-Start problem
- Distribution similarity

## Temporal drift
Over time there will most likely happen a concept drift leading to different distributions. To predict a value you look into the past. You dont predict a value of the past with the knowledge of the future as this would led to some kind of data leakage.

## Cold-Start problem
This is one of the most central aspects a recommender system has to deal with. The cold start problem referes to the fact if a user/item is already known at training time or is first introduced at test time.
We have the following 4 cases we want to cover extensively in order to have a full diagnostic view about our model performance.
- Hot/Hot (Known User/ Known Wine) -> Standard case
- Hot/Cold (Known User/ New Wine) -> A new wine is introduced
- Cold/Hot (New User/ Known Wine) -> New user get some Wine recommendations
- Cold/Cold (New User/ New Wine) -> most challenging case, first time user sees a brand new wine


## Distribution similarity

In [3]:
df_reviews = pd.read_csv(r"../All-XWines_Full_100K_wines_21M_ratings/XWines_Full_21M_ratings.csv")

  df_reviews = pd.read_csv(r"../All-XWines_Full_100K_wines_21M_ratings/XWines_Full_21M_ratings.csv")


In [5]:
df_reviews["Date"] = pd.to_datetime(df_reviews["Date"])

In [6]:
df_reviews = df_reviews.sort_values("Date")

In [7]:
df_reviews.head(10)

Unnamed: 0,RatingID,UserID,WineID,Vintage,Rating,Date
838057,838058,1395486,136168,2005,2.0,2012-01-03 08:20:53
1582521,1582522,1395486,174565,2008,4.0,2012-01-03 08:27:03
1578306,1578307,1385968,165088,2008,1.0,2012-01-04 10:34:12
725564,725565,1576414,136365,2004,4.0,2012-01-07 14:18:01
2985506,2985507,1177579,114749,2010,3.0,2012-01-11 15:01:43
1392601,1392602,1187471,174257,2007,4.0,2012-01-11 17:51:49
1578604,1578605,1855030,115399,2008,4.0,2012-01-14 13:41:43
2080735,2080736,1855030,117018,2009,4.0,2012-01-14 13:42:47
302652,302653,1785135,111511,1998,4.0,2012-01-14 15:54:09
1579771,1579772,1193574,136595,2008,5.0,2012-01-14 16:15:04


In [49]:
# create a 80/20 temporal split
split_ratio = 0.80
split_index = int(len(df_reviews) * split_ratio)
df_reviews_train = df_reviews.iloc[:split_index]
df_reviews_test = df_reviews.iloc[split_index:]
print(f"Size of all reviews: {len(df_reviews)}")
print(f"Size of train set: {len(df_reviews_train)}")
print(f"Size of test set: {len(df_reviews_test)} making it {(len(df_reviews_test) / (len(df_reviews_test) + len(df_reviews_train)))*100} %")

Size of all reviews: 21013536
Size of train set: 16810828
Size of test set: 4202708 making it 20.00000380706988 %


In [50]:
# unique wines and users in train set
u_users_train = set(df_reviews_train["UserID"])
u_wines_train = set(df_reviews_train["WineID"])
u_users_total = set(df_reviews["UserID"])
u_wines_total = set(df_reviews["WineID"])
print(f"There are {len(u_users_train)} unique users in the train set. Making it {(len(u_users_train) / len(u_users_total)) * 100}% of all users")
print(f"There are {len(u_wines_train)} unique wines in the train set. Making it {(len(u_wines_train) / len(u_wines_total)) * 100}% of all wines")

There are 997072 unique users in the train set. Making it 94.41263390333489% of all users
There are 99325 unique wines in the train set. Making it 98.6874788863939% of all wines


In [51]:
# split_test_data -> split test data into 4 different subsets
def split_test_data(df):
    segment = defaultdict(list)
    for _, row in df.iterrows():
        uid, iid = row['UserID'], row['WineID']
        if uid in u_users_train and iid in u_wines_train:
            segment['warm_user_warm_item'].append(row)
        elif uid not in u_users_train and iid in u_wines_train:
            segment['cold_user'].append(row)
        elif uid in u_users_train and iid not in u_wines_train:
            segment['cold_item'].append(row)
        else:
            segment['cold_user_cold_item'].append(row)
    return {k: pd.DataFrame(v) for k, v in segment.items()}

# === Split test set into 4 segments ===
test_segments = split_test_data(df_reviews_test)

In [59]:
def describe_test_set(df_segment, df_train, name="Segment"):
    print(f"{name.upper()} — Test Set Description")
    print(f"Size: {len(df_segment):,}")
    print(f"Unique users: {df_segment['UserID'].nunique():,}")
    print(f"Unique wines: {df_segment['WineID'].nunique():,}")
    
    train_users = set(df_train['UserID'])
    train_items = set(df_train['WineID'])
    
    cold_users = set(df_segment['UserID']) - train_users
    cold_items = set(df_segment['WineID']) - train_items
    
    print(f"% Cold users: {len(cold_users) / df_segment['UserID'].nunique():.1%}")
    print(f"% Cold items: {len(cold_items) / df_segment['WineID'].nunique():.1%}")
    print("\nRating stats:", df_segment['Rating'].describe())
    print("\nUser review counts (mean ± std):", df_segment.groupby('UserID').size().agg(['mean', 'std']))
    print("Wine review counts (mean ± std):", df_segment.groupby('WineID').size().agg(['mean', 'std']))
    print("Temporal range:", df_segment['Date'].min(), "→", df_segment['Date'].max())


In [60]:
describe_test_set(test_segments["warm_user_warm_item"], df_reviews_train, name="Hot User : Hot Wine")

HOT USER : HOT WINE — Test Set Description
Size: 3,536,778
Unique users: 543,869
Unique wines: 88,263
% Cold users: 0.0%
% Cold items: 0.0%

Rating stats: count    3.536778e+06
mean     4.007587e+00
std      6.738004e-01
min      1.000000e+00
25%      3.500000e+00
50%      4.000000e+00
75%      4.500000e+00
max      5.000000e+00
Name: Rating, dtype: float64

User review counts (mean ± std): mean     6.502996
std     12.547336
dtype: float64
Wine review counts (mean ± std): mean     40.070902
std     108.744318
dtype: float64
Temporal range: 2020-11-13 18:19:41 → 2021-12-31 23:59:56


In [61]:
describe_test_set(test_segments["cold_user"], df_reviews_train, name="Cold User : Hot Wine")

COLD USER : HOT WINE — Test Set Description
Size: 624,842
Unique users: 59,007
Unique wines: 63,676
% Cold users: 100.0%
% Cold items: 0.0%

Rating stats: count    624842.000000
mean          3.903239
std           0.754119
min           1.000000
25%           3.500000
50%           4.000000
75%           4.500000
max           5.000000
Name: Rating, dtype: float64

User review counts (mean ± std): mean    10.589286
std     10.220943
dtype: float64
Wine review counts (mean ± std): mean     9.812834
std     25.899012
dtype: float64
Temporal range: 2020-11-13 18:23:20 → 2021-12-31 23:59:27


In [62]:
describe_test_set(test_segments["cold_item"], df_reviews_train, name="Hot User : Cold Wine")

HOT USER : COLD WINE — Test Set Description
Size: 35,456
Unique users: 27,729
Unique wines: 1,321
% Cold users: 0.0%
% Cold items: 100.0%

Rating stats: count    35456.000000
mean         3.865947
std          0.685216
min          1.000000
25%          3.500000
50%          4.000000
75%          4.000000
max          5.000000
Name: Rating, dtype: float64

User review counts (mean ± std): mean    1.278661
std     0.782587
dtype: float64
Wine review counts (mean ± std): mean    26.840273
std     60.861338
dtype: float64
Temporal range: 2020-11-13 21:54:40 → 2021-12-31 23:56:15


In [63]:
describe_test_set(test_segments["cold_user_cold_item"], df_reviews_train, name="Cold User : Cold Wine")

COLD USER : COLD WINE — Test Set Description
Size: 5,632
Unique users: 4,768
Unique wines: 1,015
% Cold users: 100.0%
% Cold items: 100.0%

Rating stats: count    5632.000000
mean        3.851296
std         0.761721
min         1.000000
25%         3.500000
50%         4.000000
75%         4.000000
max         5.000000
Name: Rating, dtype: float64

User review counts (mean ± std): mean    1.181208
std     0.539491
dtype: float64
Wine review counts (mean ± std): mean     5.548768
std     12.935866
dtype: float64
Temporal range: 2020-11-14 17:40:06 → 2021-12-31 23:42:53
