In [1]:
%cd ../..

/home/kirill/PycharmProjects/pythonProject/RecoServiceTemplate


In [2]:
import pandas as pd
import numpy as np
import requests
from tqdm.auto import tqdm
from scipy.stats import mode 
from pprint import pprint
from implicit.nearest_neighbours import CosineRecommender, TFIDFRecommender
import warnings
warnings.filterwarnings("ignore")

from rectools import Columns
from rectools.dataset import Dataset

from models.userknn.userknn import UserKnn
from models.config import UserKnn_model, Popular_model

pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 200)

# Get KION dataset

In [3]:
interactions = pd.read_csv('{0}/interactions.csv'.format(UserKnn_model.dataset_path))
users = pd.read_csv('{0}/users.csv'.format(UserKnn_model.dataset_path))
items = pd.read_csv('{0}/items.csv'.format(UserKnn_model.dataset_path))

In [4]:
# rename columns, convert timestamp
interactions.rename(columns={'last_watch_dt': Columns.Datetime,
                            'total_dur': Columns.Weight},
                    inplace=True)

interactions['datetime'] = pd.to_datetime(interactions['datetime'])

In [5]:
max_date = interactions['datetime'].max()
min_date = interactions['datetime'].min()

#  userkNN model  CV

Compare implicit `CosineRecommender` and `TFIDFRecommender` as an ItemKnn base 



In [6]:
# setting for cv 
n_folds = 7
unit = "W"
n_units = 1

last_date = interactions[Columns.Datetime].max().normalize()
start_date = last_date - pd.Timedelta(n_folds * n_units + 1, unit=unit)  
print(f"Start date and last date of the test fold: {start_date, last_date}")

Start date and last date of the test fold: (Timestamp('2021-06-27 00:00:00'), Timestamp('2021-08-22 00:00:00'))


### Test fold borders

In [7]:
from rectools.model_selection import TimeRangeSplit

periods = n_folds + 1
freq = f"{n_units}{unit}"
print(
    f"start_date: {start_date}\n"
    f"last_date: {last_date}\n"
    f"periods: {periods}\n"
    f"freq: {freq}\n"
)
    
date_range = pd.date_range(start=start_date, periods=periods, freq=freq, tz=last_date.tz)
print(f"Test fold borders: {date_range.values.astype('datetime64[D]')}")

# generator of folds
cv = TimeRangeSplit(
    date_range=date_range,
    filter_already_seen=True,
    filter_cold_items=True,
    filter_cold_users=True,
)
print(f"Real number of folds: {cv.get_n_splits(interactions)}")

start_date: 2021-06-27 00:00:00
last_date: 2021-08-22 00:00:00
periods: 8
freq: 1W

Test fold borders: ['2021-06-27' '2021-07-04' '2021-07-11' '2021-07-18' '2021-07-25'
 '2021-08-01' '2021-08-08' '2021-08-15']
Real number of folds: 7


In [8]:
from rectools.metrics import Precision, Recall, MeanInvUserFreq, Serendipity, calc_metrics

# calculate several classic (precision@k and recall@k) and "beyond accuracy" metrics
metrics = {
    "prec@10": Precision(k=10),
    "recall@10": Recall(k=10),
    "novelty": MeanInvUserFreq(k=10),
    "serendipity": Serendipity(k=10),
}

# few simple models to compare
models = {
    "cosine_itemknn": CosineRecommender(),
    "tfidf_itemknn": TFIDFRecommender(),
}


# Model training by fold (long fold = 7 days)

In [None]:
%%time

results = []

fold_iterator = cv.split(interactions, collect_fold_stats=True)

for i_fold, (train_ids, test_ids, fold_info) in enumerate(fold_iterator):
    print(f"\n==================== Fold {i_fold}")
    pprint(fold_info)

    df_train = interactions.iloc[train_ids].copy()
    df_test = interactions.iloc[test_ids][Columns.UserItem].copy()

    catalog = df_train[Columns.Item].unique()
    
    for model_name, model in models.items():
        userknn_model = UserKnn(model=model, N_users=50)
        userknn_model.fit(df_train)
    
        recos = userknn_model.predict(df_test)
    
        metric_values = calc_metrics(
            metrics,
            reco=recos,
            interactions=df_test,
            prev_interactions=df_train,
            catalog=catalog,
        )
    
        fold = {"fold": i_fold, "model": model_name}
        fold.update(metric_values)
        results.append(fold)
        


{'End date': Timestamp('2021-08-13 00:00:00', freq='4D'),
 'Start date': Timestamp('2021-08-09 00:00:00', freq='4D'),
 'Test': 162953,
 'Test items': 5755,
 'Test users': 71051,
 'Train': 4649162,
 'Train items': 15415,
 'Train users': 850489}


  0%|          | 0/850489 [00:00<?, ?it/s]

  0%|          | 0/850489 [00:00<?, ?it/s]


{'End date': Timestamp('2021-08-17 00:00:00', freq='4D'),
 'Start date': Timestamp('2021-08-13 00:00:00', freq='4D'),
 'Test': 172960,
 'Test items': 5951,
 'Test users': 75887,
 'Train': 4867782,
 'Train items': 15519,
 'Train users': 880449}


  0%|          | 0/880449 [00:00<?, ?it/s]

# 👌 Metrics 

`Metrics by fold`



## Metric description 
- ### <a href="https://rectools.readthedocs.io/en/latest/api/rectools.metrics.novelty.MeanInvUserFreq.html#rectools.metrics.novelty.MeanInvUserFreq"> Mean Inverse User Frequency (novelty)</a>

- ### <a href="https://rectools.readthedocs.io/en/latest/api/rectools.metrics.serendipity.Serendipity.html"> Serendipity = novelty and relevance</a>

In [29]:
# 7 days, 7 folds
df_metrics = pd.DataFrame(results)
df_metrics

Unnamed: 0,fold,model,prec@10,recall@10,novelty,serendipity
0,0,cosine_itemknn,0.002922,0.014984,9.260631,3.8e-05
1,0,tfidf_itemknn,0.004639,0.024704,8.321053,3.9e-05
2,1,cosine_itemknn,0.003003,0.015476,9.35899,4.9e-05
3,1,tfidf_itemknn,0.00459,0.024952,8.381346,4.7e-05
4,2,cosine_itemknn,0.002793,0.014482,9.503543,5.3e-05
5,2,tfidf_itemknn,0.004015,0.021479,8.513031,5.1e-05
6,3,cosine_itemknn,0.002605,0.013047,9.595365,5.2e-05
7,3,tfidf_itemknn,0.004076,0.020768,8.612428,5.7e-05
8,4,cosine_itemknn,0.002469,0.011693,9.624027,5.7e-05
9,4,tfidf_itemknn,0.00372,0.018538,8.647398,5.8e-05


In [77]:
df_metrics.to_pickle("df_metrics.pickle")

## Metrics mean by fold
`we can compare two models`

In [79]:
df_metrics.groupby('model').mean()[metrics.keys()]

Unnamed: 0_level_0,prec@10,recall@10,novelty,serendipity
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
cosine_itemknn,0.002662,0.013196,9.527538,5.3e-05
tfidf_itemknn,0.004041,0.020986,8.545006,5.2e-05


## Metrics std by fold

`If a diff between model metrics less than an std value => there is no significant difference observed`

- For instance, for the serendipity metric there is no such difference between cosine_itemknn and tfidf_itemknn model results

In [80]:
df_metrics.groupby('model').std()[metrics.keys()]

Unnamed: 0_level_0,prec@10,recall@10,novelty,serendipity
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
cosine_itemknn,0.000245,0.00179,0.163769,8e-06
tfidf_itemknn,0.00043,0.00292,0.144612,7e-06


In [46]:
df = df_metrics.groupby('model').mean()[metrics.keys()]

In [59]:
diff = df.loc['cosine_itemknn'] - df.loc['tfidf_itemknn']
diff

prec@10       -1.379527e-03
recall@10     -7.790503e-03
novelty        9.825315e-01
serendipity    3.613396e-07
dtype: float64

#  userkNN model  CV (4 days)

Compare implicit `CosineRecommender` and `TFIDFRecommender` as an ItemKnn base 



In [9]:
# setting for cv 
n_folds = 3
unit = "D"
n_units = 4

last_date = interactions[Columns.Datetime].max().normalize()
start_date = last_date - pd.Timedelta(n_folds * n_units + 1, unit=unit)  
print(f"Start date and last date of the test fold: {start_date, last_date}")

Start date and last date of the test fold: (Timestamp('2021-08-09 00:00:00'), Timestamp('2021-08-22 00:00:00'))


In [14]:
# setting for cv 
n_folds = 3
unit = "D"
n_units = 4

last_date = interactions[Columns.Datetime].max().normalize()
start_date = last_date - pd.Timedelta(n_folds * n_units + 1, unit=unit)  
print(f"Start date and last date of the test fold: {start_date, last_date}")
from rectools.model_selection import TimeRangeSplit

periods = n_folds + 1
freq = f"{n_units}{unit}"
print(
    f"start_date: {start_date}\n"
    f"last_date: {last_date}\n"
    f"periods: {periods}\n"
    f"freq: {freq}\n"
)
    
date_range = pd.date_range(start=start_date, periods=periods, freq=freq, tz=last_date.tz)
print(f"Test fold borders: {date_range.values.astype('datetime64[D]')}")

# generator of folds
cv = TimeRangeSplit(
    date_range=date_range,
    filter_already_seen=True,
    filter_cold_items=True,
    filter_cold_users=True,
)
print(f"Real number of folds: {cv.get_n_splits(interactions)}")

Start date and last date of the test fold: (Timestamp('2021-08-09 00:00:00'), Timestamp('2021-08-22 00:00:00'))
start_date: 2021-08-09 00:00:00
last_date: 2021-08-22 00:00:00
periods: 4
freq: 4D

Test fold borders: ['2021-08-09' '2021-08-13' '2021-08-17' '2021-08-21']
Real number of folds: 3


In [10]:
from rectools.model_selection import TimeRangeSplit

periods = n_folds + 1
freq = f"{n_units}{unit}"
print(
    f"start_date: {start_date}\n"
    f"last_date: {last_date}\n"
    f"periods: {periods}\n"
    f"freq: {freq}\n"
)
    
date_range = pd.date_range(start=start_date, periods=periods, freq=freq, tz=last_date.tz)
print(f"Test fold borders: {date_range.values.astype('datetime64[D]')}")

# generator of folds
cv = TimeRangeSplit(
    date_range=date_range,
    filter_already_seen=True,
    filter_cold_items=True,
    filter_cold_users=True,
)
print(f"Real number of folds: {cv.get_n_splits(interactions)}")

start_date: 2021-08-09 00:00:00
last_date: 2021-08-22 00:00:00
periods: 4
freq: 4D

Test fold borders: ['2021-08-09' '2021-08-13' '2021-08-17' '2021-08-21']
Real number of folds: 3


# Model training by fold (long fold = 4 days)

In [25]:
%%time

results = []

fold_iterator = cv.split(interactions, collect_fold_stats=True)

for i_fold, (train_ids, test_ids, fold_info) in enumerate(fold_iterator):
    print(f"\n==================== Fold {i_fold}")
    pprint(fold_info)

    df_train = interactions.iloc[train_ids].copy()
    df_test = interactions.iloc[test_ids][Columns.UserItem].copy()

    catalog = df_train[Columns.Item].unique()
    
    for model_name, model in models.items():
        userknn_model = UserKnn(model=model, N_users=50)
        userknn_model.fit(df_train)
    
        recos = userknn_model.predict(df_test)
    
        metric_values = calc_metrics(
            metrics,
            reco=recos,
            interactions=df_test,
            prev_interactions=df_train,
            catalog=catalog,
        )
    
        fold = {"fold": i_fold, "model": model_name}
        fold.update(metric_values)
        results.append(fold)
        


{'End date': Timestamp('2021-08-13 00:00:00', freq='4D'),
 'Start date': Timestamp('2021-08-09 00:00:00', freq='4D'),
 'Test': 162953,
 'Test items': 5755,
 'Test users': 71051,
 'Train': 4649162,
 'Train items': 15415,
 'Train users': 850489}


  0%|          | 0/850489 [00:00<?, ?it/s]

  0%|          | 0/850489 [00:00<?, ?it/s]


{'End date': Timestamp('2021-08-17 00:00:00', freq='4D'),
 'Start date': Timestamp('2021-08-13 00:00:00', freq='4D'),
 'Test': 172960,
 'Test items': 5951,
 'Test users': 75887,
 'Train': 4867782,
 'Train items': 15519,
 'Train users': 880449}


  0%|          | 0/880449 [00:00<?, ?it/s]

  0%|          | 0/880449 [00:00<?, ?it/s]


{'End date': Timestamp('2021-08-21 00:00:00', freq='4D'),
 'Start date': Timestamp('2021-08-17 00:00:00', freq='4D'),
 'Test': 171074,
 'Test items': 5824,
 'Test users': 74740,
 'Train': 5106361,
 'Train items': 15589,
 'Train users': 913604}


  0%|          | 0/913604 [00:00<?, ?it/s]

  0%|          | 0/913604 [00:00<?, ?it/s]

CPU times: user 1d 17h 43s, sys: 3min 41s, total: 1d 17h 4min 25s
Wall time: 4h 8min 48s


In [26]:
df_metrics = pd.DataFrame(results)
df_metrics

Unnamed: 0,fold,model,prec@10,recall@10,novelty,serendipity
0,0,cosine_itemknn,0.002912,0.016831,8.264488,4.2e-05
1,0,tfidf_itemknn,0.005143,0.030107,8.2559,5.7e-05
2,1,cosine_itemknn,0.002912,0.016824,8.311312,4.2e-05
3,1,tfidf_itemknn,0.00508,0.030262,8.289351,5.3e-05
4,2,cosine_itemknn,0.002807,0.015895,8.344919,4.5e-05
5,2,tfidf_itemknn,0.004888,0.028442,8.295118,5.7e-05


In [27]:
df_metrics.to_pickle("df_metrics.pickle")

In [28]:
df_metrics.groupby('model').mean()[metrics.keys()]

Unnamed: 0_level_0,prec@10,recall@10,novelty,serendipity
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
cosine_itemknn,0.002877,0.016517,8.306906,4.3e-05
tfidf_itemknn,0.005037,0.029604,8.280123,5.6e-05


In [29]:
df_metrics.groupby('model').std()[metrics.keys()]

Unnamed: 0_level_0,prec@10,recall@10,novelty,serendipity
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
cosine_itemknn,6.1e-05,0.000538,0.040396,2e-06
tfidf_itemknn,0.000133,0.001009,0.021175,2e-06


In [30]:
df = df_metrics.groupby('model').mean()[metrics.keys()]

In [31]:
diff = df.loc['cosine_itemknn'] - df.loc['tfidf_itemknn']
diff

prec@10       -0.002160
recall@10     -0.013087
novelty        0.026783
serendipity   -0.000013
dtype: float64

#  userkNN model  CV (2 days)

Compare implicit `CosineRecommender` and `TFIDFRecommender` as an ItemKnn base 

In [32]:
# setting for cv 
n_folds = 3
unit = "D"
n_units = 2

last_date = interactions[Columns.Datetime].max().normalize()
start_date = last_date - pd.Timedelta(n_folds * n_units + 1, unit=unit)  
print(f"Start date and last date of the test fold: {start_date, last_date}")

Start date and last date of the test fold: (Timestamp('2021-08-15 00:00:00'), Timestamp('2021-08-22 00:00:00'))


In [33]:
from rectools.model_selection import TimeRangeSplit

periods = n_folds + 1
freq = f"{n_units}{unit}"
print(
    f"start_date: {start_date}\n"
    f"last_date: {last_date}\n"
    f"periods: {periods}\n"
    f"freq: {freq}\n"
)
    
date_range = pd.date_range(start=start_date, periods=periods, freq=freq, tz=last_date.tz)
print(f"Test fold borders: {date_range.values.astype('datetime64[D]')}")

# generator of folds
cv = TimeRangeSplit(
    date_range=date_range,
    filter_already_seen=True,
    filter_cold_items=True,
    filter_cold_users=True,
)
print(f"Real number of folds: {cv.get_n_splits(interactions)}")

start_date: 2021-08-15 00:00:00
last_date: 2021-08-22 00:00:00
periods: 4
freq: 2D

Test fold borders: ['2021-08-15' '2021-08-17' '2021-08-19' '2021-08-21']
Real number of folds: 3


# Model training by fold (long fold = 2 days)

In [34]:
%%time

results = []

fold_iterator = cv.split(interactions, collect_fold_stats=True)

for i_fold, (train_ids, test_ids, fold_info) in enumerate(fold_iterator):
    print(f"\n==================== Fold {i_fold}")
    pprint(fold_info)

    df_train = interactions.iloc[train_ids].copy()
    df_test = interactions.iloc[test_ids][Columns.UserItem].copy()

    catalog = df_train[Columns.Item].unique()
    
    for model_name, model in models.items():
        userknn_model = UserKnn(model=model, N_users=50)
        userknn_model.fit(df_train)
    
        recos = userknn_model.predict(df_test)
    
        metric_values = calc_metrics(
            metrics,
            reco=recos,
            interactions=df_test,
            prev_interactions=df_train,
            catalog=catalog,
        )
    
        fold = {"fold": i_fold, "model": model_name}
        fold.update(metric_values)
        results.append(fold)
        


{'End date': Timestamp('2021-08-17 00:00:00', freq='2D'),
 'Start date': Timestamp('2021-08-15 00:00:00', freq='2D'),
 'Test': 94423,
 'Test items': 4868,
 'Test users': 48702,
 'Train': 4985269,
 'Train items': 15565,
 'Train users': 896791}


  0%|          | 0/896791 [00:00<?, ?it/s]

  0%|          | 0/896791 [00:00<?, ?it/s]


{'End date': Timestamp('2021-08-19 00:00:00', freq='2D'),
 'Start date': Timestamp('2021-08-17 00:00:00', freq='2D'),
 'Test': 86337,
 'Test items': 4821,
 'Test users': 45282,
 'Train': 5106361,
 'Train items': 15589,
 'Train users': 913604}


  0%|          | 0/913604 [00:00<?, ?it/s]

  0%|          | 0/913604 [00:00<?, ?it/s]


{'End date': Timestamp('2021-08-21 00:00:00', freq='2D'),
 'Start date': Timestamp('2021-08-19 00:00:00', freq='2D'),
 'Test': 93419,
 'Test items': 4925,
 'Test users': 48683,
 'Train': 5217461,
 'Train items': 15628,
 'Train users': 928207}


  0%|          | 0/928207 [00:00<?, ?it/s]

  0%|          | 0/928207 [00:00<?, ?it/s]

CPU times: user 1d 20h 18min 49s, sys: 3min 53s, total: 1d 20h 22min 42s
Wall time: 4h 29min 11s


In [35]:
df_metrics = pd.DataFrame(results)
df_metrics

Unnamed: 0,fold,model,prec@10,recall@10,novelty,serendipity
0,0,cosine_itemknn,0.00247,0.015899,8.409843,3.9e-05
1,0,tfidf_itemknn,0.004447,0.029247,8.359713,5.2e-05
2,1,cosine_itemknn,0.002425,0.015732,8.441512,4e-05
3,1,tfidf_itemknn,0.00426,0.02824,8.409286,5.2e-05
4,2,cosine_itemknn,0.002533,0.016571,8.457175,4.5e-05
5,2,tfidf_itemknn,0.004217,0.027957,8.438648,5.2e-05


In [36]:
df_metrics.to_pickle("df_metrics.pickle")

In [37]:
df_metrics.groupby('model').mean()[metrics.keys()]

Unnamed: 0_level_0,prec@10,recall@10,novelty,serendipity
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
cosine_itemknn,0.002476,0.016067,8.436177,4.2e-05
tfidf_itemknn,0.004308,0.028481,8.402549,5.2e-05


In [38]:
df_metrics.groupby('model').std()[metrics.keys()]

Unnamed: 0_level_0,prec@10,recall@10,novelty,serendipity
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
cosine_itemknn,5.4e-05,0.000444,0.024113,3.125663e-06
tfidf_itemknn,0.000123,0.000678,0.039896,3.889944e-07


In [39]:
df = df_metrics.groupby('model').mean()[metrics.keys()]

In [40]:
diff = df.loc['cosine_itemknn'] - df.loc['tfidf_itemknn']
diff

prec@10       -0.001832
recall@10     -0.012414
novelty        0.033628
serendipity   -0.000010
dtype: float64

# Comparison

In [29]:
# 7 days, 7 folds
df_metrics = pd.DataFrame(results)
df_metrics

Unnamed: 0,fold,model,prec@10,recall@10,novelty,serendipity
0,0,cosine_itemknn,0.002922,0.014984,9.260631,3.8e-05
1,0,tfidf_itemknn,0.004639,0.024704,8.321053,3.9e-05
2,1,cosine_itemknn,0.003003,0.015476,9.35899,4.9e-05
3,1,tfidf_itemknn,0.00459,0.024952,8.381346,4.7e-05
4,2,cosine_itemknn,0.002793,0.014482,9.503543,5.3e-05
5,2,tfidf_itemknn,0.004015,0.021479,8.513031,5.1e-05
6,3,cosine_itemknn,0.002605,0.013047,9.595365,5.2e-05
7,3,tfidf_itemknn,0.004076,0.020768,8.612428,5.7e-05
8,4,cosine_itemknn,0.002469,0.011693,9.624027,5.7e-05
9,4,tfidf_itemknn,0.00372,0.018538,8.647398,5.8e-05


In [26]:
# 4 days, 3 folds
df_metrics = pd.DataFrame(results)
df_metrics

Unnamed: 0,fold,model,prec@10,recall@10,novelty,serendipity
0,0,cosine_itemknn,0.002912,0.016831,8.264488,4.2e-05
1,0,tfidf_itemknn,0.005143,0.030107,8.2559,5.7e-05
2,1,cosine_itemknn,0.002912,0.016824,8.311312,4.2e-05
3,1,tfidf_itemknn,0.00508,0.030262,8.289351,5.3e-05
4,2,cosine_itemknn,0.002807,0.015895,8.344919,4.5e-05
5,2,tfidf_itemknn,0.004888,0.028442,8.295118,5.7e-05


In [35]:
# 2 days, 3 folds
df_metrics = pd.DataFrame(results)
df_metrics

Unnamed: 0,fold,model,prec@10,recall@10,novelty,serendipity
0,0,cosine_itemknn,0.00247,0.015899,8.409843,3.9e-05
1,0,tfidf_itemknn,0.004447,0.029247,8.359713,5.2e-05
2,1,cosine_itemknn,0.002425,0.015732,8.441512,4e-05
3,1,tfidf_itemknn,0.00426,0.02824,8.409286,5.2e-05
4,2,cosine_itemknn,0.002533,0.016571,8.457175,4.5e-05
5,2,tfidf_itemknn,0.004217,0.027957,8.438648,5.2e-05


# Results

The best fold is 0 with a fold length of 4 days and tfidf_itemknn

`prec@10` = 0.005143
`recall@10` = 0.030107
`novelty` = 8.255900
`serendipity` = 0.000057

This variant has the highest prec@10 and recall@10, but relatively low novelty. However, since userkNN will blend in with the popular model, novelty will be low anyway.

With validation: test = 1 last week  `map@10` = 0.0826746
With validation: test = 4 days from (last_date - 12 days)  `map@10` = 0.0848538
