In [1]:
import pandas as pd
import numpy as np

# 0.3.0
from rectools import Columns
from rectools.dataset import Dataset

## Getting preprocess dataset

In [2]:
interactions = pd.read_csv('data/interactions.csv', index_col=0)
users = pd.read_csv('data/users.csv', index_col=0)
items = pd.read_csv('data/items.csv', index_col=0)
interactions['datetime'] = pd.to_datetime(interactions['datetime'])

In [3]:
interactions.head(3)

Unnamed: 0,user_id,item_id,datetime,weight,watched_pct
0,176549,9506,2021-05-11,4250,72.0
1,699317,1659,2021-05-29,8317,100.0
2,656683,7107,2021-05-09,10,0.0


In [4]:
users.head(3)

Unnamed: 0,user_id,age,income,sex,kids_flg
0,973171,age_25_34,income_60_90,М,1
1,962099,age_18_24,income_20_40,М,0
2,1047345,age_45_54,income_40_60,Ж,0


In [5]:
items.head(3)

Unnamed: 0,item_id,content_type,title,title_orig,release_year,genres,countries,for_kids,age_rating,studios,directors,actors,description,keywords
0,10711,film,Поговори с ней,Hable con ella,2002.0,"драмы, зарубежные, детективы, мелодрамы",Испания,,16.0,,Педро Альмодовар,"Адольфо Фернандес, Ана Фернандес, Дарио Гранди...",Мелодрама легендарного Педро Альмодовара «Пого...,"Поговори, ней, 2002, Испания, друзья, любовь, ..."
1,2508,film,Голые перцы,Search Party,2014.0,"зарубежные, приключения, комедии",США,,16.0,,Скот Армстронг,"Адам Палли, Брайан Хаски, Дж.Б. Смув, Джейсон ...",Уморительная современная комедия на популярную...,"Голые, перцы, 2014, США, друзья, свадьбы, прео..."
2,10716,film,Тактическая сила,Tactical Force,2011.0,"криминал, зарубежные, триллеры, боевики, комедии",Канада,,16.0,,Адам П. Калтраро,"Адриан Холмс, Даррен Шалави, Джерри Вассерман,...",Профессиональный рестлер Стив Остин («Все или ...,"Тактическая, сила, 2011, Канада, бандиты, ганг..."


Import prepared class for UserKNN model training

In [6]:
from UserKNN import UserKnn

And then import implicit nn recommenders in the way to test them

In [7]:
from implicit.nearest_neighbours import CosineRecommender, TFIDFRecommender, BM25Recommender

And download our prepared popular models 

In [8]:
import pickle

with open('rectools_datasets/dataset.pickle', 'rb') as handle:
    dataset = pickle.load(handle)
    
with open('rectools_datasets/dataset_pct.pickle', 'rb') as handle:
    dataset_pct = pickle.load(handle)

with open('pop_models/pop_sum_weight.pickle', 'rb') as handle:
    pop_sum_weight = pickle.load(handle)
    
with open('pop_models/pop_sum_weight_pct.pickle', 'rb') as handle:
    pop_sum_weight_pct = pickle.load(handle)
    
with open('pop_models/pop_n_users.pickle', 'rb') as handle:
    pop_n_users = pickle.load(handle)
    
with open('pop_models/pop_n_interactions.pickle', 'rb') as handle:
    pop_n_interactions = pickle.load(handle)

### Preparing cross validation to test models

We will use TimeRangeSplit for CV in the way it is closed to the real world, because we have only historical data to train models in order to predict recos in the future for users

So let's understand what range for historical data we have

In [9]:
max_date = interactions['datetime'].max()
min_date = interactions['datetime'].min()

print(f"min date in interactions: {min_date}")
print(f"max date in interactions: {max_date}")
print(f'Days in total: {(max_date - min_date).days}')
print(f'Weeks in total: {(max_date - min_date).days // 7}')
print(f'Month in total: {(max_date - min_date).days // 30}')

min date in interactions: 2021-03-13 00:00:00
max date in interactions: 2021-08-22 00:00:00
Days in total: 162
Weeks in total: 23
Month in total: 5


As we can see there are enough month to use them like a fold, so let's make for test last 4 weeks

In [10]:
n_folds = 1
unit = "W"
n_units = 4
periods = n_folds + 1

last_date = interactions[Columns.Datetime].max().normalize()
start_date = last_date - pd.Timedelta(n_folds * n_units + 1, unit=unit)

date_range = pd.date_range(start=start_date, periods=periods, freq=unit, tz=last_date.tz)

print(f"Test fold borders: {date_range.values.astype('datetime64[D]')}")

Test fold borders: ['2021-07-18' '2021-07-25']


In [11]:
from rectools.model_selection.time_split import TimeRangeSplitter

# generator of folds
cv = TimeRangeSplitter(
    date_range=date_range,
    filter_already_seen=True,
    filter_cold_items=True,
    filter_cold_users=True,
)

In [12]:
from rectools.dataset import Interactions

# Creating Interactions object in the way to use CV
interactions = Interactions(interactions)
print(f"Real number of folds: {cv.get_n_splits(interactions)}")

Real number of folds: 1


So let's define models and metrics in the way to make CV

In [13]:
from rectools.metrics import Precision, Recall, MeanInvUserFreq, Serendipity, calc_metrics, MAP

metrics = {
    "Precision@10": Precision(k=10),
    "Recall@10": Recall(k=10),
    "Novelty": MeanInvUserFreq(k=10),
    "Serendipity": Serendipity(k=10),
    "MAP@10": MAP(k=10)
}

# few simple models to compare
models = {
    "cosine_itemknn": CosineRecommender(),
    "tfidf_itemknn": TFIDFRecommender(),
    "bm25_itemknn": BM25Recommender(K=20)
}

Let's create features for users and items like in previous notebook

In [14]:
from sklearn import preprocessing

_, bins = pd.qcut(items["release_year"], 10, retbins=True)
year_feature = pd.DataFrame(
    {
        "id": items["item_id"],
        "value": pd.cut(items["release_year"], bins=bins, labels=bins[:-1]),
        "feature": "release_year",
    }
)

items["genre"] = items["genres"].str.split(",")
genre_feature = items[["item_id", "genre"]].explode("genre")
genre_feature.columns = ["id", "value"]
genre_feature["feature"] = "genre"

age_rating_feature = items[['item_id', 'age_rating']].dropna()
age_rating_feature.columns = ["id", "value"]
age_rating_feature['feature'] = 'age_rating'

content_type_feature = items[['item_id', 'content_type']].dropna()
content_type_feature.columns = ["id", "value"]
content_type_feature['feature'] = 'content_type'

item_feat = pd.concat([genre_feature, 
                       year_feature, 
                       age_rating_feature,
                       content_type_feature])

age_feature = users[['user_id', 'age']].dropna()
age_feature.columns = ["id", "value"]
age_feature['feature'] = 'age'

sex_feature = users[['user_id', 'sex']].dropna()
sex_feature.columns = ["id", "value"]
sex_feature['feature'] = 'sex'
user_feat = pd.concat([age_feature,
                       sex_feature
                      ])

In [15]:
from rectools.models import ImplicitItemKNNWrapperModel

In [16]:
results = []

(train_ids, test_ids, _) = next(cv.split(interactions, collect_fold_stats=True))

train = interactions.df.iloc[train_ids].copy()
test = interactions.df.iloc[test_ids].copy()

catalog = train[Columns.Item].unique()

# Dataset object constraction for ImplicitItemKNNWrapperModel
user_feature_train = user_feat[user_feat['id'].isin(train[Columns.User])]
user_feature_test = user_feat[user_feat['id'].isin(test[Columns.User])]

items_feature_train = item_feat[item_feat['id'].isin(train[Columns.Item])]
items_feature_test = item_feat[item_feat['id'].isin(test[Columns.Item])]

train_dts = Dataset.construct(
    train,
    user_features_df=user_feature_train,
    item_features_df=items_feature_train,
    cat_item_features=['genre', 
                       'release_year',
                       'age_rating',
                       'content_type'
                       ],
    cat_user_features=['age', 'sex']

)

test_dts = Dataset.construct(
    test,
    user_features_df=user_feature_test,
    item_features_df=items_feature_test,
    cat_item_features=['genre', 
                       'release_year',
                       'age_rating',
                       'content_type'
                       ],
    cat_user_features=['age', 'sex']

)

for model_name, model in models.items():
    
    print(model_name)
    userknn_model = ImplicitItemKNNWrapperModel(model, verbose=1)
    userknn_model.fit(train_dts)

    recos = userknn_model.recommend(
        users=train[Columns.User].unique(),
        dataset=train_dts,
        k=10,
        filter_viewed=True,
    )

    metric_values = calc_metrics(
        metrics,
        reco=recos,
        interactions=test,
        prev_interactions=train,
        catalog=catalog,
    )

    print(metric_values)
    results.append({"model": model_name, **metric_values})

cosine_itemknn


  0%|          | 0/14928 [00:00<?, ?it/s]

  0%|          | 0/687200 [00:00<?, ?it/s]

{'Precision@10': 0.028536379404783638, 'Recall@10': 0.15382073850844413, 'MAP@10': 0.07345204436308347, 'Novelty': 9.583207320909077, 'Serendipity': 1.3224037014240433e-05}
tfidf_itemknn


  0%|          | 0/14928 [00:00<?, ?it/s]

  0%|          | 0/687200 [00:00<?, ?it/s]

{'Precision@10': 0.037698557604528025, 'Recall@10': 0.19432669276817516, 'MAP@10': 0.09153657927049391, 'Novelty': 7.543119085433283, 'Serendipity': 1.9906498770253037e-05}
bm25_itemknn


  0%|          | 0/14928 [00:00<?, ?it/s]

  0%|          | 0/687200 [00:00<?, ?it/s]

{'Precision@10': 0.043408800438196095, 'Recall@10': 0.2278975986151315, 'MAP@10': 0.11251849850166812, 'Novelty': 4.0965778360598515, 'Serendipity': 5.57914820900483e-06}


Let's see the results of metrics evaling:

In [17]:
df_metrics = pd.DataFrame(results)
df_metrics

Unnamed: 0,model,Precision@10,Recall@10,MAP@10,Novelty,Serendipity
0,cosine_itemknn,0.028536,0.153821,0.073452,9.583207,1.3e-05
1,tfidf_itemknn,0.037699,0.194327,0.091537,7.543119,2e-05
2,bm25_itemknn,0.043409,0.227898,0.112518,4.096578,6e-06


Let's check the last model to understand if all recos have len == 10

In [18]:
recos.groupby('user_id')['rank'].count().unique()

array([10,  9,  4,  2,  8,  3,  1,  6,  7])

As we can see there are some recos that don't have all recos that we need, so let's try to solve it

For the first let's get user_id where recos < 10

In [19]:
user_id_n_recos = recos.groupby('user_id')['rank'].count()

In [20]:
user_wo_recos = np.array(user_id_n_recos[user_id_n_recos < 10].index)

So let's get popular items for this users

In [21]:
pop_recos = pop_n_users.recommend(
                user_wo_recos, 
                dataset=dataset, 
                k=10, 
                filter_viewed=True
            )

Add +10 to rank in the way we want to save right order

In [22]:
pop_recos['rank'] += 10

In [23]:
pop_recos.head()

Unnamed: 0,user_id,item_id,score,rank
0,4139,10440,202457.0,11
1,4139,15297,193123.0,12
2,4139,9728,132865.0,13
3,4139,13865,122119.0,14
4,4139,4151,91167.0,15


And concat two recos into one df

In [24]:
recos_all = pd.concat([recos, pop_recos])

In [25]:
recos_all = recos_all.sort_values(['user_id', 'rank'])

Let's redefine rank

In [26]:
recos_all['rank'] = recos_all[['user_id', 'rank']].groupby('user_id').cumcount() + 1
recos_all

Unnamed: 0,user_id,item_id,score,rank
5797574,0,15297,1.753745e+08,1
5797575,0,10440,1.428871e+08,2
5797576,0,13865,9.451971e+07,3
5797577,0,4151,9.006657e+07,4
5797578,0,9728,8.127836e+07,5
...,...,...,...,...
5915169,1097556,4151,1.675141e+08,6
5915170,1097556,4880,1.506286e+08,7
5915171,1097556,142,1.067408e+08,8
5915172,1097556,2657,9.952675e+07,9


And delete all recos that more then 10

In [27]:
recos_all = recos_all[recos_all['rank'] <= 10]
recos_all

Unnamed: 0,user_id,item_id,score,rank
5797574,0,15297,1.753745e+08,1
5797575,0,10440,1.428871e+08,2
5797576,0,13865,9.451971e+07,3
5797577,0,4151,9.006657e+07,4
5797578,0,9728,8.127836e+07,5
...,...,...,...,...
5915169,1097556,4151,1.675141e+08,6
5915170,1097556,4880,1.506286e+08,7
5915171,1097556,142,1.067408e+08,8
5915172,1097556,2657,9.952675e+07,9


Let's define it like a function to use in metrics evaling

In [28]:
def add_more_recos(recos: pd.DataFrame, pop_model, dataset) -> pd.DataFrame:
    user_id_n_recos = recos.groupby('user_id')['rank'].count()
    user_wo_recos = np.array(user_id_n_recos[user_id_n_recos < 10].index)
    
    pop_recos = pop_model.recommend(
                user_wo_recos, 
                dataset=dataset, 
                k=10, 
                filter_viewed=True
            )
    pop_recos['rank'] += 10
    
    recos_all = pd.concat([recos, pop_recos]).sort_values(['user_id', 'rank'])
    recos_all['rank'] = recos_all[['user_id', 'rank']].groupby('user_id').cumcount() + 1
    
    return recos_all[recos_all['rank'] <= 10]

Let's check that function work correctly

In [29]:
add_more_recos(recos, pop_n_users, dataset)

Unnamed: 0,user_id,item_id,score,rank
5797574,0,15297,1.753745e+08,1
5797575,0,10440,1.428871e+08,2
5797576,0,13865,9.451971e+07,3
5797577,0,4151,9.006657e+07,4
5797578,0,9728,8.127836e+07,5
...,...,...,...,...
5915169,1097556,4151,1.675141e+08,6
5915170,1097556,4880,1.506286e+08,7
5915171,1097556,142,1.067408e+08,8
5915172,1097556,2657,9.952675e+07,9


It is sounds like a plan! Let's recolculate metrics using this function to add recos

For the first define a dict where all pop models and datasets will be defined

In [30]:
pop_models_dict = {
    "pop_n_users": (pop_n_users, dataset),
    "pop_n_interactions": (pop_n_interactions, dataset),
    "pop_sum_weight": (pop_sum_weight, dataset),
    "pop_sum_weight_pct": (pop_sum_weight_pct, dataset_pct)
}

And reeval metrics again:

In [31]:
results_w_pop = []
for model_name, model in models.items():

    print(model_name)
    userknn_model = ImplicitItemKNNWrapperModel(model, verbose=1)
    userknn_model.fit(train_dts)

    recos = userknn_model.recommend(
        users=train[Columns.User].unique(),
        dataset=train_dts,
        k=10,
        filter_viewed=True,
    )

    # Add recos using different pop models
    for key in pop_models_dict:
        recos_w_pop = recos.copy()

        print(model_name, '|', key)
        model_name_w_pop = model_name + f' | {key}'

        recos_w_pop = add_more_recos(recos_w_pop,
                                     pop_models_dict[key][0], 
                                     pop_models_dict[key][1])

        metric_values = calc_metrics(
            metrics,
            reco=recos_w_pop,
            interactions=test,
            prev_interactions=train,
            catalog=catalog,
        )

        print(metric_values)
        results_w_pop.append({"model": model_name_w_pop, **metric_values})

cosine_itemknn


  0%|          | 0/14928 [00:00<?, ?it/s]

  0%|          | 0/687200 [00:00<?, ?it/s]

cosine_itemknn | pop_n_users
{'Precision@10': 0.028536379404783638, 'Recall@10': 0.15382073850844413, 'MAP@10': 0.07345204436308347, 'Novelty': 9.583063949980408, 'Serendipity': 1.3224037014240433e-05}
cosine_itemknn | pop_n_interactions
{'Precision@10': 0.028536379404783638, 'Recall@10': 0.15382073850844413, 'MAP@10': 0.07345204436308347, 'Novelty': 9.583063949980408, 'Serendipity': 1.3224037014240433e-05}
cosine_itemknn | pop_sum_weight
{'Precision@10': 0.028536379404783638, 'Recall@10': 0.15382073850844413, 'MAP@10': 0.07345204436308347, 'Novelty': 9.58308131342954, 'Serendipity': 1.3224037014240433e-05}
cosine_itemknn | pop_sum_weight_pct
{'Precision@10': 0.028536379404783638, 'Recall@10': 0.15382073850844413, 'MAP@10': 0.07345204436308347, 'Novelty': 9.583064909208696, 'Serendipity': 1.3224037014240433e-05}
tfidf_itemknn


  0%|          | 0/14928 [00:00<?, ?it/s]

  0%|          | 0/687200 [00:00<?, ?it/s]

tfidf_itemknn | pop_n_users
{'Precision@10': 0.037698557604528025, 'Recall@10': 0.19432669276817516, 'MAP@10': 0.09153657927049391, 'Novelty': 7.542962738816669, 'Serendipity': 1.9906498770253044e-05}
tfidf_itemknn | pop_n_interactions
{'Precision@10': 0.037698557604528025, 'Recall@10': 0.19432669276817516, 'MAP@10': 0.09153657927049391, 'Novelty': 7.542962738816669, 'Serendipity': 1.9906498770253044e-05}
tfidf_itemknn | pop_sum_weight
{'Precision@10': 0.037698557604528025, 'Recall@10': 0.19432669276817516, 'MAP@10': 0.09153657927049391, 'Novelty': 7.542981451593764, 'Serendipity': 1.9906498770253044e-05}
tfidf_itemknn | pop_sum_weight_pct
{'Precision@10': 0.037698557604528025, 'Recall@10': 0.19432669276817516, 'MAP@10': 0.09153657927049391, 'Novelty': 7.542963874096594, 'Serendipity': 1.9906498770253044e-05}
bm25_itemknn


  0%|          | 0/14928 [00:00<?, ?it/s]

  0%|          | 0/687200 [00:00<?, ?it/s]

bm25_itemknn | pop_n_users
{'Precision@10': 0.043408800438196095, 'Recall@10': 0.2278975986151315, 'MAP@10': 0.11251849850166812, 'Novelty': 4.09643446513118, 'Serendipity': 5.579148209004831e-06}
bm25_itemknn | pop_n_interactions
{'Precision@10': 0.043408800438196095, 'Recall@10': 0.2278975986151315, 'MAP@10': 0.11251849850166812, 'Novelty': 4.09643446513118, 'Serendipity': 5.579148209004831e-06}
bm25_itemknn | pop_sum_weight
{'Precision@10': 0.043408800438196095, 'Recall@10': 0.2278975986151315, 'MAP@10': 0.11251849850166812, 'Novelty': 4.096451828580313, 'Serendipity': 5.579148209004831e-06}
bm25_itemknn | pop_sum_weight_pct
{'Precision@10': 0.043408800438196095, 'Recall@10': 0.2278975986151315, 'MAP@10': 0.11251849850166812, 'Novelty': 4.096435424359467, 'Serendipity': 5.579148209004831e-06}


In [32]:
df_metrics_w_pop = pd.DataFrame(results_w_pop)
df_metrics_w_pop

Unnamed: 0,model,Precision@10,Recall@10,MAP@10,Novelty,Serendipity
0,cosine_itemknn | pop_n_users,0.028536,0.153821,0.073452,9.583064,1.3e-05
1,cosine_itemknn | pop_n_interactions,0.028536,0.153821,0.073452,9.583064,1.3e-05
2,cosine_itemknn | pop_sum_weight,0.028536,0.153821,0.073452,9.583081,1.3e-05
3,cosine_itemknn | pop_sum_weight_pct,0.028536,0.153821,0.073452,9.583065,1.3e-05
4,tfidf_itemknn | pop_n_users,0.037699,0.194327,0.091537,7.542963,2e-05
5,tfidf_itemknn | pop_n_interactions,0.037699,0.194327,0.091537,7.542963,2e-05
6,tfidf_itemknn | pop_sum_weight,0.037699,0.194327,0.091537,7.542981,2e-05
7,tfidf_itemknn | pop_sum_weight_pct,0.037699,0.194327,0.091537,7.542964,2e-05
8,bm25_itemknn | pop_n_users,0.043409,0.227898,0.112518,4.096434,6e-06
9,bm25_itemknn | pop_n_interactions,0.043409,0.227898,0.112518,4.096434,6e-06


In [33]:
pd.concat([df_metrics_w_pop, df_metrics]).sort_values('model')

Unnamed: 0,model,Precision@10,Recall@10,MAP@10,Novelty,Serendipity
2,bm25_itemknn,0.043409,0.227898,0.112518,4.096578,6e-06
9,bm25_itemknn | pop_n_interactions,0.043409,0.227898,0.112518,4.096434,6e-06
8,bm25_itemknn | pop_n_users,0.043409,0.227898,0.112518,4.096434,6e-06
10,bm25_itemknn | pop_sum_weight,0.043409,0.227898,0.112518,4.096452,6e-06
11,bm25_itemknn | pop_sum_weight_pct,0.043409,0.227898,0.112518,4.096435,6e-06
0,cosine_itemknn,0.028536,0.153821,0.073452,9.583207,1.3e-05
1,cosine_itemknn | pop_n_interactions,0.028536,0.153821,0.073452,9.583064,1.3e-05
0,cosine_itemknn | pop_n_users,0.028536,0.153821,0.073452,9.583064,1.3e-05
2,cosine_itemknn | pop_sum_weight,0.028536,0.153821,0.073452,9.583081,1.3e-05
3,cosine_itemknn | pop_sum_weight_pct,0.028536,0.153821,0.073452,9.583065,1.3e-05


As we can see metrics hasn't changed in general and there is may be one reason. The reason is that we don't consider cold users, where pop model can help us

So let's make final model according to evaled metrics and winner is:  
```bm25_itemknn```

### For the first train offline model

In [16]:
train = interactions.df.copy()

catalog = train[Columns.Item].unique()

# Dataset object constraction for ImplicitItemKNNWrapperModel
user_feature_train = user_feat[user_feat['id'].isin(train[Columns.User])]
items_feature_train = item_feat[item_feat['id'].isin(train[Columns.Item])]

train_dts = Dataset.construct(
    train,
    user_features_df=user_feature_train,
    item_features_df=items_feature_train,
    cat_item_features=['genre', 
                       'release_year',
                       'age_rating',
                       'content_type'
                       ],
    cat_user_features=['age', 'sex']

)


userknn_model = ImplicitItemKNNWrapperModel(BM25Recommender(), verbose=1)
userknn_model.fit(train_dts)

offline_recos = userknn_model.recommend(
    users=train[Columns.User].unique(),
    dataset=train_dts,
    k=10,
    filter_viewed=True,
)

  0%|          | 0/15706 [00:00<?, ?it/s]

  0%|          | 0/962179 [00:00<?, ?it/s]

In [17]:
offline_recos.head()

Unnamed: 0,user_id,item_id,score,rank
0,176549,13865,88906900000.0,1
1,176549,10440,79918380000.0,2
2,176549,15297,71302420000.0,3
3,176549,3734,69446720000.0,4
4,176549,4151,42316410000.0,5


In [32]:
offline = offline_recos.copy()

In [46]:
os.makedirs('knn_models_offline', exist_ok=True)
offline.to_csv('knn_models_offline/bm25_offline.csv')

Let's make dict from csv table to make offline recos in service

In [39]:
users_items_dict = offline.groupby('user_id')['item_id'].apply(list).to_dict()

In [40]:
users_items_dict[176549]

[13865, 10440, 15297, 3734, 4151, 7571, 4880, 142, 16270, 4457]

Let's save dictionary to use it in service

In [45]:
import json
with open('knn_models_offline/bm25_itemknn_offline.json', 'w') as handle:
    json.dump(users_items_dict, handle)

### And then try to create online model

We will use UserKNN class that has been prepared, but add one method to make prediction for 1 user

But we make some changes in order to use it with BM25Recommender

In [18]:
from UserKNN import UserKnnBM25
from implicit.nearest_neighbours import BM25Recommender
from random import choice
import pandas as pd
import os

interactions = pd.read_csv('data/interactions.csv', index_col=0)
interactions['datetime'] = pd.to_datetime(interactions['datetime'])

For the first test how model works on parts of train

In [19]:
model = UserKnnBM25(BM25Recommender(), N_users=50)

In [20]:
part_for_test = interactions[:100000].copy()
model.fit(part_for_test)

  0%|          | 0/83986 [00:00<?, ?it/s]

In [21]:
users_part = part_for_test['user_id'].unique().tolist()

In [22]:
model.predict_for_one_user(choice(users_part), 10)

[5766, 3797, 6646, 6210, 5560, 16361, 11237]

Looks like everything is correct  
So train all model

In [24]:
model = UserKnnBM25(BM25Recommender(), N_users=50)
model.fit(interactions)

  0%|          | 0/962179 [00:00<?, ?it/s]

In [43]:
model.predict_for_one_user(176549, 10)

[4273, 7793, 5695, 15469, 7453, 11586, 5482, 6809, 10688, 13865]

Let's save model

In [28]:
os.makedirs('knn_models', exist_ok=True)
with open('knn_models/bm25_itemknn.pickle', 'wb') as handle:
    pickle.dump(model, handle)

Loading the model to check that everything is correct

In [29]:
with open('knn_models/bm25_itemknn.pickle', 'rb') as handle:
    load_model = pickle.load(handle)

In [42]:
load_model.predict_for_one_user(176549, 10)

[4273, 7793, 5695, 15469, 7453, 11586, 5482, 6809, 10688, 13865]