In [1]:
import pandas as pd
import numpy as np

# 0.3.0
from rectools import Columns
from rectools.dataset import Dataset

## Getting preprocess dataset

In [2]:
interactions = pd.read_csv('data/interactions.csv', index_col=0)
users = pd.read_csv('data/users.csv', index_col=0)
items = pd.read_csv('data/items.csv', index_col=0)

In [3]:
interactions.head(3)

Unnamed: 0,user_id,item_id,datetime,weight,watched_pct
0,176549,9506,2021-05-11,4250,72.0
1,699317,1659,2021-05-29,8317,100.0
2,656683,7107,2021-05-09,10,0.0


In [4]:
users.head(3)

Unnamed: 0,user_id,age,income,sex,kids_flg
0,973171,age_25_34,income_60_90,М,1
1,962099,age_18_24,income_20_40,М,0
2,1047345,age_45_54,income_40_60,Ж,0


In [5]:
items.head(3)

Unnamed: 0,item_id,content_type,title,title_orig,release_year,genres,countries,for_kids,age_rating,studios,directors,actors,description,keywords
0,10711,film,Поговори с ней,Hable con ella,2002.0,"драмы, зарубежные, детективы, мелодрамы",Испания,,16.0,,Педро Альмодовар,"Адольфо Фернандес, Ана Фернандес, Дарио Гранди...",Мелодрама легендарного Педро Альмодовара «Пого...,"Поговори, ней, 2002, Испания, друзья, любовь, ..."
1,2508,film,Голые перцы,Search Party,2014.0,"зарубежные, приключения, комедии",США,,16.0,,Скот Армстронг,"Адам Палли, Брайан Хаски, Дж.Б. Смув, Джейсон ...",Уморительная современная комедия на популярную...,"Голые, перцы, 2014, США, друзья, свадьбы, прео..."
2,10716,film,Тактическая сила,Tactical Force,2011.0,"криминал, зарубежные, триллеры, боевики, комедии",Канада,,16.0,,Адам П. Калтраро,"Адриан Холмс, Даррен Шалави, Джерри Вассерман,...",Профессиональный рестлер Стив Остин («Все или ...,"Тактическая, сила, 2011, Канада, бандиты, ганг..."


## Preparing data for ```rectools.dataset.Dataset``` class

### Item features construction

Binarize years in release_year feature

Creating year_feature df

In [6]:
_, bins = pd.qcut(items["release_year"], 10, retbins=True)
year_feature = pd.DataFrame(
    {
        "id": items["item_id"],
        "value": pd.cut(items["release_year"], bins=bins, labels=bins[:-1]),
        "feature": "release_year",
    }
)
year_feature.head()

Unnamed: 0,id,value,feature
0,10711,1983.0,release_year
1,2508,2012.0,release_year
2,10716,2009.0,release_year
3,7868,2014.0,release_year
4,16268,1897.0,release_year


Adding genres like features

In [7]:
items["genre"] = items["genres"].str.split(",")
genre_feature = items[["item_id", "genre"]].explode("genre")
genre_feature.columns = ["id", "value"]
genre_feature["feature"] = "genre"
genre_feature.head()

Unnamed: 0,id,value,feature
0,10711,драмы,genre
0,10711,зарубежные,genre
0,10711,детективы,genre
0,10711,мелодрамы,genre
1,2508,зарубежные,genre


Adding age_rating like feature

In [8]:
age_rating_feature = items[['item_id', 'age_rating']].dropna()
age_rating_feature.columns = ["id", "value"]
age_rating_feature['feature'] = 'age_rating'
age_rating_feature.head()

Unnamed: 0,id,value,feature
0,10711,16.0,age_rating
1,2508,16.0,age_rating
2,10716,16.0,age_rating
3,7868,16.0,age_rating
4,16268,12.0,age_rating


In [9]:
content_type_feature = items[['item_id', 'content_type']].dropna()
content_type_feature.columns = ["id", "value"]
content_type_feature['feature'] = 'content_type'
content_type_feature.head()

Unnamed: 0,id,value,feature
0,10711,film,content_type
1,2508,film,content_type
2,10716,film,content_type
3,7868,film,content_type
4,16268,film,content_type


Creating general item_feat

In [10]:
item_feat = pd.concat([genre_feature, 
                       year_feature, 
                       age_rating_feature,
                       content_type_feature])
item_feat = item_feat[item_feat['id'].isin(interactions['item_id'])]

In [11]:
item_feat

Unnamed: 0,id,value,feature
0,10711,драмы,genre
0,10711,зарубежные,genre
0,10711,детективы,genre
0,10711,мелодрамы,genre
1,2508,зарубежные,genre
...,...,...,...
15958,6443,series,content_type
15959,2367,series,content_type
15960,10632,series,content_type
15961,4538,series,content_type


### User features construction

Adding age like a feature

In [12]:
from sklearn import preprocessing
age_feature = users[['user_id', 'age']].dropna()
age_feature.columns = ["id", "value"]
# Let's convert categorical type into int for age
lab_encoder = preprocessing.LabelEncoder()
age_feature['value'] = lab_encoder.fit_transform(age_feature['value'])
age_feature['feature'] = 'age'
age_feature.head()

Unnamed: 0,id,value,feature
0,973171,1,age
1,962099,0,age
2,1047345,3,age
3,721985,3,age
4,704055,2,age


Adding sex like a feature

In [13]:
sex_feature = users[['user_id', 'sex']].dropna()
sex_feature.columns = ["id", "value"]
# Converting categorical feature to int
sex_feature['value'] = lab_encoder.fit_transform(sex_feature['value'])
sex_feature['feature'] = 'age'
sex_feature.head()

Unnamed: 0,id,value,feature
0,973171,1,age
1,962099,1,age
2,1047345,0,age
3,721985,0,age
4,704055,0,age


Creating general user_feat

In [14]:
user_feat = pd.concat([age_feature,
                       sex_feature
                      ])
user_feat = user_feat[user_feat['id'].isin(interactions['user_id'])]

### Constracting ```Dataset``` object

In [15]:
dataset = Dataset.construct(
    interactions_df=interactions,
    user_features_df=user_feat,
    item_features_df=item_feat,
    cat_item_features=['genre', 
                       'release_year',
                       'age_rating',
                       'content_type'
                      ],
    cat_user_features=['age', 'sex']
)

### Creating simple pop model

In [16]:
from rectools.models.popular import PopularModel

Creating models with different 'popularity score' methods evaling

In [17]:
%%time
pop_n_users = PopularModel()
pop_n_users.fit(dataset);

CPU times: user 1.54 s, sys: 112 ms, total: 1.65 s
Wall time: 1.65 s


<rectools.models.popular.PopularModel at 0x7f7f6d2353a0>

In [18]:
%%time
pop_n_interactions = PopularModel(popularity='n_interactions')
pop_n_interactions.fit(dataset)

CPU times: user 65.5 ms, sys: 28.7 ms, total: 94.2 ms
Wall time: 93.7 ms


<rectools.models.popular.PopularModel at 0x7f7f6d235820>

In [19]:
%%time
pop_mean_weight = PopularModel(popularity='mean_weight')
pop_mean_weight.fit(dataset)

CPU times: user 81.7 ms, sys: 607 µs, total: 82.3 ms
Wall time: 83.5 ms


<rectools.models.popular.PopularModel at 0x7f7f6d235b80>

In [20]:
%%time
pop_sum_weight = PopularModel(popularity='sum_weight')
pop_sum_weight.fit(dataset);

CPU times: user 71.2 ms, sys: 11.6 ms, total: 82.8 ms
Wall time: 82.4 ms


<rectools.models.popular.PopularModel at 0x7f7f6d235f70>

In [21]:
pd.concat([
    pop_n_users.recommend(
        dataset.user_id_map.external_ids[:1], 
        dataset=dataset, 
        k=10, 
        filter_viewed=False  # False - same items to every user
    ).merge(items[['item_id', 'title', 'content_type']], 
           on='item_id',
           how='left'),
    
    pop_n_interactions.recommend(
        dataset.user_id_map.external_ids[:1], 
        dataset=dataset, 
        k=10, 
        filter_viewed=False  # False - same items to every user
    ).merge(items[['item_id', 'title', 'content_type']], 
           on='item_id',
           how='left'),
    
    pop_mean_weight.recommend(
        dataset.user_id_map.external_ids[:1], 
        dataset=dataset, 
        k=10, 
        filter_viewed=False  # False - same items to every user
    ).merge(items[['item_id', 'title', 'content_type']], 
           on='item_id',
           how='left'),
    
    pop_sum_weight.recommend(
        dataset.user_id_map.external_ids[:1], 
        dataset=dataset, 
        k=10, 
        filter_viewed=False  # False - same items to every user
    ).merge(items[['item_id', 'title', 'content_type']], 
           on='item_id',
           how='left'),
], axis=1).drop(['user_id', 'rank'], axis=1)

Unnamed: 0,item_id,score,title,content_type,item_id.1,score.1,title.1,content_type.1,item_id.2,score.2,title.2,content_type.2,item_id.3,score.3,title.3,content_type.3
0,10440,202457.0,Хрустальный,series,10440,202457.0,Хрустальный,series,1714,635143.666667,Люби меня таким,series,10440,5534083000.0,Хрустальный,series
1,15297,193123.0,Клиника счастья,series,15297,193123.0,Клиника счастья,series,3642,442423.0,Обручальное кольцо,series,15297,4432354000.0,Клиника счастья,series
2,9728,132865.0,Гнев человеческий,film,9728,132865.0,Гнев человеческий,film,856,400153.0,Приют комедиантов,series,4151,959195800.0,Секреты семейной жизни,series
3,13865,122119.0,Девятаев,film,13865,122119.0,Девятаев,film,3480,373410.666667,Жизнь Мехэк,series,6192,936851900.0,Отчаянные домохозяйки,series
4,4151,91167.0,Секреты семейной жизни,series,4151,91167.0,Секреты семейной жизни,series,16216,340005.5,Анна-детективъ,series,14,865261700.0,Жестокий Стамбул,series
5,3734,74803.0,Прабабушка легкого поведения,film,3734,74803.0,Прабабушка легкого поведения,film,13395,339193.0,Крамола,series,13865,818712400.0,Девятаев,film
6,2657,68581.0,Подслушано,series,2657,68581.0,Подслушано,series,11566,328638.566667,Татьянин день,series,9728,781720900.0,Гнев человеческий,film
7,4880,55043.0,Афера,series,4880,55043.0,Афера,series,1914,294688.0,Неравный брак,series,9996,629719500.0,Немцы,series
8,142,45367.0,Маша,film,142,45367.0,Маша,film,2788,290355.5,Ганга,series,16228,579679300.0,Содержанки,series
9,6809,40372.0,Дуров,film,6809,40372.0,Дуров,film,3750,285721.0,Юлия Высоцкая,series,496,445214600.0,Воскресший Эртугрул,series


As we can see:
 - There is no difference between n_users and n_iteractions (may be it linked with that all users interact with popular items no more than once);
 - mean_weight and sum_weight method doesn't work good with not normed weights, because there are only series in the top for mean_weight method.
 
 So let's normalize weights for interaction in order to get better results

Weight attribute of the dataset shows the time that user spend to interact with item. So it is obvious that users spend more time for series than for films and others

In [22]:
interactions.merge(
    items[['content_type', 'item_id']], 
    on='item_id', 
    how='left'
).groupby('content_type')['weight'].mean()

content_type
film       4570.452922
series    19803.803874
Name: weight, dtype: float64

But there is another attribute that can be helpful for us. This attribute is ```watched_pct``` that we can use like a weight, because it shows us how many percent of item have been watched by user

### Let's create mean_weights, sum_weights models using watched_pct, like a weight

In [23]:
interactions_pct = interactions.copy()
interactions_pct[Columns.Weight] = interactions_pct['watched_pct']
interactions_pct.drop('watched_pct', axis=1, inplace=True)
interactions_pct.head(2)

Unnamed: 0,user_id,item_id,datetime,weight
0,176549,9506,2021-05-11,72.0
1,699317,1659,2021-05-29,100.0


In [24]:
dataset_pct = Dataset.construct(
    interactions_df=interactions_pct,
    user_features_df=user_feat,
    item_features_df=item_feat,
    cat_item_features=['genre', 
                       'release_year',
                       'age_rating',
                       'content_type'
                      ],
    cat_user_features=['age', 'sex']
)

In [25]:
%%time
pop_mean_weight_pct = PopularModel(popularity='mean_weight')
pop_mean_weight_pct.fit(dataset_pct);

CPU times: user 87.2 ms, sys: 1.31 ms, total: 88.5 ms
Wall time: 100 ms


<rectools.models.popular.PopularModel at 0x7f7f6d247670>

In [26]:
%%time
pop_sum_weight_pct = PopularModel(popularity='sum_weight')
pop_sum_weight_pct.fit(dataset_pct);

CPU times: user 69.9 ms, sys: 8.78 ms, total: 78.7 ms
Wall time: 77.9 ms


<rectools.models.popular.PopularModel at 0x7f7f6d235df0>

In [27]:
pd.concat([
    pd.concat([
        pop_mean_weight.recommend(
            dataset.user_id_map.external_ids[:1], 
            dataset=dataset, 
            k=10, 
            filter_viewed=False  # False - same items to every user
        ).merge(items[['item_id', 'title', 'content_type']], 
               on='item_id',
               how='left'),

        pop_sum_weight.recommend(
            dataset.user_id_map.external_ids[:1], 
            dataset=dataset, 
            k=10, 
            filter_viewed=False  # False - same items to every user
        ).merge(items[['item_id', 'title', 'content_type']], 
               on='item_id',
               how='left'),
    ], axis=1).drop(['user_id', 'rank'], axis=1),

    pd.concat([
        pop_mean_weight_pct.recommend(
            dataset_pct.user_id_map.external_ids[:1], 
            dataset=dataset_pct, 
            k=10, 
            filter_viewed=False  # False - same items to every user
        ).merge(items[['item_id', 'title', 'content_type']], 
               on='item_id',
               how='left'),

        pop_sum_weight_pct.recommend(
            dataset_pct.user_id_map.external_ids[:1], 
            dataset=dataset_pct, 
            k=10, 
            filter_viewed=False  # False - same items to every user
        ).merge(items[['item_id', 'title', 'content_type']], 
               on='item_id',
               how='left'),
    ], axis=1).drop(['user_id', 'rank'], axis=1)
]
)
# |   pop_mean_weight   |   pop_sum_weight   |
# | pop_mean_weight_pct | pop_sum_weight_pct |

Unnamed: 0,item_id,score,title,content_type,item_id.1,score.1,title.1,content_type.1
0,1714,635143.666667,Люби меня таким,series,10440,5534083000.0,Хрустальный,series
1,3642,442423.0,Обручальное кольцо,series,15297,4432354000.0,Клиника счастья,series
2,856,400153.0,Приют комедиантов,series,4151,959195800.0,Секреты семейной жизни,series
3,3480,373410.666667,Жизнь Мехэк,series,6192,936851900.0,Отчаянные домохозяйки,series
4,16216,340005.5,Анна-детективъ,series,14,865261700.0,Жестокий Стамбул,series
5,13395,339193.0,Крамола,series,13865,818712400.0,Девятаев,film
6,11566,328638.566667,Татьянин день,series,9728,781720900.0,Гнев человеческий,film
7,1914,294688.0,Неравный брак,series,9996,629719500.0,Немцы,series
8,2788,290355.5,Ганга,series,16228,579679300.0,Содержанки,series
9,3750,285721.0,Юлия Высоцкая,series,496,445214600.0,Воскресший Эртугрул,series


and...  
Nothing become better (may be)  
Let's compare this models in the next nootebok.  
So let's save datasets and models using pickle  

In [28]:
import pickle
import os
os.makedirs('rectools_datasets', exist_ok=True)

with open('rectools_datasets/dataset.pickle', 'wb') as handle:
    pickle.dump(dataset, handle)
with open('rectools_datasets/dataset_pct.pickle', 'wb') as handle:
    pickle.dump(dataset_pct, handle)

In [29]:
os.makedirs('pop_models', exist_ok=True)

with open('pop_models/pop_sum_weight.pickle', 'wb') as handle:
    pickle.dump(pop_sum_weight, handle)
    
with open('pop_models/pop_sum_weight_pct.pickle', 'wb') as handle:
    pickle.dump(pop_sum_weight_pct, handle)
    
with open('pop_models/pop_n_users.pickle', 'wb') as handle:
    pickle.dump(pop_n_users, handle)
    
with open('pop_models/pop_n_interactions.pickle', 'wb') as handle:
    pickle.dump(pop_n_interactions, handle)

Checking that everything has been saved correct

In [30]:
with open('rectools_datasets/dataset.pickle', 'rb') as handle:
    dataset = pickle.load(handle)
    
with open('rectools_datasets/dataset_pct.pickle', 'rb') as handle:
    dataset_pct = pickle.load(handle)

with open('pop_models/pop_sum_weight.pickle', 'rb') as handle:
    pop_sum_weight = pickle.load(handle)
    
with open('pop_models/pop_sum_weight_pct.pickle', 'rb') as handle:
    pop_sum_weight_pct = pickle.load(handle)
    
with open('pop_models/pop_n_users.pickle', 'rb') as handle:
    pop_n_users = pickle.load(handle)
    
with open('pop_models/pop_n_interactions.pickle', 'rb') as handle:
    pop_n_interactions = pickle.load(handle)
    

In [31]:
pd.concat([
    pop_n_users.recommend(
        dataset.user_id_map.external_ids[:1], 
        dataset=dataset, 
        k=10, 
        filter_viewed=False  # False - same items to every user
    ).merge(items[['item_id', 'title', 'content_type']], 
           on='item_id',
           how='left'),
    
    pop_n_interactions.recommend(
        dataset.user_id_map.external_ids[:1], 
        dataset=dataset, 
        k=10, 
        filter_viewed=False  # False - same items to every user
    ).merge(items[['item_id', 'title', 'content_type']], 
           on='item_id',
           how='left'),
    
    pop_sum_weight.recommend(
        dataset.user_id_map.external_ids[:1], 
        dataset=dataset, 
        k=10, 
        filter_viewed=False  # False - same items to every user
    ).merge(items[['item_id', 'title', 'content_type']], 
           on='item_id',
           how='left'),
    
    pop_sum_weight_pct.recommend(
        dataset_pct.user_id_map.external_ids[:1], 
        dataset=dataset_pct, 
        k=10, 
        filter_viewed=False  # False - same items to every user
    ).merge(items[['item_id', 'title', 'content_type']], 
           on='item_id',
           how='left'),
], axis=1).drop(['user_id', 'rank'], axis=1)

Unnamed: 0,item_id,score,title,content_type,item_id.1,score.1,title.1,content_type.1,item_id.2,score.2,title.2,content_type.2,item_id.3,score.3,title.3,content_type.3
0,10440,202457.0,Хрустальный,series,10440,202457.0,Хрустальный,series,10440,5534083000.0,Хрустальный,series,15297,10451575.0,Клиника счастья,series
1,15297,193123.0,Клиника счастья,series,15297,193123.0,Клиника счастья,series,15297,4432354000.0,Клиника счастья,series,10440,9449820.0,Хрустальный,series
2,9728,132865.0,Гнев человеческий,film,9728,132865.0,Гнев человеческий,film,4151,959195800.0,Секреты семейной жизни,series,13865,8247354.0,Девятаев,film
3,13865,122119.0,Девятаев,film,13865,122119.0,Девятаев,film,6192,936851900.0,Отчаянные домохозяйки,series,9728,7648170.0,Гнев человеческий,film
4,4151,91167.0,Секреты семейной жизни,series,4151,91167.0,Секреты семейной жизни,series,14,865261700.0,Жестокий Стамбул,series,3734,5361004.0,Прабабушка легкого поведения,film
5,3734,74803.0,Прабабушка легкого поведения,film,3734,74803.0,Прабабушка легкого поведения,film,13865,818712400.0,Девятаев,film,4151,3788442.0,Секреты семейной жизни,series
6,2657,68581.0,Подслушано,series,2657,68581.0,Подслушано,series,9728,781720900.0,Гнев человеческий,film,142,2786902.0,Маша,film
7,4880,55043.0,Афера,series,4880,55043.0,Афера,series,9996,629719500.0,Немцы,series,8636,2247976.0,Белый снег,film
8,142,45367.0,Маша,film,142,45367.0,Маша,film,16228,579679300.0,Содержанки,series,6809,2132168.0,Дуров,film
9,6809,40372.0,Дуров,film,6809,40372.0,Дуров,film,496,445214600.0,Воскресший Эртугрул,series,11237,1676397.0,День города,film


Let's save offline result of POPRecs

In [32]:
recos = pop_sum_weight.recommend(
    dataset.user_id_map.external_ids, 
    dataset=dataset, 
    k=20, 
    filter_viewed=True
)

In [34]:
os.makedirs('pop_models_offline', exist_ok=True)
recos.to_csv('pop_models_offline/pop_sum_weight.csv')

In [35]:
users_items_dict = recos.groupby('user_id')['item_id'].apply(list).to_dict()
users_items_dict[176549]

[10440,
 15297,
 4151,
 6192,
 14,
 13865,
 9996,
 16228,
 496,
 3734,
 6006,
 7476,
 2657,
 5471,
 4880,
 12192,
 2720,
 6809,
 11640,
 8270]

In [36]:
import json
with open('pop_models_offline/pop_sum_weight_offline.json', 'w') as handle:
    json.dump(users_items_dict, handle)

In [39]:
pop_sum_weight.recommend(
        dataset.user_id_map.external_ids[:1], 
        dataset=dataset, 
        k=10, 
        filter_viewed=False  # False - same items to every user
    )['item_id'].tolist()

[10440, 15297, 4151, 6192, 14, 13865, 9728, 9996, 16228, 496]