In [1]:
!pip install rectools
!pip install pandas
!pip install numba
!pip install numpy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting rectools
  Downloading RecTools-0.3.0-py3-none-any.whl (89 kB)
[K     |████████████████████████████████| 89 kB 3.5 MB/s 
[?25hCollecting nmslib<3.0.0,>=2.0.4
  Downloading nmslib-2.1.1-cp38-cp38-manylinux2010_x86_64.whl (13.4 MB)
[K     |████████████████████████████████| 13.4 MB 50.4 MB/s 
[?25hCollecting attrs<22.0.0,>=19.1.0
  Downloading attrs-21.4.0-py2.py3-none-any.whl (60 kB)
[K     |████████████████████████████████| 60 kB 4.9 MB/s 
Collecting implicit==0.4.4
  Downloading implicit-0.4.4.tar.gz (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 23.7 MB/s 
Collecting Markdown<3.3,>=3.2
  Downloading Markdown-3.2.2-py3-none-any.whl (88 kB)
[K     |████████████████████████████████| 88 kB 5.2 MB/s 
Collecting lightfm<2.0,>=1.16
  Downloading lightfm-1.16.tar.gz (310 kB)
[K     |████████████████████████████████| 310 kB 50.6 MB/s 
Collecting pybind11<2.6.2
  Down

In [11]:
import pandas as pd
import numpy as np
import os

from implicit.als import AlternatingLeastSquares

from rectools.metrics import Precision, Recall, MAP, calc_metrics
from rectools.models import PopularModel, RandomModel, ImplicitALSWrapperModel
from rectools import Columns
from rectools.dataset import Dataset
from rectools.models import ImplicitALSWrapperModel, LightFMWrapperModel

import matplotlib.pyplot as plt
import seaborn as sns

import matplotlib.pyplot as plt
from pathlib import Path
import typing as tp
from tqdm import tqdm

from lightfm import LightFM

from implicit.bpr import BayesianPersonalizedRanking

from implicit.lmf import LogisticMatrixFactorization


## KION DATA

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
import requests
url = 'https://github.com/irsafilo/KION_DATASET/raw/f69775be31fa5779907cf0a92ddedb70037fb5ae/data_original.zip'

In [5]:
req = requests.get(url, stream=True)

with open('kion.zip', 'wb') as fd:
    total_size_in_bytes = int(req.headers.get('Content-Length', 0))
    progress_bar = tqdm(desc='kion dataset download', total=total_size_in_bytes, unit='iB', unit_scale=True)
    for chunk in req.iter_content(chunk_size=2 ** 20):
        progress_bar.update(len(chunk))
        fd.write(chunk)

kion dataset download:  69%|██████▉   | 54.5M/78.8M [00:00<00:00, 286MiB/s]

In [6]:
import zipfile as zf

files = zf.ZipFile('kion.zip','r')
files.extractall()
files.close()

In [43]:
interactions = pd.read_csv('data_original/interactions.csv')
Columns.Datetime = 'last_watch_dt'

In [44]:
users = pd.read_csv('data_original/users.csv')
items = pd.read_csv('data_original/items.csv')

### **Interactions prepare**

In [45]:
interactions.drop(interactions[interactions[Columns.Datetime].str.len() != 10].index, inplace=True)
interactions[Columns.Datetime] = pd.to_datetime(interactions[Columns.Datetime], format='%Y-%m-%d')

In [46]:
min_date = interactions[Columns.Datetime].min(), 
max_date = interactions[Columns.Datetime].max()

In [47]:
interactions.head()

Unnamed: 0,user_id,item_id,last_watch_dt,total_dur,watched_pct
0,176549,9506,2021-05-11,4250,72.0
1,699317,1659,2021-05-29,8317,100.0
2,656683,7107,2021-05-09,10,0.0
3,864613,7638,2021-07-05,14483,100.0
4,964868,9506,2021-04-30,6725,100.0


In [48]:
interactions[Columns.Weight] = np.where(interactions['watched_pct'] > 10, 3, 1)

In [49]:
interactions.head()

Unnamed: 0,user_id,item_id,last_watch_dt,total_dur,watched_pct,weight
0,176549,9506,2021-05-11,4250,72.0,3
1,699317,1659,2021-05-29,8317,100.0,3
2,656683,7107,2021-05-09,10,0.0,1
3,864613,7638,2021-07-05,14483,100.0,3
4,964868,9506,2021-04-30,6725,100.0,3


In [50]:
train = interactions[interactions[Columns.Datetime] < max_date - pd.Timedelta(days=7)].copy()
test = interactions[interactions[Columns.Datetime] >= max_date - pd.Timedelta(days=7)].copy()

print(f"train: {train.shape}")
print(f"test: {test.shape}")

train.drop(train.query("total_dur < 300").index, inplace=True)

# отфильтруем холодных пользователей из теста
cold_users = set(test[Columns.User]) - set(train[Columns.User])

test.drop(test[test[Columns.User].isin(cold_users)].index, inplace=True)

train: (4985269, 6)
test: (490982, 6)


## User prepare

In [51]:
users.fillna('Unknown', inplace=True)
users = users.loc[users[Columns.User].isin(train[Columns.User])].copy()

In [52]:
user_features_frames = []
for feature in ["sex", "age", "income"]:
    feature_frame = users.reindex(columns=[Columns.User, feature])
    feature_frame.columns = ["id", "value"]
    feature_frame["feature"] = feature
    user_features_frames.append(feature_frame)
user_features = pd.concat(user_features_frames)
user_features.head()

Unnamed: 0,id,value,feature
0,973171,М,sex
1,962099,М,sex
3,721985,Ж,sex
4,704055,Ж,sex
5,1037719,М,sex


In [53]:
train_user_features = user_features.loc[user_features['id'].isin(train[Columns.User])]

## Item prepare


In [54]:
items = items.loc[items[Columns.Item].isin(train[Columns.Item])].copy()

Genre

In [55]:
items["genre"] = items["genres"].str.lower().str.replace(", ", ",", regex=False).str.split(",")
genre_feature = items[["item_id", "genre"]].explode("genre")
genre_feature.columns = ["id", "value"]
genre_feature["feature"] = "genre"
genre_feature.head()


Unnamed: 0,id,value,feature
0,10711,драмы,genre
0,10711,зарубежные,genre
0,10711,детективы,genre
0,10711,мелодрамы,genre
1,2508,зарубежные,genre


Content

In [56]:
content_feature = items.reindex(columns=[Columns.Item, "content_type"])
content_feature.columns = ["id", "value"]
content_feature["feature"] = "content_type"
content_feature.head()

Unnamed: 0,id,value,feature
0,10711,film,content_type
1,2508,film,content_type
2,10716,film,content_type
3,7868,film,content_type
4,16268,film,content_type


In [58]:
item_features = pd.concat((genre_feature, content_feature))
item_features.head()

Unnamed: 0,id,value,feature
0,10711,драмы,genre
0,10711,зарубежные,genre
0,10711,детективы,genre
0,10711,мелодрамы,genre
1,2508,зарубежные,genre


In [59]:
train_item_features = item_features.loc[item_features['id'].isin(train[Columns.Item])]

### Save

In [60]:
interactions.to_csv('prepared_interactions.csv', index=False)
items.to_csv('prepared_items.csv', index=False)
users.to_csv('prepared_users.csv', index=False)

In [61]:
item_features.to_csv( 'prepared_featured_items.csv', index=False)
user_features.to_csv('prepared_featured_users.csv', index=False)

In [62]:
train.to_csv('prepared_interactions_train.csv', index=False)
test.to_csv('prepared_interactions_test.csv', index=False)

train_user_features.to_csv('prepared_featured_users_train.csv', index=False)
train_item_features.to_csv('prepared_featured_items_train.csv', index=False)

## Models

In [63]:
os.environ["OPENBLAS_NUM_THREADS"] = "1"  # For implicit ALS

import warnings
warnings.filterwarnings('ignore')