In [1]:
!pip install rectools  >> None
!pip install pandas  >> None
!pip install numba  >> None
!pip install numpy  >> None

In [2]:
import pandas as pd
import numpy as np
import os

from implicit.als import AlternatingLeastSquares

from rectools.metrics import Precision, Recall, MAP, calc_metrics
from rectools.models import PopularModel, RandomModel, ImplicitALSWrapperModel
from rectools import Columns
from rectools.dataset import Dataset
from rectools.models import ImplicitALSWrapperModel, LightFMWrapperModel

import matplotlib.pyplot as plt
import seaborn as sns

import matplotlib.pyplot as plt
from pathlib import Path
import typing as tp
from tqdm import tqdm

from lightfm import LightFM

from implicit.bpr import BayesianPersonalizedRanking

from implicit.lmf import LogisticMatrixFactorization


## KION DATA

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
import requests
url = 'https://github.com/irsafilo/KION_DATASET/raw/f69775be31fa5779907cf0a92ddedb70037fb5ae/data_original.zip'

In [5]:
req = requests.get(url, stream=True)

with open('kion.zip', 'wb') as fd:
    total_size_in_bytes = int(req.headers.get('Content-Length', 0))
    progress_bar = tqdm(desc='kion dataset download', total=total_size_in_bytes, unit='iB', unit_scale=True)
    for chunk in req.iter_content(chunk_size=2 ** 20):
        progress_bar.update(len(chunk))
        fd.write(chunk)

kion dataset download: 100%|█████████▉| 78.6M/78.8M [00:13<00:00, 5.89MiB/s]

In [6]:
import zipfile as zf

files = zf.ZipFile('kion.zip','r')
files.extractall()
files.close()

In [7]:
interactions = pd.read_csv('data_original/interactions.csv')
Columns.Datetime = 'last_watch_dt'

In [8]:
users = pd.read_csv('data_original/users.csv')
items = pd.read_csv('data_original/items.csv')

### **Interactions prepare**

In [9]:
interactions.drop(interactions[interactions[Columns.Datetime].str.len() != 10].index, inplace=True)
interactions[Columns.Datetime] = pd.to_datetime(interactions[Columns.Datetime], format='%Y-%m-%d')

In [10]:
min_date = interactions[Columns.Datetime].min(), 
max_date = interactions[Columns.Datetime].max()

In [11]:
interactions.head()

Unnamed: 0,user_id,item_id,last_watch_dt,total_dur,watched_pct
0,176549,9506,2021-05-11,4250,72.0
1,699317,1659,2021-05-29,8317,100.0
2,656683,7107,2021-05-09,10,0.0
3,864613,7638,2021-07-05,14483,100.0
4,964868,9506,2021-04-30,6725,100.0


In [12]:
interactions[Columns.Weight] = np.where(interactions['watched_pct'] > 10, 3, 1)

In [13]:
interactions.head()

Unnamed: 0,user_id,item_id,last_watch_dt,total_dur,watched_pct,weight
0,176549,9506,2021-05-11,4250,72.0,3
1,699317,1659,2021-05-29,8317,100.0,3
2,656683,7107,2021-05-09,10,0.0,1
3,864613,7638,2021-07-05,14483,100.0,3
4,964868,9506,2021-04-30,6725,100.0,3


In [14]:
train = interactions[interactions[Columns.Datetime] < max_date - pd.Timedelta(days=7)].copy()
test = interactions[interactions[Columns.Datetime] >= max_date - pd.Timedelta(days=7)].copy()

print(f"train: {train.shape}")
print(f"test: {test.shape}")

train.drop(train.query("total_dur < 300").index, inplace=True)

# отфильтруем холодных пользователей из теста
cold_users = set(test[Columns.User]) - set(train[Columns.User])

test.drop(test[test[Columns.User].isin(cold_users)].index, inplace=True)

train: (4985269, 6)
test: (490982, 6)


## User prepare

In [15]:
users.fillna('Unknown', inplace=True)
users = users.loc[users[Columns.User].isin(train[Columns.User])].copy()

In [16]:
user_features_frames = []
for feature in ["sex", "age", "income"]:
    feature_frame = users.reindex(columns=[Columns.User, feature])
    feature_frame.columns = ["id", "value"]
    feature_frame["feature"] = feature
    user_features_frames.append(feature_frame)
user_features = pd.concat(user_features_frames)
user_features.head()

Unnamed: 0,id,value,feature
0,973171,М,sex
1,962099,М,sex
3,721985,Ж,sex
4,704055,Ж,sex
5,1037719,М,sex


In [17]:
train_user_features = user_features.loc[user_features['id'].isin(train[Columns.User])]

## Item prepare


In [18]:
items = items.loc[items[Columns.Item].isin(train[Columns.Item])].copy()

Genre

In [19]:
items["genre"] = items["genres"].str.lower().str.replace(", ", ",", regex=False).str.split(",")
genre_feature = items[["item_id", "genre"]].explode("genre")
genre_feature.columns = ["id", "value"]
genre_feature["feature"] = "genre"
genre_feature.head()


Unnamed: 0,id,value,feature
0,10711,драмы,genre
0,10711,зарубежные,genre
0,10711,детективы,genre
0,10711,мелодрамы,genre
1,2508,зарубежные,genre


Content

In [20]:
content_feature = items.reindex(columns=[Columns.Item, "content_type"])
content_feature.columns = ["id", "value"]
content_feature["feature"] = "content_type"
content_feature.head()

Unnamed: 0,id,value,feature
0,10711,film,content_type
1,2508,film,content_type
2,10716,film,content_type
3,7868,film,content_type
4,16268,film,content_type


In [21]:
item_features = pd.concat((genre_feature, content_feature))
item_features.head()

Unnamed: 0,id,value,feature
0,10711,драмы,genre
0,10711,зарубежные,genre
0,10711,детективы,genre
0,10711,мелодрамы,genre
1,2508,зарубежные,genre


In [22]:
train_item_features = item_features.loc[item_features['id'].isin(train[Columns.Item])]

### Save

In [23]:
interactions.to_csv('/content/drive/MyDrive/RS/prepared_interactions.csv', index=False)
items.to_csv('/content/drive/MyDrive/RS/prepared_items.csv', index=False)
users.to_csv('/content/drive/MyDrive/RS/prepared_users.csv', index=False)

kion dataset download: 100%|██████████| 78.8M/78.8M [00:30<00:00, 5.89MiB/s]

In [24]:
item_features.to_csv( '/content/drive/MyDrive/RS/prepared_featured_items.csv', index=False)
user_features.to_csv('/content/drive/MyDrive/RS/prepared_featured_users.csv', index=False)

In [25]:
train.to_csv('/content/drive/MyDrive/RS/prepared_interactions_train.csv', index=False)
test.to_csv('/content/drive/MyDrive/RS/prepared_interactions_test.csv', index=False)

train_user_features.to_csv('/content/drive/MyDrive/RS/prepared_featured_users_train.csv', index=False)
train_item_features.to_csv('/content/drive/MyDrive/RS/prepared_featured_items_train.csv', index=False)

## Models

In [26]:
os.environ["OPENBLAS_NUM_THREADS"] = "1"  # For implicit ALS

import warnings
warnings.filterwarnings('ignore')

In [27]:
interactions = pd.read_csv('/content/drive/MyDrive/RS/prepared_interactions.csv')
items = pd.read_csv('/content/drive/MyDrive/RS/prepared_items.csv')
users =pd.read_csv('/content/drive/MyDrive/RS/prepared_users.csv')

In [28]:
item_features = pd.read_csv('/content/drive/MyDrive/RS/prepared_featured_items.csv')
user_features = pd.read_csv('/content/drive/MyDrive/RS/prepared_featured_users.csv')

In [29]:
train = pd.read_csv('/content/drive/MyDrive/RS/prepared_interactions_train.csv')
test = pd.read_csv('/content/drive/MyDrive/RS/prepared_interactions_test.csv')

In [30]:
train_user_features = pd.read_csv('/content/drive/MyDrive/RS/prepared_featured_users_train.csv')
train_item_features = pd.read_csv('/content/drive/MyDrive/RS/prepared_featured_items_train.csv')

### ALS

In [31]:
!pip install optuna  >> None

In [32]:
import optuna

In [33]:
K_RECOS = 10
RANDOM_STATE = 42
NUM_THREADS = 12
TOTAL_ITERATIONS = 30

In [46]:
dataset = Dataset.construct(
    interactions_df=train
)

metric_map = MAP(k = K_RECOS)

def objective_ALS(trial):
    '''Objective for optuna'''  
    num_factors = trial.suggest_int('factors', low=32, high=128, step=32)
    regularization  = trial.suggest_float('regularization', low=0.001, high=0.01, step=0.005)
    iterations = trial.suggest_int('iterations', low=5, high=15, step=5)
    
    als = ImplicitALSWrapperModel(
        model = AlternatingLeastSquares(
            factors = num_factors,
            regularization = regularization ,
            iterations = iterations,
            use_gpu = True,
            num_threads = NUM_THREADS,
            random_state = RANDOM_STATE
        )
    )
    
    als.fit(dataset)
    
    recos = als.recommend(
        users=test[Columns.User].unique(),
        dataset=dataset,
        k=K_RECOS,
        filter_viewed=True,
    )
    
    map_10 = metric_map.calc(recos, test)
    
    return map_10

In [47]:
study = optuna.create_study(directions=["maximize"])
study.optimize(objective_ALS, n_trials=10)

[32m[I 2022-12-14 15:50:35,923][0m A new study created in memory with name: no-name-b4a659e4-17a5-4aab-9b51-9311e9a46719[0m
[32m[I 2022-12-14 15:51:56,828][0m Trial 0 finished with value: 0.024371760687707283 and parameters: {'factors': 64, 'regularization': 0.001, 'iterations': 10}. Best is trial 0 with value: 0.024371760687707283.[0m
[32m[I 2022-12-14 15:53:20,226][0m Trial 1 finished with value: 0.024740965506254087 and parameters: {'factors': 64, 'regularization': 0.006, 'iterations': 15}. Best is trial 1 with value: 0.024740965506254087.[0m
[32m[I 2022-12-14 15:54:23,943][0m Trial 2 finished with value: 0.027518659616956532 and parameters: {'factors': 32, 'regularization': 0.001, 'iterations': 15}. Best is trial 2 with value: 0.027518659616956532.[0m
[32m[I 2022-12-14 15:55:39,512][0m Trial 3 finished with value: 0.02128553850819735 and parameters: {'factors': 96, 'regularization': 0.006, 'iterations': 5}. Best is trial 2 with value: 0.027518659616956532.[0m
[32m[I

In [58]:
def objective_BPR(trial):
    '''Objective for optuna'''  
    num_factors = trial.suggest_int('factors', low=159, high=256, step=128)
    regularization  = trial.suggest_float('regularization', low=0.001, high=0.01, step=0.005)
    iterations = trial.suggest_int('iterations', low=5, high=15, step=5)
    
    bpr = ImplicitALSWrapperModel(
        model = BayesianPersonalizedRanking(
            factors = num_factors,
            regularization = regularization ,
            iterations = iterations,
            use_gpu = True,
            num_threads = NUM_THREADS,
            random_state = RANDOM_STATE
        )
    )
    
    bpr.fit(dataset)
    
    recos = bpr.recommend(
        users=test[Columns.User].unique(),
        dataset=dataset,
        k=K_RECOS,
        filter_viewed=True,
    )
    
    map_10 = metric_map.calc(recos, test)
    
    return map_10

In [59]:
study = optuna.create_study(directions=["maximize"])
study.optimize(objective_BPR, n_trials=10)

[32m[I 2022-12-14 16:11:03,958][0m A new study created in memory with name: no-name-ff831539-aaa3-439e-98bd-5f4ea2d9a9fa[0m
[32m[I 2022-12-14 16:12:27,822][0m Trial 0 finished with value: 0.06621397053452968 and parameters: {'factors': 159, 'regularization': 0.006, 'iterations': 5}. Best is trial 0 with value: 0.06621397053452968.[0m
[32m[I 2022-12-14 16:13:55,056][0m Trial 1 finished with value: 0.05670679151157323 and parameters: {'factors': 159, 'regularization': 0.006, 'iterations': 10}. Best is trial 0 with value: 0.06621397053452968.[0m
[32m[I 2022-12-14 16:15:21,418][0m Trial 2 finished with value: 0.07097924694936714 and parameters: {'factors': 159, 'regularization': 0.001, 'iterations': 10}. Best is trial 2 with value: 0.07097924694936714.[0m
[32m[I 2022-12-14 16:16:43,491][0m Trial 3 finished with value: 0.06702273853220114 and parameters: {'factors': 159, 'regularization': 0.006, 'iterations': 5}. Best is trial 2 with value: 0.07097924694936714.[0m
[32m[I 202

In [None]:
def objective_LFM(trial):
    '''Objective for optuna'''  
    num_comp = trial.suggest_int('no_components', low=32, high=128, step=32)
    learning_rate  = trial.suggest_float('learning_rate', low=0.001, high=0.1, step=0.005)
    
    lfm = LightFMWrapperModel(
        model = LightFM(
            no_components=num_comp,
            learning_rate=learning_rate, 
            loss='warp',
            rho=0.9,
            epsilon=1e-5,
            user_alpha=0,
            item_alpha=0,
            random_state=RANDOM_STATE,
        ),
        epochs=1,
        num_threads=NUM_THREADS,
    )
    lfm.fit(dataset)
    
    
    recos = lfm.recommend(
        users=test[Columns.User].unique(),
        dataset=dataset,
        k=K_RECOS,
        filter_viewed=True,
    )
    
    map_10 = metric_map.calc(recos, test)
    
    return map_10

In [None]:
study = optuna.create_study(directions=["maximize"])
study.optimize(objective_LFM, n_trials=10)

[32m[I 2022-12-14 16:25:32,379][0m A new study created in memory with name: no-name-579d947c-6d1d-44be-9220-e0cd740677e0[0m
[32m[I 2022-12-14 16:26:46,963][0m Trial 0 finished with value: 0.07403274010295241 and parameters: {'no_components': 32, 'learning_rate': 0.096}. Best is trial 0 with value: 0.07403274010295241.[0m
[32m[I 2022-12-14 16:28:01,240][0m Trial 1 finished with value: 0.07771201385594466 and parameters: {'no_components': 32, 'learning_rate': 0.021}. Best is trial 1 with value: 0.07771201385594466.[0m
[32m[I 2022-12-14 16:29:53,129][0m Trial 2 finished with value: 0.07595548802501967 and parameters: {'no_components': 96, 'learning_rate': 0.066}. Best is trial 1 with value: 0.07771201385594466.[0m
[32m[I 2022-12-14 16:31:24,722][0m Trial 3 finished with value: 0.07630382797904361 and parameters: {'no_components': 64, 'learning_rate': 0.061}. Best is trial 1 with value: 0.07771201385594466.[0m
[32m[I 2022-12-14 16:32:55,641][0m Trial 4 finished with value: