# ДЗ 5

In [None]:
import pandas as pd
import numpy as np
import scipy.stats as sts
import scipy.sparse as sp
from itertools import islice, cycle
from implicit.nearest_neighbours import CosineRecommender
from implicit.cpu.bpr import BayesianPersonalizedRanking
from implicit.cpu.als import AlternatingLeastSquares
from implicit.cpu.lmf import LogisticMatrixFactorization

import os
import torch
import warnings
import threadpoolctl
import re
from pathlib import Path
from rectools.dataset import Dataset

from rectools import Columns
from rectools.dataset import Dataset
from rectools.models import SASRecModel, BERT4RecModel
from rectools.visuals import MetricsApp
from rectools.dataset import Dataset, Interactions



In [2]:
df_train = pd.read_csv('hse_train.csv')
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4842338 entries, 0 to 4842337
Data columns (total 3 columns):
 #   Column     Dtype
---  ------     -----
 0   user_id    int64
 1   item_id    int64
 2   timestamp  int64
dtypes: int64(3)
memory usage: 110.8 MB


In [3]:
df_sub = pd.read_csv('sub0.csv')
df_sub.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14039620 entries, 0 to 14039619
Data columns (total 2 columns):
 #   Column   Dtype
---  ------   -----
 0   user_id  int64
 1   items    int64
dtypes: int64(2)
memory usage: 214.2 MB


In [4]:
df_train.describe()

Unnamed: 0,user_id,item_id,timestamp
count,4842338.0,4842338.0,4842338.0
mean,351420.3,90093.62,1511887000.0
std,202389.9,52169.75,199849.6
min,0.0,0.0,1511539000.0
25%,176237.2,44691.0,1511705000.0
50%,351428.5,90007.0,1511876000.0
75%,526505.0,135268.0,1512050000.0
max,701980.0,180659.0,1512259000.0


In [5]:
df_train.head()

Unnamed: 0,user_id,item_id,timestamp
0,258671,74254,1511701649
1,258671,115615,1511841435
2,258671,176624,1512105022
3,240498,45484,1511605442
4,240498,39504,1511756830


In [6]:
df_sub.head()

Unnamed: 0,user_id,items
0,258671,1233
1,258671,119169
2,258671,131247
3,258671,105133
4,258671,180111


In [27]:
def baseline_most_popular(train_df, top_n=10):
    item_popularity = train_df['item_id'].value_counts().reset_index()
    item_popularity.columns = ['item_id', 'count']
  
    top_items = item_popularity.sort_values('count', ascending=False).head(top_n)['item_id'].values
    return list(top_items)

top_10_items = baseline_most_popular(df_train, top_n=10)
all_users = df_train['user_id'].unique()
recommendations = {user: top_10_items for user in all_users}


In [7]:
def prepare_submission(recommendations, sample_sub):
    sub_data = []
    for user, items in recommendations.items():
        sub_data.append({'user_id': user, 'items': items[0]})
    submission = pd.DataFrame(sub_data)
    if sample_sub is not None:
        submission = submission[submission['user_id'].isin(sample_sub['user_id'])]
    
    return submission

In [None]:

submission = prepare_submission(recommendations, df_sub)

submission.to_csv('baseline_popularity_submission.csv', index=False)
print("Baseline submission file saved!")

Baseline submission file saved!


In [23]:
submission.head()

Unnamed: 0,user_id,items
0,258671,100454
1,240498,100454
2,512761,100454
3,259030,100454
4,584368,100454


In [28]:
from scipy.sparse import csr_matrix, lil_matrix

def baseline_user_similarity_sparse(train_df, top_n=10):
    user_ids = train_df['user_id'].unique()
    item_ids = train_df['item_id'].unique()
    
    user_to_idx = {user: idx for idx, user in enumerate(user_ids)}
    item_to_idx = {item: idx for idx, item in enumerate(item_ids)}
    idx_to_item = {idx: item for item, idx in item_to_idx.items()}
    
    num_users = len(user_ids)
    num_items = len(item_ids)
    user_item_matrix = lil_matrix((num_users, num_items), dtype=np.int8)
    
    for _, row in train_df.iterrows():
        user_idx = user_to_idx[row['user_id']]
        item_idx = item_to_idx[row['item_id']]
        user_item_matrix[user_idx, item_idx] = 1  
    
    user_item_matrix_csr = user_item_matrix.tocsr()
    
    item_popularity = np.array(user_item_matrix_csr.sum(axis=0)).flatten()
    
    global_top_items = np.argsort(-item_popularity)[:top_n]
    global_top_items = [idx_to_item[idx] for idx in global_top_items]
    
    user_recommendations = {}
    for user in user_ids:
        user_idx = user_to_idx[user]
        
        _, items_bought = user_item_matrix_csr[user_idx].nonzero()
        
        if len(items_bought) == 0:
            user_recommendations[user] = global_top_items
            continue
        
        bought_popularity = item_popularity[items_bought]
        sorted_indices = np.argsort(-bought_popularity)
        top_personal_items = items_bought[sorted_indices[:top_n]]
        
        recommended_items = [idx_to_item[idx] for idx in top_personal_items]
        
        if len(recommended_items) < top_n:
            additional = [item for item in global_top_items 
                         if item not in recommended_items][:top_n - len(recommended_items)]
            recommended_items.extend(additional)
        
        user_recommendations[user] = recommended_items[0]
    
    return user_recommendations

personal_recommendations = baseline_user_similarity_sparse(df_train, top_n=10)


In [29]:
personal_submission = prepare_submission(personal_recommendations, df_sub)
personal_submission.to_csv('baseline_sparse_submission.csv', index=False)
print("Sparse matrix baseline submission file saved!")

Sparse matrix baseline submission file saved!


In [30]:
personal_submission.head()

Unnamed: 0,user_id,items
0,258671,176624
1,240498,45484
2,512761,163625
3,259030,114262
4,584368,97674


In [None]:
personal_submission.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 701981 entries, 0 to 701980
Data columns (total 2 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   user_id  701981 non-null  int64 
 1   items    701981 non-null  object
dtypes: int64(1), object(1)
memory usage: 10.7+ MB


In [10]:
df_train["weight"] = 1.0

interactions = Interactions(
    df_train.rename(columns={
        "user_id": Columns.User,
        "item_id": Columns.Item,
        "timestamp": Columns.Datetime,
        "weight": Columns.Weight
    })
)

dataset = Dataset.construct(interactions.df)

In [None]:
model = SASRecModel(
    session_max_len=20,
    loss="softmax",
    n_factors=64,
    lr=0.001,
    batch_size=128,
    deterministic=True,
    epochs=1,
    verbose=1,
)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


In [44]:
DEVICE = torch.device("cuda")

In [41]:
torch.cuda.current_device()

0

In [21]:
model.fit(dataset)

  unq_values = pd.unique(values)
  PydanticSerializationUnexpectedValue(Expected `str` - serialized value may not be as expected [input_value=('rectools.models.nn.item...net.CatFeaturesItemNet'), input_type=tuple])
  return self.__pydantic_serializer__.to_python(
/home/xande/anaconda3/envs/hse-11/lib/python3.11/site-packages/pytorch_lightning/trainer/configuration_validator.py:70: You defined a `validation_step` but have no `val_dataloader`. Skipping val loop.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name        | Type                     | Params | Mode 
-----------------------------------------------------------------
0 | torch_model | TransformerTorchBackbone | 11.6 M | train
-----------------------------------------------------------------
11.6 M    Trainable params
0         Non-trainable params
11.6 M    Total params
46.438    Total estimated model params size (MB)
35        Modules in train mode
0         Modules in eval mode
/home/xande/anaconda3/envs/hse-11/lib/python3.1

Training: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=1` reached.


<rectools.models.nn.transformers.sasrec.SASRecModel at 0x7f321199f510>

In [27]:
torch.save(model, "sas_rec_model.pt")

In [32]:
df_train.head()

Unnamed: 0,user_id,item_id,timestamp,weight
0,258671,74254,1511701649,1.0
1,258671,115615,1511841435,1.0
2,258671,176624,1512105022,1.0
3,240498,45484,1511605442,1.0
4,240498,39504,1511756830,1.0


In [48]:
recs = model.recommend(
    users=df_train["user_id"].unique(),  
    dataset=dataset,
    k=3,
    filter_viewed=True,
    on_unsupported_targets="warn"
)

NotFittedError: SASRecModel isn't fitted, call method `fit` first.

In [None]:
df_sasrec = recs.to_pandas()
df_sasrec.head()

In [None]:
recs.to_pandas().to_csv("sasrec_submission.csv", index=False)

In [None]:
personal_submission = prepare_submission(df_sasrec, df_sub)
personal_submission.to_csv('sasrec_submission_submission.csv', index=False)
print("SasRec file saved!")