In [None]:
import pandas as pd
import numpy as np

import scipy.sparse as sparse
import implicit 

from tqdm.notebook import tqdm
import seaborn as sns

In [None]:
TRAIN_PATH = 'data/train.parquet.gzip'
CANDIDATES_PATH = 'data/fresh_candidates.parquet.gzip'
pop_preds = [4628, 103927, 146586,  18584,  75560,  44269,  58977, 227420,
       130953,  11244, 130122, 173607, 121430, 195239,  73059,  52801,
       105708, 224095,  55854,  24951]

In [None]:
class DataPreprocessor:
    
    def __init__(self, penalty_threshold=.25, ts_quantile=.95, ts_power=2.5):
        self.penalty_threshold = penalty_threshold
        self.quantile = ts_quantile
        self.ts_power = ts_power
        
    def get_penalty(self):
        train_rank = self.train.index
        max_user_rank = self.train.reset_index().groupby("user_id").transform(max)['index']
        min_user_rank = self.train.reset_index().groupby("user_id").transform(min)['index']
        norm_user_rank = (train_rank - min_user_rank) / (max_user_rank - min_user_rank) \
                                                                            + self.penalty_threshold
        return norm_user_rank
    
    def clip_timespent(self):        
        quantile_user_ts = self.train.groupby('item_id')['timespent'] \
                           .transform('quantile', self.quantile) \
                           .astype(int)
        cliped_ts = np.min([self.train["timespent"], quantile_user_ts], axis=0)
        return cliped_ts
    
    def filter_train(self):
        candidates_df = pd.read_parquet(self.candidates_path)
        self.train = self.train[self.train["item_id"].isin(candidates_df["item_id"])]
        
    def idx_mapping(self):       
        self.user2idx = {v: k for k, v in enumerate(self.train['user_id'].unique())}
        self.item2idx = {v: k for k, v in enumerate(self.train['item_id'].unique())}
        
        self.idx2user = {k:v for v, k in self.user2idx.items()}
        self.idx2item = {k:v for v, k in self.item2idx.items()}
        
        self.train['user_id'] = self.train['user_id'].map(self.user2idx)
        self.train['item_id'] = self.train['item_id'].map(self.item2idx)
        
        
    def fit(self, train_path, candidates_path):
        self.candidates_path = candidates_path
        self.train = pd.read_parquet(train_path)
        self.train['timespent'] = self.train['timespent'] + 1
        
    def transform(self):
        norm_user_rank = self.get_penalty()
        cliped_ts = self.clip_timespent()        
        self.train['feature_als'] = np.power(cliped_ts * norm_user_rank , self.ts_power)
        self.filter_train()
        self.idx_mapping()
        
        sparse_user_item = sparse.csr_matrix((self.train['feature_als'], 
                                              (self.train['user_id'], self.train['item_id'])))
        return sparse_user_item
        

In [9]:
prepr = DataPreprocessor()

In [10]:
%%time
prepr.fit(TRAIN_PATH, CANDIDATES_PATH)

CPU times: user 7.18 s, sys: 2.28 s, total: 9.46 s
Wall time: 4.13 s


In [11]:
%%time
sp_mat = prepr.transform()

CPU times: user 2min 7s, sys: 31.3 s, total: 2min 38s
Wall time: 2min 17s


In [12]:
model = implicit.als.AlternatingLeastSquares(factors=756, 
                                             regularization=0.001, 
                                             iterations=40, 
                                             use_native=True, 
                                             use_cg=True,)

In [13]:
model.fit(sp_mat)

  0%|          | 0/40 [00:00<?, ?it/s]

In [18]:
test = pd.read_parquet('data/test.parquet.gzip')

In [19]:
def recommend(user_id, filter_items=None):
    user_idx = prepr.user2idx[user_id]
    recommended = model.recommend(user_idx, 
                                  sp_mat[user_idx], 
                                  N=20, 
                                  filter_items=filter_items)[0]
    recommended = [*map(prepr.idx2item.get, recommended)]
    return recommended

In [20]:
num_errors = 0
predictions = []

for user_id in tqdm(test.user_id):
    try:
        predictions.append(recommend(user_id, None))
    except:
        predictions.append(pop_preds)
        num_errors += 1
print(f"OK but {num_errors} errors") 

  0%|          | 0/200000 [00:00<?, ?it/s]

OK but 15 errors


In [21]:
test['predictions'] = predictions
test

Unnamed: 0,user_id,predictions
0,7,"[162251, 63017, 115127, 77577, 221001, 97314, ..."
1,8,"[97249, 142183, 105397, 44222, 163702, 101122,..."
2,9,"[227299, 144772, 63495, 149513, 96717, 32474, ..."
3,11,"[39597, 143520, 211646, 128262, 38136, 159480,..."
4,18,"[190377, 155973, 139289, 120767, 206293, 18141..."
...,...,...
199995,1000160,"[220549, 52187, 198552, 91078, 170326, 44273, ..."
199996,1000165,"[210739, 14866, 120027, 157041, 23921, 216251,..."
199997,1000166,"[73237, 20953, 23837, 101401, 66499, 157041, 8..."
199998,1000168,"[179166, 98609, 51819, 53550, 75882, 119088, 1..."


In [22]:
test.to_parquet('data/submission_files/als_top20.parquet.gzip', compression='gzip')