In [1]:
import os
import json
import pandas as pd
import numpy as np
import tqdm
import scipy.sparse as sp

import implicit
import lightfm
import warnings
from validation_prepare import *
warnings.filterwarnings('ignore')

In [2]:
def get_recomend(model,X,fe,test_users,user_history_films,user_features = None,item_features = None,mode = 'lightfm',N = 20,is_filter_history = True):
    # Сделаем какой-нибудь сабмит из lightfm
    result = {}
    #'implicit'#'lightfm'
    cnt = 0
    for user_uid in tqdm.tqdm(test_users):

        # transform user_uid to model's internal user category
        try:
            user_cat = fe.match_user_row[user_uid]
        except LookupError:
            continue
        
        if mode == 'lightfm':
            if is_filter_history:
                need_cols = np.array(list(set(np.arange(X.shape[1])) - user_history_films.get(user_cat, set())))
            else:
                need_cols = np.array(list(set(np.arange(X.shape[1]))))
            recs = model.predict(user_cat,need_cols,item_features=item_features,user_features=user_features)
            need_movies = np.argsort(-recs)[:N]
            result[user_uid] = [int(fe.train_movie_match_row_movie[i]) for i in need_movies]
            
                
        # переводим в фильмы

        # perform inference
        if mode == 'implicit':
            if cnt == 0:
                ratings_matrix_T = X.tocsr()
            if is_filter_history:
                recs = model.recommend(
                    user_cat,
                    X.tocsr(),
                    N=N,
                    filter_already_liked_items=True,
                    filter_items=user_history_films.get(user_uid, set())
                )
            else:
                recs = model.recommend(
                    user_cat,
                    X.tocsr(),
                    N=N,filter_already_liked_items=False,
                )
            result[user_uid] = [int(fe.train_movie_match_row_movie[i]) for i, _ in recs]
            # drop scores and transform model's internal elelemnt category to element_uid for every prediction
        # also convert np.uint64 to int so it could be json serialized later
        cnt += 1
    return result

def get_predict (model,X,fe,test_users,user_history_films,user_features = None,item_features = None,mode = 'lightfm',):
    # Сделаем какой-нибудь сабмит из lightfm
    result = {}
    #'implicit'#'lightfm'
    cnt = 0
    for user_uid in tqdm.tqdm(test_users):

        # transform user_uid to model's internal user category
        try:
            user_cat = fe.match_user_row[user_uid]
        except LookupError:
            continue
        
        if mode == 'lightfm':
#             if is_filter_history:
#                 need_cols = np.array(list(set(np.arange(X.shape[1])) - user_history_films.get(user_cat, set())))
#             else:
            need_cols = np.array(list(set(np.arange(X.shape[1]))))
            recs = model.predict(user_cat,need_cols,item_features=item_features,user_features=user_features)
            #need_movies = np.argsort(-recs)[:N]
            result[user_uid] = recs#[int(fe.train_movie_match_row_movie[i]) for i in need_movies]
            
                
        # переводим в фильмы

        # perform inference
        if mode == 'implicit':
            if cnt == 0:
                ratings_matrix_T = X.tocsr()
            
            recs = model.rank_items(user_cat, X.tocsr(), np.arange(X.shape[1]), )
            recs = sorted(recs,key = lambda x:x[0])
            
            result[user_uid] = [score for i, score in recs]
            # drop scores and transform model's internal elelemnt category to element_uid for every prediction
        # also convert np.uint64 to int so it could be json serialized later
            cnt += 1
    return result

In [3]:
pd.set_option('display.max_columns',100)

DATA_PATH = '../okko/orig_data'
PREPARED_PATH = './prepared_data/'

In [4]:
actions = pd.read_pickle(PREPARED_PATH+'actions_one_table.pkl')

actions.sort_index(inplace = True) # На всякий случай, иначе деление не будет работать

actions.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,action,consumption_mode,device_manufacturer,device_type,rating,watched_time,duration,type
user_uid,element_uid,ts,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,51,44165460.0,watch,S,99.0,0.0,,12382.0,3600,1
0,72,43758290.0,watch,S,99.0,0.0,,5653.0,6000,1
0,207,43719040.0,watch,S,99.0,0.0,,2646.0,5400,1
0,209,43778140.0,watch,S,99.0,0.0,,6971.0,7200,1
0,434,43381090.0,watch,S,99.0,0.0,,5894.0,6600,1


In [5]:
_,_,watch_actions,_ = get_target(actions)

actions = actions.join(watch_actions['rel_dur'])

actions['rel_dur'] = actions['rel_dur'].fillna(0).replace(np.inf,1)

In [6]:
idx = get_train_test(actions)

6558458
2186152
2186153


In [7]:
actions.iloc[idx[0]].index.get_level_values(2).max()

43362401.96226887

In [8]:
actions.iloc[idx[1]].index.get_level_values(2).min()

43362401.97085199

In [9]:
actions.iloc[idx[1]].index.get_level_values(2).max()

43828341.47903843

In [10]:
actions.iloc[idx[2]].index.get_level_values(2).min()

43828341.48519237

In [11]:
actions.consumption_mode.value_counts()

S    8296227
P     873834
R     472951
Name: consumption_mode, dtype: int64

In [12]:
with open(os.path.join(DATA_PATH, 'test_users.json'), 'r') as f:
    test_users = set(json.load(f)['users'])


In [13]:
actions.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,action,consumption_mode,device_manufacturer,device_type,rating,watched_time,duration,type,rel_dur
user_uid,element_uid,ts,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,51,44165460.0,watch,S,99.0,0.0,,12382.0,3600,1,3.439444
0,72,43758290.0,watch,S,99.0,0.0,,5653.0,6000,1,0.942167
0,207,43719040.0,watch,S,99.0,0.0,,2646.0,5400,1,0.49
0,209,43778140.0,watch,S,99.0,0.0,,6971.0,7200,1,0.968194
0,434,43381090.0,watch,S,99.0,0.0,,5894.0,6600,1,0.89303


In [14]:
# Вроде не пересекается.
train,test,valid = actions.iloc[idx[0]],actions.iloc[idx[1]],actions.iloc[idx[2]]

In [15]:
%time 
dur_being_train,dur_films_train,watch_actions_train,target_train = get_target(train)
dur_being_test,dur_films_test,watch_actions_test,target_test = get_target(test)
dur_being_valid,dur_films_valid,watch_actions_valid,target_valid = get_target(valid)

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 7.15 µs


In [16]:
# Получили фичи для фильмов
import pickle
with open(PREPARED_PATH+'catalogue_features.pkl','rb') as f:
    match_element_row,match_row_element,match_columns,element_matrix = pickle.load(f)
movie_match_columns = {i:ii for ii,i in enumerate(match_columns)}
movie_columns_match = {ii:i for ii,i in enumerate(match_columns)}

with open(PREPARED_PATH+'bag_of_attr_movie.pkl','rb') as f:
    bag_of_attr = pickle.load(f)

In [17]:
fe = FeatureExtractor({'movie_attr_matrix':element_matrix,'movie_match_columns':movie_match_columns,
                       'movie_columns_match':movie_columns_match,'movie_match_row_movie':match_row_element,
                      'movie_match_movie_row':match_element_row,},bag_of_attr,is_censor = True,delimiter=-1,mode = 'duration',
                     target_col_name='rel_dur')

fe = FeatureExtractor({'movie_attr_matrix':element_matrix,'movie_match_columns':movie_match_columns,
                       'movie_columns_match':movie_columns_match,'movie_match_row_movie':match_row_element,
                      'movie_match_movie_row':match_element_row,},bag_of_attr,is_censor =False,delimiter=4,)

fe.fit(train)

100%|██████████| 389188/389188 [03:19<00:00, 1946.98it/s]


(389188, 29221) 389188 29221


<validation_prepare.FeatureExtractor at 0x7f344265c208>

In [18]:
train_ = fe.transform(train)
test_ = fe.transform(test)
valid_ = fe.transform(valid)

train_.shape,test_.shape 

cfe = ColdFeatureExtractor(fe)

cfe.fit(train)

max(cfe.train_movie_cols)

# sp.csc_matrix(cfe.fitted_FE.movie_attr_matrix)[cfe.train_movie_rows,cfe.train_movie_cols]



train_res = cfe.transform(test)


print(train_res['train_user'].shape,train_res['test_user'].shape,train_res['new_test_user'].shape)

print(train_res['train_movie'].shape,train_res['test_movie'].shape)

8015 389188
(286374, 1) (281788, 3)
(389188, 8015)
8015 389188
(85260, 1) (63113, 3)
(389188, 8015)
8015 389188
(67156, 1) (44606, 3)
(389188, 8015)


100%|██████████| 35799/35799 [00:13<00:00, 2601.27it/s]


(389188, 29221) (389188, 29221) (6654, 29221)
(8015, 29232) (251, 29232)


In [19]:
np.min(train_)

0.0

In [20]:
'''
Здесь начинается русское поле экспериментов над параметрами моделей.
'''

'\nЗдесь начинается русское поле экспериментов над параметрами моделей.\n'

In [21]:
train_.shape,test_.shape,valid_.shape

((389188, 8015), (389188, 8015), (389188, 8015))

In [22]:
train_res['train_movie']

<8015x29232 sparse matrix of type '<class 'numpy.float64'>'
	with 204594 stored elements in Compressed Sparse Row format>

In [23]:
%%time
from collections import defaultdict
a,b = train_.nonzero()
D_row_col_train = defaultdict(set)
D_user_movie_train = defaultdict(set)
for i,ii in tqdm.tqdm(zip(a,b),total = len(a)):
    D_row_col_train[i].add(ii)
    D_user_movie_train[fe.match_row_user[i]].add(fe.train_movie_match_row_movie[ii])

from collections import defaultdict
a,b = test_.nonzero()
D_row_col_test = defaultdict(set)
D_user_movie_test = defaultdict(set)
for i,ii in tqdm.tqdm(zip(a,b),total = len(a)):
    D_row_col_test[i].add(ii)
    D_user_movie_test[fe.match_row_user[i]].add(fe.train_movie_match_row_movie[ii])

from collections import defaultdict
a,b = valid_.nonzero()
D_row_col_valid = defaultdict(set)
D_user_movie_valid = defaultdict(set)
for i,ii in tqdm.tqdm(zip(a,b),total = len(a)):
    D_row_col_valid[i].add(ii)
    D_user_movie_valid[fe.match_row_user[i]].add(fe.train_movie_match_row_movie[ii])

100%|██████████| 281264/281264 [00:01<00:00, 251883.72it/s]
100%|██████████| 63024/63024 [00:00<00:00, 219041.97it/s]
100%|██████████| 44525/44525 [00:00<00:00, 284423.58it/s]

CPU times: user 1.55 s, sys: 22.7 ms, total: 1.58 s
Wall time: 1.57 s





In [24]:

seed = 0
epochs = 30
num_threads=4

model = lightfm.LightFM(loss = 'warp',random_state=seed)

In [25]:
from implicit.nearest_neighbours import TFIDFRecommender,CosineRecommender,NearestNeighboursScorer,BM25Recommender,bm25_weight
from implicit.als import AlternatingLeastSquares
from implicit.evaluation import mean_average_precision_at_k,precision_at_k

In [26]:
from lightfm.evaluation import precision_at_k,auc_score

In [27]:
np.max(train_)

10.0

In [28]:
model_im = BM25Recommender()#CosineRecommender()
model_im.fit(train_.T)

100%|██████████| 8015/8015 [00:00<00:00, 239801.88it/s]


In [29]:
# recs_model_im_test = get_predict(model_im,train_,fe,D_user_movie_test.keys(),D_row_col_train,mode = 'implicit')

# list(recs_model_im_test.keys())[0]

# len(recs_model_im_test[17])

In [30]:
model = lightfm.LightFM(loss = 'logistic',random_state=seed,max_sampled=100,no_components=100)
model.fit(train_,epochs=30,)

<lightfm.lightfm.LightFM at 0x7f33fffc1710>

In [31]:
# Здесь простецкий стекинг делать

In [32]:
recs_model_test = get_predict(model,train_,fe,D_user_movie_test.keys(),D_row_col_train,mode = 'lightfm')
len(recs_model_test[list(recs_model_test.keys())[0]])
len(recs_model_test[17])

100%|██████████| 26360/26360 [01:31<00:00, 286.82it/s]


8015

In [34]:
recs_model_valid = get_predict(model,train_,fe,D_user_movie_valid.keys(),D_row_col_train,mode = 'lightfm')
len(recs_model_valid[list(recs_model_valid.keys())[0]])
#len(recs_model_valid[17])

100%|██████████| 19605/19605 [01:11<00:00, 274.42it/s]


8015

In [35]:
recs_model_test_true = get_predict(model,train_,fe,test_users,D_row_col_train,mode = 'lightfm')
len(recs_model_test_true[list(recs_model_test_true.keys())[0]]),len(recs_model_test_true)

100%|██████████| 50000/50000 [02:24<00:00, 345.37it/s]


(8015, 39299)

In [36]:
with open(os.path.join(DATA_PATH, 'catalogue.json'), 'r') as f:
    catalogue = json.load(f)
    
catalogue = {int(k): v for k, v in catalogue.items()}

In [51]:
def get_movie_feat(movie_id,data_catalogue_orig):
    l = catalogue[movie_id]['attributes']
    return {i:1 for i in l}
def convert_to_range(X,data_catalogue_orig,path = PREPARED_PATH+'train.txt', label_name = 'target',need_feat = None):
    '''
    конвертируем в libsvm вместе с query файлом
    '''
    # Сначала рейтинг 0...10
    buf_tr = []
    buf_query = []
    for i in tqdm.tqdm(np.unique(X.index.get_level_values(0))):
        cnt_film = 0
        temp = X.loc[i]
        for movie_id in temp.index.get_level_values(0):
            row = temp.loc[movie_id,need_feat].to_dict()
            dict_row = get_movie_feat(movie_id,data_catalogue_orig)
            dict_row.update(row)
            s_movie = str(int(temp.loc[movie_id,label_name]))+' '
            for k,kk in dict_row.items():
                s_movie+= str(k)+':'+str(kk)+' '
            s_movie+='\n'
            buf_tr.append(s_movie)
            cnt_film+=1
        buf_query.append(str(cnt_film)+'\n')
    with open(path,'w') as f:
        for i in buf_tr:
            f.write(i)
    with open(path+'.query','w') as f:
        for i in buf_query:
            f.write(i)

In [52]:
wm = pd.read_pickle(PREPARED_PATH+'some_wm.pkl')
wm_cut = wm.loc[~wm.rating.isnull()]

In [53]:
wm_cut.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,rel_dur,type,rating,action_enc,cons_enc,first_ts,first_ts_1,diff_ts,duration,feature_1,feature_2,feature_3,feature_4,feature_5,is_history,is_future,diff_novice_ts,target,mean_rating_user,num_watched_user,mean_target_user,mean_rating_movie,num_watched_movie,mean_target_movie,probably_kp_rate
user_uid,element_uid,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1
1,1653,1.005185,1.0,10.0,watch+rate,R,42020580.0,44249580.0,177102.426069,90,41661080.0,0.733306,18,1.141929,0.449667,1,0,-1.0,1,9.666667,21,0.761905,8.879898,11152,0.884953,0.837383
1,2245,1.024394,1.0,10.0,watch+rate,S,42227380.0,42197680.0,4771.864115,110,40975260.0,0.784022,45,1.140273,0.449667,1,0,-1.0,1,9.666667,21,0.761905,8.952146,54153,0.753642,0.893999
1,10084,0.0,1.0,9.0,rate,,42256000.0,42232150.0,3722.29505,80,40741680.0,0.687308,19,1.138604,0.68041,1,0,-1.0,-1,9.666667,21,0.761905,7.384956,23946,0.6665,0.782572
3,6409,1.221,1.0,10.0,watch+rate,P,42597580.0,42597210.0,99080.013576,100,42500430.0,0.769551,13,1.138604,0.654707,0,0,-1.0,1,10.0,54,0.555556,8.710054,19196,0.923422,0.876214
14,1354,0.9925,1.0,7.0,watch+rate,R,43137700.0,42534380.0,32361.807478,100,41367230.0,0.655429,14,1.141929,0.68041,1,0,-1.0,1,7.0,8,1.0,7.529058,4651,0.911847,0.748454


In [54]:
need_feat =['rel_dur','diff_ts','duration','feature_1','feature_2','feature_3','feature_4','feature_5','num_watched_user'
            ,'mean_rating_movie','probably_kp_rate']

In [55]:
convert_to_range(wm_cut,catalogue,label_name = 'rating',need_feat = need_feat)

100%|██████████| 104563/104563 [05:51<00:00, 297.20it/s]


In [56]:
from sklearn.datasets import load_svmlight_file

In [57]:
import lightgbm

In [59]:
d = lightgbm.Dataset(PREPARED_PATH+'train.txt')

In [None]:
res = get_recomend(model_im,train_,fe,D_user_movie_train.keys(),D_row_col_train,mode = 'implicit',is_filter_history=False)

metric(D_user_movie_train,res)

In [None]:
res = get_recomend(model_im,train_,fe,D_user_movie_test.keys(),D_row_col_train,mode = 'implicit')

metric(D_user_movie_test,res)

In [None]:
res = get_recomend(model_im,train_,fe,D_user_movie_valid.keys(),D_row_col_train,mode = 'implicit')

metric(D_user_movie_valid,res)

In [None]:
#precision_at_k(model,train_,test_,k = 20)

In [None]:
# Организуем hyperopt
import colorama
import hyperopt
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials


space ={
        'loss': hp.choice( 'loss', ['logistic', 'warp' ] ),
        'learning_schedule': hp.choice("learning_schedule", ['adagrad','adadelta']),
        'rho':  hp.quniform('rho', 0.75, 0.99,0.05),
        'max_sampled': hp.quniform('max_sampled', 10, 1000,20),
        'learning_rate': hp.loguniform('learning_rate', -6.9, -1),  
        'no_components':hp.quniform('no_components',5,200,5),
       }
cur_best_loss = np.inf
cnt = 1


def get_params(space):
    px = dict()
    px['loss'] = space['loss']
    px['learning_schedule'] = space['learning_schedule']
    px['rho'] = min(0.99,space['rho'])
    px['max_sampled'] = int(space['max_sampled'])
    px['learning_rate'] = space['learning_rate']
    px['no_components'] = int(space['no_components'])
    
    return px
def objective(space):
    global cur_best_loss,cnt
    params = get_params(space)
    model = lightfm.LightFM(random_state=seed,**params,)
    model.fit(train_,epochs=30,)
    res = get_recomend(model,train_,fe,D_user_movie_test.keys(),D_row_col_train)
    #score = custom_mae(l.values,p.values)
    score = -metric(D_user_movie_test,res)
    if cnt%10 == 0:
        print('Попытка номер:',cnt)
    if score < cur_best_loss:
        cur_best_loss = score
        print(colorama.Fore.GREEN + 'NEW BEST LOSS={}'.format(-cur_best_loss) + colorama.Fore.RESET)
        print(params)
    cnt+=1
    return {'loss':score, 'status': STATUS_OK }
trials = Trials()
# best = hyperopt.fmin(fn=objective,
#                      space=space,
#                      algo=tpe.suggest,
#                      max_evals=200,
#                      trials=trials,
#                      verbose=2)
p={'loss': 'warp', 'learning_schedule': 'adagrad', 'rho': 0.75, 'max_sampled': 460, 'learning_rate': 0.005108939164793534, 'no_components': 160}

In [None]:
res = get_recomend(model,train_,fe,D_user_movie_test.keys(),D_row_col_train)#test_users#,mode = 'implicit'


In [None]:
metric(D_user_movie_test,res)

In [None]:
# Валидация
res = get_recomend(model,valid_,fe,D_user_movie_valid.keys(),D_row_col_train)#,mode = 'implicit'
metric(D_user_movie_valid,res)

In [None]:
len(D_user_movie_test)

In [None]:
len(set(D_user_movie_test.keys()) & set(res.keys()))

In [None]:
'''
Здесь заканчиваетс/ русское поле экспериментов
'''

In [None]:
actions.head()

In [None]:
wm = pd.read_pickle(PREPARED_PATH+'some_wm.pkl')

In [None]:
wm.head()

In [None]:
#Проверим, что не загибается на полном датасете

fe = FeatureExtractor({'movie_attr_matrix':element_matrix,'movie_match_columns':movie_match_columns,
                       'movie_columns_match':movie_columns_match,'movie_match_row_movie':match_row_element,
                      'movie_match_movie_row':match_element_row,},bag_of_attr,is_censor = True,delimiter=4,)


# fe = FeatureExtractor({'movie_attr_matrix':element_matrix,'movie_match_columns':movie_match_columns,
#                        'movie_columns_match':movie_columns_match,'movie_match_row_movie':match_row_element,
#                       'movie_match_movie_row':match_element_row,},bag_of_attr,is_censor = False)
X = fe.fit_transform(actions)

In [None]:
cfe = ColdFeatureExtractor(fe)

res_action = cfe.fit_transform(actions)

In [None]:
#res_action.keys()

In [None]:
#len(fe.train_movie_match_movie_row),len(fe.match_user_row)

In [None]:
X_user = res_action['train_user']
X_movie = res_action['train_movie']
X_user.shape,X_movie.shape

In [None]:
# def fit_lightfm(train,item_features=None,seed = 0)
seed = 0
epochs = 30
num_threads=4

model = lightfm.LightFM(loss = 'warp',random_state=seed)


# model.fit(train_matrix,user_features = train_user,item_features = element_matrix,epochs = epochs,num_threads = num_threads,
#          verbose = True)


In [None]:


%time model.fit(X,epochs = epochs,num_threads = num_threads,verbose = True)

In [None]:
%%time
from collections import defaultdict
a,b = X.nonzero()
D = defaultdict(set)
for i,ii in tqdm.tqdm(zip(a,b),total = len(a)):
    D[i].add(ii)
# D = pd.DataFrame([a,b],index = ['row','columns']).T
# D = D.groupby('row').agg(lambda x:frozenset(x.values))
# D

In [None]:
#D[1]

In [None]:
len(fe.train_movie_match_row_movie)

In [None]:
print('sparsity',X.nnz/(X.shape[0]*X.shape[1]))

In [None]:
len( set(test_users) - set(fe.match_user_row.keys()))

In [None]:
from implicit.nearest_neighbours import TFIDFRecommender,CosineRecommender,NearestNeighboursScorer



In [None]:
# Для implicit  надо фильмы*юзеры матрицу.
# model = CosineRecommender()
# model.fit(X.T.tocsr())

In [None]:
model = CosineRecommender()
model.fit(X.T)

In [None]:
res = get_recomend(model,X,fe,test_users,D,mode = 'implicit')

In [None]:
# Сделаем какой-нибудь сабмит из lightfm
result = {}
mode = 'lightfm'#'implicit'#'lightfm'
cnt = 0
for user_uid in tqdm.tqdm(test_users):
    
    # transform user_uid to model's internal user category
    try:
        user_cat = fe.match_user_row[user_uid]
    except LookupError:
        continue
    if mode == 'lightfm':
        need_cols = np.array(list(set(np.arange(X.shape[1])) - D.get(user_cat, set())))
        recs = model.predict(user_cat,need_cols,num_threads=num_threads)
        need_movies = np.argsort(-recs)[:20]
        result[user_uid] = [int(fe.train_movie_match_row_movie[i]) for i in need_movies]
    # переводим в фильмы
    
    # perform inference
    if mode == 'implicit':
        if cnt == 0:
            ratings_matrix_T = X.tocsr()
        recs = model.recommend(
            user_cat,
            X.tocsr(),
            N=20,
            filter_already_liked_items=True,
            filter_items=D.get(user_uid, set())
        )
        result[user_uid] = [int(fe.train_movie_match_row_movie[i]) for i, _ in recs]
        # drop scores and transform model's internal elelemnt category to element_uid for every prediction
    # also convert np.uint64 to int so it could be json serialized later
    cnt += 1

In [None]:
user_cat

In [None]:
len(fe.train_movie_match_row_movie)

In [None]:
recs

In [None]:
#len(need_cols),len(recs),type(recs)

In [None]:
#plt.hist(recs)

In [None]:
len(result)

In [None]:

with open('answer.json', 'w') as f:
    json.dump(result, f)

In [None]:
result[0]

In [None]:
from lightfm.evaluation import auc_score,precision_at_k

# Compute and print the AUC score
train_auc = precision_at_k(model, te ,tr, k = 20,num_threads=4).mean()
print('Collaborative filtering train AUC: %s' % train_auc)

In [None]:
new_old_tr_te_2.keys()

In [None]:
tr = new_old_tr_te_2['train_interactions']
te = new_old_tr_te_2['test_interactions']
uf = new_old_tr_te_2['user_features_train']
tf = new_old_tr_te_2['user_features_test']
print(tr.shape,te.shape,uf.shape,tf.shape)
# tr[(tr < 3) &  (tr>0)] = -1
# tr[(tr > 3)] = 1
# te[(tr < 3) &  (te>0)] = -1
# te[(tr > 3)] = 1

model.fit(train_matrix,user_features=uf,epochs = epochs,num_threads =num_threads,
         verbose = True)

In [None]:
from scipy.sparse import csr_matrix
import matplotlib.pyplot as plt


In [None]:
p = model.predict(0,np.arange(tr.shape[1]),user_features=tf[0,:],num_threads=num_threads)

In [None]:
plt.hist(p)

In [None]:
test_user[0,:]

In [None]:
a = model.predict(0,np.arange(train_matrix.shape[1]),user_features=test_user[0,:],num_threads=num_threads)
len(a)

In [None]:
from lightfm.evaluation import auc_score,precision_at_k

# Compute and print the AUC score
train_auc = precision_at_k(model, train_matrix,user_features = train_user,item_features = element_matrix ,num_threads=num_threads).mean()
print('Collaborative filtering train AUC: %s' % train_auc)

In [None]:
model.user_feature_map

In [None]:
def get_answer(test_users_dict,test):
    pass