In [None]:
# https://github.com/Darel13712/ease_rec
## totally borrowed!

In [4]:
from scipy.sparse import csr_matrix
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
import os, inspect, sys

currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir)
sys.path.insert(0,parentdir) 

from load import *
def eval_NDCG(true, pred):
    top_k = pred

    for i, item in enumerate(top_k, 1):
        if item == true:
            return 1 / np.log2(i+1)
    return 0

import warnings
warnings.filterwarnings('ignore')

In [5]:
df = load_data('../data/ml-100k/u.data', threshold=3)
df = df[df['rating']==1].reset_index(drop=True)


In [10]:
test_idx = []
user_id = df
for i in df['userId'].unique():
    test_idx += list(np.random.choice(df[df['userId']==i].index, 1))
    
train = df.iloc[list(set(df.index)-set(test_idx)),:]
test = df.iloc[test_idx, :]


In [21]:
class EASE:
    def __init__(self):
        self.user_enc = LabelEncoder()
        self.item_enc = LabelEncoder()

    def _get_users_and_items(self, df):
        users = self.user_enc.fit_transform(df.loc[:, 'userId'])
        items = self.item_enc.fit_transform(df.loc[:, 'movieId'])
        return users, items

    def fit(self, df, lambda_: float = 0.5, implicit=True):
        """
        df: pandas.DataFrame with columns user_id, item_id and (rating)
        lambda_: l2-regularization term
        implicit: if True, ratings are ignored and taken as 1, else normalized ratings are used
        """
        users, items = self._get_users_and_items(df)
        values = np.ones(df.shape[0]) if implicit else df['rating'].to_numpy() / df['rating'].max()

        X = csr_matrix((values, (users, items)))
        self.X = X

        G = X.T.dot(X).toarray()
        diagIndices = np.diag_indices(G.shape[0])
        G[diagIndices] += lambda_
        P = np.linalg.inv(G)
        B = P / (-np.diag(P))
        B[diagIndices] = 0

        self.B = B
        self.pred = X.dot(B)

    def predict(self, train, users, items, k):
        df = pd.DataFrame()
        items = self.item_enc.transform(items)
        dd = train.loc[train['userId'].isin(users)]
        dd['ci'] = self.item_enc.transform(dd['movieId'])
        dd['cu'] = self.user_enc.transform(dd['userId'])
        g = dd.groupby('userId')
        for user, group in tqdm(g):
            watched = set(group['ci'])
            candidates = [item for item in items if item not in watched]
            u = group['cu'].iloc[0]
            pred = np.take(self.pred[u, :], candidates)
            res = np.argpartition(pred, -k)[-k:]
            r = pd.DataFrame({
                "userId": [user] * len(res),
                "movieId": np.take(candidates, res),
                "score": np.take(pred, res)
            }).sort_values('score', ascending=False)
            df = df.append(r, ignore_index=True)
        df['movieId'] = self.item_enc.inverse_transform(df['movieId'])
        return df

In [64]:
uid = 0
ease.user_enc.inverse_transform([0])[0]

array([0], dtype=int16)

In [67]:
ease.item_enc.inverse_transform(np.argsort(ease.pred[0]))

array([299, 741, 484, ..., 164,  11,  49], dtype=int16)

In [69]:
np.argsort(-ease.pred[0])

array([ 49,  11, 162, ..., 476, 728, 297], dtype=int64)

In [70]:
ease.pred[0][np.argsort(-ease.pred[0])]

array([ 1.03033893,  1.02958706,  1.0271893 , ..., -0.03484428,
       -0.03756666, -0.04414989])

In [74]:
np.unique(train[train['userId']==0]['movieId'])

array([  0,   2,   5,   6,   8,  11,  12,  13,  14,  15,  17,  18,  19,
        21,  24,  27,  31,  32,  38,  41,  42,  43,  44,  45,  46,  47,
        49,  50,  51,  54,  55,  56,  57,  58,  59,  60,  63,  64,  65,
        67,  71,  74,  75,  76,  78,  79,  80,  81,  83,  85,  86,  87,
        88,  89,  90,  92,  94,  95,  97,  99, 105, 106, 107, 108, 110,
       112, 113, 114, 118, 120, 122, 123, 126, 127, 128, 131, 132, 133,
       134, 136, 143, 145, 149, 150, 151, 153, 155, 156, 159, 160, 161,
       162, 164, 165, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176,
       177, 180, 181, 182, 183, 184, 185, 186, 189, 190, 191, 192, 193,
       194, 195, 196, 197, 198, 201, 202, 203, 205, 206, 207, 208, 209,
       211, 213, 215, 220, 221, 222, 223, 226, 227, 228, 229, 233, 234,
       235, 237, 238, 240, 241, 245, 247, 248, 249, 250, 252, 255, 256,
       257, 264, 266, 267, 268, 269], dtype=int16)

In [22]:
ease = EASE()

In [23]:
ease.fit(train)

In [45]:
pred = ease.predict(train, train['userId'].unique(), train['movieId'].unique(), 100)

100%|██████████| 942/942 [00:02<00:00, 349.91it/s]


In [46]:
pred

Unnamed: 0,userId,movieId,score
0,0,356,0.047607
1,0,750,0.045110
2,0,402,0.043548
3,0,519,0.036682
4,0,142,0.035914
5,0,179,0.034313
6,0,704,0.034257
7,0,658,0.034250
8,0,115,0.034247
9,0,96,0.032781


In [61]:
uid = 1
df[(df['userId']==uid) & (df['movieId'].isin(pred[pred['userId']==uid]['movieId']))]

Unnamed: 0,userId,movieId,rating
33375,1,284,1


In [62]:
train[(train['userId']==uid) & (train['movieId'].isin(pred[pred['userId']==uid]['movieId']))]

Unnamed: 0,userId,movieId,rating


In [50]:
for uid in range(942):
    pdf = df[(df['userId']==uid) & (df['movieId'].isin(pred[pred['userId']==uid]['movieId']))]
    if len(pdf) > 0:
        print(uid)

1
4
5
6
7
11
13
15
16
17
18
20
22
23
24
28
31
32
33
34
36
39
40
41
42
43
44
46
47
48
50
52
54
55
56
58
60
66
67
68
69
73
75
78
80
81
87
94
95
99
103
104
111
113
116
119
122
124
126
127
128
130
131
132
133
136
137
139
142
143
145
148
150
151
152
153
159
160
162
167
168
169
172
173
174
176
177
180
183
186
189
190
191
192
196
198
204
209
217
219
220
221
225
229
230
231
234
236
239
240
244
246
250
251
255
256
257
258
259
261
262
265
266
271
272
273
274
275
276
280
281
283
286
288
293
295
296
297
299
300
301
302
303
304
307
309
310
312
314
317
318
319
322
323
324
326
327
328
331
332
334
337
338
340
342
345
346
347
350
352
353
354
355
358
359
360
361
364
365
366
368
369
372
378
379
380
383
384
387
389
390
392
394
395
396
399
401
402
405
406
407
409
412
413
415
417
418
419
421
425
427
430
432
433
438
440
441
442
445
447
452
453
454
455
457
460
461
462
465
469
470
471
472
473
474
476
477
480
481
483
484
485
487
490
491
492
493
494
495
496
498
501
505
509
510
512
514
516
517
520
529
531
534
539

In [30]:
ease.pred.shape

(942, 1445)

In [32]:
train['userId'].unique().shape, train['movieId'].unique().shape, 

((942,), (1445,))