In [1]:
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

import os,sys,inspect
import gc
from tqdm import tqdm
import random

currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir)
sys.path.insert(0,parentdir) 

from load import *
from evals import *

import warnings
warnings.filterwarnings('ignore')

In [2]:
SEED = 42
np.random.seed(SEED)
os.environ['PYTHONHASHSEED']=str(SEED)
random.seed(SEED)

In [3]:
def load_data(filname):
    # only valid for currnet workspace
    f = open(filname, 'r')
    fs = f.readlines()
    f.close()

    df = pd.DataFrame(list(map(lambda x: x.split('\t'), fs)), columns=['userId', 'movieId', 'rating', 'time'])
    df = df.drop('time', axis=1)
    df['userId'] = df['userId'].astype(int)
    df['movieId'] = df['movieId'].astype(int)
    df['rating'] = df['rating'].astype(float)
    
    df = df[['userId', 'movieId', 'rating']]
    m_codes = df['movieId'].astype('category').cat.codes
    u_codes = df['userId'].astype('category').cat.codes
    df['movieId'] = m_codes
    df['userId'] = u_codes
    
    return df

def extract_from_df(df, n_positive):
    df_ = df.copy()
    rtd = []
    
    user_id = df['userId'].unique()
    
    for i in tqdm(user_id):
        rtd += list(np.random.choice(df[df['userId']==i]['movieId'].index, n_positive, replace=False))
        
    return rtd

In [4]:
df = load_data('../data/ml-100k/u.data')
uuid = df['userId'].unique()
uiid = df['movieId'].unique()


In [5]:
rtd = extract_from_df(df, 1)
train = df.drop(rtd)
test = df.loc[rtd]

100%|██████████| 943/943 [00:00<00:00, 1985.10it/s]


In [6]:
R = pd.pivot_table(train, index='userId', values='rating', columns='movieId').fillna(0)

P = np.where(R>0, 1, 0)
R = R.values

In [7]:
n_u = R.shape[0]
n_i = R.shape[1]

k = 20
alpha = 40
lamda = 150
epochs = 10

In [8]:
X = np.random.rand(n_u, k)*0.01
Y = np.random.rand(n_i, k)*0.01

C = (1 + alpha*R)

In [9]:
def loss_function(C, P, X, Y, r_lambda):
    predict_error = np.square(P - np.matmul(X, Y.T))
    
    regularization = r_lambda * (np.mean(np.square(X)) + np.mean(np.square(Y)))
    confidence_error = np.mean(C * predict_error)
    total_loss = confidence_error + regularization
    predict_error = np.mean(predict_error)
    return predict_error, confidence_error, regularization, total_loss

In [10]:
def update(x, y, p, c=C):
    xt = x.T
    yt = y.T
    
    for u in range(n_u):
        c_ = C[u, :]
        p_ = P[u, :]
        cu = np.diag(c_)
        
        ycy = y.T.dot(cu).dot(y)
        ycyi = ycy+lamda*np.identity(ycy.shape[0])
        ycp = y.T.dot(cu).dot(p_.T)
        
        x[u] = np.linalg.solve(ycyi, ycp)
        
    for i in range(n_i):
        c_ = C[:, i]
        p_ = P[:, i]
        ci = np.diag(c_)
        
        xcx = x.T.dot(ci).dot(x)
        xcxi = xcx+lamda*np.identity(xcx.shape[0])
        xcp = x.T.dot(ci).dot(p_.T)
        
        y[i] = np.linalg.solve(xcxi, xcp)
        
    return x, y

In [11]:
for e in tqdm(range(epochs)):
    X, Y = update(X, Y, C)
    predict_error, confidence_error, regularization, total_loss = loss_function(C, P, X, Y, lamda)
    print('----------------step %d----------------' %e)
    print("predict error: %f" % predict_error)
    print("confidence error: %f" % confidence_error)
    print("regularization: %f" % regularization)
    print("total loss: %f" % total_loss)

 10%|█         | 1/10 [00:17<02:33, 17.09s/it]

----------------step 0----------------
predict error: 0.161185
confidence error: 2.068850
regularization: 59.882542
total loss: 61.951392


 20%|██        | 2/10 [00:34<02:16, 17.08s/it]

----------------step 1----------------
predict error: 0.204791
confidence error: 0.346337
regularization: 41.705697
total loss: 42.052034


 30%|███       | 3/10 [00:51<01:59, 17.09s/it]

----------------step 2----------------
predict error: 0.183479
confidence error: 0.284980
regularization: 24.436931
total loss: 24.721911


 40%|████      | 4/10 [01:08<01:43, 17.17s/it]

----------------step 3----------------
predict error: 0.183041
confidence error: 0.270308
regularization: 20.359363
total loss: 20.629672


 50%|█████     | 5/10 [01:25<01:25, 17.11s/it]

----------------step 4----------------
predict error: 0.184066
confidence error: 0.264862
regularization: 19.003535
total loss: 19.268396


 60%|██████    | 6/10 [01:42<01:08, 17.09s/it]

----------------step 5----------------
predict error: 0.184955
confidence error: 0.262373
regularization: 18.352146
total loss: 18.614519


 70%|███████   | 7/10 [01:59<00:51, 17.06s/it]

----------------step 6----------------
predict error: 0.185648
confidence error: 0.261109
regularization: 17.963514
total loss: 18.224623


 80%|████████  | 8/10 [02:16<00:34, 17.03s/it]

----------------step 7----------------
predict error: 0.186194
confidence error: 0.260424
regularization: 17.704544
total loss: 17.964968


 90%|█████████ | 9/10 [02:33<00:17, 17.07s/it]

----------------step 8----------------
predict error: 0.186628
confidence error: 0.260035
regularization: 17.521659
total loss: 17.781693


100%|██████████| 10/10 [02:50<00:00, 17.09s/it]

----------------step 9----------------
predict error: 0.186977
confidence error: 0.259804
regularization: 17.388006
total loss: 17.647810





In [12]:
def eval_hit(X, y, df, test, user_id, item_ids, top_k):
    df = pd.concat([df, test])
    items = list(set(item_ids) - set(df[df['userId']==user_id][df['rating']==1]['movieId'].values))
    np.random.shuffle(items)
    items = items[:99]
    items.append(test[test['userId']==user_id]['movieId'].values[0])
    items = np.array(items).reshape(-1, 1)

    user = np.full(len(items), user_id).reshape(-1, 1)

    preds = np.dot(X[user_id], Y[items].squeeze(1).T)
    item_to_pred = {item: pred for item, pred in zip(items.flatten(), preds)}

    top_k = heapq.nlargest(top_k, item_to_pred, key=item_to_pred.get)
    
    if items[-1][0] in top_k:
            return 1
    return 0

def eval_NDCG(X, Y, df, test, user_id, item_ids, top_k):
    df = pd.concat([df, test])
    items = list(set(item_ids) - set(df[df['userId']==user_id][df['rating']==1]['movieId'].values))
    np.random.shuffle(items)
    items = items[:99]
    items.append(test[test['userId']==user_id]['movieId'].values[0])
    items = np.array(items).reshape(-1, 1)

    user = np.full(len(items), user_id).reshape(-1, 1)

    preds = np.dot(X[user_id], Y[items].squeeze(1).T)
    item_to_pred = {item: pred for item, pred in zip(items.flatten(), preds)}

    top_k = heapq.nlargest(top_k, item_to_pred, key=item_to_pred.get)
    
    for i, item in enumerate(top_k, 1):
        if item == test[test['userId']==user_id]['movieId'].values:
            return np.log(i) / np.log(i+2)
    return 0

def eval_hit_wrapper(X, Y, df, test, item_ids, top_k):
    def f(user_id):
        return eval_hit(X, Y, df, test, user_id, item_ids, top_k)
    return f

def eval_NDCG_wrapper(X, Y, df, test, item_ids, top_k):
    def f(user_id):
        return eval_NDCG(X, Y, df, test, user_id, item_ids, top_k)
    return f

In [13]:
hits10 = list(map(eval_hit_wrapper(X, Y, train, test, uiid, 10), uuid))
print(sum(hits10)/len(hits10))

0.6935312831389183


In [14]:
ndcg10 = list(map(eval_NDCG_wrapper(X, Y, train, test, uiid, 10), uuid))
print(sum(ndcg10)/len(ndcg10))

0.39732657914694164
