In [1]:
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

import os
import gc
from tqdm import tqdm
import random

import warnings
warnings.filterwarnings('ignore')

In [2]:
from tensorflow import keras
import tensorflow as tf
from tensorflow.keras import optimizers, callbacks, layers, losses
from tensorflow.keras.layers import Dense, Concatenate, Activation, Add, BatchNormalization, Dropout, Input, Embedding, Flatten, Multiply
from tensorflow.keras.models import Model, Sequential, load_model
from tensorflow.keras.utils import to_categorical

SEED = 42
np.random.seed(SEED)
tf.random.set_seed(SEED)
os.environ['PYTHONHASHSEED']=str(SEED)
random.seed(SEED)
gpus = tf.config.experimental.list_physical_devices('GPU')

if gpus:
    try:
        tf.config.experimental.set_memory_growth(gpus[0], True)
    except RuntimeError as e:
        # 프로그램 시작시에 메모리 증가가 설정되어야만 합니다
        print(e)
        
def mish(x):
    return x*tf.math.tanh(tf.math.softplus(x))

def leakyrelu(x, factor=0.2):
    return tf.maximum(x, factor*x)

In [3]:
def load_data(filname):
    f = open(filname, 'r')
    fs = f.readlines()
    f.close()

    df = pd.DataFrame(list(map(lambda x: x.split('\t'), fs)), columns=['userId', 'movieId', 'rating', 'time'])
    df = df.drop('time', axis=1)
    df['userId'] = df['userId'].astype(int)
    df['movieId'] = df['movieId'].astype(int)
    df['rating'] = df['rating'].astype(int)
    
    df = df[['userId', 'movieId', 'rating']]
#     df['rating'] = 1.
    m_codes = df['movieId'].astype('category').cat.codes
    u_codes = df['userId'].astype('category').cat.codes
    df['movieId'] = m_codes
    df['userId'] = u_codes
    
    return df

def add_negative(df, times=4):
    df_ = df.copy()
    user_id = df['userId'].unique()
    item_id = df['movieId'].unique()
    
    for i in tqdm(user_id):
        cnt = 0
        n = len(df_[df_['userId']==i])
        n_negative = min(n*times, len(item_id)-n)
        available_negative = list(set(uiid) - set(df[df['userId']==i]['movieId'].values))
        
        new = np.random.choice(available_negative, n_negative, replace=False)
        new = [[i, j, 0] for j in new]
        df_ = df_.append(pd.DataFrame(new, columns=df.columns), ignore_index=True)
    
    return df_

def extract_from_df(df, n_positive, n_negative):
    df_ = df.copy()
    rtd = []
    
    user_id = df['userId'].unique()
    
    for i in tqdm(user_id):
        rtd += list(np.random.choice(df[df['userId']==i][df['rating']==1]['movieId'].index, n_positive, replace=False))
        rtd += list(np.random.choice(df[df['userId']==i][df['rating']==0]['movieId'].index, n_negative, replace=False))
        
    return rtd

In [4]:
df = load_data('./data/ml-100k/u.data')
uuid = df['userId'].unique()
uiid = df['movieId'].unique()


In [5]:
um = pd.pivot_table(df, values='rating', index='userId', columns='movieId').fillna(0)
um

movieId,0,1,2,3,4,5,6,7,8,9,...,1672,1673,1674,1675,1676,1677,1678,1679,1680,1681
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,5.0,3.0,4.0,3.0,3.0,5.0,4.0,1.0,5.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
938,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
939,0.0,0.0,0.0,2.0,0.0,0.0,4.0,5.0,3.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
940,5.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
941,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
from sklearn.metrics import mean_squared_error
def rmse(true, pred):
    user_idx, item_idx = np.nonzero(true)
    trues = [true[i][j] for i, j in zip(user_idx, item_idx)]
    preds = [pred[i][j] for i, j in zip(user_idx, item_idx)]
    return np.sqrt(mean_squared_error(trues, preds))

def matrix_fatorization(M, k, epochs, lr=0.01):
    n_user, n_item = M.shape
    
    U = np.random.normal(0., 1./k, (n_user, k))
    V = np.random.normal(0., 1./k, (n_item, k))
    
    u_idx, i_idx = np.nonzero(M)
    
    for e in (range(epochs)):
        for i, j in zip(u_idx, i_idx):
            e_ij = M[i][j] - np.dot(U[i,:], V[j,:].T)
            
            U[i, :] = U[i, :] + lr*(e_ij*V[j, :] - 0.01*U[i,:])
            V[j, :] = V[j, :] + lr*(e_ij*U[i, :] - 0.01*V[j,:])
            
        recon = np.dot(U, V.T)
        print(f'epochs: {e}:', rmse(M, recon))
    return U, V.T

In [7]:
U, V = matrix_fatorization(um.values, 32, 200)

epochs: 0: 2.4909834259591515
epochs: 1: 1.1521076234524399
epochs: 2: 1.0104802173826586
epochs: 3: 0.9744747270383864
epochs: 4: 0.9523888834977259
epochs: 5: 0.933672329735635
epochs: 6: 0.9149085864038816
epochs: 7: 0.8957985882084453
epochs: 8: 0.876626285527219
epochs: 9: 0.8571589735339687
epochs: 10: 0.8371639130477156
epochs: 11: 0.816687931741886
epochs: 12: 0.7959163057388908
epochs: 13: 0.7750926235311938
epochs: 14: 0.7545171787081046
epochs: 15: 0.734515451110103
epochs: 16: 0.7153756339664444
epochs: 17: 0.6973041863213724
epochs: 18: 0.6804193445103907
epochs: 19: 0.6647678217673921
epochs: 20: 0.6503451888842676
epochs: 21: 0.6371115337717312
epochs: 22: 0.625002459837312
epochs: 23: 0.6139376607658721
epochs: 24: 0.6038283708653496
epochs: 25: 0.594583749004476
epochs: 26: 0.5861157812171405
epochs: 27: 0.5783424769335224
epochs: 28: 0.5711895082381769
epochs: 29: 0.5645906788919018
epochs: 30: 0.558487651295158
epochs: 31: 0.5528292816853979
epochs: 32: 0.54757080229

In [8]:
recon = np.dot(U, V)
rmse(um.values, recon)

0.42459980845419526

In [15]:
def get_best(record, U, V=V, top_k=10):
    prev = np.nonzero(record[0])[0]
    candidates = np.argsort(-np.dot(U, V))
    
    res = []
    cnt = 0
    for c in candidates:
        if c not in prev:
            res.append(c)
            cnt += 1
        if cnt == top_k:
            return res

In [16]:
get_best(um.values, U[0], V, 10)

[1072, 1112, 461, 646, 407, 420, 316, 740, 427, 488]