In [1]:
# 유저 => 문장
# 영화 => 단어 로 보고 임베딩
# Word2Vec negative sampling
# 임베딩 스페이스에서 유저가 봤던 영화들과 비슷한 영화들을 모아서 추천
# 새로운 유저가 들어오는 경우 모델을 새로 짜지 않고도 적용 가능(cold start)
# 성능은 생각보다 저조하여 적용 방법에 대해서 더 알아 볼 필요가 있음

In [2]:
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

import os,sys,inspect
import gc
from tqdm import tqdm
import random

currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir)
sys.path.insert(0,parentdir) 

from load import *
from evals import *

import warnings
warnings.filterwarnings('ignore')

In [3]:
from tensorflow import keras
import tensorflow as tf
from tensorflow.keras import optimizers, callbacks, layers, losses
from tensorflow.keras.layers import Dense, Concatenate, Activation, Add, BatchNormalization, Dropout, Input, Embedding, Flatten, Multiply
from tensorflow.keras.models import Model, Sequential, load_model
from tensorflow.keras.regularizers import l2
from tensorflow.keras.utils import to_categorical

from sklearn.metrics.pairwise import cosine_similarity

SEED = 42
np.random.seed(SEED)
tf.random.set_seed(SEED)
os.environ['PYTHONHASHSEED']=str(SEED)
random.seed(SEED)
gpus = tf.config.experimental.list_physical_devices('GPU')

if gpus:
    try:
        tf.config.experimental.set_memory_growth(gpus[0], True)
    except RuntimeError as e:
        # 프로그램 시작시에 메모리 증가가 설정되어야만 합니다
        print(e)
        
def mish(x):
    return x*tf.math.tanh(tf.math.softplus(x))

def leakyrelu(x, factor=0.2):
    return tf.maximum(x, factor*x)

## Load

In [4]:
df = load_data('../data/ml-100k/u.data', threshold=0)
uuid = df['userId'].unique()
uiid = df['movieId'].unique()


In [5]:
rtd = extract_from_df(df, 1, 0)

100%|██████████| 943/943 [00:02<00:00, 433.14it/s]


In [6]:
train = df.drop(rtd)
test = df.loc[rtd]

In [7]:
u_i = pd.pivot_table(train, index='userId', columns='movieId', values='rating').fillna(0)
u_i

movieId,0,1,2,3,4,5,6,7,8,9,...,1672,1673,1674,1675,1676,1677,1678,1679,1680,1681
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
938,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
939,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
940,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
941,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
groups = []
for i in range(len(u_i)):
    groups.append(list(np.argwhere(u_i.values[i]).flatten()))
# groups = np.array(groups)

## Gensim

In [17]:
from gensim.models import Word2Vec

model = Word2Vec(
      np.array(groups),
      vector_size = 32,
      window=10,
      min_count=1,
      sg=1,
      negative=5)

In [18]:
model.build_vocab(np.array(groups))

In [19]:
model.train(np.array(groups),
         total_examples = model.corpus_count,
         epochs=100,
         compute_loss=True)

(9564495, 9905700)

In [20]:
embedding_matrix = model.wv[model.wv.key_to_index.keys()]
embedding_matrix.shape

(1682, 32)

In [21]:
from sklearn.metrics.pairwise import cosine_similarity

def get_average(user_id, model=model, embedding=embedding_matrix):
    seen_movies = train[train['userId']==user_id]['movieId'].values
    kdx = []
    for i in seen_movies:
        kdx.append(model.wv.key_to_index[i])
        
    vec = embedding_matrix[kdx]
    vec = np.mean(vec, 0)
        
    return vec

def top_n(user_id, k=10, uiid=uiid, model=model):
    seen_movies = train[train['userId']==user_id]['movieId'].values
    unseen_movies = list(set(uiid) - set(seen_movies))
    
    user_vec = get_average(user_id)
    
    kdx = []
    for i in unseen_movies:
        kdx.append(model.wv.key_to_index[i])
        
    unseen_vec = embedding_matrix[kdx]
    
    res = sorted(unseen_movies, key=lambda x: cosine_similarity([embedding_matrix[model.wv.key_to_index[x]]], [user_vec]), reverse=True)
    return np.array(res[:k])

In [22]:
cnt = 0
for i in range(len(test)):
    user, item, _ = test.values[i]
    pred = top_n(user, 10)
    if item in pred:
        cnt += 1
        
cnt / len(test)

0.14528101802757157