In [None]:
import os
import numpy as np
import matplotlib.pyplot as plt
import time
import sys
import pickle
import random
import scipy.signal as sp_signal
import pandas as pd
from datetime import datetime
import re

from numpy import dot
from numpy.linalg import norm

import hiddenlayer as hl
%matplotlib inline

In [2]:
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader, TensorDataset
from torch.utils.data.sampler import SubsetRandomSampler, WeightedRandomSampler
from torch.autograd import Variable
from torchvision import transforms

In [3]:
class NCF(nn.Module):
    def __init__(self, config):
        super(NCF, self).__init__()
        self.config = config
        self.user_dim = config['nUsers']
        self.movie_dim = config['nMovies']
        self.genres_dim = config['nGenres']
        
        self.g_latent_dim = config['genre_latent_dim']
        self.u_latent_dim = config['user_latent_dim']
        self.i_latent_dim = config['item_latent_dim']
        self.query_dim = config['Query_latent_dim']
        
        self.user_embedding = nn.Embedding(self.user_dim, self.u_latent_dim, 
                                           max_norm=1, scale_grad_by_freq = True)
        
        self.movie_embedding = nn.Embedding(self.movie_dim, self.i_latent_dim,
                                           max_norm=1, scale_grad_by_freq = True)

        self.genres_layers = nn.Sequential(
                                nn.Linear(self.genres_dim, self.g_latent_dim)
        )
        
        self.fc_layers = nn.Sequential(
                            nn.Linear(self.u_latent_dim + self.g_latent_dim + self.query_dim + 2, self.u_latent_dim*2),
                            nn.Tanh(),
                            nn.Linear(self.u_latent_dim*2,self.u_latent_dim),
                            nn.Tanh(),
                            nn.Linear(self.u_latent_dim,self.u_latent_dim),
                            nn.Tanh())
        
        self.cos_sim = nn.CosineSimilarity(dim=2, eps=1e-6)
        
    def forward(self, user_id, movie_id, genres, em_query, weekend, hr):
        
        user_em = self.user_embedding(user_id)
        movie_em = self.movie_embedding(movie_id)
        
        genres_em = self.genres_layers(genres).mean(dim=1, keepdim=True)
        
        total_em = torch.cat((user_em, genres_em, em_query, weekend.unsqueeze(dim=1), hr.unsqueeze(dim=1)), dim=2)
        final_em = self.fc_layers(total_em)
        return self.cos_sim(user_em,movie_em)*5, self.cos_sim(final_em,movie_em)*5
        

In [4]:
config = {'nUsers': 4380, 'nMovies': 3868, 
          'nGenres': 22,
          'user_latent_dim': 128, 'item_latent_dim': 128,
          'genre_latent_dim': 32, 'Query_latent_dim': 64
         }
print(config)

{'nUsers': 4380, 'nMovies': 3868, 'nGenres': 22, 'user_latent_dim': 128, 'item_latent_dim': 128, 'genre_latent_dim': 32, 'Query_latent_dim': 64}


In [5]:
CF = NCF(config)
CF.load_state_dict(torch.load('./../data/processed_data/cfmodel',map_location=torch.device('cpu')))
CF.eval()

NCF(
  (user_embedding): Embedding(4380, 128, max_norm=1, scale_grad_by_freq=True)
  (movie_embedding): Embedding(3868, 128, max_norm=1, scale_grad_by_freq=True)
  (genres_layers): Sequential(
    (0): Linear(in_features=22, out_features=32, bias=True)
  )
  (fc_layers): Sequential(
    (0): Linear(in_features=226, out_features=256, bias=True)
    (1): Tanh()
    (2): Linear(in_features=256, out_features=128, bias=True)
    (3): Tanh()
    (4): Linear(in_features=128, out_features=128, bias=True)
    (5): Tanh()
  )
  (cos_sim): CosineSimilarity()
)

In [6]:
movie_embedding = CF.movie_embedding.weight.data.detach().numpy()
user_embedding = CF.user_embedding.weight.data.detach().numpy()

In [7]:
genres_weight = CF.genres_layers[0].weight.data.detach().numpy()
genres_bias = CF.genres_layers[0].bias.data.detach().numpy()

In [8]:
fc=[{},{},{}]
for i in range(3):
    fc[i]['weight'] = CF.fc_layers[i*2].weight.data.detach().numpy()
    fc[i]['bias'] = CF.fc_layers[i*2].bias.data.detach().numpy()
    print(i, fc[i]['weight'].shape)

0 (256, 226)
1 (128, 256)
2 (128, 128)


In [9]:
model_weights = {'genres_weight':genres_weight,
                'genres_bias':genres_bias,
                'fc':fc
                }

In [10]:
genres_list= ['horror', 'fantasy', 'adventure', 'western', 'action', 'war', 'animation', 'comedy', 'family', 'mystery', 'movie', 'documentary', 'fiction', 'romance', 'music', 'history', 'drama', 'science', 'thriller', 'crime', 'tv', 'foreign']
genres_dict = dict(zip(genres_list, range(len(genres_list))))

model_weights['genres_list'] = genres_list
model_weights['genres_dict'] = genres_dict


In [11]:
with open('./../data/processed_data/keyword_embedding.pkl', 'rb') as f:
    kw_embedding = pickle.load(f)

In [12]:
kw_set = set(kw_embedding.keys())
kw_default_embedding = np.zeros(64)
n=0
for key in kw_embedding:
    kw_default_embedding += kw_embedding[key]
    n+=1

kw_default_embedding/=n

In [13]:
model_weights['kw_embedding'] = kw_embedding
model_weights['kw_default_embedding'] = kw_default_embedding
model_weights['kw_set'] = kw_set

In [14]:
with open('./../data/processed_data/model_weights','wb') as f:
    pickle.dump(model_weights,f)

In [15]:
def get_query_vector(query, model_weights):
    kws = [kw.lower() for kw in query.split(' ')]
    n = 0
    res = np.zeros(64)
    for kw in kws:
        if kw in model_weights['kw_set']:
            res+=model_weights['kw_embedding'][kw]
            n+=1
    if n==0:
        return model_weights['kw_default_embedding']
    else:
        return res/n

In [16]:
def get_genre_vector(genres, model_weights):
    v = np.zeros((22,1))
    for g in genres:
        v[model_weights['genres_dict'][g]]=1
    
    return model_weights['genres_weight'].dot(v).squeeze(1)+model_weights['genres_bias']

In [17]:
def compute_final_embedding(model_weights, query, user_embedding, genres, isWeekend, hr):
    query_embedding = get_query_vector(query, model_weights)
    genres_embedding = get_genre_vector(genres, model_weights)
    
    x = np.concatenate((user_embedding, 
                        genres_embedding,
                        query_embedding,
                        np.array([isWeekend,hr])))
    
    for fc in model_weights['fc']:
        x = fc['weight'].dot(x)+fc['bias']
        x = np.tanh(x)
    return x

In [18]:
genres_list

['horror',
 'fantasy',
 'adventure',
 'western',
 'action',
 'war',
 'animation',
 'comedy',
 'family',
 'mystery',
 'movie',
 'documentary',
 'fiction',
 'romance',
 'music',
 'history',
 'drama',
 'science',
 'thriller',
 'crime',
 'tv',
 'foreign']

In [19]:
movie_df = pd.read_csv(os.path.join('./../data/processed_data/','keywords_all.csv'))

In [23]:
querys = ''
genres = ['family','movie','animation']

In [24]:
a = compute_final_embedding(model_weights, querys, np.zeros(128),genres, 1, 12)

In [25]:
score = dot(movie_embedding,a)
movie_id = np.argsort(score)[::-1][:50]
movie_df.iloc[list(movie_id)]

Unnamed: 0.1,Unnamed: 0,title,movieId,qmdbId,tmdbId,year,genre_kws,kws_all
281,281,"Princess Bride, The",935,281,2493,1987,"['comedy', 'adventure', 'family', 'romance', '...",reference a man happy versus charming imperson...
317,317,Back to the Future,1005,317,105,1985,"['comedy', 'fiction', 'science', 'adventure', ...",robert lloyd discoveries escapade a and sci fi...
781,781,"Iron Giant, The",2202,781,10386,1999,"['fiction', 'science', 'adventure', 'family', ...",atomic mother a bird sitting and mother son te...
315,315,Groundhog Day,1000,315,137,1993,"['comedy', 'drama', 'fantasy', 'romance']",100 a stephen ramis character sculpting paquet...
1943,1943,"Polar Express, The",6187,1944,5255,2004,"['fantasy', 'adventure', 'family', 'animation']",robert leslie animation post bibliothek bell s...
2475,2475,Resident Evil: Extinction,7692,2476,7737,2007,"['action', 'fiction', 'science', 'horror']",conspiracy russell clearplay zombies r evil ma...
303,303,"Right Stuff, The",967,303,9549,1983,"['drama', 'history']",100 kaufman long space u s s barrier explorati...
2079,2079,"Perfect Man, The",6898,2080,15648,2005,"['comedy', 'family', 'drama', 'romance']",man woman mother o malley heather man admirer ...
2393,2393,Teen Wolf Too,7506,2394,15582,1987,"['comedy', 'family', 'fantasy']",jason christopher leitch teenager leitch betam...
0,0,Toy Story,0,0,862,1995,"['comedy', 'family', 'animation']",jealousy dolls next and tim star kid computer ...
