In [1]:
import os
import numpy as np
import matplotlib.pyplot as plt
import time
import sys
import pickle
import random
import scipy.signal as sp_signal
import pandas as pd
from datetime import datetime
import re

from numpy import dot
from numpy.linalg import norm

import hiddenlayer as hl
%matplotlib inline

In [2]:
import torch
from torch import nn
# from torch.utils.data import Dataset, DataLoader, TensorDataset
# from torch.utils.data.sampler import SubsetRandomSampler, WeightedRandomSampler
# from torch.autograd import Variable
# from torchvision import transforms

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Device is ", device)

Device is  cpu


In [4]:
class NCF(nn.Module):
    def __init__(self, config):
        super(NCF, self).__init__()
        self.config = config
        self.user_dim = config['nUsers']
        self.movie_dim = config['nMovies']
        self.genres_dim = config['nGenres']
        
        self.g_latent_dim = config['genre_latent_dim']
        self.u_latent_dim = config['user_latent_dim']
        self.i_latent_dim = config['item_latent_dim']
        
        self.user_embedding = nn.Embedding(self.user_dim, self.u_latent_dim, 
                                           max_norm=1, scale_grad_by_freq = True)
        
        self.movie_embedding = nn.Embedding(self.movie_dim, self.i_latent_dim,
                                           max_norm=1, scale_grad_by_freq = True)

        self.genres_layers = nn.Sequential(
                                nn.Linear(self.genres_dim, self.g_latent_dim)
        )
        
        self.fc_layers_relevance = nn.Sequential(
            nn.Linear(self.g_latent_dim + self.i_latent_dim + 64, 64),
            nn.Tanh(),
            nn.Linear(64, 64),
            nn.Tanh(),
            nn.Linear(64, 1),
            nn.Sigmoid()
        )
        
        self.cos_sim = nn.CosineSimilarity(dim=2, eps=1e-6)
        
    def forward(self, user_id, movie_id, genres, em_query):
        user_em = self.user_embedding(user_id)
        movie_em = self.movie_embedding(movie_id)
        genres_em = self.genres_layers(genres)
        
        final_query_em = user_em + genres_em + em_query
        
        relevance_em = torch.cat((movie_em, genres_em, em_query), dim=2)
        relevance_em = relevance_em.view(-1,self.g_latent_dim + self.i_latent_dim + 64)

        relevance_score = self.fc_layers_relevance(relevance_em)

        return relevance_score.view(-1,1), torch.bmm(final_query_em, movie_em.view(-1,self.i_latent_dim, 1)).view(-1,1)

In [5]:
config = {'nUsers': 4380, 'nMovies': 3868, 
          'nGenres': 22,
          'user_latent_dim': 64, 'item_latent_dim': 64,
          'genre_latent_dim': 64
         }
print(config)


{'nUsers': 4380, 'nMovies': 3868, 'nGenres': 22, 'user_latent_dim': 64, 'item_latent_dim': 64, 'genre_latent_dim': 64}


In [6]:
CF = NCF(config).to(device)
CF.load_state_dict(torch.load('./../data/processed_data/cfmodel_v04',map_location=torch.device('cpu')))
CF.eval()

NCF(
  (user_embedding): Embedding(4380, 64, max_norm=1, scale_grad_by_freq=True)
  (movie_embedding): Embedding(3868, 64, max_norm=1, scale_grad_by_freq=True)
  (genres_layers): Sequential(
    (0): Linear(in_features=22, out_features=64, bias=True)
  )
  (fc_layers_relevance): Sequential(
    (0): Linear(in_features=192, out_features=64, bias=True)
    (1): Tanh()
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): Tanh()
    (4): Linear(in_features=64, out_features=1, bias=True)
    (5): Sigmoid()
  )
  (cos_sim): CosineSimilarity()
)

In [7]:
movie_embedding = CF.movie_embedding.weight.data.detach().cpu().numpy()
user_embedding = CF.user_embedding.weight.data.detach().cpu().numpy()
ave_User_Em = user_embedding.mean(axis=0)

In [8]:
genres_weight = CF.genres_layers[0].weight.data.detach().cpu().numpy()
genres_bias = CF.genres_layers[0].bias.data.detach().cpu().numpy()

In [9]:
fc_relevance = [{},{},{}]
for ilayer in range(3):
    fc_relevance[ilayer]['weight'] = CF.fc_layers_relevance[ilayer*2].weight.data.detach().cpu().numpy()
    fc_relevance[ilayer]['bias'] = CF.fc_layers_relevance[ilayer*2].bias.data.detach().cpu().numpy()
    print(fc_relevance[ilayer]['weight'].shape)

(64, 192)
(64, 64)
(1, 64)


In [10]:
tmp = pd.read_csv('./../data/processed_data/ind2qmdb.csv')
ind2qmdbId = dict(zip(tmp.iloc[:,0],tmp.iloc[:,1]))
qmdbId2ind = dict(zip(tmp.iloc[:,1],tmp.iloc[:,0]))

In [11]:
with open('./../data/processed_data/keyword_embedding.pkl', 'rb') as f:
    kw_embedding = pickle.load(f)

In [12]:
kw_set = set(kw_embedding.keys())
kw_default_embedding = np.zeros(64)
n=0
for key in kw_embedding:
    kw_default_embedding += kw_embedding[key]
    n+=1

kw_default_embedding/=n

In [13]:
genres_list = ['tv', 'movie', 'fantasy', 'documentary', 'comedy', 'drama', 'mystery', 'horror', 'science', 'family', 'crime', 'history', 'action', 'adventure', 'fiction', 'music', 'war', 'animation', 'western', 'thriller', 'foreign', 'romance']
genres_dict = dict(zip(genres_list, range(len(genres_list))))

model_weights = {'genres_weight':genres_weight,
                'genres_bias':genres_bias,
                'genres_list': genres_list,
                'genres_dict': genres_dict,
                'movie_embedding': movie_embedding,
                 'ave_user_embedding': ave_User_Em,
                'fc_relevance': fc_relevance,
                 'kw_embedding': kw_embedding,
                 'kw_default_embedding': kw_default_embedding,
                 'kw_set': kw_set,
                 'ind2qmdbId': ind2qmdbId,
                 'qmdbId2ind': qmdbId2ind
                }

In [14]:
indmap = {'ind2qmdbId': ind2qmdbId,
            'qmdbId2ind': qmdbId2ind}

In [15]:
with open('./../data/processed_data/model_weights_v04','wb') as f:
    pickle.dump(model_weights, f)

with open('./../data/processed_data/ave_User_Em_v04','wb') as f:
    pickle.dump(ave_User_Em, f)

with open('./../data/processed_data/movie_embedding_v04','wb') as f:
    pickle.dump(movie_embedding, f)  

with open('./../data/processed_data/indmap','wb') as f:
    pickle.dump(indmap, f)       

In [16]:
def get_query_vector(query, model_weights):
    kws = [kw.lower() for kw in query.split(' ')]
    n = 0
    res = np.zeros(64)
    for kw in kws:
        if kw in model_weights['kw_set']:
            print(kw)
            res+=model_weights['kw_embedding'][kw]
            n+=1
    if n==0:
        return model_weights['kw_default_embedding']
    else:
        return res/n

def get_genre_vector(genres, model_weights):
    v = np.zeros((22,1))
    for g in genres:
        if g in model_weights['genres_dict']:
            v[model_weights['genres_dict'][g]]=1
    
    return model_weights['genres_weight'].dot(v).squeeze(1)+model_weights['genres_bias']

In [17]:
def predict(model_weights, query, user_embedding, genres):

    query_embedding = get_query_vector(query, model_weights)
    genres_embedding = get_genre_vector(genres_q, model_weights)

    final_query_em = ave_User_Em + genres_embedding + query_embedding

    nM = model_weights['movie_embedding'].shape[0]
    relevance_em = np.concatenate((model_weights['movie_embedding'], genres_embedding.reshape(1,64).repeat(nM,axis=0), query_embedding.reshape(1,64).repeat(nM,axis=0)),axis=1)

    for i, fc in enumerate(model_weights['fc_relevance']):
        relevance_em = relevance_em.dot(fc['weight'].T)+fc['bias']
        if i<len(model_weights['fc_relevance'])-1:
            relevance_em = np.tanh(relevance_em)
        else:
            relevance_em = 1. / (1. + np.exp(-relevance_em))

    relevance_score = relevance_em.squeeze(axis=1)
    print(final_query_em.shape)
    print(model_weights['movie_embedding'].T.shape)
    print(final_query_em.dot(model_weights['movie_embedding'].T).shape)
    
    rating_score = final_query_em.dot(model_weights['movie_embedding'].T)
    
    return relevance_score, rating_score

In [49]:
query = 'john lasseter'
genres_q = ['family','animation']

In [50]:
rel, rat = predict(model_weights, query, ave_User_Em, genres_q)

john
lasseter
(64,)
(64, 3868)
(3868,)


In [52]:
final_score = rel+rat

In [53]:
ind_combined = np.argsort(final_score)[::-1][:30]

In [54]:
pd.DataFrame([rel[ind_combined], rat[ind_combined]])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,20,21,22,23,24,25,26,27,28,29
0,0.999672,0.905575,0.989556,0.999056,0.923518,0.970899,0.971915,0.94886,0.99448,0.999745,...,0.90286,0.995492,0.999962,0.987424,0.800687,0.907888,0.955933,0.990811,0.967741,0.795983
1,0.632045,0.717348,0.565383,0.508932,0.577643,0.510057,0.50712,0.527522,0.481013,0.444441,...,0.40077,0.306326,0.300154,0.303642,0.458357,0.339227,0.284136,0.249104,0.233861,0.387042


In [55]:
movie_df.iloc[list(ind_combined)]

Unnamed: 0.1,Unnamed: 0,title,movieId,qmdbId,tmdbId,year,genre_kws,kws_all
3142,3142,Winnie the Pooh,8962,3143,51162,2011,"['family', 'animation']",oppl duringcreditsstinger hunger stephen bd vi...
3039,3039,Tangled,8768,3040,38757,2010,"['family', 'animation']",runner byron selfishness mother animal chamele...
3478,3478,Turbo,9564,3479,77950,2013,"['family', 'animation']",animal car pitjor racing super animals ryan po...
929,929,"Tigger Movie, The",2633,929,15655,2000,"['family', 'animation']",animation woman tigger animals disney the anim...
2619,2619,Space Chimps,7972,2620,11802,2008,"['family', 'animation']",daniels jeff hines kirk aftercreditsstinger ch...
3119,3119,Kung Fu Panda 2,8930,3120,49444,2011,"['family', 'animation']",animation woman much yuh jennifer panda black ...
1658,1658,Rugrats Go Wild!,4986,1659,20694,2003,"['family', 'animation']",rugrats a daily and island jungle kids wildlif...
3402,3402,"Batman: The Dark Knight Returns, Part 2",9417,3403,142061,2013,"['action', 'animation']",comics frank animation peter oliva super retur...
2027,2027,Pooh's Heffalump Movie,6749,2028,13682,2005,"['family', 'animation']",animation jimmy jim kath music sansom cute ani...
1002,1002,Dinosaur,2880,1002,10567,2000,"['family', 'animation']",animation alfre cataclysm prehistoric lemur an...


In [134]:
ind_rel = np.argsort(rel)[::-1][:100]
ind = [ind_rel[i] for i in np.argsort(rat[ind_rel])[::-1]]

In [135]:
print(ave_User_Em[:10])

[-0.01720215  0.00289822 -0.00434898  0.03746572  0.04820991  0.00210612
  0.00730949  0.01476944 -0.04041719  0.01017506]


In [136]:
print(ind[:10])

[3039, 2619, 3462, 3146, 2708, 1650, 3803, 2450, 2445, 1821]


In [25]:
movie_df = pd.read_csv(os.path.join('./../data/processed_data/','keywords_all.csv'))

In [138]:
movie_df.iloc[list(ind)]

Unnamed: 0.1,Unnamed: 0,title,movieId,qmdbId,tmdbId,year,genre_kws,kws_all
3039,3039,Tangled,8768,3040,38757,2010,"['family', 'animation']",runner byron selfishness mother animal chamele...
2619,2619,Space Chimps,7972,2620,11802,2008,"['family', 'animation']",daniels jeff hines kirk aftercreditsstinger ch...
3462,3462,Monsters University,9531,3463,62211,2013,"['family', 'animation']",animation buscemi dan bd video billy college s...
3146,3146,Alpha and Omega,8968,3147,12819,2010,"['family', 'animation']",ranger wolf dennis alpha and forbidden omega t...
2708,2708,Ponyo (Gake no ue no Ponyo),8139,2709,12429,2008,"['family', 'animation']",tokoro animation gake mother pitjors doi for p...
1650,1650,Finding Nemo,4916,1651,12,2003,"['family', 'animation']",protective anxiety harbor orphaned ellen chara...
3803,3803,Hotel Transylvania 2,10282,3804,159824,2015,"['comedy', 'family', 'animation']",sandler man backpacker magic skeleton hilariou...
2450,2450,"Simpsons Movie, The",7637,2451,35,2007,"['comedy', 'family', 'animation']",pig a dan kavner toplist07 than credits penis ...
2445,2445,Arn: The Knight Templar (Arn - Tempelriddaren),7628,2446,13491,2007,"['action', 'adventure', 'drama', 'romance']",joakim peter a arn templars the erlend s fight...
1821,1821,Home on the Range,5589,1822,13700,2004,"['family', 'animation']",roseanne animal farm animals range disney rabb...
