In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from pandas.core.common import SettingWithCopyWarning
import warnings
warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt

In [None]:
df_movies = pd.read_csv('/kaggle/input/data-large/df_large.csv')
df_ratings = pd.read_csv('/kaggle/input/data-large/d_large.csv')
ratings = pd.read_csv('/kaggle/input/the-movies-dataset/ratings_small.csv')  # remove _small
pd.set_option('display.max_colwidth', -1)

In [None]:
# DataFrame containing movie info
del df_movies['overview']
df_movies.head()

In [None]:
# DataFrame containing user ratings
df_ratings.rename(columns = {'id':'MovieId'}, inplace = True)
df_ratings = df_ratings[['userId', 'MovieId', 'title', 'genres', 'keywords', 'rating']]
df_ratings.head()

In [None]:
d = df_ratings

In [None]:
print("Size of ratings dataframe: ",len(ratings), "  Size of movies dataframe: ",len(df_movies))

In [None]:
from ast import literal_eval
# To return the first 3 genres
df_movies['genres'] = df_movies['genres'].apply(literal_eval).apply(lambda x : x[0:3])

In [None]:
del df_movies['Unnamed: 0']

In [None]:
# Split the genres to seperate columns
df_movies[['genre1','genre2', 'genre3']] = pd.DataFrame(df_movies.genres.tolist(), index= df_movies.index)
df_movies.head(2)

In [None]:
# Drop movies that do not have two genre values
n = len(df_movies)
df_movies.dropna(subset = ["genre1", "genre2"], inplace=True)

print("Size of DataFrame after dropping movies that do not have 2 genre valus : ",len(df_movies))
print("Number of movies dropped: ", n - len(df_movies))

In [None]:
# Map genre1 to integer values
genre1_list = np.unique(df_movies.genre1) 
g1_dict = {k: int(v) for v, k in enumerate(genre1_list)}
g1_dict

In [None]:
# Map genre2 to integer values
genre2_list = np.unique(df_movies.genre2) 
g2_dict = {k: int(v) for v, k in enumerate(genre2_list)}
g2_dict

In [None]:
# Replace categorical values of genre with integer values
df_movies = df_movies.replace({"genre1": g1_dict, "genre2": g2_dict})
df_movies.head(2)

In [None]:
ratings = ratings.rename(columns={'movieId': 'id'})

In [None]:
# merge ratings and df_movies based on movieid
d = pd.merge(ratings, df_movies, on ='id' )
d.head(2)

In [None]:
del ratings
del d['timestamp']
del d['popularity']
del d['release_date']
del d['actors']
del d['director']

In [None]:
# mapping the movieid to continous targets, as there are breaks between ids
t = dict([(y,x) for x,y in enumerate(np.unique(d['id']))])
d['id'] = d['id'].map(t)

In [None]:
# starting userId from 0
d['userId'] = d['userId'] - 1

In [None]:
d.head(2)

In [None]:
# Getting the most common movie keywords

import ast
temp =[]
for i in d['keywords']:
    res = ast.literal_eval(i) 
    temp.extend(res)

print(len(temp))

from collections import Counter 
Counter = Counter(temp)
most_occur = Counter.most_common(1500) #1500


In [None]:
# Convert most common keywords to integer values

k = dict([(y[0],x) for x,y in enumerate(most_occur)])
f = []
for i in d['keywords']:
    temp = []
    for j in ast.literal_eval(i):
        if j in k.keys() and len(temp) < 3:
            temp.append(k[j])
            
    f.append(temp)
    
        

In [None]:
d['key'] = f
d.head(2)

In [None]:
d['key_count'] = d['key'].apply(lambda x: len(x))
print("Size before dropping: ",len(d))
d = d[d['key_count'] >1] # Drop movies that have less than one most common keywords
print("Size of DataFrame after dropping : ",len(d))

In [None]:
# Create columns based on keyword integer value
d[['key1','key2', 'key3']] = pd.DataFrame(d.key.tolist(), index= d.index)
d.head(2)

In [None]:
# Map the Keyword values to continous values starting from 0

key_list = np.unique(list(np.unique(d['key1'])) + list(np.unique(d['key2'])))

t = dict([(y,x) for x,y in enumerate(key_list)])

d = d.replace({"key1": t, "key2":t}) 

In [None]:
del d['genre3']
del d['key']
del d['key_count']
del d['key3']
d[d['title']=='The Godfather'].head(3)

In [None]:
print("Number of unique movies : ",len(np.unique(d['id'])),"\nNumber of unique users: ", len(np.unique(d['userId'])),"\nTotal Number of enteries in the DataFrame: ", len(d))

# **MODEL**

In [None]:
import keras

hidden_units = (32,4)
#movie_embedding_size = 50
#user_embedding_size = 50
m_emb_size = min(len(np.unique(d.id))//2 , 50)
u_emb_size = min(len(np.unique(d.userId))//2 , 50)
g1_emb_size = min(len(np.unique(d.genre1))//2 , 50)
g2_emb_size = min(len(np.unique(d.genre2))//2 , 50)
k1_emb_size = min(len(np.unique(d.key1))//2 , 50)
k2_emb_size = min(len(np.unique(d.key2))//2 , 50)


# Each instance will consist of two inputs: a single user id, and a single movie id
user_id_input = keras.Input(shape=(1,), name='user_id')
movie_id_input = keras.Input(shape=(1,) ,name='movie_id')

g1_id_input = keras.Input(shape = (1,), name='g1_id' )
g2_id_input = keras.Input(shape = (1,), name='g2_id' )

k1_id_input = keras.Input(shape = (1,), name='k1_id' )
k2_id_input = keras.Input(shape = (1,), name='k2_id' )
#director_input = keras


user_embedded = keras.layers.Embedding(d.userId.max()+1, m_emb_size, 
                                       input_length=1, name='user_embedding')(user_id_input)
movie_embedded = keras.layers.Embedding(d.id.max()+1, u_emb_size, 
                                        input_length=1, name='movie_embedding')(movie_id_input)

g1_embedded = keras.layers.Embedding(d.genre1.max()+1, g1_emb_size, 
                                        input_length=1, name='genre1_embedding')(g1_id_input)
g2_embedded = keras.layers.Embedding(d.genre2.max()+1, g2_emb_size, 
                                       input_length=1, name='genre2_embedding')(g2_id_input)

k1_embedded = keras.layers.Embedding(d.key1.max()+1, k1_emb_size, 
                                        input_length=1, name='key1_embedding')(k1_id_input)
k2_embedded = keras.layers.Embedding(d.key2.max()+1, k2_emb_size, 
                                       input_length=1, name='key2_embedding')(k2_id_input)


# Concatenate the embeddings (and remove the useless extra dimension)
concatenated = keras.layers.Concatenate()([user_embedded, movie_embedded, g1_embedded, g2_embedded, k1_embedded, k2_embedded])
out = keras.layers.Flatten()(concatenated)

from keras import backend as K

def custom_activation(x):
    return (K.sigmoid(x) * 6) 

# Add one or more hidden layers
for n_hidden in hidden_units:
    out = keras.layers.Dense(n_hidden, activation='relu')(out)
    out = keras.layers.Dropout(0.2)(out)
    #out = keras.layers.Dense(n_hidden, activation=custom_activation)(out)

# A single output: our predicted rating
out = keras.layers.Dense(1, activation= custom_activation, name='prediction')(out) #'linear'

model2 = keras.Model(
    inputs = [user_id_input, movie_id_input, g1_id_input, g2_id_input, k1_id_input, k2_id_input],
    outputs = out,
)


In [None]:
import tensorflow as tf
model2.compile(keras.optimizers.Adam(learning_rate=0.01),
    loss='MSE',
    metrics=['mse', 'mae', 'mape'])

history = model2.fit(
    [d.userId, d.id, d.genre1, d.genre2, d.key1, d.key2],
    d.rating,
    batch_size=2000,
    epochs=100,
    verbose=0,
    validation_split=.1,
)

In [None]:
# Using Saved Model

from keras import backend as K

def custom_activation(x):
    return (K.sigmoid(x) * 6) 

from keras.models import load_model

# To load the model
custom_objects={'custom_activation': custom_activation}

# To load a persisted model that uses the CRF layer 
model2 = load_model("/kaggle/input/model/Startwars_188.h5", custom_objects = custom_objects)



In [None]:
import tensorflow as tf
tf.keras.utils.plot_model(model2, to_file='Embedding_Model.png', show_shapes=True, show_layer_names=False)
from IPython.display import Image
Image(retina=False, filename='Embedding_Model.png')

# **MODEL PERFORMANCE**

In [None]:

X = d[['userId', 'id', 'genre1','genre2', 'rating', 'key1','key2']]
y = d['rating']

X['pred'] = model2.predict([[X['userId']], [X['id']],[X['genre1']],[X['genre2']], [X['key1']], [X['key2']]])

X['diff'] = abs(X['rating'] - X['pred'])
X.head()

In [None]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score

MSE = mean_squared_error(y_true = X.rating.values, y_pred = X.pred.values )
MAE = mean_absolute_error(y_true = X.rating.values, y_pred = X.pred.values )

print("MEAN SQUARED ERROR : ", MSE, "\nROOT MEAN SQUARED ERROR : ", MSE**(0.5), "\nMEAN ABSOLUTE ERROR : ", MAE)

In [None]:
# Movie Embedding Vector
emb_layer = model2.get_layer('movie_embedding')
(w,) = emb_layer.get_weights()
w[0]

# **Recommendations**

In [None]:
!pip install nmslib
import nmslib

In [None]:
# Searching in Embedding Space

movies_index = nmslib.init(space='angulardist', method='hnsw')
movies_index.addDataPointBatch(model2.get_layer('movie_embedding').get_weights()[0])

user_index = nmslib.init(space='angulardist', method='hnsw')
user_index.addDataPointBatch(model2.get_layer('user_embedding').get_weights()[0])

M = 100
efC = 1000
efS = 1000
num_threads = 6
index_time_params = {'M': M, 'indexThreadQty': num_threads, 'efConstruction': efC, 'post' : 0}
query_time_params = {'efSearch': efS}

movies_index.createIndex(index_time_params)
movies_index.setQueryTimeParams(query_time_params)

user_index.createIndex(index_time_params)
user_index.setQueryTimeParams(query_time_params)

def get_knns(index, vecs, n_neighbour):
     return zip(*index.knnQueryBatch(vecs, k=n_neighbour, num_threads=6))

def get_knn(index, vec, n_neighbour):
    return index.knnQuery(vec, k=n_neighbour)

def suggest_movies_knn(movieId, n_suggest = 5):
    id = movieId
    res = get_knn(movies_index, model2.get_layer("movie_embedding").get_weights()[0][movieId], n_suggest)[0]
    #return df_main[df_main.id.isin([idx2movie[i] for i in res])]
    return res
    
def suggest_users_knn(userId, n_suggest = 5):
    i = userId
    res = get_knn(user_index, model2.get_layer("user_embedding").get_weights()[0][userId], n_suggest)[0]
    #return df_main[df_main.id.isin([idx2movie[i] for i in res])]
    return res

In [None]:
d[d['title']=='Star Wars'].head(1)

In [None]:
# Recommendations for Star Wars
movie_id = 188 # 188 --> Star Wars
j = suggest_movies_knn(movie_id, 8)
print(" Recommended Movies based on Movie Embedding are : \n",list(np.unique(d[d['id'].isin(j)]['title'])))

In [None]:
d[d['title']=='The Lord of the Rings: The Fellowship of the Ring'].head(1)

In [None]:
movie_id = 3087 # 3087 --> The Lord of the Rings: The Fellowship of the Ring
j = suggest_movies_knn(movie_id, 8)
print(" Recommended Movies based on Movie Embedding are : \n",list(np.unique(d[d['id'].isin(j)]['title'])))

In [None]:
d[(d['userId']==288) & (d['rating']>4)]

In [None]:
# Recommend similar profiles
user_id = 288
j = suggest_users_knn(user_id, 5)
print(" Recommended Users based on user Embedding are : \n",list(np.unique(d[d['userId'].isin(j)]['userId']))[:10])

In [None]:
# Recommendations Based on User Profile #288
user_id = 288
user_profile = d[d['userId'] == user_id]  
user_profile = user_profile[['userId', 'id', 'title', 'genre1', 'genre2', 'key1', 'key2', 'rating', 'genres']]
user_profile = user_profile[user_profile['rating']>4]
user_profile # User Profile of user 288

In [None]:
# Finding the average movie embedding to capture user interests.
avg_w = 0
for i in user_profile.id:
    avg_w += w[i]
avg_w = avg_w/len(user_profile)    

In [None]:
# Recommending movies based on average movie embedding
j= get_knn(movies_index, avg_w , 5)[0]
print(" Recommended Movies based on Movie Embedding are : \n",list(np.unique(d[d['id'].isin(j)]['title'])))