In [1]:
import numpy as np
from numpy import genfromtxt
from collections import defaultdict
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
import csv
from numpy import genfromtxt
from tensorflow.keras.models import Model

##  Prepare Data

You can find the dataset on [MovieLens ml-latest-small](https://grouplens.org/datasets/movielens/latest/). 
* The original dataset has 9,742 movies rated by 610 users, and a total of 100,836 ratings<br> 
* After removing two insignificant movie genres and filtering out movies with less than 5 ratings, the dataset is reduced to $n_u = 610$ users and $n_m= 3649$ movies. <br> 

In [2]:
def load_data():
    """
    Load the datasets of movies and users
    Returns:
        movies (ndarray (m, n)): feature value of each movie, 
            each movie could appear multiple times if it has multiple genres or got rated more than once
        users (ndarray (m, n)): feature value of each user,  each user could appear multiple times if rated multiple movies
        y (ndarray (m, 1)): target value (rating) of each example
        item_vecs (ndarray): feature values of each movie, not affected by the number of ratings
        movie_dict (Dict(Dict)): for better access to the movie id, title, genre 
        ratings (ndarray): all ratings given by users
    """
    movies = pd.read_csv('./ml-latest-small/movie_data.csv')
    users = pd.read_csv('./ml-latest-small/user_data.csv')
    y    = pd.read_csv('./ml-latest-small/y.csv')
    item_vecs = pd.read_csv('./ml-latest-small/item_vecs.csv')

    movie_dict = defaultdict(dict)
    count = 0
    with open('./ml-latest-small/movies.csv', newline='', encoding="utf8") as movie:
        reader = csv.reader(movie, delimiter=',', quotechar='"')
        for line in reader:
            if count == 0:
                count += 1  # skip header
            else:
                count += 1
                movie_id = int(line[0])
                movie_dict[movie_id]["title"] = line[1]
                movie_dict[movie_id]["genres"] = line[2]
    
    ratings = pd.read_csv('./ml-latest-small/ratings.csv')
    
    return movies, users, y, item_vecs, movie_dict, ratings

In [3]:
movies_df, users_df, y_df, item_vecs_df, movie_dict, ratings_df = load_data()

### Display the Datasets

In [4]:
movies_df.head()

Unnamed: 0,movieId,year,movie_ave_rating,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,1995,3.92093,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,1995,3.92093,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1,1995,3.92093,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1,1995,3.92093,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1,1995,3.92093,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
users_df.head()

Unnamed: 0,userId,user_ave_rating,user_rating_count,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,4.361233,683,4.318182,4.380952,4.678571,4.536585,4.283951,4.333333,0.0,...,4.297872,5.0,3.470588,4.681818,4.166667,4.307692,4.225,4.12963,4.5,4.285714
1,1,4.361233,683,4.318182,4.380952,4.678571,4.536585,4.283951,4.333333,0.0,...,4.297872,5.0,3.470588,4.681818,4.166667,4.307692,4.225,4.12963,4.5,4.285714
2,1,4.361233,683,4.318182,4.380952,4.678571,4.536585,4.283951,4.333333,0.0,...,4.297872,5.0,3.470588,4.681818,4.166667,4.307692,4.225,4.12963,4.5,4.285714
3,1,4.361233,683,4.318182,4.380952,4.678571,4.536585,4.283951,4.333333,0.0,...,4.297872,5.0,3.470588,4.681818,4.166667,4.307692,4.225,4.12963,4.5,4.285714
4,1,4.361233,683,4.318182,4.380952,4.678571,4.536585,4.283951,4.333333,0.0,...,4.297872,5.0,3.470588,4.681818,4.166667,4.307692,4.225,4.12963,4.5,4.285714


In [6]:
y_df.head()

Unnamed: 0,rating
0,4.0
1,4.0
2,4.0
3,4.0
4,4.0


In [7]:
item_vecs_df.head()

Unnamed: 0,movieId,year,movie_ave_rating,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,1995,3.92093,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,1995,3.92093,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1,1995,3.92093,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1,1995,3.92093,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1,1995,3.92093,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [9]:
print(movies_df.shape)
print(users_df.shape)
print(y_df.shape)
print(item_vecs_df.shape)
print(ratings_df.shape)

(247900, 21)
(247900, 21)
(247900, 1)
(9167, 21)
(100836, 4)


### Variables for NN

In [10]:
num_user_features = users_df.shape[1] - 3  # don't need the userid, rating count and ave rating features during training
uvs = 3  # user genre vector start at 3nd col
u_s = 3  # Not using first 3 features when training

num_item_features = movies_df.shape[1] - 1  # don't need the movie id feature during training
ivs = 3  # the item genre vector starts at the fourth column(Action, Advanture, ...)
i_s = 1  # the column movie id will not be meaningful for training

scaledata = True  # applies the standard scalar to data if true

### Scale and Split the Datasets

In [11]:
# convert the pandas dataframe to numpy
users = users_df.to_numpy()
movies = movies_df.to_numpy()
y = y_df.to_numpy()
item_vecs = item_vecs_df.to_numpy()

In [12]:
# scale training data to improve convergence, z = (x - u) / s
if scaledata:
    # Use StandardScaler to scale the data
    scalerItem = StandardScaler()
    scalerItem.fit(movies)
    movies = scalerItem.transform(movies)

    scalerUser = StandardScaler()
    scalerUser.fit(users)
    users = scalerUser.transform(users)

In [13]:
# split the datasets into traning and testing sets
item_train, item_test = train_test_split(movies, train_size=0.80, shuffle=True, random_state=1)
user_train, user_test = train_test_split(users, train_size=0.80, shuffle=True, random_state=1)
y_train, y_test       = train_test_split(y,    train_size=0.80, shuffle=True, random_state=1)
print(f"movie/item training set shape: {item_train.shape}")
print(f"movie/item test  set shape: {item_test.shape}")
print(f"user training set shape: {user_train.shape}")
print(f"user test  set shape: {user_test.shape}")

movie/item training set shape: (198320, 21)
movie/item test  set shape: (49580, 21)
user training set shape: (198320, 21)
user test  set shape: (49580, 21)


In [14]:
# Scale the target ratings using a Min Max Scaler to scale the target to be between -1 and 1
scaler = MinMaxScaler((-1, 1))
scaler.fit(y_train.reshape(-1, 1))
ynorm_train = scaler.transform(y_train.reshape(-1, 1))
ynorm_test = scaler.transform(y_test.reshape(-1, 1))
print(ynorm_train.shape, ynorm_test.shape)

(198320, 1) (49580, 1)


##  Create Neural Networks for Both User Content and Movie Content

In [15]:
# Create neural networks for user content and movie content
num_outputs = 32
tf.random.set_seed(1)
user_NN = tf.keras.models.Sequential([
  tf.keras.layers.Dense(256, activation='relu'),
  tf.keras.layers.Dense(128, activation='relu'),
  tf.keras.layers.Dense(num_outputs),
])

item_NN = tf.keras.models.Sequential([
  tf.keras.layers.Dense(256, activation='relu'),
  tf.keras.layers.Dense(128, activation='relu'),
  tf.keras.layers.Dense(num_outputs),
])

# create the user input layer and point to the user network
input_user = tf.keras.layers.Input(shape=(num_user_features))
vu = user_NN(input_user)
# normalize the input for better efficiency
vu = tf.linalg.l2_normalize(vu, axis=1)

# create the item input layer and point to the item network
input_item = tf.keras.layers.Input(shape=(num_item_features))
vm = item_NN(input_item)
# normalize the input for better efficiency
vm = tf.linalg.l2_normalize(vm, axis=1)

# The model output the dot product of the two vectors vu and vm
output = tf.keras.layers.Dot(axes=1)([vu, vm])

# specify the inputs and output of the model
model = Model([input_user, input_item], output)

model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 18)]         0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, 20)]         0           []                               
                                                                                                  
 sequential (Sequential)        (None, 32)           41888       ['input_1[0][0]']                
                                                                                                  
 sequential_1 (Sequential)      (None, 32)           42400       ['input_2[0][0]']                
                                                                                              

### Compile and Fit Model

In [16]:
tf.random.set_seed(1)
cost_fn = tf.keras.losses.MeanSquaredError()
opt = keras.optimizers.Adam(learning_rate=0.01)
model.compile(optimizer=opt,
              loss=cost_fn)

In [17]:
tf.random.set_seed(1)
# fit the model with training set (user_train ignore first 3 features, item_train ignore first feature)
model.fit([user_train[:, u_s:], item_train[:, i_s:]], ynorm_train, epochs=30)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x20be62e5df0>

In [18]:
# evaluate returns the loss value & metrics values for the model in test mode.
# set verbose means how do you want to 'see' the training progress for each epoch.
# verbose=0 will show you nothing (silent)
# verbose=1 will show you an animated progress bar like this: [==========]
# verbose=2 will just mention the number of epoch like this: Epoch 1/10
model.evaluate([user_test[:, u_s:], item_test[:, i_s:]], ynorm_test, verbose=2)

1550/1550 - 3s - loss: 0.1180 - 3s/epoch - 2ms/step


0.11798270791769028

## Make Rating Predictions

### New User

In [19]:
# create a new user 
new_user_id = 999
new_rating_ave = 3.0
new_rating_count = 12

new_action = 5.0
new_adventure = 4
new_animation = 1
new_childrens = 1
new_comedy = 5
new_crime = 5
new_documentary = 4
new_drama = 1
new_fantasy = 1
new_film_noir = 5
new_horror = 1
new_musical = 1
new_mystery = 5
new_romance = 1
new_scifi = 5
new_thriller = 1
new_war = 5
new_western = 1
user_vec = np.array([[new_user_id, new_rating_count, new_rating_ave, 
                      new_action, new_adventure, new_animation, new_childrens, 
                      new_comedy, new_crime, new_documentary, new_drama, 
                      new_fantasy, new_film_noir, new_horror, new_musical, 
                      new_mystery, new_romance, new_scifi, new_thriller, 
                      new_war, new_western]])

In [20]:
def predict_uservec(user_vecs, item_vecs, model, u_s, i_s, scaler, ScalerUser, ScalerItem, scaledata=False):
    """
    Predict the user's ratings on each movie and arrange them in descending order
    Args:
        user_vecs (ndarray): feature value of each user
        item_vecs (ndarray): feature value of each movie
        model (tf.keras.Model()): neural network model that takes both user and item inputs
        u_s (int): column start index for user vectors  
        i_s (int): column start index for item vectors  
        scaler (MinMaxScaler): scaler for dataset y
        ScalerUser (StandardScaler): scaler for user vectors
        ScalerItem (StandardScaler): scaler for item vectors
        scaledata (bool): whether to apply scalar
    Returns:
        sorted_index (List): sorted indices based on the predicted ratings in descending order
        sorted_ypu (List): sorted predicted ratings in descending order
        sorted_items (List): sorted item vectors based on the predicted ratings in descending order
        sorted_user (List): sorted user vectors based on the predicted ratings in descending order
    """
    # if the vectors need to be scaled, use the fitted StandardScaler() to scale the vectors.
    if scaledata:
        scaled_user_vecs = ScalerUser.transform(user_vecs)
        scaled_item_vecs = ScalerItem.transform(item_vecs)
        y_p = model.predict([scaled_user_vecs[:, u_s:], scaled_item_vecs[:, i_s:]])
    else:
        y_p = model.predict([user_vecs[:, u_s:], item_vecs[:, i_s:]])
    # Scale back the data to the original representation. (0.5 to 5 rating)
    y_pu = scaler.inverse_transform(y_p)
    
    if np.any(y_pu < 0) : 
        print("exists negative predictions")
    #negate y_pu to get a descending list of indices
    sorted_index = np.argsort(-y_pu,axis=0).reshape(-1).tolist()  
    sorted_ypu   = y_pu[sorted_index]
    sorted_items = item_vecs[sorted_index]
    sorted_user  = user_vecs[sorted_index]
    return sorted_index, sorted_ypu, sorted_items, sorted_user

In [21]:
# np.tile(A, reps), construct an array by repeating A the number of times given by reps.
# generate and replicate the user vector to match the number of examples in the item vectors.
user_vecs = np.tile(user_vec, (len(item_vecs), 1))

# scale the vectors and make predictions for all movies. Return results sorted by rating.
sorted_index, sorted_ypu, sorted_items, sorted_user = predict_uservec(user_vecs, item_vecs, model, u_s, i_s, 
                                                                      scaler, scalerUser, scalerItem, 
                                                                      scaledata=scaledata)



#### Display Predictions for the New User

In [22]:
movie_id = sorted_items[:, 0].astype(int)
rating_ave = sorted_items[:, 2].astype(float)
print_pred_movie = defaultdict(list)

In [23]:
movies_listed = set()
for i in range(len(movie_id)):
    if movie_id[i] in movies_listed:
        continue    
    print_pred_movie['y_p'].append(sorted_ypu[i, 0])
    movies_listed.add(movie_id[i])
    print_pred_movie['movie id'].append(movie_id[i])
    print_pred_movie['rating ave'].append(rating_ave[i])
    print_pred_movie['title'].append(movie_dict[movie_id[i]]['title'])
    print_pred_movie['genre'].append(movie_dict[movie_id[i]]['genres'])

In [24]:
pred_new_user_rating = pd.DataFrame(data=print_pred_movie)

In [25]:
pred_new_user_rating.head(10)

Unnamed: 0,y_p,movie id,rating ave,title,genre
0,4.626996,84152,3.95,Limitless (2011),Sci-Fi|Thriller
1,4.623612,98124,3.928571,"Batman: The Dark Knight Returns, Part 1 (2012)",Action|Animation|Sci-Fi
2,4.623343,109487,3.993151,Interstellar (2014),Sci-Fi|IMAX
3,4.622888,111759,3.977273,Edge of Tomorrow (2014),Action|Sci-Fi|IMAX
4,4.622752,93840,4.022727,"Cabin in the Woods, The (2012)",Comedy|Horror|Sci-Fi|Thriller
5,4.61959,68237,3.96875,Moon (2009),Drama|Mystery|Sci-Fi|Thriller
6,4.619314,133771,4.0,The Lobster (2015),Comedy|Romance|Sci-Fi
7,4.619314,134130,4.0,The Martian (2015),Adventure|Drama|Sci-Fi
8,4.619043,112852,4.050847,Guardians of the Galaxy (2014),Action|Adventure|Sci-Fi
9,4.618533,106920,3.92,Her (2013),Drama|Romance|Sci-Fi


The predicted user rating is based on the the user vector which includes a set of user genre rating <br>
For the case that a user only gives a maximum rating for one genre and minimums for the rest, if there's no similar user rating in the user vector, then the predicted rating may not be meaningful.

Movies with multiple genre's show up multiple times in the training data. For example, 'The Time Machine' has three genre's: Adventure, Action, Sci-Fi

### Exist User

In [26]:
def get_user_vecs(user_id, user_train, item_vecs, ratings):
    """ 
    Find the user vectors
    Args:
        user_id (int): id of the user
        user_train (ndarray): training set of user dataset
        item_vecs (ndarray): feature value of each movie
        ratings (ndarray): all ratings given by users
    Returns:
        user_vecs (ndarray): feature value of the user
        y (ndarray): ratings given by this user
    """

    user_vec_found = False
    # search the user training set for user with user_id
    for i in range(len(user_train)):
        if user_train[i, 0] == user_id:
            user_vec = user_train[i]
            user_vec_found = True
            break
            
    if not user_vec_found:
        print("can't find the user in user_train")

    # generate and replicate the user vector to match the number of examples in the item vectors.
    num_items = len(item_vecs)
    user_vecs = np.tile(user_vec, (num_items, 1))

    y = np.zeros(num_items)
    # walk through movies in item_vecs and get the movies, see if user has rated them
    for i in range(num_items):  
        # list of movies the user has rated
        movie_id_lst = list(ratings.loc[ratings['userId'] == user_id]['movieId'])
        movie_id = item_vecs[i, 0]
        # if the movie is rated by the user, get the rating
        if movie_id in movie_id_lst:
            rating = ratings.loc[ratings['userId'] == user_id].loc[ratings['movieId'] == movie_id].iloc[0]['rating']
        else:
            rating = 0
        y[i] = rating
    return user_vecs, y

In [27]:
# Predict the rating of user x. Compare the predicted ratings with the model's ratings.
uid = 111

# form a set of user vectors. This is the same vector, transformed and repeated.
user_vecs, y_vecs = get_user_vecs(uid, scalerUser.inverse_transform(user_train), item_vecs, ratings_df)

# scale the vectors and make predictions for all movies. Return results sorted by rating.
sorted_index, sorted_ypu, sorted_items, sorted_user = predict_uservec(user_vecs, item_vecs, model, u_s, i_s, scaler, 
                                                                      scalerUser, scalerItem, scaledata=scaledata)
sorted_y = y_vecs[sorted_index]



#### Display Predictions for the exist User

In [28]:
movie_id = sorted_items[:, 0].astype(int)
rating_ave = sorted_items[:, 2].astype(float)
print_existing_user = defaultdict(list)

In [29]:
item_features = list(movies_df.columns)

In [30]:
for i in range(len(sorted_ypu)):
    if sorted_y[i] != 0:
        # offset is the index of the movie genre
        offset = np.where(sorted_items[i, ivs:] == 1)[0][0]
        # genre_rating given by the ith user
        genre_rating = sorted_user[i, uvs + offset]
        # genre of the movie
        genre = item_features[ivs + offset]

        print_existing_user['y_p'].append(sorted_ypu[i, 0])
        print_existing_user['y'].append(sorted_y[i])
        print_existing_user['user'].append(sorted_user[i, 0].astype(int))
        print_existing_user['user genre ave'].append(genre_rating.astype(float))
        print_existing_user['movie rating ave'].append(rating_ave[i])
        print_existing_user['title'].append(movie_dict[movie_id[i]]['title'])
        print_existing_user['genre'].append(genre)

In [31]:
pred_exist_user_rating = pd.DataFrame(data=print_existing_user)

In [32]:
pred_exist_user_rating.head(15)

Unnamed: 0,y_p,y,user,user genre ave,movie rating ave,title,genre
0,3.92666,4.5,111,3.326613,4.429022,"Shawshank Redemption, The (1994)",Drama
1,3.881918,4.5,111,3.382948,3.8,The Boss Baby (2017),Comedy
2,3.878161,5.0,111,3.382948,3.833333,Deadpool (2016),Comedy
3,3.877383,5.0,111,3.382948,3.986111,Kingsman: The Secret Service (2015),Comedy
4,3.876554,2.5,111,3.382948,3.890625,Zootopia (2016),Comedy
5,3.865192,2.5,111,3.382948,3.813953,Inside Out (2015),Comedy
6,3.861678,5.0,111,3.382948,3.8,The Wedding Ringer (2015),Comedy
7,3.852442,4.0,111,3.382948,4.0,"Secret Life of Walter Mitty, The (2013)",Comedy
8,3.852229,3.0,111,3.382948,3.853659,Big Hero 6 (2014),Comedy
9,3.851339,5.0,111,3.382948,3.8125,Blended (2014),Comedy


### Find Similar Movies, Pre-Train Item Vectors

A similarity measure is the squared distance between the two vectors $ \mathbf{v_m^{(k)}}$ and $\mathbf{v_m^{(i)}}$ :
$$\left\Vert \mathbf{v_m^{(k)}} - \mathbf{v_m^{(i)}}  \right\Vert^2 = \sum_{l=1}^{n}(v_{m_l}^{(k)} - v_{m_l}^{(i)})^2$$

In [33]:
def sq_dist(a, b):
    """
    Returns the squared distance between two vectors
    """
    
    d = np.sum(np.square(a - b), axis = 0)
    
    return d

A matrix of distances between movies can be computed once when the model is trained and then reused for new recommendations without retraining.<br>
We can build a model to run the movie vectors to generate the movie feature vector $v_m$ for each of the movies.<br>

In [34]:
input_item_m = tf.keras.layers.Input(shape=(num_item_features))    
vm_m = item_NN(input_item_m)                                       
vm_m = tf.linalg.l2_normalize(vm_m, axis=1)                     
model_m = Model(input_item_m, vm_m)                                
model_m.summary()

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_3 (InputLayer)        [(None, 20)]              0         
                                                                 
 sequential_1 (Sequential)   (None, 32)                42400     
                                                                 
 tf.math.l2_normalize_2 (TFO  (None, 32)               0         
 pLambda)                                                        
                                                                 
Total params: 42,400
Trainable params: 42,400
Non-trainable params: 0
_________________________________________________________________


The item_vecs must be scaled to use with the trained model. The result of the prediction is a 32 entry feature vector for each movie.

In [35]:
scaled_item_vecs = scalerItem.transform(item_vecs)
item_vecs_pred = model_m.predict(scaled_item_vecs[:, i_s:])
print("Shape of movie feature vectors is {}".format(item_vecs_pred.shape))

Shape of movie feature vectors is (9167, 32)


The dataset contains 3649 unique movies, but same movie will appear as a separate vector for each of its genres.

In [36]:
def get_item_genre(item, ivs, item_features):
    """
    Find the genre of this movie
    Args:
        item (ndarray): item vectors
        ivs (int): starting index of the feature value
        item_features (List): features of the item
    Returns:
        genre
        offset
    """
    # np.where(item[ivs:] == 1) will return (array([x]),) where x is the index of the column equal to 1
    offset = np.where(item[ivs:] == 1)[0][0]
    genre = item_features[ivs + offset]
    return genre

### Create a Matrix of all Squared Distance
- The value of (i, j) is the squred distance between movie i and movie j 

In [37]:
dim = len(item_vecs_pred)
dist = np.zeros((dim, dim))

# create the matrixR
for i in range(dim):
    for j in range(dim):
        dist[i,j] = sq_dist(item_vecs_pred[i, :], item_vecs_pred[j, :])

# The diagonal contains all the products of the same movie, so mask the diagonal to avoid selecting the same movie
m_dist = np.ma.masked_array(dist, mask=np.identity(dist.shape[0]))  

In [38]:
table = defaultdict(list)
for i in range(50):
    # find the index of the movie that has the smallest square distance to movie i
    min_idx = np.argmin(m_dist[i])
    movie1_id = int(item_vecs[i, 0])
    movie2_id = int(item_vecs[min_idx, 0])
    genre1 = get_item_genre(item_vecs[i, :], ivs, item_features)
    genre2 = get_item_genre(item_vecs[min_idx, :], ivs, item_features)
    
    table["movie1"].append(movie_dict[movie1_id]['title'])
    table["genre1"].append(genre1)
    table["movie2"].append(movie_dict[movie2_id]['title'])
    table["genre2"].append(genre2)

In [39]:
similar_movie = pd.DataFrame(data=table)

In [40]:
similar_movie

Unnamed: 0,movie1,genre1,movie2,genre2
0,Toy Story (1995),Adventure,"Lion King, The (1994)",Adventure
1,Toy Story (1995),Animation,Wallace & Gromit: A Close Shave (1995),Animation
2,Toy Story (1995),Children,"Little Princess, A (1995)",Children
3,Toy Story (1995),Comedy,Emma (1996),Comedy
4,Toy Story (1995),Fantasy,Groundhog Day (1993),Fantasy
5,Jumanji (1995),Adventure,Executive Decision (1996),Adventure
6,Jumanji (1995),Children,James and the Giant Peach (1996),Children
7,Jumanji (1995),Fantasy,"Prophecy, The (1995)",Fantasy
8,Grumpier Old Men (1995),Comedy,Bad Boys (1995),Comedy
9,Grumpier Old Men (1995),Romance,"Bridges of Madison County, The (1995)",Romance
