In [69]:
import numpy as np
from numpy import genfromtxt
from collections import defaultdict
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
import csv
import pickle5 as pickle
from numpy import genfromtxt
from tensorflow.keras.models import Model
import tabulate

##  Dataset 
You can find the dataset on [MovieLens ml-latest-small](https://grouplens.org/datasets/movielens/latest/). 

* The dataset contains movies released after year 2000 and the dataset has $n_u = 395$ users and $n_m= 694$ movies. <br>
* For each movie, there's a movie title, release date, and one or more genres. This dataset contains little information about users other than their ratings. <br>

* This dataset is used to create training vectors for the neural networks described below. 


In [70]:
def load_data():
    
    with open('./data/content_item_train_header.txt', 'r') as i_features:
        item_features = []
        for line in i_features:
            item_features.extend(line.strip().split(','))

    with open('./data/content_user_train_header.txt', 'r') as u_features:
        user_features = []
        for line in u_features:
            user_features.extend(line.strip().split(','))
    
    #item_train = genfromtxt('./data/content_item_train.csv', delimiter=',')
    item_train = pd.read_csv('./data/content_item_train.csv', names = item_features, header=None)
    user_train = pd.read_csv('./data/content_user_train.csv', names = user_features, header=None)
    y_train    = pd.read_csv('./data/content_y_train.csv', header=None)                
            
    item_vecs = genfromtxt('./data/content_item_vecs.csv', delimiter=',')
    #item_vecs = pd.read_csv('./data/content_item_vecs.csv', header=None)


    movie_dict = defaultdict(dict)
    count = 0
    with open('./data/content_movie_list.csv', newline='') as movie:
        reader = csv.reader(movie, delimiter=',', quotechar='"')
        for line in reader:
            if count == 0:
                count += 1  # skip header
            else:
                count += 1
                movie_id = int(line[0])
                movie_dict[movie_id]["title"] = line[1]
                movie_dict[movie_id]["genres"] = line[2]
                
#     a = {'hello': 'world'} or a = [{'hello': 'world'}, 1, 2.3333, 4, True, "x", ("y", [[["z"], "y"], "x"]), {'today', today}]
#     with open('filename.pickle', 'wb') as handle:
#         pickle.dump(a, handle, protocol=pickle.HIGHEST_PROTOCOL)
#     with open('filename.pickle', 'rb') as handle:
#         b = pickle.load(handle)
#     print(a == b) (True)

    with open('./data/content_user_to_genre.pickle', 'rb') as f:
        user_to_genre = pickle.load(f)
        
    return(item_train, user_train, y_train, item_features, user_features, item_vecs, movie_dict, user_to_genre)

In [71]:
# load data
item_train, user_train, y_train, item_features, user_features, item_vecs, movie_dict, user_to_genre = load_data()

In [72]:
user_train.head()

Unnamed: 0,user id,rating count,rating ave,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Horror,Mystery,Romance,Sci-Fi,Thriller
0,2.0,16.0,4.0625,3.9,5.0,0.0,0.0,4.0,4.2,4.0,4.0,0.0,3.0,4.0,0.0,4.25,3.875
1,2.0,16.0,4.0625,3.9,5.0,0.0,0.0,4.0,4.2,4.0,4.0,0.0,3.0,4.0,0.0,4.25,3.875
2,2.0,16.0,4.0625,3.9,5.0,0.0,0.0,4.0,4.2,4.0,4.0,0.0,3.0,4.0,0.0,4.25,3.875
3,2.0,16.0,4.0625,3.9,5.0,0.0,0.0,4.0,4.2,4.0,4.0,0.0,3.0,4.0,0.0,4.25,3.875
4,2.0,16.0,4.0625,3.9,5.0,0.0,0.0,4.0,4.2,4.0,4.0,0.0,3.0,4.0,0.0,4.25,3.875


In [73]:
num_user_features = user_train.shape[1] - 3  # remove userid, rating count and ave rating during training
uvs = 3  # user genre vector start
u_s = 3  # Not using first 3 features when training

In [74]:
item_train.head()

Unnamed: 0,movie id,year,ave rating,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Horror,Mystery,Romance,Sci-Fi,Thriller
0,6874,2003,3.961832,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,6874,2003,3.961832,0,0,0,0,0,1,0,0,0,0,0,0,0,0
2,6874,2003,3.961832,0,0,0,0,0,0,0,0,0,0,0,0,0,1
3,8798,2004,3.761364,1,0,0,0,0,0,0,0,0,0,0,0,0,0
4,8798,2004,3.761364,0,0,0,0,0,1,0,0,0,0,0,0,0,0


In [75]:
num_item_features = item_train.shape[1] - 1  # remove movie id at train time
ivs = 3  # the item genre vector starts at the fourth column(Action, Advanture, ...)
i_s = 1  # the column movie id will not be meaningful for training
scaledata = True  # applies the standard scalar to data if true

In [76]:
# convert the pandas dataframe to numpy
user_train = user_train.to_numpy()
user_train.shape

(58187, 17)

In [77]:
item_train = item_train.to_numpy()
item_train.shape

(58187, 17)

In [78]:
# scale training data to improve convergence, z = (x - u) / s
if scaledata:
    item_train_save = item_train
    user_train_save = user_train
    # Use StandardScaler to scale the data
    scalerItem = StandardScaler()
    scalerItem.fit(item_train)
    item_train = scalerItem.transform(item_train)

    scalerUser = StandardScaler()
    scalerUser.fit(user_train)
    user_train = scalerUser.transform(user_train)

In [79]:
# split the datasets into traning and testing sets
item_train, item_test = train_test_split(item_train, train_size=0.80, shuffle=True, random_state=1)
user_train, user_test = train_test_split(user_train, train_size=0.80, shuffle=True, random_state=1)
y_train, y_test       = train_test_split(y_train.to_numpy(),    train_size=0.80, shuffle=True, random_state=1)
print(f"movie/item training set shape: {item_train.shape}")
print(f"movie/item test  set shape: {item_test.shape}")

movie/item training set shape: (46549, 17)
movie/item test  set shape: (11638, 17)


In [80]:
user_train[0]

array([ 1.11518128, -0.89065377,  0.55107954,  0.67356455,  0.61127079,
        0.57779233,  0.71133592,  0.72105638,  0.46401949,  0.69378025,
        0.19576629,  0.25667377,  0.30812552,  0.54095844,  0.51780862,
        0.76467069,  0.47074832])

In [81]:
# Scale the target ratings using a Min Max Scaler to scale the target to be between -1 and 1
scaler = MinMaxScaler((-1, 1))
scaler.fit(y_train.reshape(-1, 1))
ynorm_train = scaler.transform(y_train.reshape(-1, 1))
ynorm_test = scaler.transform(y_test.reshape(-1, 1))
print(ynorm_train.shape, ynorm_test.shape)

(46549, 1) (11638, 1)


In [82]:
# Create neural networks for user content and movie content
num_outputs = 32
tf.random.set_seed(1)
user_NN = tf.keras.models.Sequential([
  tf.keras.layers.Dense(256, activation='relu'),
  tf.keras.layers.Dense(128, activation='relu'),
  tf.keras.layers.Dense(num_outputs),
])

item_NN = tf.keras.models.Sequential([
  tf.keras.layers.Dense(256, activation='relu'),
  tf.keras.layers.Dense(128, activation='relu'),
  tf.keras.layers.Dense(num_outputs),
])

# create the user input and point to the base network
input_user = tf.keras.layers.Input(shape=(num_user_features))
vu = user_NN(input_user)
vu = tf.linalg.l2_normalize(vu, axis=1)

# create the item input and point to the base network
input_item = tf.keras.layers.Input(shape=(num_item_features))
vm = item_NN(input_item)
vm = tf.linalg.l2_normalize(vm, axis=1)

# The model output the dot product of the two vectors vu and vm
output = tf.keras.layers.Dot(axes=1)([vu, vm])

# specify the inputs and output of the model
model = Model([input_user, input_item], output)

model.summary()

Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_4 (InputLayer)            [(None, 14)]         0                                            
__________________________________________________________________________________________________
input_5 (InputLayer)            [(None, 16)]         0                                            
__________________________________________________________________________________________________
sequential_2 (Sequential)       (None, 32)           40864       input_4[0][0]                    
__________________________________________________________________________________________________
sequential_3 (Sequential)       (None, 32)           41376       input_5[0][0]                    
____________________________________________________________________________________________

In [83]:
tf.random.set_seed(1)
cost_fn = tf.keras.losses.MeanSquaredError()
opt = keras.optimizers.Adam(learning_rate=0.01)
model.compile(optimizer=opt,
              loss=cost_fn)

In [84]:
tf.random.set_seed(1)
# fit the model with training set (user_train ignore first 3 features, item_train ignore first feature)
model.fit([user_train[:, u_s:], item_train[:, i_s:]], ynorm_train, epochs=30)

Train on 46549 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<tensorflow.python.keras.callbacks.History at 0x7f895c5c5e10>

In [85]:
# find the loss of the model on test set
model.evaluate([user_test[:, u_s:], item_test[:, i_s:]], ynorm_test)



0.10525702715847045

In [86]:
# create a new user 
new_user_id = 5000
new_rating_ave = 1.0
new_action = 1.0
new_adventure = 1
new_animation = 1
new_childrens = 1
new_comedy = 5
new_crime = 1
new_documentary = 1
new_drama = 1
new_fantasy = 1
new_horror = 1
new_mystery = 1
new_romance = 5
new_scifi = 5
new_thriller = 1
new_rating_count = 3

user_vec = np.array([[new_user_id, new_rating_count, new_rating_ave,
                      new_action, new_adventure, new_animation, new_childrens,
                      new_comedy, new_crime, new_documentary,
                      new_drama, new_fantasy, new_horror, new_mystery,
                      new_romance, new_scifi, new_thriller]])

In [87]:
user_vec.shape

(1, 17)

In [88]:
len(item_vecs)

1883

In [89]:
def predict_uservec(user_vecs, item_vecs, model, u_s, i_s, scaler, ScalerUser, ScalerItem, scaledata=False):
    """ given a user vector, does the prediction on all movies in item_vecs returns
        an array predictions sorted by predicted rating,
        arrays of user and item, sorted by predicted rating sorting index
    """
    # if the vectors need to be scaled, use the fitted StandardScaler() to scale the vectors.
    if scaledata:
        scaled_user_vecs = ScalerUser.transform(user_vecs)
        scaled_item_vecs = ScalerItem.transform(item_vecs)
        y_p = model.predict([scaled_user_vecs[:, u_s:], scaled_item_vecs[:, i_s:]])
    else:
        y_p = model.predict([user_vecs[:, u_s:], item_vecs[:, i_s:]])
    # Scale back the data to the original representation. (0.5 to 5 rating)
    y_pu = scaler.inverse_transform(y_p)
    
    if np.any(y_pu < 0) : 
        print("Error, expected all positive predictions")
    #negate y_pu to get a descending list of indices
    sorted_index = np.argsort(-y_pu,axis=0).reshape(-1).tolist()  
    sorted_ypu   = y_pu[sorted_index]
    sorted_items = item_vecs[sorted_index]
    sorted_user  = user_vecs[sorted_index]
    return(sorted_index, sorted_ypu, sorted_items, sorted_user)

In [90]:
# np.tile(A, reps), construct an array by repeating A the number of times given by reps.
# generate and replicate the user vector to match the number movies in the data set.
user_vecs = np.tile(user_vec, (len(item_vecs), 1))

# scale the vectors and make predictions for all movies. Return results sorted by rating.
sorted_index, sorted_ypu, sorted_items, sorted_user = predict_uservec(user_vecs, item_vecs, model, u_s, i_s, 
                                                                      scaler, scalerUser, scalerItem, 
                                                                      scaledata=scaledata)

In [91]:
movie_id = sorted_items[:, 0].astype(int)
rating_ave = sorted_items[:, 2].astype(float)
print_pred_movie = defaultdict(list)

In [92]:
for i in range(len(movie_id)):
    print_pred_movie['y_p'].append(sorted_ypu[i, 0])
    print_pred_movie['movie id'].append(movie_id[i])
    print_pred_movie['rating ave'].append(rating_ave[i])
    print_pred_movie['title'].append(movie_dict[movie_id[i]]['title'])
    print_pred_movie['genre'].append(movie_dict[movie_id[i]]['genres'])

In [93]:
df1 = pd.DataFrame(data=print_pred_movie)

In [94]:
df1.head(10)

Unnamed: 0,y_p,movie id,rating ave,title,genre
0,4.602983,69406,3.5,"Proposal, The (2009)",Comedy|Romance
1,4.600286,48043,3.5,"Fountain, The (2006)",Drama|Fantasy|Romance
2,4.598306,5066,3.5,"Walk to Remember, A (2002)",Drama|Romance
3,4.59753,34162,3.508621,Wedding Crashers (2005),Comedy|Romance
4,4.597528,54004,3.454545,I Now Pronounce You Chuck and Larry (2007),Comedy|Romance
5,4.597121,6753,3.5,Secondhand Lions (2003),Children|Comedy|Drama
6,4.595761,6188,3.512821,Old School (2003),Comedy
7,4.594862,5785,3.5,Jackass: The Movie (2002),Action|Comedy|Documentary
8,4.593769,41285,3.441176,Match Point (2005),Crime|Drama|Romance
9,4.593555,8983,3.52,House of Flying Daggers (Shi mian mai fu) (2004),Action|Drama|Romance


The predicted user rating is based on the the user vector which includes a set of user genre rating <br>
For the case that a user only gives a maximum rating for one genre and minimums for the rest, if there's no similar user rating in the user vector, then the predicted rating may not be meaningful.

In [95]:
def get_user_vecs(user_id, user_train, item_vecs, user_to_genre):
    """ given a user_id, return:
        user train/predict matrix to match the size of item_vecs
        y vector with ratings for all rated movies and 0 for others of size item_vecs """

    if user_id not in user_to_genre:
        print("error: unknown user id")
        return(None)
    else:
        user_vec_found = False
        for i in range(len(user_train)):
            if user_train[i, 0] == user_id:
                user_vec = user_train[i]
                user_vec_found = True
                break
        if not user_vec_found:
            print("error in get_user_vecs, did not find uid in user_train")
        num_items = len(item_vecs)
        user_vecs = np.tile(user_vec, (num_items, 1))

        y = np.zeros(num_items)
        # walk through movies in item_vecs and get the movies, see if user has rated them
        for i in range(num_items):  
            movie_id = item_vecs[i, 0]
            if movie_id in user_to_genre[user_id]['movies']:
                rating = user_to_genre[user_id]['movies'][movie_id]
            else:
                rating = 0
            y[i] = rating
    return(user_vecs, y)

Note that movies with multiple genre's show up multiple times in the training data. For example,'The Time Machine' has three genre's: Adventure, Action, Sci-Fi

In [96]:
# Predict the rating of user 36. Compare the predicted ratings with the model's ratings.
uid =  36 

# form a set of user vectors. This is the same vector, transformed and repeated.
user_vecs, y_vecs = get_user_vecs(uid, scalerUser.inverse_transform(user_train), item_vecs, user_to_genre)

# scale the vectors and make predictions for all movies. Return results sorted by rating.
sorted_index, sorted_ypu, sorted_items, sorted_user = predict_uservec(user_vecs, item_vecs, model, u_s, i_s, scaler, 
                                                                      scalerUser, scalerItem, scaledata=scaledata)
sorted_y = y_vecs[sorted_index]

In [97]:
movie_id = sorted_items[:, 0].astype(int)
rating_ave = sorted_items[:, 2].astype(float)

In [98]:
print_existing_user = defaultdict(list)

In [99]:
for i in range(len(sorted_ypu)):
    if sorted_y[i] != 0:
        offset = np.where(sorted_items[i, ivs:] == 1)[0][0]
        genre_rating = sorted_user[i, uvs + offset]
        genre = item_features[ivs + offset]

        print_existing_user['y_p'].append(sorted_ypu[i, 0])
        print_existing_user['y'].append(sorted_y[i])
        print_existing_user['user'].append(sorted_user[i, 0].astype(int))
        print_existing_user['user genre ave'].append(genre_rating.astype(float))
        print_existing_user['movie rating ave'].append(rating_ave[i])
        print_existing_user['title'].append(movie_dict[movie_id[i]]['title'])
        print_existing_user['genre'].append(genre)

In [100]:
df2 = pd.DataFrame(data=print_existing_user)

In [101]:
df2

Unnamed: 0,y_p,y,user,user genre ave,movie rating ave,title,genre
0,2.804359,3.0,36,3.0,2.863636,"Time Machine, The (2002)",Adventure
1,2.742683,3.0,36,3.0,2.863636,"Time Machine, The (2002)",Action
2,2.648199,3.0,36,3.0,2.863636,"Time Machine, The (2002)",Sci-Fi
3,1.91995,1.0,36,1.5,4.0,"Beautiful Mind, A (2001)",Drama
4,1.86923,1.5,36,1.75,3.520408,Road to Perdition (2002),Crime
5,1.868329,2.0,36,1.75,3.518182,Gangs of New York (2002),Crime
6,1.836663,1.0,36,1.0,4.0,"Beautiful Mind, A (2001)",Romance
7,1.805806,2.0,36,1.5,3.518182,Gangs of New York (2002),Drama
8,1.804587,1.5,36,1.5,3.520408,Road to Perdition (2002),Drama


#### Finding Similar Items
These are 32 entry vectors whose values are difficult to interpret. However, similar items will have similar vectors.

A similarity measure is the squared distance between the two vectors $ \mathbf{v_m^{(k)}}$ and $\mathbf{v_m^{(i)}}$ :
$$\left\Vert \mathbf{v_m^{(k)}} - \mathbf{v_m^{(i)}}  \right\Vert^2 = \sum_{l=1}^{n}(v_{m_l}^{(k)} - v_{m_l}^{(i)})^2$$

In [102]:
def sq_dist(a,b):
    """
    Returns the squared distance between two vectors
    """
    
    d = np.sum(np.square(a - b), axis = 0)
    
    return (d)

The previous model needs inputs from both the user vector and movie vector<br>
A matrix of distances between movies can be computed once when the model is trained and then reused for new recommendations without retraining.<br>
We can build a model to run the movie vectors to generate the movie feature vector $v_m$ for each of the movies.<br>

In [103]:
input_item_m = tf.keras.layers.Input(shape=(num_item_features))    # input layer
vm_m = item_NN(input_item_m)                                       # use the trained item_NN
vm_m = tf.linalg.l2_normalize(vm_m, axis=1)                        # incorporate normalization as was done in the original model
model_m = Model(input_item_m, vm_m)                                
model_m.summary()

Model: "model_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_6 (InputLayer)            [(None, 16)]         0                                            
__________________________________________________________________________________________________
sequential_3 (Sequential)       (None, 32)           41376       input_6[0][0]                    
__________________________________________________________________________________________________
tf_op_layer_l2_normalize_5/Squa [(None, 32)]         0           sequential_3[1][0]               
__________________________________________________________________________________________________
tf_op_layer_l2_normalize_5/Sum  [(None, 1)]          0           tf_op_layer_l2_normalize_5/Square
____________________________________________________________________________________________

Recall that the same movie will appear as a separate vector for each of its genres. The item_vecs must be scaled to use with the trained model. The result of the prediction is a 32 entry feature vector for each movie.

In [104]:
scaled_item_vecs = scalerItem.transform(item_vecs)
vms = model_m.predict(scaled_item_vecs[:,i_s:])
print(f"size of all predicted movie feature vectors: {vms.shape}")

size of all predicted movie feature vectors: (1883, 32)


<figure>
    <left> <img src="./images/distmatrix.PNG"   style="width:400px;height:225px;" ></center>
</figure>
We can then find the closest movie by finding the minimum along each row.

In [105]:
def get_item_genre(item, ivs, item_features):
    # np.where(item[ivs:] == 1) will return (array([x]),) where x is the index of the column equal to 1
    offset = np.where(item[ivs:] == 1)[0][0]
    genre = item_features[ivs + offset]
    return(genre, offset)

In [106]:
item = item_vecs[0, :]
item

array([4.05400e+03, 2.00100e+03, 2.84375e+00, 0.00000e+00, 0.00000e+00,
       0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,
       1.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,
       0.00000e+00, 0.00000e+00])

In [107]:
q = np.where(item[3:] == 1)
q

(array([7]),)

In [108]:
count = 50
dim = len(vms)
dist = np.zeros((dim,dim))

# create the matrix
for i in range(dim):
    for j in range(dim):
        dist[i,j] = sq_dist(vms[i, :], vms[j, :])

# The diagonal contains all the products of the same movie, so mask the diagonal to avoid selecting the same movie
m_dist = np.ma.masked_array(dist, mask=np.identity(dist.shape[0]))  

# disp = [["movie1", "genres", "movie2", "genres"]]
# # get the top 50 movies for recommendation
# for i in range(count):
#     min_idx = np.argmin(m_dist[i])
#     movie1_id = int(item_vecs[i,0])
#     movie2_id = int(item_vecs[min_idx,0])
#     genre1,_  = get_item_genre(item_vecs[i,:], ivs, item_features)
#     genre2,_  = get_item_genre(item_vecs[min_idx,:], ivs, item_features)

#     disp.append( [movie_dict[movie1_id]['title'], genre1,
#                   movie_dict[movie2_id]['title'], genre2]
#                )
# table = tabulate.tabulate(disp, tablefmt='html', headers="firstrow", floatfmt=[".1f", ".1f", ".0f", ".2f", ".2f"])
# table

In [109]:
table = defaultdict(list)

In [110]:
for i in range(50):
    min_idx = np.argmin(m_dist[i])
    movie1_id = int(item_vecs[i,0])
    movie2_id = int(item_vecs[min_idx,0])
    genre1,_  = get_item_genre(item_vecs[i,:], ivs, item_features)
    genre2,_  = get_item_genre(item_vecs[min_idx,:], ivs, item_features)
    
    table["movie1"].append(movie_dict[movie1_id]['title'])
    table["genre1"].append(genre1)
    table["movie2"].append(movie_dict[movie2_id]['title'])
    table["genre2"].append(genre2)

In [111]:
df3 = pd.DataFrame(data=table)

In [112]:
df3

Unnamed: 0,movie1,genre1,movie2,genre2
0,Save the Last Dance (2001),Drama,John Q (2002),Drama
1,Save the Last Dance (2001),Romance,Mona Lisa Smile (2003),Romance
2,"Wedding Planner, The (2001)",Comedy,"Sweetest Thing, The (2002)",Comedy
3,"Wedding Planner, The (2001)",Romance,"Sweetest Thing, The (2002)",Romance
4,Hannibal (2001),Horror,Final Destination 2 (2003),Horror
5,Hannibal (2001),Thriller,"Sum of All Fears, The (2002)",Thriller
6,Saving Silverman (Evil Woman) (2001),Comedy,Cats & Dogs (2001),Comedy
7,Saving Silverman (Evil Woman) (2001),Romance,Save the Last Dance (2001),Romance
8,Down to Earth (2001),Comedy,Joe Dirt (2001),Comedy
9,Down to Earth (2001),Fantasy,"Haunted Mansion, The (2003)",Fantasy
