Imports:

In [None]:
#importing required packages
import pandas as pd
import tensorflow.compat.v1 as tf
import numpy as np
import collections
from IPython import display
tf.disable_v2_behavior()

Instructions for updating:
non-resource variables are not supported in the long term


importing datasets:

In [None]:
columns_name = ['userId',"movieId","rating","timestamp"]
ratings_df = pd.read_csv("u.data", sep='\t', names = columns_name) #rating Dataframe with userId, movieId, and Rating

In [None]:
users_cols = ['userId', 'age', 'sex', 'occupation', 'zip_code']
users_df = pd.read_csv('u.user', sep='|', names=users_cols, encoding='latin-1') #user Dataframe with UserId

In [None]:
genre_cols = [
    "genre_unknown", "Action", "Adventure", "Animation", "Children", "Comedy",
    "Crime", "Documentary", "Drama", "Fantasy", "Film-Noir", "Horror",
    "Musical", "Mystery", "Romance", "Sci-Fi", "Thriller", "War", "Western"
]
movies_cols = ['movieId', 'title', 'release_date', "video_release_date", "imdb_url"
] + genre_cols
movies_df = pd.read_csv('u.item', sep='|', names=movies_cols, encoding='latin-1')
movies_df = movies_df.drop('video_release_date', axis = 1) #movie Dataframe with MovieId

Data Processing:

In [None]:
#userId and movieId starts from 1, we shift that to 0 for matrix factorization computation
users_df["userId"] = users_df["userId"].apply(lambda x: str(x-1))
movies_df["movieId"] = movies_df["movieId"].apply(lambda x: str(x-1))
movies_df["year"] = movies_df['release_date'].apply(lambda x: str(x).split('-')[-1])
ratings_df["movieId"] = ratings_df["movieId"].apply(lambda x: str(x-1))
ratings_df["userId"] = ratings_df["userId"].apply(lambda x: str(x-1))
ratings_df["rating"] = ratings_df["rating"].apply(lambda x: float(x))
movies_df['title'] = movies_df.title.str.replace('[(\d\d\d\d)]', "")
movies_df['title'] = movies_df['title'].apply(lambda x: x.strip())

end of data processing

Merging the userId, movieId, and their consecutive rating by each user.


In [None]:
movielens_data = ratings_df.merge(movies_df, on='movieId').merge(users_df, on='userId')

In [None]:
movies_df = movies_df[['movieId', 'title', 'release_date', 'imdb_url','all_genres', 'genre_unknown',
       'Action', 'Adventure', 'Animation', 'Children', 'Comedy', 'Crime',
       'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical',
       'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western', 'genre']]

In [None]:
movies_df

Functions to split data, create sparse tensor, Model helper, and the model:

In [None]:
#splits dataset to train and test
def split_dataframe(df):
  #splits the dataframe into test and train, test with 20% of DF. Initalizing random_state so that we get the same split every time.
    test = df.sample(frac = 0.2, replace = False, random_state = 100) 
    train = df[~df.index.isin(test.index)] 
    
    return train, test

In [None]:
#creating sparse tensor
def build_sparse_tensor(ratings_df):
    indices = ratings_df[['userId', 'movieId']].values #indices value of User (userId) and the movies (movieId) the user has rated
    ratings = ratings_df['rating'].values
    users = users_df.shape[0] #number of users
    movies = movies_df.shape[0] #number of movies

    sparseTensor = tf.SparseTensor(
        indices = indices, values = ratings, 
        dense_shape = [users, movies]) #returns a sparse tensor and indices of of non-zero ratings in the DF
    
    return sparseTensor

In [None]:
#function to calculate the loss function MSE
def sparse_mean_square_error(sparseTensor, user_em, movie_em):
    
    predictions = tf.gather_nd(tf.matmul(user_em, movie_em, transpose_b=True),sparseTensor.indices) #return values(rating predictions) of given indices to check with acutal ratings
    loss = tf.losses.mean_squared_error(sparseTensor.values, predictions) #mean square error of actual_rating - predicted_rating
    
    return loss 

In [None]:
#Regularization of the Embedding and Predcited Rating
def regularization_loss(reg_coeff, gravity_coeff, U, V):
    
    reg_loss = reg_coeff * (tf.reduce_sum(U * U)/U.shape[0].value + tf.reduce_sum(V * V)/V.shape[0].value) #l2 norm
    gravity_loss = gravity_coeff * (1. / (U.shape[0].value*V.shape[0].value) * tf.reduce_sum(tf.matmul(U, U, transpose_a=True) * tf.matmul(V, V, transpose_a=True)))
    total_reg_loss = reg_loss + gravity_loss

    return total_reg_loss

In [None]:
#Model Helper class where SGD is performed on the loss and Embeddings are optimized
class Rec_model(tf.Module):
  
  #constructor
  def __init__(self, embedding_vars, loss, metrics):
    self._loss = loss
    self._metrics = metrics
    self._embedding_vars = embedding_vars
    self._embeddings = {embedding: None for embedding in embedding_vars}
    self._session = None

  #getter method for embeddings
  @property
  def embeddings(self):
    return self._embeddings
  
  #a call function to train with some default parameters
  def train(self, iterations = 100, learning_rate = 1., optimizer = tf.train.GradientDescentOptimizer):
    with self._loss.graph.as_default():
      opt = optimizer(learning_rate)
      train = opt.minimize(self._loss)
      init = tf.global_variables_initializer()

      if self._session is None:
        self._session = tf.Session()
        with self._session.as_default():
          self._session.run(init)

      with self._session.as_default():
        #local_init.run()
        for i in range(iterations + 1):
          _, results = self._session.run((train, self._metrics))
          if (i % 10 == 0) or i == iterations:
            print("\r iteration number %d: " % i + ", ".join(["%s = %0.4f" % (error, value) for r in results for error, value in r.items()]),end='')
        
        for k, v in self._embedding_vars.items():
          self._embeddings[k] = v.eval()

    return results


In [None]:
#model function where Tensorflow variables are initialzed (U-em and V-em) and loss is initialized, and then variables are sent to model helper.
def build_model(ratings_df, embedding_dem, regularization_coeff, gravity_coeff, init_stddev):
    
    train, test = split_dataframe(ratings_df)
    
    train_sparse = build_sparse_tensor(train)
    test_sparse = build_sparse_tensor(test)
    
    U = tf.Variable(tf.random_normal([train_sparse.dense_shape[0], embedding_dem], stddev=init_stddev))
    V = tf.Variable(tf.random_normal([train_sparse.dense_shape[1], embedding_dem], stddev=init_stddev))
    
    train_loss = sparse_mean_square_error(train_sparse, U, V)
    test_loss = sparse_mean_square_error(test_sparse, U, V)
    
    reg_loss = regularization_loss(regularization_coeff, gravity_coeff,  U, V)
    
    total_train_loss = train_loss + reg_loss
    
    metrics = {
        'train_error': train_loss,
        'test_error': test_loss
    }
    embeddings = {
      "user": U,
      "movie": V
  }
    return embeddings, total_train_loss, [metrics]

Build the Model and Train it using the train function we created in the helper class:

In [None]:
em, loss, metrics = build_model(ratings_df, embedding_dem = 25, regularization_coeff = 0.1, gravity_coeff = 1.0, init_stddev=0.5) #variables returned from Model

In [None]:
model = Rec_model(em, loss, metrics) #variables sent to model helper.

In [None]:
model.train(learning_rate= 1.0, iterations = 5000)

 iteration number 5000: train_error = 1.3870, test_error = 2.2911

[DictWrapper({'train_error': 1.3869758, 'test_error': 2.2911236})]

Functions to calculate Cosine Similarity Score of a movie or user to find similar movies or users to make recommendations:

In [None]:
#computes cosine simialrit score
def compute_scores(movie_em, item_em):
  U = movie_em
  V = item_em
  #cosine similarity (a, b) = dot(a, b) / sqrt(summation|a|^2 * summation|b|^2)
  U = U / np.linalg.norm(U)  #linalg.norm uses the Forbenius Norm 
  V = V / np.linalg.norm(V, axis=1, keepdims=True)
  
  cos_sim_scores = U.dot(V.T)

  return cos_sim_scores

In [None]:
def user_recommendations(model, user, exclude_rated = False, k=6):

#generate recommendations for existing user using User Id as parameter.
  scores = compute_scores(
      model.embeddings["user"][user], model.embeddings["movie"])

  df = pd.DataFrame({
      'cosine score': list(scores),
      'movie': movies_df['movieId'],
      'titles': movies_df['title']
  })
  
  if exclude_rated == True:
    rated_movies = ratings_df[ratings_df.userId == user]['movieId'].values
    df = df[df.movie.apply(lambda x: x not in rated_movies)]
  
  display.display(df.sort_values(['cosine score'], ascending=False).head(k))


def new_recommendations(model, title, k=6):

  #generate recommendations for new user using Movie Id as paramter.
  ids =  movies_df[movies_df['title'].str.contains(title)].index.values
  titles = movies_df.iloc[ids]['title'].values
  if len(titles) == 0:
    raise ValueError("Found no movies with title" % title)
  print("Recommendations for Movie: %s" % titles[0])

  movie_id = ids[0]
  scores = compute_scores(
      model.embeddings["movie"][movie_id], model.embeddings["movie"])
  df = pd.DataFrame({
      'cosine score': list(scores),
      'titles': movies_df['title']
  })
  display.display(df.sort_values(['cosine score'], ascending=False).head(k))

In [None]:
user_recommendations(model, user = 0 ,exclude_rated = False, k=10)  #input User_id and K = no. of movies for recommendation as paramters

Unnamed: 0,cosine score,movie,titles
47,0.781483,47,Hoop Dreams
80,0.759109,80,"Hudsucker Proxy, The"
432,0.734611,432,Heathers
159,0.732638,159,Glengarry Glen Ross
167,0.72728,167,Monty Python and the Holy Grail
12,0.725088,12,Mighty Aphrodite
168,0.722535,168,"Wrong Trousers, The"
178,0.721432,178,"Clockwork Orange, A"
155,0.719998,155,Reservoir Dogs
64,0.716954,64,What's Eating Gilbert Grape


In [None]:
new_recommendations(model, "Pulp Fiction", k= 10)  #send movie title as paramter.

Recommendations for Movie: Pulp Fiction


Unnamed: 0,cosine score,titles
55,1.0,Pulp Fiction
11,0.907979,"Usual Suspects, The"
54,0.838927,"Professional, The"
97,0.817772,"Silence of the Lambs, The"
88,0.817433,Blade Runner
182,0.807629,Alien
237,0.802529,Raising Arizona
233,0.798982,Jaws
356,0.795394,One Flew Over the Cuckoo's Nest
199,0.791362,"Shining, The"


In [None]:
movielens_data[movielens_data.userId == '0'][['movieId', 'title', 'rating']].sort_values(by = 'rating', ascending = False).head(20)  # to view previously rated movies of user

Unnamed: 0,movieId,title,rating
1478,241,Kolya,5.0
1543,257,Contact,5.0
1646,43,Dolores Claiborne,5.0
1554,177,Angry Men,5.0
1553,227,Star Trek: The Wrath of Khan,5.0
1552,234,Mars Attacks!,5.0
1551,201,Groundhog Day,5.0
1547,47,Hoop Dreams,5.0
1651,189,Henry V,5.0
1545,215,When Harry Met Sally...,5.0
