## Setup

In [481]:
# Imports
import numpy as np
import pandas as pd
import collections
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
import altair as alt
import sklearn
import sklearn.manifold
import tensorflow as tf
from tensorflow.keras import layers, models
from IPython import display

# Display options
pd.options.display.max_rows = 10
pd.options.display.float_format = "{:.3f}".format
alt.data_transformers.enable("default", max_rows=None)


# Returns a filtered dataframe by applying a mask from the result of applying function on column
def mask(df, col, function):
  return df[function(df[col])]


# Flattens the columns of the dataframe
def flatten_cols(df):
  df.columns = [" ".join(col).strip() for col in df.columns.values]
  return df


pd.DataFrame.mask = mask
pd.DataFrame.flatten_cols = flatten_cols

# For applying recommendation system on user
USER_RATINGS = False

In [482]:
# MovieLens Dataset
users_cols = ["user_id", "age", "gender", "occupation", "zip_code"]
users = pd.read_csv(
    "movielens_datasets/movielens_100K_ratings/u.user",
    sep="|",
    names=users_cols,
    encoding="latin-1",
)
USERS_COUNT = users.shape[0]

ratings_cols = ["user_id", "movie_id", "rating", "timestamp"]
ratings = pd.read_csv(
    "movielens_datasets/movielens_100K_ratings/u.data",
    sep="\t",
    names=ratings_cols,
    encoding="latin-1",
)

genre_cols = [
    "genre_unknown",
    "Action",
    "Adventure",
    "Animation",
    "Children",
    "Comedy",
    "Crime",
    "Documentary",
    "Drama",
    "Fantasy",
    "Film-Noir",
    "Horror",
    "Musical",
    "Mystery",
    "Romance",
    "Sci-Fi",
    "Thriller",
    "War",
    "Western",
]
movies_cols = [
    "movie_id",
    "title",
    "release_date",
    "video_release_date",
    "imdb_url",
] + genre_cols
movies = pd.read_csv(
    "movielens_datasets/movielens_100K_ratings/u.item",
    sep="|",
    names=movies_cols,
    encoding="latin-1",
)

In [483]:
# Show data type of columns for each dataframe
print("Users Dataset Info:")
print(users.info())
print("Ratings Dataset Info:")
print(ratings.info())

Users Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 943 entries, 0 to 942
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   user_id     943 non-null    int64 
 1   age         943 non-null    int64 
 2   gender      943 non-null    object
 3   occupation  943 non-null    object
 4   zip_code    943 non-null    object
dtypes: int64(2), object(3)
memory usage: 37.0+ KB
None
Ratings Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype
---  ------     --------------   -----
 0   user_id    100000 non-null  int64
 1   movie_id   100000 non-null  int64
 2   rating     100000 non-null  int64
 3   timestamp  100000 non-null  int64
dtypes: int64(4)
memory usage: 3.1 MB
None


In [484]:
print("Movies Dataset Info:")
print(movies.info())

Movies Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1682 entries, 0 to 1681
Data columns (total 24 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   movie_id            1682 non-null   int64  
 1   title               1682 non-null   object 
 2   release_date        1681 non-null   object 
 3   video_release_date  0 non-null      float64
 4   imdb_url            1679 non-null   object 
 5   genre_unknown       1682 non-null   int64  
 6   Action              1682 non-null   int64  
 7   Adventure           1682 non-null   int64  
 8   Animation           1682 non-null   int64  
 9   Children            1682 non-null   int64  
 10  Comedy              1682 non-null   int64  
 11  Crime               1682 non-null   int64  
 12  Documentary         1682 non-null   int64  
 13  Drama               1682 non-null   int64  
 14  Fantasy             1682 non-null   int64  
 15  Film-Noir           1682 non-null 

In [485]:
# Data Preprocessing
# Shift ids to start from 0 instead of 1 and convert them to string
id_shift = lambda x: str(x - 1)
users["user_id"] = users["user_id"].apply(id_shift)
ratings["movie_id"] = ratings["movie_id"].apply(id_shift)
ratings["user_id"] = ratings["user_id"].apply(id_shift)
movies["movie_id"] = movies["movie_id"].apply(id_shift)

# Convert ratings to float
ratings["rating"] = ratings["rating"].astype(float)

# Add year column for movies
movies["year"] = movies["release_date"].apply(lambda x: str(x).split("-")[-1])

In [486]:
genre_count = movies[genre_cols].sum().to_dict()
print("Genre occurences for movies dataframe:", genre_count)


# Some movies have mutiple genres
# Two columns are added to the movies dataframe
# 'all_generes' is a combination of the movie's multiple genres
# 'genre' is a randomly selected genre from the movie's multiple genres
def mark_genres(movies, genres):
  def get_random_genre(movie_gs):
    active = [genre for genre, g in zip(genres, movie_gs) if g == 1]
    if len(active) == 0:
        return "Other"
    return np.random.choice(active)

  def get_all_genres(movie_gs):
    active = [genre for genre, g in zip(genres, movie_gs) if g == 1]
    if len(active) == 0:
        return "Other"
    return "-".join(active)

  movies["genre"] = [
    get_random_genre(movie_gs) for movie_gs in zip(*[movies[genre] for genre in genres])
  ]
  movies["all_genres"] = [
    get_all_genres(movie_gs) for movie_gs in zip(*[movies[genre] for genre in genres])
  ]


mark_genres(movies, genre_cols)
print(movies[["title", "genre", "all_genres"]].head())

Genre occurences for movies dataframe: {'genre_unknown': 2, 'Action': 251, 'Adventure': 135, 'Animation': 42, 'Children': 122, 'Comedy': 505, 'Crime': 109, 'Documentary': 50, 'Drama': 725, 'Fantasy': 22, 'Film-Noir': 24, 'Horror': 92, 'Musical': 56, 'Mystery': 61, 'Romance': 247, 'Sci-Fi': 101, 'Thriller': 251, 'War': 71, 'Western': 27}
               title     genre                 all_genres
0   Toy Story (1995)  Children  Animation-Children-Comedy
1   GoldenEye (1995)    Action  Action-Adventure-Thriller
2  Four Rooms (1995)  Thriller                   Thriller
3  Get Shorty (1995)    Comedy        Action-Comedy-Drama
4     Copycat (1995)     Crime       Crime-Drama-Thriller


In [487]:
# Merge three dataframes into one containing all information
# Merge movies dataframe with ratings dataframe based on common column movie_id
# Merge users dataframe with ratings dataframe based on common column user_id
movielens = ratings.merge(movies, on="movie_id").merge(users, on="user_id")


# Dataframe split function for training and test sets
def dataframe_split(df, test_split=0.1):
    test = df.sample(frac=test_split, replace=False)
    train = df[~df.index.isin(test.index)]
    return train, test

## Explore Data

In [488]:
users.describe()

Unnamed: 0,age
count,943.0
mean,34.052
std,12.193
min,7.0
25%,25.0
50%,31.0
75%,43.0
max,73.0


In [489]:
users.describe(include=object)

Unnamed: 0,user_id,gender,occupation,zip_code
count,943,943,943,943
unique,943,2,21,795
top,0,M,student,55414
freq,1,670,196,9


In [490]:
# The following is used to generate interactive Altair charts.

# Create an occupation filter to slice the data
occupation_filter = alt.selection_point(fields=["occupation"])
# Create an occupation bar chart using Altair
occupation_chart = alt.Chart().mark_bar().encode(
    x="count()",
    y=alt.Y("occupation:N"),
    color=alt.condition(
        occupation_filter,
        alt.Color("occupation:N", scale=alt.Scale(scheme="category20")),
        alt.value("lightgray"),
    ),
).properties(width=300, height=300).add_params(occupation_filter)


# Create a function that generates a layered histogram chart
# This displays the full data in a histogram and the filtered data in another histogram
def filtered_hist(field, label, filter):
    # base histogram
    base = alt.Chart().mark_bar().encode(
        x=alt.X(field, bin=alt.Bin(maxbins=10), title=label),
        y="count()",
    ).properties(
        width=300,
    )
    # Layered chart
    return alt.layer(
        base.transform_filter(filter), # Histogram of filtered data
        base.encode(color=alt.value("lightgray"), opacity=alt.value(0.7)), # Histogram of full data
    ).resolve_scale(y="independent")

In [491]:
# Create a new dataframe that contains rating statistics for each user
# This calculates the rating count and rating mean for each user
users_ratings = (
    ratings
    .groupby("user_id", as_index=False)
    .agg({"rating": ["count", "mean"]})
    .flatten_cols()
    .merge(users, on="user_id")
)

print(users_ratings.sample(5))

    user_id  rating count  rating mean  age gender     occupation zip_code
149     232           110        4.345   38      M       engineer    98682
460     512            22        4.364   43      M  administrator    26241
479      53            65        3.692   22      M      executive    66315
517     564            35        4.543   40      M        student    55422
514     561            72        3.542   54      F  administrator    20879


In [492]:
# Create a chart of three subcharts to visualise the distribution of ratings per user
# The charts displayed showcase the rating count per user, rating mean per user, and the occupation chart
# The occupation chart is interactive, clicking on an occupation will filter the data by that occupation
# Full data is shown in gray and filtered data is superimposed and shown in blue
# Use SHIFT+click to select mutiple occupations when filtering the data
alt.hconcat(
    filtered_hist("rating count", "number of ratings per user", occupation_filter),
    filtered_hist("rating mean", "mean user rating", occupation_filter),
    occupation_chart,
    data=users_ratings,
)

In [493]:
# Create a new dataframe that contains rating statistics for each movie
movies_ratings = (
    ratings
    .groupby("movie_id", as_index=False)
    .agg({"rating": ["count", "mean"]})
    .flatten_cols()
    .merge(movies, on="movie_id")
)

print(movies_ratings[['movie_id', 'rating count', 'rating mean', 'title', 'release_date']].sample(5))

     movie_id  rating count  rating mean                        title  \
353      1315             7        2.714  Horse Whisperer, The (1998)   
56       1048            25        2.520          House Arrest (1996)   
1357      706            70        3.929       Enchanted April (1991)   
256      1228            13        2.154         Poison Ivy II (1995)   
1531      863            86        3.163   My Fellow Americans (1996)   

     release_date  
353   25-Dec-1997  
56    02-Aug-1996  
1357  01-Jan-1991  
256   01-Jan-1995  
1531  20-Dec-1996  


In [494]:
# Create a genre filter to slice the data
genre_filter = alt.selection_point(fields=["genre"])
# Create a genre bar chart using Altair
genre_chart = alt.Chart().mark_bar().encode(
    x="count()",
    y=alt.Y("genre"),
    color=alt.condition(
        genre_filter,
        alt.Color("genre:N"),
        alt.value("lightgray"),
    ),
).properties(width=300, height=300).add_params(genre_filter)

In [495]:
# Checking the most rated movies
movies_ratings[["title", "rating count", "rating mean"]].sort_values(
    "rating count", ascending=False
).head(10)

Unnamed: 0,title,rating count,rating mean
1116,Star Wars (1977),583,4.358
858,Contact (1997),509,3.804
1671,Fargo (1996),508,4.156
773,Return of the Jedi (1983),507,4.008
898,Liar Liar (1997),485,3.157
889,"English Patient, The (1996)",481,3.657
891,Scream (1996),478,3.441
0,Toy Story (1995),452,3.878
904,Air Force One (1997),431,3.631
225,Independence Day (ID4) (1996),429,3.438


In [496]:
# Checking the highest rated movies with more than 150 ratings
movies_ratings[["title", "rating count", "rating mean"]].mask(
    "rating count", lambda x: x > 150
).sort_values("rating mean", ascending=False).head(10)

Unnamed: 0,title,rating count,rating mean
925,Schindler's List (1993),298,4.466
1108,Casablanca (1942),243,4.457
1272,"Shawshank Redemption, The (1994)",283,4.445
1242,Rear Window (1954),209,4.388
113,"Usual Suspects, The (1995)",267,4.386
1116,Star Wars (1977),583,4.358
369,Citizen Kane (1941),198,4.293
1046,To Kill a Mockingbird (1962),219,4.292
968,One Flew Over the Cuckoo's Nest (1975),264,4.292
1649,"Silence of the Lambs, The (1991)",390,4.29


In [497]:
# Create a chart of three subcharts to visualise the distribution of ratings per movie
# The charts displayed showcase the rating count per movie, rating mean per movie, and the genre chart
# The genre chart is interactive, clicking on a genre will filter the data by that genre
# Full data is shown in gray and filtered data is superimposed and shown in blue
# Use SHIFT+click to select mutiple genres when filtering the data
alt.hconcat(
    filtered_hist("rating count", "number of ratings per movie", genre_filter),
    filtered_hist("rating mean", "mean movie rating", genre_filter),
    genre_chart,
    data=movies_ratings,
)

## Preparation
The matrix $A$ is defined as the ratings matrix where $A_{ij}$ is the rating of the $jth$ movie by the $ith$ user.\
The aim is to factorize the matrix $A$ into the product of the matrix $U$ and the matrix $V$ so that $A \approx UV^\top$.\
The matrix $U$ would be the user embedding matrix and $V$ would be the movie membedding matrix.\
Each row $U_{i}$ in matrix $U$ is an embedding vector of dimension $d$ representing user $i$, and each row $V_{j}$ in matrix $V$ is an embedding vector of dimension $d$ representing movie $j$.

In [498]:
# The ratings matrix A will be represented as a sparse representation since each user rates a small number of movies
def rating_sparse_tensor(ratings_df):
    indices = ratings_df[["user_id", "movie_id"]].values
    values = ratings_df["rating"].values
    return tf.SparseTensor(
        indices=indices,
        values=values,
        dense_shape=[users.shape[0], movies.shape[0]]
    )

In [499]:
# Return the mean squared error between A and the product of U and V
def sparse_mean_square_error(sparse_ratings, user_embeddings, movie_embeddings):
    predictions = tf.reduce_sum(
        tf.gather(user_embeddings, sparse_ratings.indices[:, 0])
        * tf.gather(movie_embeddings, sparse_ratings.indices[:, 1]),
        axis=1,
    )
    loss = tf.losses.mean_squared_error(sparse_ratings.values, predictions)
    return loss

In [508]:
# Enter personal ratings of some movies
# Fill in the csv file 'user_ratings.csv' with personal ratings
# This generates movie recommendations based on the inputed movie ratings
user_ratings = pd.read_csv(
    "user_ratings.csv",
    names=["movie_title", "rating"],
    encoding="latin-1",
)

# Perform data cleaning on user rating column
# Replace non-numeric values with NaN
user_ratings["rating"] = pd.to_numeric(
    user_ratings["rating"], errors="coerce"
)
# Replace values less than 0 or greater than 5 with NaN
user_ratings["rating"] = user_ratings["rating"].apply(
    lambda x: x if 0 <= x <= 5 else pd.NA
)
# Replace missing values (NaN) with 0
user_ratings["rating"].fillna(0, inplace=True)

user_ratings = user_ratings[user_ratings["rating"] > 0]

if user_ratings.empty:
    USER_RATINGS = False
else:
    USER_RATINGS = True

new_user_id = str(USERS_COUNT)
if USER_RATINGS:
    user_ratings["user_id"] = new_user_id
    user_ratings["movie_id"] = user_ratings.index
    user_ratings["movie_id"] = user_ratings["movie_id"].astype(str)
    user_ratings["rating"] = user_ratings["rating"].astype(float)
    print(user_ratings) # Show personal ratings of movies
    # Remove previous personal ratings
    ratings = ratings[ratings.user_id != new_user_id]
    # Add new personal ratings
    ratings = pd.concat([ratings, user_ratings[["user_id", "movie_id", "rating"]]], ignore_index=True)
    # Add new user to users dataframe
    if users.shape[0] == USERS_COUNT:
        # Duplicate the last row in users and change its id to add new user
        users = pd.concat([users, users.iloc[USERS_COUNT-1].to_frame().transpose()], ignore_index=True)
        users["user_id"][USERS_COUNT] = new_user_id
    number_of_ratings = len(user_ratings)
    text = "ratings" if number_of_ratings > 1 else "rating"
    print(f"Added your {number_of_ratings} {text}!")

## Model Training
A Collaborative Filtering model class (CFModel) is created to train a matrix factorization model.
This is achieved using stochastic gradient descent.