In [1]:
%matplotlib inline
from matplotlib import pyplot as plt

import argparse
from collections import deque
import os

import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error

from keras.layers import Input, Embedding, Reshape, Dot, Concatenate, Dense, Dropout
from keras.models import Model

In [2]:
def fetch_user_data(data_path):
    r"""Reads user data and refine it."""
    df_raw = pd.read_csv(data_path, header=None, names=["User", "Rating", "Date"], usecols=[0, 1, 2])

    tmp_movies = df_raw[df_raw["Rating"].isna()]["User"].reset_index()
    movie_indices = [[index, int(movie[:-1])] for index, movie in tmp_movies.values]

    shifted_movie_indices = deque(movie_indices)
    shifted_movie_indices.rotate(-1)

    user_data = []
    for [df_id_1, movie_id], [df_id_2, next_movie_id] in zip(movie_indices, shifted_movie_indices):
        if df_id_1<df_id_2:
            tmp_df = df_raw.loc[df_id_1+1:df_id_2-1].copy()
        else:
            tmp_df = df_raw.loc[df_id_1+1:].copy()
            
        tmp_df["Movie"] = movie_id
        user_data.append(tmp_df)

    df = pd.concat(user_data)
    print(f"Shape of raw User-Ratings: {df.shape}")

    return df

In [3]:
def filter_data(df, movie_rating_threshold=10000, user_rating_threshold=200):
    r"""Discard outliers from data in both movies and users."""
    filter_movies = (df["Movie"].value_counts() > movie_rating_threshold)
    filter_movies = filter_movies[filter_movies].index.tolist()

    filter_users = (df["User"].value_counts() > user_rating_threshold)
    filter_users = filter_users[filter_users].index.tolist()

    df_filtered = df[(df['Movie'].isin(filter_movies)) & (df['User'].isin(filter_users))]

    print(f"Shape of filtered User-Ratings: {df_filtered.shape}")

    return df_filtered

In [4]:
def create_train_test_split(df_filtered, n=100000):
    r"""Splits dataframe into training and test."""
    df_filtered = df_filtered.drop('Date', axis=1).sample(frac=1).reset_index(drop=True)

    # splits data into training and test
    df_train = df_filtered[:-n]
    df_test = df_filtered[-n:]

    return df_train, df_test

In [5]:
# IMPORTANT: not used for DNN approach
def make_data_sparse(df):
    df_p = df.pivot_table(index="User", columns="Movie", values="Rating", fill_value=0)

    print(f"Shape of User-Movie: {df_p.shape}")

    return df_p

In [6]:
def fetch_movie_titles(data_path):
    r"""Reads movie titles and returns dataframe."""
    movie_titles = pd.read_csv(data_path, 
                               encoding="ISO-8859-1", 
                               header=None, 
                               names=["Id", "Year", "Name"]).set_index("Id")

    print(f"Shape of Movie-Titles: {movie_titles.shape}")

    return movie_titles

In [7]:
data_dir = "./data"
data_path = os.path.join(data_dir, "combined_data_1.txt")
# data_path = os.path.join(data_dir, "combined_data_all.txt")

movie_titles = fetch_movie_titles(os.path.join(data_dir, "movie_titles.csv"))

df_raw = fetch_user_data(data_path)
df_filtered = filter_data(df_raw)
df_train, df_test = create_train_test_split(df_filtered)

Shape of Movie-Titles: (17770, 2)
Shape of raw User-Ratings: (24053764, 4)
Shape of filtered User-Ratings: (4178032, 4)


In [8]:
print(f"Raw number of users: {len(df_raw['User'].unique())}")
print(f"Raw number of movies: {len(df_raw['Movie'].unique())}")

Raw number of users: 470758
Raw number of movies: 4499


In [9]:
# will map all IDs to [0, N]
user_id_mapping = {unique_id: i for i, unique_id in enumerate(df_filtered["User"].unique())}
movie_id_mapping = {unique_id: i for i, unique_id in enumerate(df_filtered["Movie"].unique())}

train_user_data = df_train["User"].map(user_id_mapping)
train_movie_data = df_train["Movie"].map(movie_id_mapping)

# same mapping ised for both training and test data
test_user_data = df_test["User"].map(user_id_mapping)
test_movie_data = df_test["Movie"].map(movie_id_mapping)

n_users = len(user_id_mapping)
print(f"Number of users: {n_users}")

n_movies = len(movie_id_mapping)
print(f"Number of movies: {n_movies}")

Number of users: 20828
Number of movies: 491


In [10]:
user_embedding_size = 12
movie_embedding_size = 5

In [11]:
user_id_input = Input(shape=[1], name="user")
movie_id_input = Input(shape=[1], name="movie")

# Create embedding layers for users and movies
user_embedding = Embedding(output_dim=user_embedding_size, 
                           input_dim=n_users,
                           input_length=1, 
                           name="user_embedding")(user_id_input)
movie_embedding = Embedding(output_dim=movie_embedding_size, 
                            input_dim=n_movies,
                            input_length=1, 
                            name="item_embedding")(movie_id_input)

# reshaping because embedding's output will have redundant dimension
user_vector = Reshape([user_embedding_size])(user_embedding)
movie_vector = Reshape([movie_embedding_size])(movie_embedding)

# input to model will be embedding of both movie and user
concat_input = Concatenate()([user_vector, movie_vector])

# model is shallow MLP
dense = Dense(256)(concat_input)
y = Dense(1)(dense)

In [12]:
# init model
model = Model(inputs=[user_id_input, movie_id_input], outputs=y)
model.compile(loss="mse", optimizer="adam")

# train model
train_history = model.fit(x=[train_user_data, train_movie_data],
                          y=df_train["Rating"],
                          batch_size=512, 
                          epochs=1,
                          validation_split=0.1,
                          shuffle=True)



In [13]:
y_pred = model.predict([test_user_data, test_movie_data])
y_true = df_test["Rating"].values

In [14]:
mse = mean_squared_error(y_pred=y_pred, y_true=y_true)
print(f"MSE on test set: {mse}")

rmse = np.sqrt(mse)
print(f"RMSE on test set: {rmse}")

MSE on test set: 0.8221831156594356
RMSE on test set: 0.9067431365383669
