<a href="https://colab.research.google.com/github/yannboun/collaborativefiltering/blob/master/Colaborative_Filtering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install implicit
!wget -O ml-1m.zip http://files.grouplens.org/datasets/movielens/ml-1m.zip
!unzip -o ml-1m.zip

import pandas as pd
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()
import numpy as np
from scipy.sparse import coo_matrix
import implicit
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import mean_absolute_error


movie_df = pd.read_csv("ml-1m/movies.dat", sep="::",names=["MovieID","movie","type"])
ratings_df = pd.read_csv("ml-1m/ratings.dat", sep="::",names=["UserID","MovieID","Rating","Time"])

Collecting implicit
[?25l  Downloading https://files.pythonhosted.org/packages/5a/d8/6b4f1374ffa2647b72ac76960c71b984c6f3238090359fb419d03827d87a/implicit-0.4.2.tar.gz (1.1MB)
[K     |▎                               | 10kB 18.8MB/s eta 0:00:01[K     |▋                               | 20kB 3.0MB/s eta 0:00:01[K     |▉                               | 30kB 4.4MB/s eta 0:00:01[K     |█▏                              | 40kB 2.9MB/s eta 0:00:01[K     |█▌                              | 51kB 3.5MB/s eta 0:00:01[K     |█▊                              | 61kB 4.2MB/s eta 0:00:01[K     |██                              | 71kB 4.9MB/s eta 0:00:01[K     |██▍                             | 81kB 5.5MB/s eta 0:00:01[K     |██▋                             | 92kB 6.1MB/s eta 0:00:01[K     |███                             | 102kB 4.7MB/s eta 0:00:01[K     |███▎                            | 112kB 4.7MB/s eta 0:00:01[K     |███▌                            | 122kB 4.7MB/s eta 0:00:01[

  from ipykernel import kernelapp as app
  app.launch_new_instance()


In [0]:
## Start indexing at 0
ratings_df['UserID'] = ratings_df['UserID'] - 1
ratings_df["MovieID"] = ratings_df["MovieID"] - 1
movie_df["MovieID"] = movie_df["MovieID"] - 1

## Create dictionary to easily match movie ids to titles
movie_dict = movie_df.set_index("MovieID")[["movie"]].to_dict(orient="index")

## Set some ratings aside for testing

ratings_df_train, ratings_df_test = train_test_split(ratings_df, test_size=0.1, random_state=41)


In [0]:
usermovie_df = ratings_df_train.pivot(index='UserID', columns='MovieID', values='Rating').reindex(range(0,ratings_df["UserID"].max()), fill_value=0).transpose().reindex(range(0,ratings_df["MovieID"].max()), fill_value=0).transpose()
usermovie_df = usermovie_df.transpose().fillna(usermovie_df.mean(axis=1)).transpose()
users = usermovie_df.index.tolist()
movies = usermovie_df.columns.tolist()
usermovie_df = usermovie_df.values



## First approach:
By creating a simple neural network with tensorflow, we find an optimal series of encoders and decoders: e1, e2, d1, d2 such as:<br>

>$\overrightarrow{X}*ReLU(\overrightarrow{e_1})*\sigma(\overrightarrow{e_2})*ReLU
(\overrightarrow{d_1})*\sigma(\overrightarrow{d_2}) = \overrightarrow{X'}  \approx \overrightarrow{X}$

*Credits to Susan Li* :  https://towardsdatascience.com/building-a-collaborative-filtering-recommender-system-with-tensorflow-82e63d27b420

* Blanks are filled with user's average ratings instead of 0
* Used ReLU activations on first layers

In [0]:
num_input = ratings_df["MovieID"].max()
num_hidden_1 = 20
num_hidden_2 = 7


X = tf.placeholder(tf.float64, [None, num_input])

weights = {
    'encoder_h1': tf.Variable(tf.random_normal([num_input, num_hidden_1], dtype=tf.float64)),
    'encoder_h2': tf.Variable(tf.random_normal([num_hidden_1, num_hidden_2], dtype=tf.float64)),
    'decoder_h1': tf.Variable(tf.random_normal([num_hidden_2, num_hidden_1], dtype=tf.float64)),
    'decoder_h2': tf.Variable(tf.random_normal([num_hidden_1, num_input], dtype=tf.float64)),
}

biases = {
    'encoder_b1': tf.Variable(tf.random_normal([num_hidden_1], dtype=tf.float64)),
    'encoder_b2': tf.Variable(tf.random_normal([num_hidden_2], dtype=tf.float64)),
    'decoder_b1': tf.Variable(tf.random_normal([num_hidden_1], dtype=tf.float64)),
    'decoder_b2': tf.Variable(tf.random_normal([num_input], dtype=tf.float64)),
}

def encoder(x):
    layer_1 = tf.nn.relu(tf.add(tf.matmul(x, weights['encoder_h1']), biases['encoder_b1']))
    layer_2 = tf.nn.sigmoid(tf.add(tf.matmul(layer_1, weights['encoder_h2']), biases['encoder_b2']))
    return layer_2

def decoder(x):
    layer_1 = tf.nn.relu(tf.add(tf.matmul(x, weights['decoder_h1']), biases['decoder_b1']))
    layer_2 = tf.nn.sigmoid(tf.add(tf.matmul(layer_1, weights['decoder_h2']), biases['decoder_b2']))
    return layer_2

encoder_op = encoder(X)
decoder_op = decoder(encoder_op)
y_pred = decoder_op
y_true = X

In [50]:

loss = tf.losses.mean_squared_error(y_true, y_pred)
optimizer = tf.train.RMSPropOptimizer(0.03).minimize(loss)
eval_x = tf.placeholder(tf.int32, ) 
eval_y = tf.placeholder(tf.int32, )
pre, pre_op = tf.metrics.precision(labels=eval_x, predictions=eval_y)

init = tf.global_variables_initializer()
local_init = tf.local_variables_initializer()
pred_data = pd.DataFrame()

with tf.Session() as session:
    epochs = 100
    batch_size = 35

    session.run(init)
    session.run(local_init)

    num_batches = int(usermovie_df.shape[0] / batch_size)
    train_df = np.array_split(usermovie_df//5, num_batches)
    
    for i in range(epochs):

        avg_cost = 0
        for batch in train_df:
            _, l = session.run([optimizer, loss], feed_dict={X: batch})
            avg_cost += l

        avg_cost /= num_batches

        if (i+1) % 5 == 0 : print("epoch: {} Loss: {}".format(i + 1, avg_cost))

    train_df = np.concatenate(train_df, axis=0)

    preds = session.run(decoder_op, feed_dict={X: train_df})*5

    pred_data = pred_data.append(pd.DataFrame(preds))

    pred_data = pred_data.stack().reset_index(name='Rating')
    pred_data.columns = ['UserID', 'MovieID', 'Rating']
    pred_data['UserID'] = pred_data['UserID'].map(lambda value: users[value])
    pred_data['MovieID'] = pred_data['MovieID'].map(lambda value: movies[value])
    
    keys = ['UserID', 'MovieID']
    index_1 = pred_data.set_index(keys).index
    index_2 = ratings_df_train.set_index(keys).index

    top_ten_ranked = pred_data[~index_1.isin(index_2)]
    top_ten_ranked = top_ten_ranked.sort_values(['UserID', 'Rating'], ascending=[True, False])
    top_ten_ranked = top_ten_ranked.groupby('UserID').head(10)

epoch: 5 Loss: 0.0085481125834333
epoch: 10 Loss: 0.008183790831068573
epoch: 15 Loss: 0.008107344850053102
epoch: 20 Loss: 0.008074943089944333
epoch: 25 Loss: 0.00805722699002471
epoch: 30 Loss: 0.008046128343097692
epoch: 35 Loss: 0.0080385553504362
epoch: 40 Loss: 0.008033075380723837
epoch: 45 Loss: 0.008028935687075
epoch: 50 Loss: 0.008025704227864396
epoch: 55 Loss: 0.008023115538255594
epoch: 60 Loss: 0.008020997856916902
epoch: 65 Loss: 0.008019235166504459
epoch: 70 Loss: 0.008017746607101587
epoch: 75 Loss: 0.00801647357036208
epoch: 80 Loss: 0.008015373302082162
epoch: 85 Loss: 0.008014413530851692
epoch: 90 Loss: 0.008013569267986471
epoch: 95 Loss: 0.008012821258933739
epoch: 100 Loss: 0.00801215430472566


# Second Approach:
Matrix factorisation via Alternate Least Square (GPU Optimized):

We alternately solve the following expression for U and for M :

>$\overrightarrow{U}\overrightarrow{M} \approx \overrightarrow{X} 
,where \\\overrightarrow{U} is\ a\ user\ (n_{users}*n_{hiddenfeatures})\ matrix, 
\\\overrightarrow{M} is\ a\ movie\ (n_{hiddenfeatures}*n_{movies})\ matrix,
$


In [51]:


item_user_data = coo_matrix((ratings_df_train["Rating"], (ratings_df_train["MovieID"],ratings_df_train["UserID"])), shape=(ratings_df_train["MovieID"].max()+1, ratings_df_train["UserID"].max()+1))
# initialize a model
model = implicit.als.AlternatingLeastSquares(factors=128, use_gpu = True)

# train the model on a sparse matrix of item/user/confidence weights
model.fit(item_user_data)

user_items = item_user_data.T.tocsr()


HBox(children=(IntProgress(value=0, max=15), HTML(value='')))




In [52]:
##Simple function to find a movie index number
def getKeysByValue(valueToFind):
    listOfKeys = list()
    listOfItems = movie_dict.items()
    for item  in listOfItems:
        if valueToFind.lower() in item[1]["movie"].lower():
            listOfKeys.append((item[0],item[1]["movie"]))
    return  listOfKeys

getKeysByValue("matrix")

[(2570, 'Matrix, The (1999)')]

In [53]:
# With this approach, we are able to find similar movies
i=2570
print(movie_dict[i])
[movie_dict[x] for (x, y) in model.similar_items(i)]

{'movie': 'Matrix, The (1999)'}


[{'movie': 'Matrix, The (1999)'},
 {'movie': 'Terminator 2: Judgment Day (1991)'},
 {'movie': 'Total Recall (1990)'},
 {'movie': 'Fugitive, The (1993)'},
 {'movie': 'Terminator, The (1984)'},
 {'movie': 'Face/Off (1997)'},
 {'movie': 'Fifth Element, The (1997)'},
 {'movie': 'Twelve Monkeys (1995)'},
 {'movie': 'Jurassic Park (1993)'},
 {'movie': 'Men in Black (1997)'}]

# Comparing results

In [68]:
#Let's choose a random user numer
usernumber = 1234
results = pd.DataFrame()
results["watched"] = [x["movie"] for x in ratings_df_train.loc[ratings_df_train['UserID'] == usernumber].sort_values(by=['Rating'], ascending=False)["MovieID"].head(10).map(movie_dict)]
results["NeuralNet"] = [x["movie"] for x in top_ten_ranked.loc[top_ten_ranked['UserID'] == usernumber]["MovieID"].head(10).map(movie_dict)]
results["MatrixFac"] = [movie_dict[x]["movie"] for (x, y) in model.recommend(usernumber, user_items, filter_already_liked_items =True)]
results

Unnamed: 0,watched,NeuralNet,MatrixFac
0,Life Is Beautiful (La Vita � bella) (1997),Star Wars: Episode IV - A New Hope (1977),Shine (1996)
1,Titanic (1997),American Beauty (1999),Scent of a Woman (1992)
2,"Cider House Rules, The (1999)","Godfather, The (1972)",Jerry Maguire (1996)
3,Sense and Sensibility (1995),Star Wars: Episode V - The Empire Strikes Back...,"Piano, The (1993)"
4,"Joy Luck Club, The (1993)",Raiders of the Lost Ark (1981),Fried Green Tomatoes (1991)
5,Good Will Hunting (1997),"Sixth Sense, The (1999)",Awakenings (1990)
6,"Secret Garden, The (1993)",Schindler's List (1993),Schindler's List (1993)
7,"Shawshank Redemption, The (1994)","Matrix, The (1999)",Leaving Las Vegas (1995)
8,Hope Floats (1998),"Princess Bride, The (1987)","Green Mile, The (1999)"
9,"Firm, The (1993)","Usual Suspects, The (1995)","American President, The (1995)"


Results are quite different. Let's examine the accuracy using the test set


In [0]:
df_compare = ratings_df_test[["UserID","MovieID","Rating"]].merge(pred_data, how="left", right_on = ["UserID","MovieID"], left_on = ["UserID","MovieID"], suffixes=("","_NN"))

In [0]:

def get_als_score(user, movie):
  return model.rank_items(user,user_items,[movie])[0][1]

df_compare["Rating_MF"] = df_compare.apply(lambda x: get_als_score(int(x["UserID"]),int(x["MovieID"])), axis=1)

## Since we will simply recomend top movie, 
## we are not really interested in the actual absolute store,
## but rather by the relative scores, in that optic, it makes sense to standatdize

scaler = StandardScaler()
df_compare["Rating_standard"]= scaler.fit_transform(df_compare["Rating"].values.reshape(-1,1))
df_compare["Rating_MF_standard"]= scaler.fit_transform(df_compare["Rating_MF"].values.reshape(-1,1))
df_compare["Rating_NN_standard"]= scaler.fit_transform(df_compare["Rating_NN"].values.reshape(-1,1))

df_compare=df_compare.dropna()

In [58]:
print("Neural Network MAE: {}".format(mean_absolute_error(df_compare["Rating_standard"], df_compare["Rating_NN_standard"])))
print("Matrix Factorization MAE: {}".format(mean_absolute_error(df_compare["Rating_standard"], df_compare["Rating_MF_standard"])))

Neural Network MAE: 0.9162304069335659
Matrix Factorization MAE: 1.0391789241501572


Both Absolute Errors are similar and around 1, meaning that on average our predictions are one standard deviation away from the real rating. We see below that the standard deviation of the original ratings (on a 1 to 5 scale) is also around 1. 
<br><br>
On this iteration, the Neural Network performed slightly better, but a k-fold CV would be necessary to take any conclusion



In [60]:
df_compare["Rating"].std()

1.113562891901298