# Recommendation System using Matrix Factorization in Python
###### The Matix


### First we import the libraries necessary to us

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

### Then we import the datasets
### Here ratings stores the dataset of the ratings and movies stores the dataset for movies.

In [3]:
ratings="https://github.com/couturierc/tutorials/raw/master/recommender_system/data/ratings.csv"
movies="https://github.com/couturierc/tutorials/raw/master/recommender_system/data/movies.csv"


# If data stored locally
# ratings="./data/ratings.csv"
# movies="./data/movies.csv"

df_ratings = pd.read_csv(ratings, sep=',')                     # read_csv reads the given link i.e ratings and the separation used is a comma ','
df_ratings.columns = ['userId', 'itemId', 'rating', 'timestamp']
df_movies = pd.read_csv(movies, sep=',')                       # read_csv reads the given link i.e movies and the separation used is a comma ','
df_movies.columns = ['itemId', 'title', 'genres']

In [4]:
df_movies.head(10)    #head() gives the top 5 data. But by entering 10 as a parameter, we get top 10 datasets.


Unnamed: 0,itemId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children
8,9,Sudden Death (1995),Action
9,10,GoldenEye (1995),Action|Adventure|Thriller


In [5]:
df_ratings.head(10)      #head() gives the top 5 data. But by entering 10 as a parameter, we get top 10 datasets.

Unnamed: 0,userId,itemId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
5,1,70,3.0,964982400
6,1,101,5.0,964980868
7,1,110,4.0,964982176
8,1,151,5.0,964984041
9,1,157,5.0,964984100


In [6]:
df_movies.describe()

Unnamed: 0,itemId
count,9742.0
mean,42200.353623
std,52160.494854
min,1.0
25%,3248.25
50%,7300.0
75%,76232.0
max,193609.0


In [7]:
df_ratings.describe()

Unnamed: 0,userId,itemId,rating,timestamp
count,100836.0,100836.0,100836.0,100836.0
mean,326.127564,19435.295718,3.501557,1205946000.0
std,182.618491,35530.987199,1.042529,216261000.0
min,1.0,1.0,0.5,828124600.0
25%,177.0,1199.0,3.0,1019124000.0
50%,325.0,2991.0,3.5,1186087000.0
75%,477.0,8122.0,4.0,1435994000.0
max,610.0,193609.0,5.0,1537799000.0


In [9]:
#The first step of a colaborative filtering is to build a item-user matrix.This is also called a pivot matrix.
#We are using usersID for the rows of the matrix and ItemId for the column of the matrix. This matrix is a very sparse matrix.
#df_user_item = df_ratings.pivot(index='userId', columns='itemId',values='rating')
# Sort index/rows (userId's) and columns (itemId's)
df_user_item = df_ratings.pivot(index='userId', columns='itemId',values='rating')
df_user_item.sort_index(axis=0, inplace=True)
df_user_item.sort_index(axis=1, inplace=True)
df_user_item.head()

df_user_item.describe()

itemId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
count,215.0,110.0,52.0,7.0,49.0,102.0,54.0,8.0,16.0,132.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
mean,3.92093,3.431818,3.259615,2.357143,3.071429,3.946078,3.185185,2.875,3.125,3.496212,...,3.5,3.0,4.0,4.0,3.5,4.0,3.5,3.5,3.5,4.0
std,0.834859,0.881713,1.054823,0.852168,0.907148,0.817224,0.977561,1.125992,0.974679,0.859381,...,,,,,,,,,,
min,0.5,0.5,0.5,1.0,0.5,1.0,1.0,1.0,1.5,0.5,...,3.5,3.0,4.0,4.0,3.5,4.0,3.5,3.5,3.5,4.0
25%,3.5,3.0,3.0,1.75,3.0,3.125,3.0,2.75,2.875,3.0,...,3.5,3.0,4.0,4.0,3.5,4.0,3.5,3.5,3.5,4.0
50%,4.0,3.5,3.0,3.0,3.0,4.0,3.0,3.0,3.0,3.5,...,3.5,3.0,4.0,4.0,3.5,4.0,3.5,3.5,3.5,4.0
75%,4.5,4.0,4.0,3.0,3.5,4.5,4.0,3.0,3.25,4.0,...,3.5,3.0,4.0,4.0,3.5,4.0,3.5,3.5,3.5,4.0
max,5.0,5.0,5.0,3.0,5.0,5.0,5.0,5.0,5.0,5.0,...,3.5,3.0,4.0,4.0,3.5,4.0,3.5,3.5,3.5,4.0


In [10]:
def encode_ids(data):
    '''Takes a rating dataframe and return: 
    - a simplified rating dataframe with ids in range(nb unique id) for users and movies
    - 2 mapping disctionaries
    
    '''

    data_encoded = data.copy()
    
    users = pd.DataFrame(data_encoded.userId.unique(),columns=['userId'])  # df of all unique users
    dict_users = users.to_dict()    
    inv_dict_users = {v: k for k, v in dict_users['userId'].items()}

    items = pd.DataFrame(data_encoded.itemId.unique(),columns=['itemId']) # df of all unique items
    dict_items = items.to_dict()    
    inv_dict_items = {v: k for k, v in dict_items['itemId'].items()}

    data_encoded.userId = data_encoded.userId.map(inv_dict_users)
    data_encoded.itemId = data_encoded.itemId.map(inv_dict_items)

    return data_encoded, dict_users, dict_items

 



In [11]:
# Adapted from http://nicolas-hug.com/blog/matrix_facto_4
def SGD(data,           # dataframe containing 1 user|item|rating per row
        n_factors = 10, # number of factors
        alpha = .01,    # number of factors
        n_epochs = 1,   # number of iteration of the SGD procedure
       ):
    '''Learn the vectors P and Q (ie all the weights p_u and q_i) with SGD.
    '''

    # Encoding userId's and itemId's in data
    data, dict_users, dict_items = encode_ids(data)
    
    ##### FILL HERE (2 lines) ######
    n_users = data.userId.nunique()  # number of unique users
    n_items = data.itemId.nunique() # number of unique items
    ################################
    
    # Randomly initialize the user and item factors.
    p = np.random.normal(0, .1, (n_users, n_factors))
    q = np.random.normal(0, .1, (n_items, n_factors))

    # Optimization procedure
    for epoch in range(n_epochs):
        print ('epoch: ', epoch)
        # Loop over the rows in data
        for index in range(data.shape[0]):
            row = data.iloc[[index]]
            u = int(row.userId)      # current userId = position in the p vector (thanks to the encoding)
            i = int(row.itemId)      # current itemId = position in the q vector
            r_ui = float(row.rating) # rating associated to the couple (user u , item i)
            
            ##### FILL HERE (1 line) ######
            err = r_ui -np.dot(p[u],q[i].transpose())    # difference between the predicted rating (p_u . q_i) and the known ratings r_ui
            ################################
            
            # Update vectors p_u and q_i
            ##### FILL HERE (2 lines) ######
            p_old=p[u]
            p[u] = p[u]+alpha*err*q[i]  # cf. update rule above 
            q[i] = q[i]+alpha*err*p_old
            ################################
            
    return p, q
    
    
def estimate(u, i, p, q):
    '''Estimate rating of user u for item i.'''
    ##### FILL HERE (1 line) ######
    return np.dot(p[u],q[i].transpose())            #scalar product of p[u] and q[i] /!\ dimensions
    ################################

In [None]:
p, q = SGD(df_ratings)


epoch:  0
