In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

# import numpy as np # linear algebra
# import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Data Preprocessing

In [12]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
# read_data
df1 = pd.read_csv("../input/netflix-prize-data/combined_data_1.txt",sep=",",header=None,names = ['customer_id', 'rating'], usecols = [0,1])
# df2 = pd.read_csv("../input/netflix-prize-data/combined_data_2.txt",sep=",",header=None,names = ['customer_id', 'rating'], usecols = [0,1])
# data = df1.append(df2)
# del df1,df2
# df3 = pd.read_csv("../input/netflix-prize-data/combined_data_3.txt",sep=",",header=None,names = ['customer_id', 'rating'], usecols = [0,1])
# data = data.append(df3)
# del df3
# df4 = pd.read_csv("../input/netflix-prize-data/combined_data_4.txt",sep=",",header=None,names = ['customer_id', 'rating'], usecols = [0,1])
# data = data.append(df4)
# del df4

In [20]:
# fill_in_movie_index
data = df1
df_movie = data[pd.isnull(data["rating"])]
data["movie_id"] = df_movie["customer_id"].apply(lambda x: x[:-1])
data["movie_id"].fillna(method ="ffill", inplace = True)
data = data[data["rating"].notna()]
data = pd.DataFrame(data,dtype=np.uint32) #convert to int

In [27]:
data = data[data["movie_id"]<=500] # Reduced sample size for easy calculation

# Recommendation Algorithm with Matrix Factorization
loss function:
$$
\min _{q^{*}, p^{*}} \sum_{(u, i) \in \kappa}\left(r_{u i}-q_{i}^{T} p_{u}\right)^{2}+\lambda\left(\left\|q_{i}\right\|^{2}+\left\|p_{u}\right\|^{2}\right)
$$
$\kappa$ is the set of the $(u, i)$ pairs for which $r_{u i}$ is known (the training set).$q_i$ is the factor vector of movie i, $p_u$ is the factor vector of user u. The constant $\lambda$ controls the extent of regularization.

In [28]:
def get_loss(data,p,q,learning_rate):
    loss = 0
    customer_num,movie_num = len(set(data["customer_id"])),len(set(data["movie_id"]))
    for index in data.index:
        u = customer_dict[data.iloc[index]["customer_id"]]
        i = int(data.iloc[index]["movie_id"])-1
        p_u = p[u]
        q_i = q[i]
        loss += (data.iloc[index]["rating"]- np.dot(p_u,q_i))**2
    # add regularization
    for i in range(movie_num):
        loss += learning_rate*np.linalg.norm(q[i],ord=2)**2
    for u in range(customer_num):
        loss += learning_rate*np.linalg.norm(p[u],ord=2)**2
    return loss

def recoding_customer_id(customer_id):
    customer_num = len(set(customer_id))
    customer_dict = dict()
    temp = list(set(customer_id))
    for u in range(customer_num):
        customer_dict[temp[u]] = u
    return customer_dict
print("The data is with {} customers and {} movies.".format(len(set(data["customer_id"])),len(set(data["movie_id"]))))

## Stochastic Gradient Descent
>[1] Netflix Update: Try This at Home, Simon Funk, https://sifter.org/~simon/journal/20061211.html

$$
e_{u i} \stackrel{\operatorname{def}}{=} r_{u i}-q_{i}^{T} p_{u}
$$
Then it modifies the parameters by a magnitude proportional to $\gamma$ in the opposite direction of the gradient, yielding:
- $\quad q_{i} \leftarrow q_{i}+\gamma \cdot\left(e_{u i} \cdot p_{u}-\lambda \cdot q_{i}\right)$
- $\quad p_{u} \leftarrow p_{u}+\gamma \cdot\left(e_{u i} \cdot q_{i}-\lambda \cdot p_{u}\right)$

In [29]:
def SGD(data,learning_rate,rank,lamb,iterations,loss = None):
    '''
    rank: the rank of SVD, factor number inferred from the ratings patterns
    learning_rate: arbitrary number, try 0.001 if you don't have a better choice
    lamb: lambda, controls the extent of regularization
    iterations: the times it iterate
    '''
    customer_num,movie_num = len(set(data["customer_id"])),len(set(data["movie_id"]))
    customer_dict = recoding_customer_id(data["customer_id"])
    # initialize
    p = np.zeros((customer_num,rank))
    q = np.zeros((movie_num,rank))
    for _ in range(iterations):
        for index in data.index:
            u = customer_dict[data.iloc[index]["customer_id"]]
            #movie_id = i + 1
            i = int(data.iloc[index]["movie_id"])-1
            p_u = p[u]
            q_i = q[i]
            error = data.iloc[index]["rating"] - np.dot(p_u,q_i)
            q_i += learning_rate*(error*p_u - lamb*q_i)
            p_u += learning_rate*(error*q_i - lamb*p_u)
        if loss is not None:
            temp_loss = get_loss(data,p,q,learning_rate)
            if temp_loss < loss:
                return p,q,temp_loss
    temp_loss = get_loss(data,p,q,learning_rate)
    return p,q,temp_loss

In [None]:
p,q,temp_loss = SGD(data,learning_rate = 0.001,rank = 40,lamb = 1,iterations = 10,loss = None)

In [31]:
data.to_csv("output.csv",index = None)

## Lternating Least Squares