In [1]:
import pandas as pd
import numpy as np
from helpers import *
from sklearn.linear_model import Ridge

In [11]:
# num_users, num_movies = len(rating),len(rating.columns)

# user_features = pd.DataFrame(np.random.rand(num_users,num_features),index=range(1,num_users+1),columns=range(1,num_features+1))
# movie_features = pd.DataFrame(np.random.rand(num_movies,num_features),index=range(1,num_movies+1),columns=range(1,num_features+1))

# user_ft_data = train_df.set_index('Movie').join(Z).sort_values('User').set_index('User')
# movie_ft_data = train_df.set_index('User').join(W).sort_values('Movie').set_index('Movie')

# user_ft_data.loc[1,:].head(1)

# user_ft_data.loc[1,user_ft_data.columns!='Rating']

In [24]:
def compute_rmse_rr(pred, rating):
    err = rating - pred
    mse = err**2
    return np.sqrt(np.mean(mse.mean(skipna=True)))

In [18]:
def update_user_features(W,Z,train_df,lambda_,num_users):
    user_ft_data = train_df.set_index('Movie').join(Z).sort_values('User').set_index('User')
    model = Ridge(fit_intercept=False,alpha = lambda_)
    
    for i in range(1,num_users+1):
        X = user_ft_data.loc[i,user_ft_data.columns!='Rating']
        y = user_ft_data.loc[i,'Rating']
        model.fit(X,y)
        W.loc[i,:] = model.coef_
    return W


def update_movie_features(W,Z,train_df,lambda_,num_movies):
    movie_ft_data = train_df.set_index('User').join(W).sort_values('Movie').set_index('Movie')
    model = Ridge(fit_intercept=False,alpha = lambda_)
    
    for i in range(1,num_movies+1):
        X = movie_ft_data.loc[i,movie_ft_data.columns!='Rating']
        y = movie_ft_data.loc[i,'Rating']
        model.fit(X,y)
        Z.loc[i,:] = model.coef_
    return Z

In [26]:
def MF_RR(train_tr, rating,num_features,lambda_,iterations=20):
    """Alternating Least Squares (ALS) algorithm."""
    # define parameters
    stop_criterion = 1e-5
    change = 1
    error_list = [0, 0]
    it = 0
 
    num_users, num_movies = len(rating),len(rating.columns)
    # init matrix
    user_features = pd.DataFrame(np.random.rand(num_users,num_features),index=range(1,num_users+1),columns=range(1,num_features+1))
    movie_features = pd.DataFrame(np.random.rand(num_movies,num_features),index=range(1,num_movies+1),columns=range(1,num_features+1))
    
    train_rmse = 0
    W = user_features.copy()
    Z = movie_features.copy()
    # start ALS
    while(it < iterations):
        W = update_user_features(W,Z,train_tr,lambda_,num_users)
        
        Z = update_movie_features(W,Z,train_tr,lambda_,num_movies)
        pred = W.dot(Z.T)
        pred[pred > 5] = 5
        pred[pred < 1] = 1
        train_rmse = compute_rmse_rr(pred,rating)
        print("MF-RR training RMSE : {err}".format(err=train_rmse))
        error_list.append(train_rmse)
        change = np.fabs(error_list[-1] - error_list[-2])
        if (change < stop_criterion):
            print("Converge!")
            break;
            
        it += 1
        
#     print("MF-RR Final training RMSE : {err}".format(err=train_rmse))
    return W,Z

In [57]:
def mf_rr_algo(model,train_df,test_df):
    
    num_features = 20
    lambda_ = 19
    
    rating = train_df.pivot(index="User",columns="Movie",values="Rating")
    
    user_features,movie_features = MF_RR(train_df,rating,num_features,lambda_,iterations=20)
    
    pred = user_features.dot(movie_features.T)
    pred[pred > 5] = 5
    pred[pred < 1] = 1
    
    test_user = test_df['User'].values
    test_movie = test_df['Movie'].values
    
    pred_test =[]
    for user,movie in zip(test_user,test_movie):
        pred_rating = pred.loc[user,movie]
        pred_test.append(pred_rating)
        
    pred_test = np.asarray(pred_test)

    test_ret = test_df.copy()
    test_ret.drop(columns='Rating',inplace=True)
    test_ret['Rating'] = pred_test
    prediction = test_ret.copy()
#     prediction = create_csv_submission(test_ret)
    
    return prediction

## Test MFRR algo

In [54]:
train_dataset = "./data/data_train.csv"
test_dataset = "./data/sampleSubmission.csv"

train_df = load_dataset(train_dataset)
test_df = load_dataset(test_dataset)

[load_dataset] Valid: (1176952, 3)
[load_dataset] Valid: (1176952, 3)


In [55]:
train_tr, test_tr = split_dataset(train_df,p_test=0.5)

[split_dataset] Valid: (1176952, 3)


In [58]:
prediction = mf_rr_algo("",train_tr,test_tr)

MF-RR training RMSE : 0.9888417362091013
MF-RR training RMSE : 0.9756442891414643
MF-RR training RMSE : 0.9648709136415702
MF-RR training RMSE : 0.9582898528767403
MF-RR training RMSE : 0.9539443183324987
MF-RR training RMSE : 0.95109905813012
MF-RR training RMSE : 0.9493085565853918
MF-RR training RMSE : 0.9482580792515908
MF-RR training RMSE : 0.9477215880370957
MF-RR training RMSE : 0.9475373528892748
MF-RR training RMSE : 0.9475919504153082
MF-RR training RMSE : 0.9478056930300627
MF-RR training RMSE : 0.9481229623670366
MF-RR training RMSE : 0.9485047767673412
MF-RR training RMSE : 0.9489231249984413
MF-RR training RMSE : 0.9493588556197821
MF-RR training RMSE : 0.9497977406294663
MF-RR training RMSE : 0.9502297963659893
MF-RR training RMSE : 0.9506478322941487
MF-RR training RMSE : 0.9510469817429116


In [60]:
prediction.head()

Unnamed: 0,User,Movie,Rating
0,2135,741,3.845605
1,6850,357,2.125287
2,4393,440,3.244223
3,365,42,3.354287
4,9660,546,3.76417


## Debug for each

In [15]:
train_dataset = "./data/data_train.csv"
test_dataset = "./data/sampleSubmission.csv"

train_df = load_dataset(train_dataset)
test_df = load_dataset(test_dataset)
train_tr, test_tr = split_dataset(train_df)
# train_tr = train_tr.pivot(index="User",columns="Movie",values="Rating").fillna(0)
# test_tr = test_tr.pivot(index="User",columns="Movie",values="Rating").fillna(0)

[load_dataset] Valid: (1176952, 3)
[load_dataset] Valid: (1176952, 3)
[split_dataset] Valid: (1176952, 3)


In [16]:
rating = train_df.pivot(index="User",columns="Movie",values="Rating")

In [27]:
num_features = 20
lambda_ = 19
user_features,movie_features = MF_RR(train_tr,rating,num_features,lambda_,iterations=20)

MF-RR training RMSE : 0.9962715663977991
MF-RR training RMSE : 0.9821503744810715
MF-RR training RMSE : 0.9715191976415806
MF-RR training RMSE : 0.9648483388734127
MF-RR training RMSE : 0.9609606277479182
MF-RR training RMSE : 0.958657229973747
MF-RR training RMSE : 0.9572381437650561
MF-RR training RMSE : 0.9563431546319633
MF-RR training RMSE : 0.9557697728835788
MF-RR training RMSE : 0.9554186642932773
MF-RR training RMSE : 0.9552258745111935
MF-RR training RMSE : 0.9551474234591418
MF-RR training RMSE : 0.9551523752923804
Converge!


In [28]:
pred = user_features.dot(movie_features.T)
pred[pred > 5] = 5
pred[pred < 1] = 1

In [31]:
test_user = test_tr['User'].values
test_movie = test_tr['Movie'].values

In [43]:
pred_test =[]
for user,movie in zip(test_user,test_movie):
    pred_rating = pred.loc[user,movie]
    pred_test.append(pred_rating)
pred_test = np.asarray(pred_test)
pred_test[pred_test > 5] = 5
pred_test[pred_test < 1] = 1
test_ret = test_tr.copy()
test_ret['Pred'] = pred_test

In [45]:
err = test_ret['Rating']-test_ret['Pred']
mse = err**2
rmse = np.sqrt(mse.mean())

In [46]:
rmse

1.0082348576201594