# LibFM - Cross Validation

Here, we rewrite the methods we use for the run with libFM in a way compatible with the cross-validation of the rest of the project.

Usual imports first.

In [2]:
import pywFM
import numpy as np
import pandas as pd
import scipy.sparse as sp
from scipy.sparse import csr_matrix


## 1. Loading and formatting functions.

In [3]:
def df_load(path):
    # 1. Load the DF and format it
    df = pd.read_csv(path)
    df['User'] = [(ID.split('_')[0])[1:] for ID in df['Id']]
    df['Movie'] = [(ID.split('_')[1])[1:] for ID in df['Id']]
    parsed_df = df[['User', 'Movie', 'Prediction']].astype(int)
    parsed_df[['Id']] = df[['Id']]

    # 2. Sort in ascending way for two variables so we can identify it later on.
    parsed_df = parsed_df.sort_values(['Movie','User'],ascending=[True,True])
    return parsed_df

In [4]:
def df_to_sparse(df):
    """
        Rewrites our matrix of user movie association in the following format, starting from a 2 column csv file with :
        1st column : user id and movie id mixed, 2nd column : rating. The output matrix will take the form 
        
         Users  |     Movies    
        A  B  C | TI  NH  SW  ST
        [1, 0, 0,  1,  0,  0,  0],
        [1, 0, 0,  0,  1,  0,  0],
        [1, 0, 0,  0,  0,  1,  0],
        [0, 1, 0,  0,  0,  1,  0],
        [0, 1, 0,  0,  0,  0,  1],
        [0, 0, 1,  1,  0,  0,  0],
        [0, 0, 1,  0,  0,  1,  0] 
        ])
        
        target = [5, 3, 1, 4, 5, 1, 5]
        
        @param path : The path of the training/testing data
        @oaram p_test : The percentage of elements that should be in the training
        @return features_te, target_te : the testing matrix and testing target values
        @return features_tr, target_tr : the training matrix and the training target values
    """
    #1. Extracting the info from the input DFv
    user_index = np.squeeze(np.array(df['User']-1))
    movie_index = np.squeeze(np.array(df['Movie'] + max(user_index)))
    ratings = np.squeeze(np.array(df['Prediction']))
    
    #2.Formatting now the way we need to use libFM
    # a. Testing set
    col_entries = np.r_[user_index,movie_index]
    indices = np.arange(0,len(user_index))
    row_entries = np.r_[indices,indices]
    entries = np.ones(len(row_entries))
    
    features = csr_matrix((entries,(row_entries, col_entries)),shape = (len(indices),len(col_entries)))
    
    return features, ratings

In [5]:
def df_to_sparse_split(df, p_test=0.1):
    """
        Rewrites our matrix of user movie association in the following format, starting from a 2 column csv file with :
        1st column : user id and movie id mixed, 2nd column : rating. The output matrix will take the form 
        
         Users  |     Movies    
        A  B  C | TI  NH  SW  ST
        [1, 0, 0,  1,  0,  0,  0],
        [1, 0, 0,  0,  1,  0,  0],
        [1, 0, 0,  0,  0,  1,  0],
        [0, 1, 0,  0,  0,  1,  0],
        [0, 1, 0,  0,  0,  0,  1],
        [0, 0, 1,  1,  0,  0,  0],
        [0, 0, 1,  0,  0,  1,  0] 
        ])
        
        target = [5, 3, 1, 4, 5, 1, 5]
        
        @param path : The path of the training/testing data
        @oaram p_test : The percentage of elements that should be in the training
        @return features_te, target_te : the testing matrix and testing target values
        @return features_tr, target_tr : the training matrix and the training target values
    """
    #1. Extracting the info from the input DFv
    user_index = np.squeeze(np.array(df['User']-1))
    movie_index = np.squeeze(np.array(df['Movie'] + max(user_index)))
    ratings = np.squeeze(np.array(df['Prediction']))
    
    #2. Test train split    
    # We make no permutation if either everything is a train or a test split.
    if p_test > 0 and p_test <1:
        indices = np.random.permutation(np.arange(0,len(user_index)))
    
        idx_te = indices[0:int(len(indices)*p_test)]
        idx_tr = indices[int(len(indices)*p_test):]
    elif p_test == 1:
        idx_te = np.arange(0,len(user_index))
        idx_tr = []
    else:
        idx_te = []
        idx_tr = np.arange(0,len(user_index))
    
    #3.Formatting now the way we need to use libFM
    # a. Testing set
    col_entries_te = np.r_[user_index[idx_te],movie_index[idx_te]]
    indices_te = np.arange(0,len(user_index[idx_te]))
    row_entries_te = np.r_[indices_te,indices_te]
    entries_te = np.ones(len(row_entries_te))
    
    features_te = csr_matrix((entries_te,(row_entries_te, col_entries_te)),shape = (len(indices_te),len(col_entries_te)))
    target_te = ratings[idx_te]
    
    # b. Training set
    col_entries_tr = np.r_[user_index[idx_tr],movie_index[idx_tr]]
    indices_tr = np.arange(0,len(user_index[idx_tr]))
    row_entries_tr = np.r_[indices_tr,indices_tr]
    entries_tr = np.ones(len(row_entries_tr))
    
    features_tr = csr_matrix((entries_tr,(row_entries_tr, col_entries_tr)),shape = (len(indices_tr),len(col_entries_tr)))
    target_tr = ratings[idx_tr]    
    
    return features_te, target_te, features_tr, target_tr

In [6]:
def write_submission(submission_data_path, prediction, out_path):
    df = pd.read_csv(submission_data_path)
    df['User'] = [(ID.split('_')[0])[1:] for ID in df['Id']]
    df['Movie'] = [(ID.split('_')[1])[1:] for ID in df['Id']]
    df[['User','Movie']] = df[['User','Movie']].apply(pd.to_numeric)
    df = df.sort_values(['Movie','User'],ascending=[True,True])
    df['Prediction'] = prediction.astype(int)
    
    df[['Id','Prediction']].to_csv(out_path, index = False)

## 2. Running methods (proper run and CV functions)


In [7]:
TEST_PATH = "submission.csv"

### a. Alternating Least Squares (ALS)

In [8]:
def run_ALS(train_df, num_iter=25, std_init = 0.1, rank = 8, r0_reg = 1.5, r1_reg = 2, r2_reg = 2):
    
    # 1. Defining the model
    fm = pywFM.FM(task = 'regression', learning_method='als', num_iter=num_iter, init_stdev = std_init, k2 = rank,
             r0_regularization = r0_reg, r1_regularization = r1_reg, r2_regularization = r2_reg)
    
    # 2. Formatting the data
    features_tr, target_tr = df_to_sparse(train_df)
    features_te, target_te = df_to_sparse(df_load(TEST_PATH))
    
    # 3. Running the model
    model = fm.run(features_tr, target_tr, features_te, target_te)
    
    # 4. Outputs
    error = model.rlog.rmse[num_iter-1]
    pred = model.predictions
    
    print("Error = ",error," (for ALS with ", num_iter, "iterations, std_init =",std_init, ", k=",rank, ", r0_reg=",r0_reg,
         ", r1_reg=",r1_reg,", r2_reg =",r2_reg,")")
    
    return pred, error

In [9]:
def ALS_CV(features_tr, target_tr, features_te, target_test, num_iter, std_init_vec, rank_vec, r0_reg_vec, r1_reg_vec,r2_reg_vec):
    #features_tr, target_tr = df_to_sparse(train)
    #features_te, target_te = df_to_sparse(test)
    
    best_error = 1000000;
    best_std = 0;
    best_rank = 0;
    best_r0 = 0;
    best_r1 = 0;
    best_r2 = 0;
    for std_init in std_init_vec:
        for rank in rank_vec:
            for r0_reg in r0_reg_vec:
                for r1_reg in r1_reg_vec:
                    for r2_reg in r2_reg_vec:
                        fm = pywFM.FM(task = 'regression', learning_method='als', num_iter=num_iter, 
                                        init_stdev = std_init, k2 = rank, r0_regularization = r0_reg, 
                                        r1_regularization = r1_reg, r2_regularization = r2_reg)
                        model = fm.run(features_tr, target_tr, features_te, target_te)
                        error = model.rlog.rmse[num_iter-1]
                        
                        print("Error = ",error," (for ALS with ", num_iter, "iterations, std_init =",
                            std_init, ", k=",rank, ", r0_reg=",r0_reg,
                             ", r1_reg=",r1_reg,", r2_reg =",r2_reg,")")
    
                        if error < best_error:
                            best_error = error; best_std = std_init; best_rank = rank; best_r0 = r0_reg; best_r1 = r1_reg; best_r2 = r2_reg;   
    
    return best_error, best_std, best_rank, best_r0, best_r1, best_r2 

### b. MCMC

In [10]:
def run_MCMC(train_df, num_iter=25, std_init = 0.1):
    
    # 1. Defining the model
    fm = pywFM.FM(task='regression', num_iter= num_iter,init_stdev = std_init)
    
    # 2. Formatting the data
    features_tr, target_tr = df_to_sparse(train_df)
    features_te, target_te = df_to_sparse(df_load(TEST_PATH))
    
    # 3. Running the model
    model = fm.run(features_tr, target_tr, features_te, target_te)
    
    # 4. Outputs
    error = model.rlog.rmse[num_iter-1]
    pred = model.predictions
    
    print("Error = ",error," (for ALS with ", num_iter, "iterations, std_init =",std_init,")")
    
    return pred, error


In [11]:
def MCMC_CV(features_tr, target_tr, features_te, target_test, num_iter, std_init_vec):
    best_error = 1000000;
    best_std = 0;
    for std_init in std_init_vec:
            fm = pywFM.FM(task='regression', num_iter= num_iter,init_stdev = std_init)
            model = fm.run(features_tr, target_tr, features_te, target_te)
            error = model.rlog.rmse[num_iter-1]
            print("Error = ",error," (for MCMC with ", num_iter, "iterations, std_init =",std_init,")")            
            if error < best_error:
                best_error = error; best_std = std_init

## 3. Running it all

In [33]:
pred, error =  run_ALS(df_load("data_train.csv"))

Cross-Validation

In [17]:
features_te, target_te, features_tr, target_tr =  df_to_sparse_split(df_load("data_train.csv"),0.1)
best_error, best_std, best_rank, best_r0, best_r1, best_r2 = ALS_CV(features_tr, target_tr, features_te, target_te,
        num_iter = 40, std_init_vec=[0.375,0.43], rank_vec=[7], r0_reg_vec=[0.5], r1_reg_vec=[15,20],r2_reg_vec=[20,25])

Error =  0.984539  (for ALS with  40 iterations, std_init = 0.375 , k= 7 , r0_reg= 0.5 , r1_reg= 15 , r2_reg = 20 )
Error =  0.984155  (for ALS with  40 iterations, std_init = 0.375 , k= 7 , r0_reg= 0.5 , r1_reg= 15 , r2_reg = 25 )
Error =  0.983993  (for ALS with  40 iterations, std_init = 0.375 , k= 7 , r0_reg= 0.5 , r1_reg= 20 , r2_reg = 20 )
Error =  0.984125  (for ALS with  40 iterations, std_init = 0.375 , k= 7 , r0_reg= 0.5 , r1_reg= 20 , r2_reg = 25 )
Error =  0.983576  (for ALS with  40 iterations, std_init = 0.43 , k= 7 , r0_reg= 0.5 , r1_reg= 15 , r2_reg = 20 )
Error =  0.983401  (for ALS with  40 iterations, std_init = 0.43 , k= 7 , r0_reg= 0.5 , r1_reg= 15 , r2_reg = 25 )
Error =  0.984214  (for ALS with  40 iterations, std_init = 0.43 , k= 7 , r0_reg= 0.5 , r1_reg= 20 , r2_reg = 20 )
Error =  0.983691  (for ALS with  40 iterations, std_init = 0.43 , k= 7 , r0_reg= 0.5 , r1_reg= 20 , r2_reg = 25 )


In [28]:
pred, error = run_ALS(df_load("data_train.csv"), num_iter=100, std_init = 0.43, rank = 7, r0_reg = 0.5, r1_reg = 15, r2_reg = 25)

Error =  0.681716  (for ALS with  40 iterations, std_init = 0.43 , k= 7 , r0_reg= 0.5 , r1_reg= 15 , r2_reg = 25 )


In [29]:
write_submission("submission.csv", np.round(pred), "out_ALS_40_034_7_05_15_25.csv")

In [25]:
np.round(pred)

array([ 3.,  3.,  4., ...,  3.,  3.,  4.])

In [15]:
features_te, target_te, features_tr, target_tr =  df_to_sparse_split(df_load("data_train.csv"),0.1)
MCMC_CV(features_tr, target_tr, features_te, target_te, num_iter = 60, std_init_vec = [0.5,0.8,1,1.5,2,5])

Error =  0.979054  (for MCMC with  60 iterations, std_init = 0.5 )
Error =  0.979325  (for MCMC with  60 iterations, std_init = 0.8 )
Error =  0.978956  (for MCMC with  60 iterations, std_init = 1 )
Error =  0.979788  (for MCMC with  60 iterations, std_init = 1.5 )
Error =  0.986542  (for MCMC with  60 iterations, std_init = 2 )
Error =  1.01712  (for MCMC with  60 iterations, std_init = 5 )


```
features_te, target_te, features_tr, target_tr =  df_to_sparse_split(df_load("data_train.csv"),0.1)
best_error, best_std, best_rank, best_r0, best_r1, best_r2 = ALS_CV(features_tr, target_tr, features_te, target_te, num_iter = 40, std_init_vec=[0.01,0.05,0.1], rank_vec=[7,9], r0_reg_vec=[0.5, 2], r1_reg_vec=[0.5, 2],r2_reg_vec=[0.5, 2])
```
Error =  0.997084  (for ALS with  40 iterations, std_init = 0.1 , k= 7 , r0_reg= 0.5 , r1_reg= 2 , r2_reg = 2 )


```
features_te, target_te, features_tr, target_tr =  df_to_sparse_split(df_load("data_train.csv"),0.1)
best_error, best_std, best_rank, best_r0, best_r1, best_r2 = ALS_CV(features_tr, target_tr, features_te, target_te,
        num_iter = 40, std_init_vec=[0.5,1], rank_vec=[7,8], r0_reg_vec=[0.5, 2], r1_reg_vec=[2,3],r2_reg_vec=[2,3])
```

Error =  0.993506  (for ALS with  40 iterations, std_init = 0.5 , k= 7 , r0_reg= 0.5 , r1_reg= 3 , r2_reg = 3 )

```
best_error, best_std, best_rank, best_r0, best_r1, best_r2 = ALS_CV(features_tr, target_tr, features_te, target_te,
        num_iter = 40, std_init_vec=[0.375,0.43], rank_vec=[7], r0_reg_vec=[0.5], r1_reg_vec=[15,20],r2_reg_vec=[20,25])
```
Error =  0.983401  (for ALS with  40 iterations, std_init = 0.43 , k= 7 , r0_reg= 0.5 , r1_reg= 15 , r2_reg = 25 )

```
features_te, target_te, features_tr, target_tr =  df_to_sparse_split(df_load("data_train.csv"),0.1)
MCMC_CV(features_tr, target_tr, features_te, target_te, num_iter = 60, std_init_vec = [0.5,0.8,1,1.5,2,5])
```

- Error =  0.979054  (for MCMC with  60 iterations, std_init = 0.5 )
- Error =  0.979325  (for MCMC with  60 iterations, std_init = 0.8 )
- Error =  0.978956  (for MCMC with  60 iterations, std_init = 1 )
- Error =  0.979788  (for MCMC with  60 iterations, std_init = 1.5 )
- Error =  0.986542  (for MCMC with  60 iterations, std_init = 2 )
- Error =  1.01712  (for MCMC with  60 iterations, std_init = 5 )
