cf [this](https://github.com/jfloff/pywFM) github repo

In [None]:
import pywFM
import numpy as np
import pandas as pd
import scipy.sparse as sp
from scipy.sparse import csr_matrix


Formatting from DataFrame to sparse matrix, the format needed for libFM.

In [None]:
def df_to_sparse_split(path, p_test=0.1):
    """
        Rewrites our matrix of user movie association in the following format, starting from a 2 column csv file with :
        1st column : user id and movie id mixed, 2nd column : rating. The output matrix will take the form 
        
         Users  |     Movies    
        A  B  C | TI  NH  SW  ST
        [1, 0, 0,  1,  0,  0,  0],
        [1, 0, 0,  0,  1,  0,  0],
        [1, 0, 0,  0,  0,  1,  0],
        [0, 1, 0,  0,  0,  1,  0],
        [0, 1, 0,  0,  0,  0,  1],
        [0, 0, 1,  1,  0,  0,  0],
        [0, 0, 1,  0,  0,  1,  0] 
        ])
        
        target = [5, 3, 1, 4, 5, 1, 5]
        
        @param path : The path of the training/testing data
        @oaram p_test : The percentage of elements that should be in the training
        @return features_te, target_te : the testing matrix and testing target values
        @return features_tr, target_tr : the training matrix and the training target values
    """
    # 1. Loading the DF and formatting it
    df = pd.read_csv(path)
    df['User'] = [(ID.split('_')[0])[1:] for ID in df['Id']]
    df['Movie'] = [(ID.split('_')[1])[1:] for ID in df['Id']]
    parsed_df = df[['User', 'Movie', 'Prediction']].astype(int)
    parsed_df[['Id']] = df[['Id']]

    parsed_df = parsed_df.sort_values(['Movie','User'],ascending=[True,True])
    
    user_index = np.squeeze(np.array(parsed_df['User']-1))
    movie_index = np.squeeze(np.array(parsed_df['Movie'] + max(user_index)))
    ratings = np.squeeze(np.array(parsed_df['Prediction']))
    
#2. Test train split    
# We make no permutation if either everything is a train or a test split.
    if p_test > 0 and p_test <1:
        indices = np.random.permutation(np.arange(0,len(user_index)))
    
        idx_te = indices[0:int(len(indices)*p_test)]
        idx_tr = indices[int(len(indices)*p_test):]
    elif p_test == 1:
        idx_te = np.arange(0,len(user_index))
        idx_tr = []
    else:
        idx_te = []
        idx_tr = np.arange(0,len(user_index))
    
    #3.Formatting now the way we need to use libFM
    # a. Testing set
    col_entries_te = np.r_[user_index[idx_te],movie_index[idx_te]]
    indices_te = np.arange(0,len(user_index[idx_te]))
    row_entries_te = np.r_[indices_te,indices_te]
    entries_te = np.ones(len(row_entries_te))
    
    features_te = csr_matrix((entries_te,(row_entries_te, col_entries_te)),shape = (len(indices_te),len(col_entries_te)))
    target_te = ratings[idx_te]
    
    # b. Training set
    col_entries_tr = np.r_[user_index[idx_tr],movie_index[idx_tr]]
    indices_tr = np.arange(0,len(user_index[idx_tr]))
    row_entries_tr = np.r_[indices_tr,indices_tr]
    entries_tr = np.ones(len(row_entries_tr))
    
    features_tr = csr_matrix((entries_tr,(row_entries_tr, col_entries_tr)),shape = (len(indices_tr),len(col_entries_tr)))
    target_tr = ratings[idx_tr]    
    
    return features_te, target_te, features_tr, target_tr

Example of run for ALS with libFM (parameters to be tuned)

In [None]:
#als
fm = pywFM.FM(task = 'regression', learning_method='als', num_iter=5, init_stdev = 0.1, k2 = 8,
             r0_regularization = 1.5, r1_regularization = 2, r2_regularization = 2)

Example of run with MCMC (parameters to be tuned)

In [None]:
#mcmc
fm = pywFM.FM(task='regression', num_iter=500,init_stdev=1)

In [None]:
# split features and target for train/test : to be used for cross validation
features_te, target_te, features_tr, target_tr = df_to_sparse_split("data_train.csv", 0.1)

model = fm.run(features_tr, target_tr, features_te, target_te)

pred = model.predictions
pred = np.round(pred)
# you can also get the model weights
weights = model.weights

In [None]:
model.rlog.rmse

Real run here :

In [None]:
real_te_features, real_te_target, _, _ = df_to_sparse_split("submission.csv",1.)
_, _, features_tr, target_tr = df_to_sparse_split("data_train.csv", 0.0)

model = fm.run(features_tr, target_tr, real_te_features, real_te_target)
pred = model.predictions

In [None]:
model.rlog

In [None]:
def write_submission(submission_data_path, prediction, out_path):
    df = pd.read_csv(submission_data_path)
    df['User'] = [(ID.split('_')[0])[1:] for ID in df['Id']]
    df['Movie'] = [(ID.split('_')[1])[1:] for ID in df['Id']]
    df[['User','Movie']] = df[['User','Movie']].apply(pd.to_numeric)
    df = df.sort_values(['Movie','User'],ascending=[True,True])
    df['Prediction'] = prediction.astype(int)
    
    df[['Id','Prediction']].to_csv(out_path, index = False)
pred = np.round(model.predictions)
write_submission("submission.csv",pred,"thomas_submission_mcmc_std_1_iter_500.csv")

In [None]:
df = pd.read_csv("submission.csv")
df['User'] = [(ID.split('_')[0])[1:] for ID in df['Id']]
df['Movie'] = [(ID.split('_')[1])[1:] for ID in df['Id']]
parsed_df = df[['User', 'Movie', 'Prediction']].astype(int)
parsed_df[['Id']] = df[['Id']]

parsed_df2 = parsed_df.sort_values(['User'],ascending=True)
parsed_df2['Prediction'] = pred.astype(int)
parsed_df2 = parsed_df2.sort_values(['Movie','User'],ascending=[True,True])

In [None]:
parsed_df.sort_values(['Movie'],ascending=True)

In [None]:
parsed_df2.head(50)

In [None]:
df.head(50)