cf [this](https://github.com/jfloff/pywFM) github repo

In [1]:
import pywFM
import numpy as np
import pandas as pd
import scipy.sparse as sp
from scipy.sparse import csr_matrix


Formatting from DataFrame to sparse matrix, the format needed for libFM.

In [2]:
def df_to_sparse_split(path, p_test=0.1):
    """
        Rewrites our matrix of user movie association in the following format, starting from a 2 column csv file with :
        1st column : user id and movie id mixed, 2nd column : rating. The output matrix will take the form 
        
         Users  |     Movies    
        A  B  C | TI  NH  SW  ST
        [1, 0, 0,  1,  0,  0,  0],
        [1, 0, 0,  0,  1,  0,  0],
        [1, 0, 0,  0,  0,  1,  0],
        [0, 1, 0,  0,  0,  1,  0],
        [0, 1, 0,  0,  0,  0,  1],
        [0, 0, 1,  1,  0,  0,  0],
        [0, 0, 1,  0,  0,  1,  0] 
        ])
        
        target = [5, 3, 1, 4, 5, 1, 5]
        
        @param path : The path of the training/testing data
        @oaram p_test : The percentage of elements that should be in the training
        @return features_te, target_te : the testing matrix and testing target values
        @return features_tr, target_tr : the training matrix and the training target values
    """
    # 1. Loading the DF and formatting it
    df = pd.read_csv(path)
    df['User'] = [(ID.split('_')[0])[1:] for ID in df['Id']]
    df['Movie'] = [(ID.split('_')[1])[1:] for ID in df['Id']]
    parsed_df = df[['User', 'Movie', 'Prediction']].astype(int)
    parsed_df[['Id']] = df[['Id']]

    parsed_df = parsed_df.sort_values(['Movie','User'],ascending=[True,True])
    
    user_index = np.squeeze(np.array(parsed_df['User']-1))
    movie_index = np.squeeze(np.array(parsed_df['Movie'] + max(user_index)))
    ratings = np.squeeze(np.array(parsed_df['Prediction']))
    
#2. Test train split    
# We make no permutation if either everything is a train or a test split.
    if p_test > 0 and p_test <1:
        indices = np.random.permutation(np.arange(0,len(user_index)))
    
        idx_te = indices[0:int(len(indices)*p_test)]
        idx_tr = indices[int(len(indices)*p_test):]
    elif p_test == 1:
        idx_te = np.arange(0,len(user_index))
        idx_tr = []
    else:
        idx_te = []
        idx_tr = np.arange(0,len(user_index))
    
    #3.Formatting now the way we need to use libFM
    # a. Testing set
    col_entries_te = np.r_[user_index[idx_te],movie_index[idx_te]]
    indices_te = np.arange(0,len(user_index[idx_te]))
    row_entries_te = np.r_[indices_te,indices_te]
    entries_te = np.ones(len(row_entries_te))
    
    features_te = csr_matrix((entries_te,(row_entries_te, col_entries_te)),shape = (len(indices_te),len(col_entries_te)))
    target_te = ratings[idx_te]
    
    # b. Training set
    col_entries_tr = np.r_[user_index[idx_tr],movie_index[idx_tr]]
    indices_tr = np.arange(0,len(user_index[idx_tr]))
    row_entries_tr = np.r_[indices_tr,indices_tr]
    entries_tr = np.ones(len(row_entries_tr))
    
    features_tr = csr_matrix((entries_tr,(row_entries_tr, col_entries_tr)),shape = (len(indices_tr),len(col_entries_tr)))
    target_tr = ratings[idx_tr]    
    
    return features_te, target_te, features_tr, target_tr

Example of run for ALS with libFM (parameters to be tuned)

In [3]:
#als
fm = pywFM.FM(task = 'regression', learning_method='als', num_iter=25, init_stdev = 0.1, k2 = 8,
             r0_regularization = 1.5, r1_regularization = 2, r2_regularization = 2)

Example of run with MCMC (parameters to be tuned)

In [11]:
#mcmc
fm = pywFM.FM(task='regression', num_iter=50,init_stdev=0.1)

In [None]:
# split features and target for train/test : to be used for cross validation
features_te, target_te, features_tr, target_tr = df_to_sparse_split("data_train.csv", 0.1)

model = fm.run(features_tr, target_tr, features_te, target_te)

pred = model.predictions
pred = np.round(pred)
# you can also get the model weights
weights = model.weights

Real run here :

In [12]:
real_te_features, real_te_target, _, _ = df_to_sparse_split("submission.csv",1.)
_, _, features_tr, target_tr = df_to_sparse_split("data_train.csv", 0.0)

model = fm.run(features_tr, target_tr, real_te_features, real_te_target)
pred = model.predictions

In [13]:
model.rlog

Unnamed: 0,rmse,mae,time_pred,time_learn,time_learn2,time_learn4,alpha,rmse_mcmc_this,rmse_mcmc_all,rmse_mcmc_all_but5,...,"vmu[0,3]","vlambda[0,3]","vmu[0,4]","vlambda[0,4]","vmu[0,5]","vlambda[0,5]","vmu[0,6]","vlambda[0,6]","vmu[0,7]","vlambda[0,7]"
0,0.63104,0.468367,,0.94,0.942774,1,0.061761,0.63104,0.63104,2.87937,...,0.00171,99.0547,-0.002162,97.7679,0.001043,100.407,0.000495,98.6531,0.00084,98.9798
1,0.564,0.428446,,0.948,0.946666,1,0.890315,0.529639,0.564,2.87937,...,0.001171,100.057,-0.004145,96.1689,0.000312,102.681,0.000793,97.9615,0.000778,99.0749
2,0.53818,0.412661,,1.0,0.998546,1,0.997997,0.516506,0.53818,2.87937,...,-0.001113,99.0805,-0.004916,95.8775,0.001573,101.932,0.000706,98.1422,-8.9e-05,100.458
3,0.526408,0.405531,,0.984,0.985871,1,1.00737,0.514983,0.526408,2.87937,...,0.000895,98.6135,-0.005058,95.9183,0.001483,97.3909,0.002191,95.6182,0.001702,98.875
4,0.520205,0.401774,,0.92,0.920211,1,1.01252,0.514672,0.520205,1.4047,...,-0.000703,97.6125,-0.005056,94.1877,-0.00104,100.322,0.00398,92.4325,0.000811,98.1542
5,0.516667,0.399602,,0.856,0.856329,1,1.01239,0.514903,0.516667,0.514903,...,0.001143,94.0455,-0.005036,94.3933,-0.000156,100.095,0.003398,91.2027,-9.5e-05,96.8944
6,0.514354,0.398113,,0.996,0.994595,1,1.01408,0.514561,0.514354,0.510208,...,0.000289,94.019,-0.005712,90.2164,-0.001431,102.495,0.00416,92.1695,-0.000233,97.8156
7,0.512799,0.397107,,0.988,0.989197,1,1.01419,0.51462,0.512799,0.508665,...,0.000223,91.3632,-0.007711,88.4683,-0.000499,100.405,0.00371,90.1568,-0.000304,91.7894
8,0.511704,0.396446,,0.94,0.938633,1,1.01306,0.515045,0.511704,0.508014,...,-0.001444,93.1633,-0.007787,89.0879,-0.001105,95.9997,0.003379,91.4066,-0.000365,90.9538
9,0.510863,0.39594,,0.952,0.953596,1,1.01003,0.514765,0.510863,0.50755,...,-0.003046,93.461,-0.008171,92.6813,-0.002563,93.7879,0.004522,89.8223,-0.000109,86.944


In [14]:
def write_submission(submission_data_path, prediction, out_path):
    df = pd.read_csv(submission_data_path)
    df['User'] = [(ID.split('_')[0])[1:] for ID in df['Id']]
    df['Movie'] = [(ID.split('_')[1])[1:] for ID in df['Id']]
    df[['User','Movie']] = df[['User','Movie']].apply(pd.to_numeric)
    df = df.sort_values(['Movie','User'],ascending=[True,True])
    df['Prediction'] = prediction.astype(int)
    
    df[['Id','Prediction']].to_csv(out_path, index = False)
pred = np.round(model.predictions)
write_submission("submission.csv",pred,"thomas_submission_mcmc_new.csv")

In [None]:
df = pd.read_csv("submission.csv")
df['User'] = [(ID.split('_')[0])[1:] for ID in df['Id']]
df['Movie'] = [(ID.split('_')[1])[1:] for ID in df['Id']]
parsed_df = df[['User', 'Movie', 'Prediction']].astype(int)
parsed_df[['Id']] = df[['Id']]

parsed_df2 = parsed_df.sort_values(['User'],ascending=True)
parsed_df2['Prediction'] = pred.astype(int)
parsed_df2 = parsed_df2.sort_values(['Movie','User'],ascending=[True,True])

In [None]:
parsed_df.sort_values(['Movie'],ascending=True)

In [None]:
parsed_df2.head(50)

In [None]:
df.head(50)