In [3]:
import pandas as pd
import numpy as np
from pyfm import pylibfm
from sklearn.feature_extraction import DictVectorizer

In [4]:
train_dataset = "./datas/data_train.csv"
pred_dataset = "./datas/sampleSubmission.csv"

In [5]:
def load_dataset(path):
    """Load dataset as a (User, Movie, Rating) pandas dataframe"""
    df = pd.read_csv(path)
    parsed_df = pd.DataFrame()
    # Get all pairs of (r44_c1) -> (44, 1) (user, movie)
    user_movie_indices = df.Id.apply(lambda x: x.split('_'))
    parsed_df['User'] =  [int(i[0][1:]) for i in user_movie_indices]
    parsed_df['Movie'] = [int(i[1][1:]) for i in user_movie_indices]
    parsed_df['Rating'] = df['Prediction']
    
    num_items = parsed_df.Movie.nunique()
    num_users = parsed_df.User.nunique()
    print("USERS: {} ITEMS: {}".format(num_users, num_items))
    return parsed_df

In [6]:
train_df = load_dataset(train_dataset)

USERS: 10000 ITEMS: 1000


In [7]:
def split_dataset(parsed_df, p_test=0.1, min_num_ratings=0):
    movies_per_user = parsed_df.User.value_counts()
    users_per_movie = parsed_df.Movie.value_counts()

    valid_users = movies_per_user[movies_per_user > min_num_ratings].index.values
    valid_movies = users_per_movie[users_per_movie > min_num_ratings].index.values
    valid_parsed_df = parsed_df[parsed_df.User.isin(valid_users) & parsed_df.Movie.isin(valid_movies)].reset_index(drop=True)
    
    print("movies per user: min[{a}], max[{b}], users per movie: min[{c}], max[{d}].".
          format(a=movies_per_user.min(), b=movies_per_user.max(), c=users_per_movie.min(), d=users_per_movie.max()))

    size = valid_parsed_df.shape[0]
    indexes = list(range(size))
    np.random.shuffle(indexes)

    test_ind = indexes[:int(size*p_test)]
    train_ind = indexes[int(size*p_test):]

    test = valid_parsed_df.loc[test_ind].reset_index(drop=True)
    train = valid_parsed_df.loc[train_ind].reset_index(drop=True)
    print("The shape of test_dataset: {test}, train_dataset: {train}".format(test=test.shape, train=train.shape))
    
    return train, test

train, test = split_dataset(train_df)
# print(train.iloc[0,2])
# type(train.iloc[0,2])

movies per user: min[3], max[522], users per movie: min[8], max[4590].
The shape of test_dataset: (117695, 3), train_dataset: (1059257, 3)


In [8]:
def compute_rmse(pred, real):
    pred_sorted = pred.sort_values(['Movie', 'User']).reset_index(drop=True)
    real_sorted = real.sort_values(['Movie', 'User']).reset_index(drop=True)

    mse = np.square(pred_sorted.Rating - real_sorted.Rating).mean()
    rmse = np.sqrt(mse)

    return rmse

In [9]:
def toPyFMData(df):
    """Transform pandas dataframe into the dataformat PyFM needs"""
    data = []
    users = set(df.User.unique())
    movies = set(df.Movie.unique())
    ratings = df.Rating.astype(float).tolist()
    for row in df.iterrows():
        data.append({"user_id": str(row[1].User), "movie_id": str(row[1].Movie)})
    return (data, np.array(ratings), users, movies)

In [10]:
(train_data, y_train, train_users, train_items) = toPyFMData(train)
(test_data, y_test, test_users, test_items) = toPyFMData(test)
v = DictVectorizer()
X_train = v.fit_transform(train_data)
X_test = v.transform(test_data)

In [11]:
fm = pylibfm.FM(num_factors=20, num_iter=200, verbose=True, task="regression", initial_learning_rate=0.001, learning_rate_schedule="optimal")
fm.fit(X_train,y_train)

Creating validation dataset of 0.01 of training for adaptive regularization
-- Epoch 1
Training MSE: 0.54478
-- Epoch 2
Training MSE: 0.51169
-- Epoch 3
Training MSE: 0.50290
-- Epoch 4
Training MSE: 0.49799
-- Epoch 5
Training MSE: 0.49478
-- Epoch 6
Training MSE: 0.49248
-- Epoch 7
Training MSE: 0.49073
-- Epoch 8
Training MSE: 0.48928
-- Epoch 9
Training MSE: 0.48798
-- Epoch 10
Training MSE: 0.48684
-- Epoch 11
Training MSE: 0.48573
-- Epoch 12
Training MSE: 0.48469
-- Epoch 13
Training MSE: 0.48357
-- Epoch 14
Training MSE: 0.48248
-- Epoch 15
Training MSE: 0.48131
-- Epoch 16
Training MSE: 0.48005
-- Epoch 17
Training MSE: 0.47884
-- Epoch 18
Training MSE: 0.47743
-- Epoch 19
Training MSE: 0.47609
-- Epoch 20
Training MSE: 0.47462
-- Epoch 21
Training MSE: 0.47317
-- Epoch 22
Training MSE: 0.47165
-- Epoch 23
Training MSE: 0.47017
-- Epoch 24
Training MSE: 0.46874
-- Epoch 25
Training MSE: 0.46731
-- Epoch 26
Training MSE: 0.46590
-- Epoch 27
Training MSE: 0.46464
-- Epoch 28
Tra

In [12]:
preds = fm.predict(X_test)
from sklearn.metrics import mean_squared_error
print("FM MSE: %.4f" % mean_squared_error(y_test,preds))

FM MSE: 0.9756


In [13]:
preds = fm.predict(X_test)
for i in range(len(preds)):
    if preds[i] > 5:
        preds[i] = 5
    elif preds[i] < 1:
        preds[i] = 1
predictions = test.copy()
predictions['Rating'] = preds

rmse = compute_rmse(predictions, test)
print(rmse)

0.9877478639054355


In [6]:
import pandas as pd
import numpy as np
from helpers import *
from pyfm import pylibfm
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import KFold

In [7]:
def pyFM_cv_algo(algo, k_fold=5, verbose=True):
    
    kf = KFold(n_splits=k_fold)
    rmse_ = 0
    
    for trainset_ind, testset_ind in kf.split(train_df):
        
        trainset = train_df.iloc[trainset_ind]
        testset = train_df.iloc[testset_ind]
        
        (train_data, y_train, train_users, train_items) = toPyFMData(trainset)
        (test_data, y_test, test_users, test_items) = toPyFMData(testset)
        v = DictVectorizer()
        X_train = v.fit_transform(train_data)
        X_test = v.transform(test_data)
    
        algo.fit(X_train,y_train)
        preds = algo.predict(X_test)
        for i in range(len(preds)):
            if preds[i] > 5:
                preds[i] = 5
            elif preds[i] < 1:
                preds[i] = 1
        predictions = testset.copy()
        predictions['Rating'] = preds

        rmse_ += compute_rmse(predictions, testset)
        
    rmse_mean = rmse_/k_fold
    return rmse_mean
        
def pyFM_cv(verbose=True, t = Timer()): 
    #pyFM parameters
    factors = np.linspace(20, 200, 9, dtype=np.int64)
    learning_rates = np.logspace(-2, -5, 4)
    params = dict()
    rmses = dict()
    
    for k in factors:
        params['k'] = k
        for rate in learning_rates:
            params['rate'] = rate
            algo = pylibfm.FM(num_factors=k, num_iter=200, verbose=True, task="regression", initial_learning_rate=rate, learning_rate_schedule="optimal")
            rmse = pyFM_cv_algo(algo)
            print("------Time:{}, rmse: {}, factors: {}, learning_rates: {}------\n\n".format(t.now(), rmse, k, rate))
            rmses[rmse] = params
    
    # Find the model with least RMSE
    lowest_rmse = min(rmses.keys())
    best_params = rmses[lowest_rmse]
    
    print("Best pyFM rmse: {}. Params: factors: {}, learning_rates: {}".format(lowest_rmse, best_params['k'], best_params['rate']))

In [None]:
train_dataset = "./datas/data_train.csv"
train_df = load_dataset(train_dataset)

t = Timer()
t.start()
pyFM_cv()
t.stop(verbose=True)

Creating validation dataset of 0.01 of training for adaptive regularization
-- Epoch 1
Training MSE: 0.50892
-- Epoch 2
Training MSE: 0.49226
-- Epoch 3
Training MSE: 0.48633
-- Epoch 4
Training MSE: 0.47756
-- Epoch 5
Training MSE: 0.46932
-- Epoch 6
Training MSE: 0.46355
-- Epoch 7
Training MSE: 0.45967
-- Epoch 8
Training MSE: 0.45701
-- Epoch 9
Training MSE: 0.45415
-- Epoch 10
Training MSE: 0.45137
-- Epoch 11
Training MSE: 0.44841
-- Epoch 12
Training MSE: 0.44768
-- Epoch 13
Training MSE: 0.44580
-- Epoch 14
Training MSE: 0.44467
-- Epoch 15
Training MSE: 0.44405
-- Epoch 16
Training MSE: 0.44272
-- Epoch 17
Training MSE: 0.44264
-- Epoch 18
Training MSE: 0.44253
-- Epoch 19
Training MSE: 0.44248
-- Epoch 20
Training MSE: 0.44198
-- Epoch 21
Training MSE: 0.44090
-- Epoch 22
Training MSE: 0.44163
-- Epoch 23
Training MSE: 0.44088
-- Epoch 24
Training MSE: 0.44080
-- Epoch 25
Training MSE: 0.44010
-- Epoch 26


In [None]:
import pandas as pd
import numpy as np
from helpers import *
from pyfm import pylibfm
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

def compute_rmse_test(pred, truth):
    """ compute RMSE for pandas dataframes """
    truth_sorted = truth.sort_values(['User', 'Movie']).reset_index(drop=True)
    pred_sorted = pred.sort_values(['User', 'Movie']).reset_index(drop=True)

    truth_sorted['square_error'] = np.square(truth_sorted['Rating'] - pred_sorted['Rating'])

    mse = truth_sorted['square_error'].mean()
    rmse = np.sqrt(mse)

    return rmse

def pyFM_cv_algo(algo, k_fold=5, verbose=True):
    
    kf = KFold(n_splits=k_fold)
    rmse_ = 0
    
    for trainset_ind, testset_ind in kf.split(train_df):
        
        trainset = train_df.iloc[trainset_ind]
        testset = train_df.iloc[testset_ind]
        
        (train_data, y_train, train_users, train_items) = toPyFMData(trainset)
        (test_data, y_test, test_users, test_items) = toPyFMData(testset)
        v = DictVectorizer()
        X_train = v.fit_transform(train_data)
        X_test = v.transform(test_data)
    
        algo.fit(X_train,y_train)
        preds = algo.predict(X_test)
        for i in range(len(preds)):
            if preds[i] > 5:
                preds[i] = 5
            elif preds[i] < 1:
                preds[i] = 1
        predictions = testset.copy()
        predictions['Rating'] = preds
        print("predictions['Rating']: {}".format(predictions['Rating'].iloc[0]))
        rmse = compute_rmse_test(predictions, testset)
        print("rmse: {}".format(rmse))
        print("FM MSE: %.4f" % mean_squared_error(y_test,preds))
        print(preds)
        rmse_ += rmse
        
    rmse_mean = rmse_/k_fold
    return rmse_mean
    
train_dataset = "./datas/data_train.csv"
train_df = load_dataset(train_dataset)

t = Timer()
t.start()
algo = pylibfm.FM(num_factors=40, num_iter=3, verbose=True, task="regression", initial_learning_rate=0.001, learning_rate_schedule="optimal")
rmse = pyFM_cv_algo(algo)
print("------Time:{}, rmse: {}, factors: {}, learning_rates: {}------\n\n".format(t.now(), rmse,20, 0.001))
t.stop(verbose=True)

Creating validation dataset of 0.01 of training for adaptive regularization
-- Epoch 1
Training MSE: 0.53516
-- Epoch 2
Training MSE: 0.50306
-- Epoch 3
Training MSE: 0.49337
predictions['Rating']: 3.749767021609181
rmse: 1.1636358262542048
FM MSE: 1.3540
[3.74976702 3.93636196 3.52732125 ... 3.29428063 3.47435704 3.26056977]
Creating validation dataset of 0.01 of training for adaptive regularization
-- Epoch 1
Training MSE: 0.55050
-- Epoch 2
Training MSE: 0.51776
-- Epoch 3
Training MSE: 0.50824
predictions['Rating']: 4.118604735906454
rmse: 1.1109504440837479
FM MSE: 1.2342
[4.11860474 4.26118996 3.74162908 ... 3.4761124  3.4774119  3.50195492]
Creating validation dataset of 0.01 of training for adaptive regularization
-- Epoch 1
Training MSE: 0.54992
-- Epoch 2
Training MSE: 0.51694
-- Epoch 3
Training MSE: 0.50749
predictions['Rating']: 3.4944698156105924
rmse: 1.107797901943358
FM MSE: 1.2272
[3.49446982 3.55253983 3.49709276 ... 4.20767293 4.29337989 4.13934643]
Creating validat

In [None]:
preds