In [1]:
from pyfm import pylibfm

In [2]:
from sklearn.feature_extraction import DictVectorizer

In [3]:
import numpy as np

In [4]:
def loadData(filename,path="/home/kai/data/shiyi/data/ml-100k/"):
    data = []
    y = []
    users=set()
    items=set()
    with open(path+filename) as f:
        for line in f:
            (user,movieid,rating,ts)=line.split('\t')
            data.append({ "user_id": str(user), "movie_id": str(movieid)})
            y.append(float(rating))
            users.add(user)
            items.add(movieid)

    return (data, np.array(y), users, items)

In [5]:
(train_data, y_train, train_users, train_items) = loadData("ua.base")

In [8]:
train_data[:3]

[{'movie_id': '1', 'user_id': '1'},
 {'movie_id': '2', 'user_id': '1'},
 {'movie_id': '3', 'user_id': '1'}]

In [6]:
(test_data, y_test, test_users, test_items) = loadData("ua.test")

In [9]:
test_data[:3]

[{'movie_id': '20', 'user_id': '1'},
 {'movie_id': '33', 'user_id': '1'},
 {'movie_id': '61', 'user_id': '1'}]

In [12]:
v = DictVectorizer()
X_train = v.fit_transform(train_data)
X_test = v.transform(test_data)

In [13]:
# Build and train a Factorization Machine
fm = pylibfm.FM(num_factors=10, num_iter=100, verbose=True, task="regression", initial_learning_rate=0.001, learning_rate_schedule="optimal")

In [14]:
fm.fit(X_train,y_train)

Creating validation dataset of 0.01 of training for adaptive regularization
-- Epoch 1
Training MSE: 0.59498
-- Epoch 2
Training MSE: 0.51787
-- Epoch 3
Training MSE: 0.49039
-- Epoch 4
Training MSE: 0.47447
-- Epoch 5
Training MSE: 0.46409
-- Epoch 6
Training MSE: 0.45662
-- Epoch 7
Training MSE: 0.45091
-- Epoch 8
Training MSE: 0.44640
-- Epoch 9
Training MSE: 0.44259
-- Epoch 10
Training MSE: 0.43949
-- Epoch 11
Training MSE: 0.43668
-- Epoch 12
Training MSE: 0.43429
-- Epoch 13
Training MSE: 0.43218
-- Epoch 14
Training MSE: 0.43029
-- Epoch 15
Training MSE: 0.42849
-- Epoch 16
Training MSE: 0.42691
-- Epoch 17
Training MSE: 0.42532
-- Epoch 18
Training MSE: 0.42392
-- Epoch 19
Training MSE: 0.42252
-- Epoch 20
Training MSE: 0.42119
-- Epoch 21
Training MSE: 0.42000
-- Epoch 22
Training MSE: 0.41870
-- Epoch 23
Training MSE: 0.41752
-- Epoch 24
Training MSE: 0.41634
-- Epoch 25
Training MSE: 0.41520
-- Epoch 26
Training MSE: 0.41399
-- Epoch 27
Training MSE: 0.41287
-- Epoch 28
Tra

In [15]:
# Evaluate
preds = fm.predict(X_test)
from sklearn.metrics import mean_squared_error
print("FM MSE: %.4f" % mean_squared_error(y_test,preds))

FM MSE: 0.8857


In [16]:
import pandas as pd

In [22]:
train_df = pd.DataFrame(train_data)

In [24]:
test_df = pd.DataFrame(test_data)

In [19]:
from sklearn.linear_model import LinearRegression

In [20]:
lr = LinearRegression()

In [23]:
lr.fit(train_df, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [26]:
preds_lr = lr.predict(test_df)

In [27]:
print("FM MSE: %.4f" % mean_squared_error(y_test,preds_lr))

FM MSE: 1.2285


In [29]:
from lightfm import LightFM

In [30]:
lfm = LightFM(loss='warp')

In [31]:
from lightfm.datasets import fetch_movielens
from lightfm.evaluation import precision_at_k

# Load the MovieLens 100k dataset. Only five
# star ratings are treated as positive.
data = fetch_movielens(min_rating=5.0)

In [35]:
len(data), type(data), data.keys()

(5,
 dict,
 dict_keys(['train', 'test', 'item_features', 'item_feature_labels', 'item_labels']))

In [34]:
data['train']

<943x1682 sparse matrix of type '<class 'numpy.int32'>'
	with 19048 stored elements in COOrdinate format>

In [37]:
np.array(data['test'])

array(<943x1682 sparse matrix of type '<class 'numpy.int32'>'
	with 2153 stored elements in COOrdinate format>, dtype=object)

In [38]:
model = LightFM(loss='warp')
model.fit(data['train'], epochs=30, num_threads=2)

# Evaluate the trained model
test_precision = precision_at_k(model, data['test'], k=5).mean()

In [39]:
test_precision

0.049141344