# Factorization Machines scoring

## Import

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from surprise import Dataset, Reader
from surprise import SVD, NMF
from surprise.model_selection import cross_validate, train_test_split, GridSearchCV

import functions as f
import river
from river import datasets, stream, compose, facto, optim, reco, metrics, preprocessing, feature_extraction
def parse_list(x:str):
    lst = x.strip('][').split('\', ')
    return list(map(lambda x: x.strip('\''), lst))

## Training model
In this section, we trained MF and FM models using River, a python incremeantal learning package, specialize in working with data stream

In [2]:
# Read Data
def parse_list(x:str):
    lst = x.strip('][').split('\', ')
    return list(map(lambda x: x.strip('\''), lst))
def readData(random_state = None, frac = 0.8):
    df = pd.read_csv('data/db_rating.csv', header=0,usecols=[0,2,12], names=['user', 'item', 'rating'])
    r_df = df.sample(frac=frac, random_state=random_state) if random_state else df.head(int(frac* df.shape[0]))
    test = df.drop(r_df.index)
    y = r_df.pop('rating')
    y_test = test.pop('rating')
    return stream.iter_pandas(r_df, y), stream.iter_pandas(test, y_test)


In [3]:

RANDOM_STATE = 5


In [4]:
biased_mf_params = {
    'n_factors': 20,
    'bias_optimizer': optim.SGD(0.025),
    'latent_optimizer': optim.SGD(0.05),
    'weight_initializer': optim.initializers.Zeros(),
    'latent_initializer': optim.initializers.Normal(mu=0., sigma=0.1, seed=73),
    'l2_bias': 0.,
    'l2_latent': 0.
}
def trainBiasedMF():
    print('training MF on seed:', RANDOM_STATE)
    res = []
    model = reco.BiasedMF(**biased_mf_params)
    # model = reco.RandomNormal(seed=42)
    metric = metrics.MAE() + metrics.RMSE()
    X_y, X_y_test = readData(RANDOM_STATE)
    cnt = 0
    for x, y in X_y:
        y_pred = model.predict_one(user=x['user'], item=x['item'])
        
        metric.update(y_pred=y_pred, y_true=y)
        _ = model.learn_one(user=x['user'], item=x['item'], x=x, y=y)
        cnt+= 1 
    print('On training', cnt, metric)
    res.extend(metric.get())
    metric = metrics.MAE() + metrics.RMSE()
    cnt = 0
    for x, y in X_y_test:
        y_pred = model.predict_one(user=x['user'], item=x['item'])
        
        metric.update(y_pred=y_pred, y_true=y)
        cnt+= 1 
    print('On test', cnt, metric)
    res.extend(metric.get())
    return model, res

In [6]:
train_res = pd.DataFrame(columns=['MF_train_MAE', 'MF_train_RMSE', 'MF_test_MAE', 'MF_test_RMSE'])

for RANDOM_STATE in [5,10,15,20,25,30,35,100,200,150,200,500]:
    res = trainBiasedMF()[1]
    train_res.loc[RANDOM_STATE] = res
    
train_res
    

training MF on seed: 5
On training 149759 MAE: 0.620416, RMSE: 0.818568
On test 37440 MAE: 0.607284, RMSE: 0.801555
training MF on seed: 10
On training 149759 MAE: 0.620107, RMSE: 0.818377
On test 37440 MAE: 0.614207, RMSE: 0.805696
training MF on seed: 15
On training 149759 MAE: 0.621516, RMSE: 0.81966
On test 37440 MAE: 0.608094, RMSE: 0.800898
training MF on seed: 20
On training 149759 MAE: 0.620903, RMSE: 0.819001
On test 37440 MAE: 0.60645, RMSE: 0.799268
training MF on seed: 25
On training 149759 MAE: 0.620517, RMSE: 0.817663
On test 37440 MAE: 0.609443, RMSE: 0.80709
training MF on seed: 30
On training 149759 MAE: 0.619244, RMSE: 0.817925
On test 37440 MAE: 0.613147, RMSE: 0.80488
training MF on seed: 35
On training 149759 MAE: 0.619922, RMSE: 0.81843
On test 37440 MAE: 0.608789, RMSE: 0.80126
training MF on seed: 100
On training 149759 MAE: 0.620263, RMSE: 0.818742
On test 37440 MAE: 0.610782, RMSE: 0.80193
training MF on seed: 200
On training 149759 MAE: 0.620487, RMSE: 0.8188

Unnamed: 0,MF_train_MAE,MF_train_RMSE,MF_test_MAE,MF_test_RMSE
5,0.620416,0.818568,0.607284,0.801555
10,0.620107,0.818377,0.614207,0.805696
15,0.621516,0.81966,0.608094,0.800898
20,0.620903,0.819001,0.60645,0.799268
25,0.620517,0.817663,0.609443,0.80709
30,0.619244,0.817925,0.613147,0.80488
35,0.619922,0.81843,0.608789,0.80126
100,0.620263,0.818742,0.610782,0.80193
200,0.620481,0.818785,0.60849,0.800666
150,0.620645,0.818601,0.609083,0.802001


In [6]:
import pickle
model, res= trainBiasedMF()
with open('MF_model.pkl','wb') as f:
    pickle.dump(model,f)
# fm_model, res = trainFFM()
# with open('FM_model.pkl', 'wb') as f:
#     pickle.dump(fm_model, f)

training MF on seed: 10
On training 149759 MAE: 0.618716, RMSE: 0.817656
On test 37440 MAE: 0.612784, RMSE: 0.804262


## Recommendation for user
In this section, we used the trained model to filtered and present a list of recommendation for a specific user
In this example, if the user with id = 193458

In [3]:
df = pd.read_csv('data/ratings_top_w_features.csv', header=0, names=['user','age', 'item', 'rating', 'book_title', 'book_author', 'year_of_publication', 'language', 'categories','country', 'age_group'],dtype={'country': str})
df['categories'] = df['categories'].apply(parse_list)

In [4]:
import pickle
with open('MF_model.pkl', 'rb') as f:
    model = pickle.load(f)
    user_id = 193458
    rated_books = df[df['user'] == user_id]
    rated_books = rated_books[['item', 'book_title', 'categories', 'rating']].drop_duplicates('item')
    # rated_books
    predicted_ratings = []
    for index, row in rated_books.iterrows():
        pred = model.predict_one(user_id, row['item'])
        predicted_ratings.append(pred)
    rated_books['pred_rating'] = predicted_ratings
    rated_books = rated_books.sort_values(by='pred_rating', ascending=False)
rated_books

Unnamed: 0,item,book_title,categories,rating,pred_rating
6227,0064471047,"The Lion, the Witch, and the Wardrobe (The Chr...",[Juvenile Fiction],9,8.981209
23249,0345361792,A Prayer for Owen Meany,[Fiction],10,8.88434
95506,0064471101,The Magician's Nephew (rack) (Narnia),[Juvenile Fiction],9,8.688841
69872,0671880314,Schindler's List,[Fiction],9,8.631567
108469,006447108X,The Last Battle,[Juvenile Fiction],9,8.458833
17518,1853260002,Pride & Prejudice (Wordsworth Classics),[Fiction],10,8.438066
108422,0064471055,Prince Caspian (rack) : The Return to Narnia (...,[Juvenile Fiction],9,8.409073
138327,014011369X,"And the Band Played on: Politics, People, and ...",[History],9,8.356536
6701,0142001740,The Secret Life of Bees,[Fiction],9,8.326005
108447,0064471063,The Horse and His Boy,[Juvenile Fiction],9,8.275317


In [5]:
books = df[['item', 'book_title','categories']].drop_duplicates(subset=['item', 'book_title'])
unrated_books = books[~books['item'].isin(rated_books['item'].tolist())].copy()
print(f'all books: {books.shape[0]}, rated books: {rated_books.shape[0]}, not yet rated books: {unrated_books.shape[0]}')

all books: 63819, rated books: 55, not yet rated books: 63764


First we use Matrix Factorization to filterd through the 63764 not yet rated books, and keep 2000 highest predicted rating

In [7]:
with open('MF_model.pkl', 'rb') as f:
    model = pickle.load(f)
    user_id = 193458
    predicted_ratings = []
    for index, row in unrated_books.iterrows():
        pred = model.predict_one(user_id, row['item'])
        predicted_ratings.append(pred)
    unrated_books['pred_rating'] = predicted_ratings
    unrated_books = unrated_books.sort_values(by='pred_rating', ascending=False)
filtered_books = unrated_books.head(2000).copy()
filtered_books.head(10)

Unnamed: 0,item,book_title,categories,pred_rating
8139,0439139597,Harry Potter and the Goblet of Fire (Book 4),[Juvenile Fiction],9.391017
9330,0590353403,Harry Potter and the Sorcerer's Stone (Book 1),[Juvenile Fiction],9.326653
111120,0439425220,Harry Potter and the Chamber of Secrets Postca...,[Juvenile Fiction],9.176215
8276,043935806X,Harry Potter and the Order of the Phoenix (Boo...,[Juvenile Fiction],9.173875
22345,067168390X,Lonesome Dove,[Fiction],9.169922
1200,0446310786,To Kill a Mockingbird,[Fiction],9.157064
9488,0618002227,The Fellowship of the Ring (The Lord of the Ri...,[Fiction],9.14115
44252,043936213X,Harry Potter and the Sorcerer's Stone (Book 1),[Juvenile Fiction],9.059316
72547,0877017883,Griffin & Sabine: An Extraordinary Correspondence,[Fiction],9.045657
32439,0812550706,Ender's Game (Ender Wiggins Saga (Paperback)),[Fiction],9.039317


Then we run Factorization Machine on the 2000 highest predicted rating to get the last recommendation list

In [16]:
with open('FM_model.pkl', 'rb') as f:
    model = pickle.load(f)
    regressor = compose.Select('user') | compose.FuncTransformer(lambda x: {'user': str(x)})
    regressor += (
        compose.Select('item')
    )
    regressor += (
        compose.Select('book_title') | compose.FuncTransformer(lambda x: x['book_title']) | feature_extraction.TFIDF() | preprocessing.Normalizer()
    )
    regressor += (
        compose.Select('age_group')
    )
    regressor += (
        compose.Select('categories') | compose.FuncTransformer(split_categories)
    )

    regressor |= model
    books = df[['item', 'book_title']].drop_duplicates()
    user_id = 193458
    user = df[df['user'] == user_id]
    user = user[['user','age', 'country', 'age_group']].iloc[0].squeeze()
    user_rated = df[df['user'] == user_id]
    user_rated = user_rated[['item', 'book_title', 'rating', 'categories']]
    predicted_ratings = []
    for index, row in filtered_books.iterrows():
        row = pd.concat([row, user])
        pred = regressor.predict_one(row)
        predicted_ratings.append(pred)
    filtered_books['pred_rating'] = predicted_ratings
    filtered_books = filtered_books.sort_values(by='pred_rating', ascending=False)
filtered_books.drop_duplicates('book_title').head(10)

Unnamed: 0,item,book_title,categories,pred_rating
35337,0836220889,Calvin and Hobbes,[Humor],9.528266
86363,0679879269,"The Amber Spyglass (His Dark Materials, Book 3)",[Juvenile Fiction],9.510108
8139,0439139597,Harry Potter and the Goblet of Fire (Book 4),[Juvenile Fiction],9.427033
22345,067168390X,Lonesome Dove,[Fiction],9.407547
19677,0060256672,Where the Sidewalk Ends : Poems and Drawings,[Juvenile Nonfiction],9.377482
1200,0446310786,To Kill a Mockingbird,[Fiction],9.357175
49478,0836218221,The Authoritative Calvin and Hobbes (Calvin an...,[Humor],9.327345
32439,0812550706,Ender's Game (Ender Wiggins Saga (Paperback)),[Fiction],9.323133
45211,0064400557,Charlotte's Web (Trophy Newbery),[Juvenile Fiction],9.314458
114920,0310205719,The Purpose-Driven Life: What on Earth Am I He...,[Religion],9.314183


In [22]:
with open('FM_model.pkl', 'rb') as f:
    model = pickle.load(f)
    regressor = compose.Select('user') | compose.FuncTransformer(lambda x: {'user': str(x)})
    regressor += (
        compose.Select('item')
    )
    regressor += (
        compose.Select('book_title') | compose.FuncTransformer(lambda x: x['book_title']) | feature_extraction.TFIDF() | preprocessing.Normalizer()
    )
    regressor += (
        compose.Select('age_group')
    )
    regressor += (
        compose.Select('categories') | compose.FuncTransformer(split_categories)
    )

    regressor |= model
    books = df[['item', 'book_title']].drop_duplicates()
    user_id = 193458
    user = df[df['user'] == user_id]
    user = user[['user','age', 'country', 'age_group']].iloc[0].squeeze()
    print(user)
    user_rated = df[df['user'] == user_id]
    user_rated = user_rated[['item', 'book_title', 'rating', 'categories']]
    predicted_ratings = []
    for index, row in user_rated.iterrows():
        row = pd.concat([row, user])
        pred = regressor.predict_one(row)
        predicted_ratings.append(pred)
    user_rated['pred_rating'] = predicted_ratings
    user_rated = user_rated.sort_values(by='pred_rating', ascending=False)
user_rated

user         193458
age            40.0
country      canada
age_group     30-40
Name: 2500, dtype: object


Unnamed: 0,item,book_title,rating,categories,pred_rating
23249,0345361792,A Prayer for Owen Meany,10,[Fiction],9.230143
6227,0064471047,"The Lion, the Witch, and the Wardrobe (The Chr...",9,[Juvenile Fiction],9.213678
95506,0064471101,The Magician's Nephew (rack) (Narnia),9,[Juvenile Fiction],9.137354
6701,0142001740,The Secret Life of Bees,9,[Fiction],9.016339
17518,1853260002,Pride & Prejudice (Wordsworth Classics),10,[Fiction],8.859538
108422,0064471055,Prince Caspian (rack) : The Return to Narnia (...,9,[Juvenile Fiction],8.77526
11238,1853260622,War and Peace (Wordsworth Classics),5,[Fiction],8.73735
69872,0671880314,Schindler's List,9,[Fiction],8.704941
146306,1853260169,Sense and Sensibility (Wordsworth Classics),10,[Fiction],8.539659
41250,0064471098,The Silver Chair,9,[Juvenile Fiction],8.523407


## Bonus: Implement MF using FM
FM is a generalization of MF, so in theory, we can use FM to reimplementing MF with a similar result

In [17]:
fm_params = {
    'n_factors': 10,
    'weight_optimizer': optim.SGD(0.01),
    'latent_optimizer': optim.SGD(0.02),
    'sample_normalization': False,
    'l1_weight': 0.,
    'l2_weight': 0.,
    'l1_latent': 0.,
    'l2_latent': 0.,
    'intercept': 3,
    'intercept_lr': .01,
    'weight_initializer': optim.initializers.Zeros(),
    'latent_initializer': optim.initializers.Normal(mu=0., sigma=0.1, seed=73),
}

regressor = compose.Select('user')  | compose.FuncTransformer(lambda x: {'user': str(x)})
regressor += (compose.Select('item'))
regressor |= facto.FMRegressor(**fm_params)

model = preprocessing.PredClipper(
    regressor=regressor,
    y_min=1,
    y_max=10
)
metric = metrics.MAE() + metrics.RMSE()
X_y, z = readData()
cnt = 0
for x, y in X_y:
    y_pred = model.predict_one(x=x)
    
    metric.update(y_pred=y_pred, y_true=y)
    _ = model.learn_one(x=x, y=y)
    if cnt % 10000 == 0:
        print(cnt, metric)
    cnt+= 1 
print(metric)
metric = metrics.MAE() + metrics.RMSE()
for x, y in z:
    y_pred = model.predict_one(x=x)
    
    metric.update(y_pred=y_pred, y_true=y)
    if cnt % 10000 == 0:
        print(cnt, metric)
    cnt+= 1 
print(metric)


0 MAE: 6.019074, RMSE: 6.019074
10000 MAE: 1.400258, RMSE: 1.776039
20000 MAE: 1.389467, RMSE: 1.760143
30000 MAE: 1.372615, RMSE: 1.740441
40000 MAE: 1.363295, RMSE: 1.724303
50000 MAE: 1.361456, RMSE: 1.72043
60000 MAE: 1.362864, RMSE: 1.723444
70000 MAE: 1.36103, RMSE: 1.719765
80000 MAE: 1.358217, RMSE: 1.715602
90000 MAE: 1.356265, RMSE: 1.713454
100000 MAE: 1.357406, RMSE: 1.715939
110000 MAE: 1.358356, RMSE: 1.717383
120000 MAE: 1.35715, RMSE: 1.715042
130000 MAE: 1.359383, RMSE: 1.71698
140000 MAE: 1.356906, RMSE: 1.714539
150000 MAE: 1.357568, RMSE: 1.715796
MAE: 1.356625, RMSE: 1.714362
160000 MAE: 1.499413, RMSE: 1.838634
170000 MAE: 1.411787, RMSE: 1.775231
180000 MAE: 1.349974, RMSE: 1.712063
190000 MAE: 1.369604, RMSE: 1.720804
MAE: 1.379612, RMSE: 1.735303
