# Factorization Machines scoring

In [17]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from surprise import Dataset, Reader
from surprise import SVD, NMF
from surprise.model_selection import cross_validate, train_test_split, GridSearchCV

import functions as f
import river
from river import datasets, stream, compose, facto, optim, reco, metrics, preprocessing, feature_extraction, dummy, stats
def parse_list(x:str):
    lst = x.strip('][').split('\', ')
    return list(map(lambda x: x.strip('\''), lst))

## Training model
In this section, we trained MF and FM models using River, a python incremeantal learning package, specialize in working with data stream

In [2]:
# Read Data
def parse_list(x:str):
    lst = x.strip('][').split('\', ')
    return list(map(lambda x: x.strip('\''), lst))
def readData(random_state = None, frac = 0.8):
    df = pd.read_csv('data/ratings_top_w_features.csv', header=0, names=['user','age', 'item', 'rating', 'book_title', 'book_author', 'year_of_publication', 'language', 'categories','country', 'age_group'])
    df['categories'] = df['categories'].apply(parse_list)
    df['country'] = df['country'].apply(str)
    df['rating'] = df['rating'].map(lambda x: (x + 1)//2)
    # r_df = df.head(int(0.8* df.shape[0]))
    r_df = df.sample(frac=frac, random_state=random_state) if random_state else df.head(int(frac* df.shape[0]))
    test = df.drop(r_df.index)
    y = r_df.pop('rating')
    y_test = test.pop('rating')
    return stream.iter_pandas(r_df, y), stream.iter_pandas(test, y_test)


In [3]:
# Calculate weighted
df = pd.read_csv('data/ratings_top_w_features.csv', header=0, names=['user','age', 'item', 'rating', 'book_title', 'book_author', 'year_of_publication', 'language', 'categories','country', 'age_group'],dtype={'country': str})
df['categories'] = df['categories'].apply(parse_list)
category_counts = df.explode('categories').groupby('categories')['categories'].count()
weights = 1/category_counts
total_weights = weights.sum()
# min_weight, max_weight = weights.max(), weights.min()
# normalized_weights = (weights - min_weight)/ (max_weight - min_weight)
normalized_weights = weights / total_weights

languages_counts = df.groupby('language')['language'].count()
languages_counts
l_weights = 1 /languages_counts
total_l_weights = l_weights.sum()
normalized_l_weights = l_weights/total_l_weights
category_counts.sort_values(ascending=False), languages_counts.sort_values(ascending=False)

(categories
 Fiction                        135429
 Juvenile Fiction                15062
 Biography & Autobiography        9437
 Humor                            3934
 History                          3354
 Religion                         2981
 Body, Mind & Spirit              2169
 Juvenile Nonfiction              2071
 Social Science                   2059
 Business & Economics             1902
 Family & Relationships           1811
 Self-Help                        1742
 Health & Fitness                 1597
 Cooking                          1377
 Travel                           1230
 Poetry                           1069
 True Crime                       1007
 Psychology                       1002
 Science                           974
 Computers                         945
 Literary Criticism                933
 Drama                             896
 Political Science                 862
 Nature                            821
 Comics & Graphic Novels           816
 Philosophy  

In [4]:

RANDOM_STATE = 5

In [12]:
biased_mf_params = {
    'n_factors': 20,
    'bias_optimizer': optim.SGD(0.025),
    'latent_optimizer': optim.SGD(0.05),
    'weight_initializer': optim.initializers.Zeros(),
    'latent_initializer': optim.initializers.Normal(mu=0., sigma=0.1, seed=73),
    'l2_bias': 0.,
    'l2_latent': 0.
}
def trainBiasedMF():
    print('training MF on seed:', RANDOM_STATE)
    res = []
    model = reco.BiasedMF(**biased_mf_params)
    # model = reco.RandomNormal(seed=42)
    metric = metrics.MAE() + metrics.RMSE()
    X_y, X_y_test = readData(RANDOM_STATE)
    cnt = 0
    for x, y in X_y:
        y_pred = model.predict_one(user=x['user'], item=x['item'])
        
        metric.update(y_pred=y_pred, y_true=y)
        _ = model.learn_one(user=x['user'], item=x['item'], x=x, y=y)
        cnt+= 1 
    print('On training', cnt, metric)
    res.extend(metric.get())
    metric = metrics.MAE() + metrics.RMSE()
    cnt = 0
    for x, y in X_y_test:
        y_pred = model.predict_one(user=x['user'], item=x['item'])
        
        metric.update(y_pred=y_pred, y_true=y)
        cnt+= 1 
    print('On test', cnt, metric)
    res.extend(metric.get())
    return model, res

In [20]:
baseline_params = {
    'optimizer': optim.SGD(0.025),
    'l2': 0.,
    'initializer': optim.initializers.Zeros()
}

def trainBaseline():
    print('training MF on seed:', RANDOM_STATE)
    res = []
    model =  dummy.StatisticRegressor(stats.Mean())
    # model = reco.RandomNormal(seed=42)
    metric = metrics.MAE() + metrics.RMSE()
    X_y, X_y_test = readData(RANDOM_STATE)
    cnt = 0
    for x, y in X_y:
        y_pred = model.predict_one(x)
        
        metric.update(y_pred=y_pred, y_true=y)
        _ = model.learn_one(x,y=y)
        cnt+= 1 
    print('On training', cnt, metric)
    res.extend(metric.get())
    metric = metrics.MAE() + metrics.RMSE()
    cnt = 0
    for x, y in X_y_test:
        y_pred = model.predict_one(x)
        
        metric.update(y_pred=y_pred, y_true=y)
        cnt+= 1 
    print('On test', cnt, metric)
    res.extend(metric.get())
    return model, res

In [6]:
ffm_params = {
    'n_factors': 10,
    'weight_optimizer': optim.SGD(0.01),
    'latent_optimizer': optim.SGD(0.02),
    'intercept': 3,
    'latent_initializer': optim.initializers.Normal(mu=0., sigma=0.05, seed=73),
}

def split_categories(x):
    categories = x['categories']
    return {f'categorie_{categorie}': normalized_weights[categorie] for categorie in categories}
def weight_language(x):
    lan = x['language']
    return {f'lang_{lan}' : normalized_l_weights[lan]}
def trainFFM():
    
    print('training FM on seed:', RANDOM_STATE)
    res = []
    regressor = compose.Select('user') | compose.FuncTransformer(lambda x: {'user': str(x)})
    regressor += (
        compose.Select('item')
    )
    regressor += (
        compose.Select('book_title') | compose.FuncTransformer(lambda x: x['book_title']) | feature_extraction.TFIDF() | preprocessing.Normalizer()
    )
    regressor += (
        compose.Select('age_group')
    )
    regressor += (
        compose.Select('categories') | compose.FuncTransformer(split_categories)
    )
    model = facto.FFMRegressor(**ffm_params)

    regressor |= model


    X_y, X_y_test = readData(RANDOM_STATE)
    metric = metrics.MAE() + metrics.RMSE()
    cnt = 0
    for x, y in X_y:
        y_pred = regressor.predict_one(x=x)
        
        metric.update(y_pred=y_pred, y_true=y) 
        _ = regressor.learn_one(x=x, y=y)
        cnt += 1
    print('On training', cnt, metric)
    res.extend(metric.get())
    metric = metrics.MAE() + metrics.RMSE()
    cnt = 0
    for x, y in X_y_test:
        y_pred = regressor.predict_one(x=x)
        
        metric.update(y_pred=y_pred, y_true=y)
        cnt += 1
    print('On test', cnt, metric)
    res.extend(metric.get())
    return model, res

In [15]:
# train_res = pd.DataFrame(columns=['MF_train_MAE', 'MF_train_RMSE', 'MF_test_MAE', 'MF_test_RMSE','FM_train_MAE', 'FM_train_RMSE', 'FM_test_MAE', 'FM_test_RMSE'])
train_res = pd.DataFrame(columns=['MF_train_MAE', 'MF_train_RMSE', 'MF_test_MAE', 'MF_test_RMSE'])

for RANDOM_STATE in [5,10,15,20,25,30]:
    res = trainBiasedMF()[1]
    # res.extend(trainFFM()[1])
    train_res.loc[RANDOM_STATE] = res
    
train_res
    

training MF on seed: 5
On training 159273 MAE: 0.621114, RMSE: 0.819217
On test 39818 MAE: 0.607907, RMSE: 0.798725
training MF on seed: 10
On training 159273 MAE: 0.620504, RMSE: 0.818346
On test 39818 MAE: 0.606297, RMSE: 0.800271
training MF on seed: 15
On training 159273 MAE: 0.619495, RMSE: 0.818175
On test 39818 MAE: 0.60809, RMSE: 0.800701
training MF on seed: 20


KeyboardInterrupt: 

In [19]:
# train_res = pd.DataFrame(columns=['MF_train_MAE', 'MF_train_RMSE', 'MF_test_MAE', 'MF_test_RMSE','FM_train_MAE', 'FM_train_RMSE', 'FM_test_MAE', 'FM_test_RMSE'])
train_res = pd.DataFrame(columns=['MF_train_MAE', 'MF_train_RMSE', 'MF_test_MAE', 'MF_test_RMSE'])

for RANDOM_STATE in [5]:
    res = trainBaseline()[1]
    # res.extend(trainFFM()[1])
    train_res.loc[RANDOM_STATE] = res
    
train_res
    

training MF on seed: 5
On training 159273 MAE: 0.6583, RMSE: 0.875538
On test 39818 MAE: 0.65208, RMSE: 0.871775


Unnamed: 0,MF_train_MAE,MF_train_RMSE,MF_test_MAE,MF_test_RMSE
5,0.6583,0.875538,0.65208,0.871775


In [15]:
import pickle
model, res= trainBiasedMF()
with open('MF_model.pkl','wb') as f:
    pickle.dump(model,f)
fm_model, res = trainFFM()
with open('FM_model.pkl', 'wb') as f:
    pickle.dump(fm_model, f)

training MF on seed: 5
On training 159273 MAE: 1.329124, RMSE: 1.697562
On test 39818 MAE: 1.280593, RMSE: 1.649141
training FM on seed: 5
On training 159273 MAE: 1.328087, RMSE: 1.699
On test 39818 MAE: 1.270415, RMSE: 1.635319


## Recommendation for user
In this section, we used the trained model to filtered and present a list of recommendation for a specific user
In this example, if the user with id = 193458

In [69]:
df = pd.read_csv('data/ratings_top_w_features.csv', header=0, names=['user','age', 'item', 'rating', 'book_title', 'book_author', 'year_of_publication', 'language', 'categories','country', 'age_group'],dtype={'country': str})
df['categories'] = df['categories'].apply(parse_list)

In [92]:
with open('MF_model.pkl', 'rb') as f:
    model = pickle.load(f)
    user_id = 193458
    rated_books = df[df['user'] == user_id]
    rated_books = rated_books[['item', 'book_title', 'categories', 'rating']].drop_duplicates('item')
    # rated_books
    predicted_ratings = []
    for index, row in rated_books.iterrows():
        pred = model.predict_one(user_id, row['item'])
        predicted_ratings.append(pred)
    rated_books['pred_rating'] = predicted_ratings
    rated_books = rated_books.sort_values(by='pred_rating', ascending=False)
rated_books

Unnamed: 0,item,book_title,categories,rating,pred_rating
6227,0064471047,"The Lion, the Witch, and the Wardrobe (The Chr...",[Juvenile Fiction],9,8.984755
23249,0345361792,A Prayer for Owen Meany,[Fiction],10,8.739835
69872,0671880314,Schindler's List,[Fiction],9,8.688743
95506,0064471101,The Magician's Nephew (rack) (Narnia),[Juvenile Fiction],9,8.672462
6701,0142001740,The Secret Life of Bees,[Fiction],9,8.595924
108469,006447108X,The Last Battle,[Juvenile Fiction],9,8.52053
17518,1853260002,Pride & Prejudice (Wordsworth Classics),[Fiction],10,8.47363
138327,014011369X,"And the Band Played on: Politics, People, and ...",[History],9,8.389543
108422,0064471055,Prince Caspian (rack) : The Return to Narnia (...,[Juvenile Fiction],9,8.277526
108447,0064471063,The Horse and His Boy,[Juvenile Fiction],9,8.260891


In [89]:
books = df[['item', 'book_title','categories']].drop_duplicates(subset=['item', 'book_title'])
unrated_books = books[~books['item'].isin(rated_books['item'].tolist())].copy()
print(f'all books: {books.shape[0]}, rated books: {rated_books.shape[0]}, not yet rated books: {unrated_books.shape[0]}')

all books: 63819, rated books: 55, not yet rated books: 63764


First we use Matrix Factorization to filterd through the 63764 not yet rated books, and keep 2000 highest predicted rating

In [83]:
with open('MF_model.pkl', 'rb') as f:
    model = pickle.load(f)
    user_id = 193458
    predicted_ratings = []
    for index, row in unrated_books.iterrows():
        pred = model.predict_one(user_id, row['item'])
        predicted_ratings.append(pred)
    unrated_books['pred_rating'] = predicted_ratings
    unrated_books = unrated_books.sort_values(by='pred_rating', ascending=False)
filtered_books = unrated_books.head(2000).copy()

Then we run Factorization Machine on the 2000 highest predicted rating to get the last recommendation list

In [84]:
with open('FM_model.pkl', 'rb') as f:
    model = pickle.load(f)
    regressor = compose.Select('user') | compose.FuncTransformer(lambda x: {'user': str(x)})
    regressor += (
        compose.Select('item')
    )
    regressor += (
        compose.Select('book_title') | compose.FuncTransformer(lambda x: x['book_title']) | feature_extraction.TFIDF() | preprocessing.Normalizer()
    )
    regressor += (
        compose.Select('age_group')
    )
    regressor += (
        compose.Select('categories') | compose.FuncTransformer(split_categories)
    )

    regressor |= model
    books = df[['item', 'book_title']].drop_duplicates()
    user_id = 193458
    user = df[df['user'] == user_id]
    user = user[['user','age', 'country', 'age_group']].iloc[0].squeeze()
    user_rated = df[df['user'] == user_id]
    user_rated = user_rated[['item', 'book_title', 'rating', 'categories']]
    predicted_ratings = []
    for index, row in filtered_books.iterrows():
        row = pd.concat([row, user])
        pred = regressor.predict_one(row)
        predicted_ratings.append(pred)
    filtered_books['pred_rating'] = predicted_ratings
    filtered_books = filtered_books.sort_values(by='pred_rating', ascending=False)
filtered_books.drop_duplicates('book_title').head(10)

Unnamed: 0,item,book_title,categories,pred_rating
86363,0679879269,"The Amber Spyglass (His Dark Materials, Book 3)",[Juvenile Fiction],9.426824
35337,0836220889,Calvin and Hobbes,[Humor],9.419324
45211,0064400557,Charlotte's Web (Trophy Newbery),[Juvenile Fiction],9.341966
8139,0439139597,Harry Potter and the Goblet of Fire (Book 4),[Juvenile Fiction],9.313588
19677,0060256672,Where the Sidewalk Ends : Poems and Drawings,[Juvenile Nonfiction],9.297272
84512,1853261580,The Little Prince (Wordsworth Collection),[Juvenile Fiction],9.293492
32439,0812550706,Ender's Game (Ender Wiggins Saga (Paperback)),[Fiction],9.291647
49478,0836218221,The Authoritative Calvin and Hobbes (Calvin an...,[Humor],9.27505
41199,0385199570,The Stand (The Complete and Uncut Edition),[Fiction],9.27301
22345,067168390X,Lonesome Dove,[Fiction],9.256769


In [95]:
with open('FM_model.pkl', 'rb') as f:
    model = pickle.load(f)
    regressor = compose.Select('user') | compose.FuncTransformer(lambda x: {'user': str(x)})
    regressor += (
        compose.Select('item')
    )
    regressor += (
        compose.Select('book_title') | compose.FuncTransformer(lambda x: x['book_title']) | feature_extraction.TFIDF() | preprocessing.Normalizer()
    )
    regressor += (
        compose.Select('age_group')
    )
    regressor += (
        compose.Select('categories') | compose.FuncTransformer(split_categories)
    )

    regressor |= model
    books = df[['item', 'book_title']].drop_duplicates()
    user_id = 193458
    user = df[df['user'] == user_id]
    user = user[['user','age', 'country', 'age_group']].iloc[0].squeeze()
    print(user)
    user_rated = df[df['user'] == user_id]
    user_rated = user_rated[['item', 'book_title', 'rating', 'categories']]
    predicted_ratings = []
    for index, row in user_rated.iterrows():
        row = pd.concat([row, user])
        pred = regressor.predict_one(row)
        predicted_ratings.append(pred)
    user_rated['pred_rating'] = predicted_ratings
    user_rated = user_rated.sort_values(by='pred_rating', ascending=False)
user_rated

user         193458
age            40.0
country      canada
age_group     30-40
Name: 2500, dtype: object


Unnamed: 0,item,book_title,rating,categories,pred_rating
23249,0345361792,A Prayer for Owen Meany,10,[Fiction],9.222751
6227,0064471047,"The Lion, the Witch, and the Wardrobe (The Chr...",9,[Juvenile Fiction],9.214003
95506,0064471101,The Magician's Nephew (rack) (Narnia),9,[Juvenile Fiction],9.141694
6701,0142001740,The Secret Life of Bees,9,[Fiction],9.015545
17518,1853260002,Pride & Prejudice (Wordsworth Classics),10,[Fiction],8.857416
108422,0064471055,Prince Caspian (rack) : The Return to Narnia (...,9,[Juvenile Fiction],8.77872
11238,1853260622,War and Peace (Wordsworth Classics),5,[Fiction],8.734933
69872,0671880314,Schindler's List,9,[Fiction],8.705058
146306,1853260169,Sense and Sensibility (Wordsworth Classics),10,[Fiction],8.538697
41250,0064471098,The Silver Chair,9,[Juvenile Fiction],8.52324


## Bonus: Implement MF using FM
FM is a generalization of MF, so in theory, we can use FM to reimplementing MF with a similar result

In [15]:
fm_params = {
    'n_factors': 10,
    'weight_optimizer': optim.SGD(0.025),
    'latent_optimizer': optim.SGD(0.05),
    'sample_normalization': False,
    'l1_weight': 0.,
    'l2_weight': 0.,
    'l1_latent': 0.,
    'l2_latent': 0.,
    'intercept': 3,
    'intercept_lr': .01,
    'weight_initializer': optim.initializers.Zeros(),
    'latent_initializer': optim.initializers.Normal(mu=0., sigma=0.1, seed=73),
}

regressor = compose.Select('user')  | compose.FuncTransformer(lambda x: {'user': str(x)})
regressor += (compose.Select('item'))
regressor |= facto.FMRegressor(**fm_params)

model = preprocessing.PredClipper(
    regressor=regressor,
    y_min=1,
    y_max=10
)
metric = metrics.MAE() + metrics.RMSE()
X_y, z = readData()
cnt = 0
for x, y in X_y:
    y_pred = model.predict_one(x=x)
    
    metric.update(y_pred=y_pred, y_true=y)
    _ = model.learn_one(x=x, y=y)
    if cnt % 10000 == 0:
        print(cnt, metric)
    cnt+= 1 
print(metric)
metric = metrics.MAE() + metrics.RMSE()
for x, y in z:
    y_pred = model.predict_one(x=x)
    
    metric.update(y_pred=y_pred, y_true=y)
    # _ = model.learn_one(x=x, y=y)
    if cnt % 10000 == 0:
        print(cnt, metric)
    cnt+= 1 
print(metric)


0 MAE: 6.019074, RMSE: 6.019074
10000 MAE: 1.395521, RMSE: 1.770745
20000 MAE: 1.380373, RMSE: 1.751652
30000 MAE: 1.360002, RMSE: 1.728276
40000 MAE: 1.346654, RMSE: 1.708966
50000 MAE: 1.341534, RMSE: 1.701696
60000 MAE: 1.341415, RMSE: 1.703185
70000 MAE: 1.337916, RMSE: 1.698332
80000 MAE: 1.332936, RMSE: 1.692468
90000 MAE: 1.329329, RMSE: 1.688843
100000 MAE: 1.329272, RMSE: 1.690474
110000 MAE: 1.329752, RMSE: 1.691807
120000 MAE: 1.32742, RMSE: 1.688552
130000 MAE: 1.328501, RMSE: 1.689557
140000 MAE: 1.325267, RMSE: 1.686469
150000 MAE: 1.32515, RMSE: 1.687107
MAE: 1.323294, RMSE: 1.685036
160000 MAE: 1.451595, RMSE: 1.799405
170000 MAE: 1.355835, RMSE: 1.740138
180000 MAE: 1.297076, RMSE: 1.676742
190000 MAE: 1.316396, RMSE: 1.683654
MAE: 1.325851, RMSE: 1.697495
