In [1]:
%load_ext Cython

import pandas as pd
import numpy as np
import datetime
import pickle
from surprise import SVD
from surprise import Dataset
from surprise import Reader
from surprise import accuracy
from surprise.model_selection import train_test_split
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

In [2]:
from recommender_algo.editable_svd import EditableSVD

# This is a notebook for training the model, calculating association rules and model fidelity

Because these steps take a long time to calculate, results can be cached locally using .pickle files. To choose whether a calculation should be repeated or its results should be loaded from cache, use the selections below

In [9]:
development_set = False
retrain_model = True
recalculate_topn = False
recalculate_association_rules = False

In [4]:
if retrain_model:
    if development_set:
        filename = "../data/ml_100k/ratings.csv"
    else:
        filename = "../data/ml-20m/ratings.csv"
    ratings_df = pd.read_csv(filename, dtype={
        'userId': np.int32,
        'movieId': np.int32,
        'rating': np.float32,
        'timestamp': np.int32,
    })

    reader = Reader(rating_scale=(1, 5))
    data = Dataset.load_from_df(ratings_df[['userId', 'movieId', 'rating']], reader)

    trainset, testset = train_test_split(data, test_size=.25, random_state=42)
    algo = EditableSVD(n_factors=50, n_epochs=20)

    train_start_time = datetime.datetime.now()
    algo.fit(trainset)
    train_end_time = datetime.datetime.now()
    print("Training duration: " + str(train_end_time - train_start_time))
    
    predictions = algo.test(testset)
    accuracy.rmse(predictions)
    print(accuracy)
    
    if development_set:
        with open("algo-100k.pickle", "wb+") as fp:
            pickle.dump(algo, fp)
    else:
        with open("algo-20m.pickle", "wb+") as fp:
            pickle.dump(algo, fp)
else:
    if development_set:
        with open("algo-100k.pickle", "rb") as fp:
            algo = pickle.load(fp)
    else:
        with open("algo-20m.pickle", "rb") as fp:
            algo = pickle.load(fp)
    

Training duration: 0:11:31.868355
RMSE: 0.7897
<module 'surprise.accuracy' from '/home/ville/uni/SHProject/venv/lib64/python3.7/site-packages/surprise/accuracy.py'>


In [5]:
accuracy.mae(predictions)

MAE:  0.6010


0.6009594593556168

In [7]:
# Calculate top-n recommendations to all users
if recalculate_topn:
    n = 30
    top_n = {}
    all_users = algo.trainset._raw2inner_id_users.keys()
    all_items = algo.trainset._raw2inner_id_items.keys()
    
    top_n_start_time = datetime.datetime.now()
    for index, u in enumerate(all_users):
        user_recommendations = []
        for i in all_items:
            prediction = algo.predict(u, i)
            user_recommendations.append((prediction.iid, prediction.est))
        user_recommendations.sort(key=lambda x: x[1], reverse=True)
        top_n[u] = user_recommendations[:n]
        if index == 100 or index % 1000 == 0: # Debug
            print(str(index) + ": " + str(datetime.datetime.now()))
    top_n_end_time = datetime.datetime.now()
    print("Top-n calculation duration: " + str(top_n_end_time - top_n_start_time))
    
    if development_set:
        with open("top_n-100k.pickle", "wb+") as fp:
            pickle.dump(top_n, fp)
    else:
        with open("top_n30-20m.pickle", "wb+") as fp:
            pickle.dump(top_n, fp)
else:
    if development_set:
        with open("top_n-100k.pickle", "rb") as fp:
            top_n = pickle.load(fp)
    else:
        with open("top_n20-20m.pickle", "rb") as fp:
            top_n = pickle.load(fp)

In [8]:
top_n_items = [ [x[0] for x in row] for row in top_n.values()]
top_n_items[1]

[356,
 2329,
 3949,
 318,
 527,
 2571,
 2959,
 3578,
 2324,
 48394,
 3147,
 48780,
 26674,
 110,
 47,
 1704,
 4226,
 1584,
 7254,
 4995]

In [10]:
if recalculate_association_rules:
    # Calculate association rules from top-n
    te = TransactionEncoder()
    te_ary = te.fit(top_n_items).transform(top_n_items, sparse=True)
    topn_df = pd.DataFrame.sparse.from_spmatrix(te_ary, columns=te.columns_)
    print("Sparse df created")

    apriori_start_time = datetime.datetime.now()
    frequent_itemsets = apriori(topn_df, min_support=0.005, verbose=1, low_memory=True, use_colnames=True, max_len=4)
    apriori_end_time = datetime.datetime.now()
    print("Training duration: " + str(apriori_end_time - apriori_start_time))

    frequent_itemsets['length'] = frequent_itemsets['itemsets'].apply(lambda x: len(x))
    print(frequent_itemsets)

    frequent_itemsets[(frequent_itemsets['length'] == 2)]
    rules = association_rules(frequent_itemsets)
    if development_set:
        with open("association-rules-100k.pickle", "wb+") as fp:
            pickle.dump(rules, fp)
    else:
        with open("association-rules-25m-n30.pickle", "wb+") as fp:
            pickle.dump(rules, fp)
else:
    if development_set:
        with open("association-rules-100k.pickle", "rb") as fp:
            rules = pickle.load(fp)
    else:
        with open("association-rules-20m.pickle", "rb") as fp:
            rules = pickle.load(fp)

In [14]:
rules['consequents_length'] = rules['consequents'].apply(lambda x: len(x))
rules['antecedents_length'] = rules['antecedents'].apply(lambda x: len(x))
filtered_rules = rules[(rules['support'] > 0.05) &
      (rules['confidence'] > 0.3) & (rules['antecedents_length'] < 4) & (rules['consequents_length'] == 1)]
print(filtered_rules)

               antecedents consequents  antecedent support  \
3                   (4226)        (50)            0.059952   
5                   (6016)        (50)            0.086748   
6                    (111)       (858)            0.073599   
7                   (1196)       (260)            0.124916   
8                   (1198)       (260)            0.082372   
...                    ...         ...                 ...   
22437   (5952, 4993, 7502)      (7153)            0.065520   
22438   (7153, 4993, 7502)      (5952)            0.064595   
22506  (5952, 7153, 58559)      (4993)            0.075787   
22507  (5952, 4993, 58559)      (7153)            0.064617   
22508  (7153, 4993, 58559)      (5952)            0.064133   

       consequent support   support  confidence      lift  leverage  \
3                0.380120  0.054797    0.914007  2.404520  0.032008   
5                0.380120  0.071202    0.820792  2.159296  0.038227   
6                0.395161  0.059830    0.8

In [10]:
movieId = 2959
filtered_rules.loc[filtered_rules['consequents'].apply(lambda cons: True if movieId in cons else False)]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,consequents_length,antecedents_length
14,(47),(2959),0.029929,0.335851,0.029157,0.974186,2.900649,0.019105,25.728019,1,1
81,(223),(2959),0.026861,0.335851,0.024030,0.894624,2.663752,0.015009,6.302639,1,1
101,(288),(2959),0.012066,0.335851,0.010246,0.849192,2.528479,0.006194,4.403941,1,1
417,(665),(2959),0.018015,0.335851,0.016275,0.903407,2.689904,0.010225,6.875734,1,1
465,(778),(2959),0.088481,0.335851,0.074278,0.839481,2.499564,0.044562,4.137510,1,1
...,...,...,...,...,...,...,...,...,...,...,...
992860,"(100553, 94466, 108583)",(2959),0.121544,0.335851,0.106171,0.873522,2.600923,0.065351,5.251114,1,3
992864,"(94466, 105250, 104069)",(2959),0.009185,0.335851,0.008621,0.938679,2.794928,0.005537,10.830739,1,3
992866,"(94466, 104069, 108583)",(2959),0.012391,0.335851,0.011849,0.956294,2.847376,0.007688,15.195731,1,3
992868,"(94466, 105250, 108583)",(2959),0.053916,0.335851,0.043165,0.800589,2.383764,0.025057,3.330558,1,3


In [21]:
# Calculate model fidelity
mf_start_time = datetime.datetime.now()
recommendations_amount = 0
explainable_amount = 0
for k, (u, recommendations) in enumerate(top_n.items()):
    if k % 10 == 3: # Only take a small sample
        if k % 10000 == 3:
            print(k)
        for (i, rating) in recommendations:
            if (i not in [row[0] for row in algo.trainset.ur[u]]):
                recommendations_amount += 1
                rows = filtered_rules.loc[filtered_rules['consequents']
                    .apply(lambda cons: True if i in cons else False)]
                for index, row in rows.iterrows():
                    antecedents = list(row['antecedents'])
                    user_ratings = [ algo.trainset.to_raw_iid(x[0]) for x in algo.trainset.ur[algo.trainset.to_inner_uid(u)] ]
                    if all([x in user_ratings for x in antecedents]):
                        explainable_amount += 1
                        break;
                
mf_end_time = datetime.datetime.now()
print("Model fidelity calculation duration: " + str(mf_end_time - mf_start_time))

model_fidelity = explainable_amount / recommendations_amount
print(model_fidelity)

3
10003
20003
30003
40003
50003
60003
70003
80003
90003
100003
110003
120003
130003
Model fidelity calculation duration: 0:08:14.717013
0.0649594417536473
