In [2]:
%load_ext Cython

import pandas as pd
import numpy as np
import datetime
import pickle
from surprise import SVD
from surprise import Dataset
from surprise import Reader
from surprise import accuracy
from surprise.model_selection import train_test_split
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

In [3]:
from recommender_algo.editable_svd import EditableSVD

In [4]:
development_set = False
retrain_model = False
recalculate_topn = True
recalculate_association_rules = True

In [5]:
if retrain_model:
    if development_set:
        filename = "../data/ml_100k/ratings.csv"
    else:
        filename = "../data/ml-25m/ratings.csv"
    ratings_df = pd.read_csv(filename, dtype={
        'userId': np.int32,
        'movieId': np.int32,
        'rating': np.float32,
        'timestamp': np.int32,
    })

    reader = Reader(rating_scale=(1, 5))
    data = Dataset.load_from_df(ratings_df[['userId', 'movieId', 'rating']], reader)

    trainset, testset = train_test_split(data, test_size=.25, random_state=42)
    algo = EditableSVD(n_factors=10)

    train_start_time = datetime.datetime.now()
    algo.fit(trainset)
    train_end_time = datetime.datetime.now()
    print("Training duration: " + str(train_end_time - train_start_time))
    
    predictions = algo.test(testset)
    accuracy.rmse(predictions)
    print(accuracy)
    
    if development_set:
        with open("algo-100k.pickle", "wb+") as fp:
            pickle.dump(algo, fp)
    else:
        with open("algo-25m.pickle", "wb+") as fp:
            pickle.dump(algo, fp)
else:
    if development_set:
        with open("algo-100k.pickle", "rb") as fp:
            algo = pickle.load(fp)
    else:
        with open("algo-25m.pickle", "rb") as fp:
            algo = pickle.load(fp)
    

In [None]:
# Calculate top-n recommendations to all users
if recalculate_topn:
    n = 30
    top_n = {}
    all_users = algo.trainset._raw2inner_id_users.keys()
    all_items = algo.trainset._raw2inner_id_items.keys()
    
    top_n_start_time = datetime.datetime.now()
    for index, u in enumerate(all_users):
        user_recommendations = []
        for i in all_items:
            prediction = algo.predict(u, i)
            user_recommendations.append((prediction.iid, prediction.est))
        user_recommendations.sort(key=lambda x: x[1], reverse=True)
        top_n[u] = user_recommendations[:n]
        if index == 100 or index % 1000 == 0: # Debug
            print(str(index) + ": " + str(datetime.datetime.now()))
    top_n_end_time = datetime.datetime.now()
    print("Top-n calculation duration: " + str(top_n_end_time - top_n_start_time))
    
    if development_set:
        with open("top_n-100k.pickle", "wb+") as fp:
            pickle.dump(top_n, fp)
    else:
        with open("top_n30-25m.pickle", "wb+") as fp:
            pickle.dump(top_n, fp)
else:
    if development_set:
        with open("top_n-100k.pickle", "rb") as fp:
            top_n = pickle.load(fp)
    else:
        with open("top_n20-20m.pickle", "rb") as fp:
            top_n = pickle.load(fp)

0: 2020-02-25 15:49:39.805675
100: 2020-02-25 15:50:28.562455
1000: 2020-02-25 15:58:08.804340
2000: 2020-02-25 16:06:42.183617
3000: 2020-02-25 16:16:40.254445
4000: 2020-02-25 16:25:34.792893
5000: 2020-02-25 16:35:04.617478
6000: 2020-02-25 16:48:05.280548
7000: 2020-02-25 17:02:08.277470
8000: 2020-02-25 17:11:14.227242
9000: 2020-02-25 17:20:17.122729
10000: 2020-02-25 17:29:22.014507
11000: 2020-02-25 17:38:22.485881
12000: 2020-02-25 17:47:21.191107
13000: 2020-02-25 17:56:20.837243
14000: 2020-02-25 18:05:16.776579
15000: 2020-02-25 18:14:12.671045
16000: 2020-02-25 18:23:10.843381
17000: 2020-02-25 18:32:07.180465
18000: 2020-02-25 18:41:04.395176
19000: 2020-02-25 18:50:02.363179
20000: 2020-02-25 18:59:03.644262
21000: 2020-02-25 19:08:01.396308
22000: 2020-02-25 19:16:58.890631
23000: 2020-02-25 19:25:57.199601
24000: 2020-02-25 19:34:53.160128
25000: 2020-02-25 19:43:50.036412
26000: 2020-02-25 19:52:46.386559
27000: 2020-02-25 20:01:41.670630
28000: 2020-02-25 20:10:35.96

In [None]:
top_n_items = [ [x[0] for x in row] for row in top_n.values()]
top_n_items[1]

In [None]:
if recalculate_association_rules:
    # Calculate association rules from top-n
    te = TransactionEncoder()
    te_ary = te.fit(top_n_items).transform(top_n_items, sparse=True)
    topn_df = pd.DataFrame.sparse.from_spmatrix(te_ary, columns=te.columns_)
    print("Sparse df created")

    apriori_start_time = datetime.datetime.now()
    frequent_itemsets = apriori(topn_df, min_support=0.005, verbose=1, low_memory=True, use_colnames=True, max_len=4)
    apriori_end_time = datetime.datetime.now()
    print("Training duration: " + str(apriori_end_time - apriori_start_time))

    frequent_itemsets['length'] = frequent_itemsets['itemsets'].apply(lambda x: len(x))
    print(frequent_itemsets)

    frequent_itemsets[(frequent_itemsets['length'] == 2)]
    rules = association_rules(frequent_itemsets)
    if development_set:
        with open("association-rules-100k.pickle", "wb+") as fp:
            pickle.dump(rules, fp)
    else:
        with open("association-rules-25m-n30.pickle", "wb+") as fp:
            pickle.dump(rules, fp)
else:
    if development_set:
        with open("association-rules-100k.pickle", "rb") as fp:
            rules = pickle.load(fp)
    else:
        with open("association-rules-20m.pickle", "rb") as fp:
            rules = pickle.load(fp)

In [15]:
rules['consequents_length'] = rules['consequents'].apply(lambda x: len(x))
rules['antecedents_length'] = rules['antecedents'].apply(lambda x: len(x))
filtered_rules = rules[(rules['support'] > 0.05) &
      (rules['confidence'] > 0.3) & (rules['antecedents_length'] < 4) & (rules['consequents_length'] == 1)]
print(filtered_rules)

                      antecedents consequents  antecedent support  \
5                            (17)       (527)            0.078379   
15                           (50)       (318)            0.393811   
16                         (2858)        (50)            0.202776   
17                         (4226)        (50)            0.165590   
18                         (6016)        (50)            0.132815   
...                           ...         ...                 ...   
1177150    (93040, 93988, 108583)    (100553)            0.062306   
1177239   (100553, 101850, 93988)    (108583)            0.065982   
1177240   (101850, 93988, 108583)    (100553)            0.063758   
1177337  (100553, 105250, 101850)    (108583)            0.087680   
1177338  (101850, 105250, 108583)    (100553)            0.085701   

         consequent support   support  confidence      lift  leverage  \
5                  0.421278  0.064718    0.825702  1.959996  0.031699   
15                 0.7993

In [10]:
movieId = 2959
filtered_rules.loc[filtered_rules['consequents'].apply(lambda cons: True if movieId in cons else False)]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,consequents_length,antecedents_length
14,(47),(2959),0.029929,0.335851,0.029157,0.974186,2.900649,0.019105,25.728019,1,1
81,(223),(2959),0.026861,0.335851,0.024030,0.894624,2.663752,0.015009,6.302639,1,1
101,(288),(2959),0.012066,0.335851,0.010246,0.849192,2.528479,0.006194,4.403941,1,1
417,(665),(2959),0.018015,0.335851,0.016275,0.903407,2.689904,0.010225,6.875734,1,1
465,(778),(2959),0.088481,0.335851,0.074278,0.839481,2.499564,0.044562,4.137510,1,1
...,...,...,...,...,...,...,...,...,...,...,...
992860,"(100553, 94466, 108583)",(2959),0.121544,0.335851,0.106171,0.873522,2.600923,0.065351,5.251114,1,3
992864,"(94466, 105250, 104069)",(2959),0.009185,0.335851,0.008621,0.938679,2.794928,0.005537,10.830739,1,3
992866,"(94466, 104069, 108583)",(2959),0.012391,0.335851,0.011849,0.956294,2.847376,0.007688,15.195731,1,3
992868,"(94466, 105250, 108583)",(2959),0.053916,0.335851,0.043165,0.800589,2.383764,0.025057,3.330558,1,3


In [16]:
# Calculate model fidelity
mf_start_time = datetime.datetime.now()
recommendations_amount = 0
explainable_amount = 0
for k, (u, recommendations) in enumerate(top_n.items()):
    if k % 2000 == 3: # Only take a small sample
        print(str(k))
        for (i, rating) in recommendations:
            recommendations_amount += 1
            rows = filtered_rules.loc[filtered_rules['consequents']
                .apply(lambda cons: True if i in cons else False)]
            for index, row in rows.iterrows():
                antecedents = list(row['antecedents'])
                user_ratings = [ algo.trainset.to_raw_iid(x[0]) for x in algo.trainset.ur[algo.trainset.to_inner_uid(u)] ]
                if all([x in user_ratings for x in antecedents]):
                    explainable_amount += 1
                    break;
                
mf_end_time = datetime.datetime.now()
print("Model fidelity calculation duration: " + str(top_n_end_time - top_n_start_time))

model_fidelity = explainable_amount / recommendations_amount
print(model_fidelity)

3
2003
4003
6003
8003
10003
12003
14003
16003
18003
20003
22003
24003
26003
28003
30003
32003
34003
36003
38003
40003
42003
44003
46003
48003
50003
52003
54003
56003
58003
60003
62003
64003
66003
68003
70003
72003
74003
76003
78003
80003
82003
84003
86003
88003
90003
92003
94003
96003
98003
100003
102003
104003
106003
108003
110003
112003
114003
116003
118003
120003
122003
124003
126003
128003
130003
132003
134003
136003
138003
Model fidelity calculation duration: 9:19:48.485246
0.2180952380952381
