In [1]:
%load_ext Cython

import pandas as pd
import numpy as np
import datetime
import pickle
from surprise import SVD
from surprise import Dataset
from surprise import Reader
from surprise import accuracy
from surprise.model_selection import train_test_split
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

In [2]:
from recommender_algo.editable_svd import EditableSVD

In [3]:
development_set = False
retrain_model = False
recalculate_topn = False
recalculate_association_rules = False

In [4]:
if retrain_model:
    if development_set:
        filename = "../data/ml_100k/ratings.csv"
    else:
        filename = "../data/ml-20m/ratings.csv"
    ratings_df = pd.read_csv(filename, dtype={
        'userId': np.int32,
        'movieId': np.int32,
        'rating': np.float32,
        'timestamp': np.int32,
    })

    reader = Reader(rating_scale=(1, 5))
    data = Dataset.load_from_df(ratings_df[['userId', 'movieId', 'rating']], reader)

    trainset, testset = train_test_split(data, test_size=.25)
    algo = EditableSVD(n_factors=10)

    train_start_time = datetime.datetime.now()
    algo.fit(trainset)
    train_end_time = datetime.datetime.now()
    print("Training duration: " + str(train_end_time - train_start_time))
    
    predictions = algo.test(testset)
    accuracy.rmse(predictions)
    print(accuracy)
    
    if development_set:
        with open("algo-100k.pickle", "wb+") as fp:
            pickle.dump(algo, fp)
    else:
        with open("algo-20m.pickle", "wb+") as fp:
            pickle.dump(algo, fp)
else:
    if development_set:
        with open("algo-100k.pickle", "rb") as fp:
            algo = pickle.load(fp)
    else:
        with open("algo-20m.pickle", "rb") as fp:
            algo = pickle.load(fp)
    

In [10]:
# Calculate top-n recommendations to all users
if recalculate_topn:
    n = 20
    top_n = {}
    all_users = algo.trainset._raw2inner_id_users.keys()
    all_items = algo.trainset._raw2inner_id_items.keys()
    
    top_n_start_time = datetime.datetime.now()
    for index, u in enumerate(all_users):
        user_recommendations = []
        for i in all_items:
            prediction = algo.predict(u, i)
            user_recommendations.append((prediction.iid, prediction.est))
        user_recommendations.sort(key=lambda x: x[1], reverse=True)
        top_n[u] = user_recommendations[:n]
        if index == 100 or index % 10000 == 0: # Debug
            print(str(index))
    top_n_end_time = datetime.datetime.now()
    print("Top-n calculation duration: " + str(top_n_end_time - top_n_start_time))
    
    if development_set:
        with open("top_n-100k.pickle", "wb+") as fp:
            pickle.dump(top_n, fp)
    else:
        with open("top_n20-20m.pickle", "wb+") as fp:
            pickle.dump(top_n, fp)
else:
    if development_set:
        with open("top_n-100k.pickle", "rb") as fp:
            top_n = pickle.load(fp)
    else:
        with open("top_n20-20m.pickle", "rb") as fp:
            top_n = pickle.load(fp)

In [11]:
top_n_items = [ [x[0] for x in row] for row in top_n.values()]
top_n_items[1]

[356,
 2329,
 3949,
 318,
 527,
 2571,
 2959,
 3578,
 2324,
 48394,
 3147,
 48780,
 26674,
 110,
 47,
 1704,
 4226,
 1584,
 7254,
 4995]

In [12]:
if recalculate_association_rules:
    # Calculate association rules from top-n
    te = TransactionEncoder()
    te_ary = te.fit(top_n_items).transform(top_n_items, sparse=True)
    topn_df = pd.DataFrame.sparse.from_spmatrix(te_ary, columns=te.columns_)
    print("Sparse df created")

    apriori_start_time = datetime.datetime.now()
    frequent_itemsets = apriori(topn_df, min_support=0.005, verbose=1, low_memory=True, use_colnames=True)
    apriori_end_time = datetime.datetime.now()
    print("Training duration: " + str(apriori_end_time - apriori_start_time))

    frequent_itemsets['length'] = frequent_itemsets['itemsets'].apply(lambda x: len(x))
    print(frequent_itemsets)

    frequent_itemsets[(frequent_itemsets['length'] == 2)]
    rules = association_rules(frequent_itemsets)
    if development_set:
        with open("association-rules-100k.pickle", "wb+") as fp:
            pickle.dump(rules, fp)
    else:
        with open("association-rules-20m.pickle", "wb+") as fp:
            pickle.dump(rules, fp)
else:
    if development_set:
        with open("association-rules-100k.pickle", "rb") as fp:
            rules = pickle.load(fp)
    else:
        with open("association-rules-20m.pickle", "rb") as fp:
            rules = pickle.load(fp)

In [13]:
rules['consequents_length'] = rules['consequents'].apply(lambda x: len(x))
rules['antecedents_length'] = rules['antecedents'].apply(lambda x: len(x))
filtered_rules = rules[(rules['support'] > 0.005) &
      (rules['confidence'] > 0.3) & (rules['antecedents_length'] < 4) & (rules['consequents_length'] == 1)]
print(filtered_rules)

                   antecedents consequents  antecedent support  \
0                         (25)       (296)            0.012058   
1                         (36)       (527)            0.007791   
2                         (47)      (2959)            0.037626   
3                       (4226)        (50)            0.059952   
4                       (5782)        (50)            0.007264   
...                        ...         ...                 ...   
22985  (81736, 108583, 117192)     (26109)            0.006795   
22986    (81834, 69844, 40815)     (88125)            0.006758   
22987    (69844, 88125, 40815)     (81834)            0.007387   
22989    (86345, 86347, 92535)     (86377)            0.009871   
22990    (86377, 86347, 92535)     (86345)            0.010549   

       consequent support   support  confidence       lift  leverage  \
0                0.251832  0.010022    0.831138   3.300363  0.006985   
1                0.311987  0.006636    0.851715   2.729969  0.0

In [14]:
movieId = 1221
filtered_rules.loc[filtered_rules['consequents'].apply(lambda cons: True if movieId in cons else False)]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,consequents_length,antecedents_length
63,(1201),(1221),0.017055,0.317489,0.014355,0.841660,2.650989,0.008940,4.310404,1,1
64,(1208),(1221),0.027070,0.317489,0.024160,0.892505,2.811136,0.015566,6.349216,1,1
65,(1213),(1221),0.054479,0.317489,0.051902,0.952684,3.000683,0.034605,14.424498,1,1
66,(1219),(1221),0.007365,0.317489,0.006289,0.853922,2.689610,0.003951,4.672223,1,1
67,(1222),(1221),0.006787,0.317489,0.005697,0.839362,2.643751,0.003542,4.248744,1,1
...,...,...,...,...,...,...,...,...,...,...,...
21464,"(6016, 3949, 7502)",(1221),0.006730,0.317489,0.005437,0.807940,2.544781,0.003301,3.553633,1,3
21491,"(6016, 5952, 7502)",(1221),0.007625,0.317489,0.006145,0.805871,2.538265,0.003724,3.515764,1,3
21509,"(6016, 7502, 58559)",(1221),0.007040,0.317489,0.006130,0.870769,2.742676,0.003895,5.281335,1,3
21510,"(6016, 77658, 7502)",(1221),0.007062,0.317489,0.006080,0.860941,2.711718,0.003838,4.908057,1,3


In [38]:
# Calculate model fidelity
mf_start_time = datetime.datetime.now()
recommendations_amount = 0
explainable_amount = 0
for k, (u, recommendations) in enumerate(top_n.items()):
    if k % 100 == 0: # Only take a small sample
        for (i, rating) in recommendations:
            recommendations_amount += 1
            rows = filtered_rules.loc[filtered_rules['consequents']
                .apply(lambda cons: True if i in cons else False)]
            for index, row in rows.iterrows():
                antecedents = list(row['antecedents'])
                user_ratings = [ algo.trainset.to_raw_iid(x[0]) for x in algo.trainset.ur[algo.trainset.to_inner_uid(u)] ]
                if all([x in user_ratings for x in antecedents]):
                    explainable_amount += 1
                    break;
                
mf_end_time = datetime.datetime.now()
print("Model fidelity calculation duration: " + str(top_n_end_time - top_n_start_time))

model_fidelity = explainable_amount / recommendations_amount
print(model_fidelity)

Model fidelity calculation duration: 9:57:00.473035
0.15422382671480145
