In [1]:
%load_ext Cython

import pandas as pd
import numpy as np
import datetime
import pickle
from surprise import SVD
from surprise import Dataset
from surprise import Reader
from surprise import accuracy
from surprise.model_selection import train_test_split
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

In [2]:
from recommender_algo.editable_svd import EditableSVD

In [3]:
development_set = False
retrain_model = False
recalculate_topn = False
recalculate_association_rules = False

In [4]:
if retrain_model:
    if development_set:
        filename = "../data/ml_100k/ratings.csv"
    else:
        filename = "../data/ml-20m/ratings.csv"
    ratings_df = pd.read_csv(filename, dtype={
        'userId': np.int32,
        'movieId': np.int32,
        'rating': np.float32,
        'timestamp': np.int32,
    })

    reader = Reader(rating_scale=(1, 5))
    data = Dataset.load_from_df(ratings_df[['userId', 'movieId', 'rating']], reader)

    trainset, testset = train_test_split(data, test_size=.25)
    algo = EditableSVD()

    train_start_time = datetime.datetime.now()
    algo.fit(trainset)
    train_end_time = datetime.datetime.now()
    print("Training duration: " + str(train_end_time - train_start_time))
    
    predictions = algo.test(testset)
    accuracy.rmse(predictions)
    
    if development_set:
        with open("algo-100k.pickle", "wb+") as fp:
            pickle.dump(algo, fp)
    else:
        with open("algo-20m.pickle", "wb+") as fp:
            pickle.dump(algo, fp)
else:
    if development_set:
        with open("algo-100k.pickle", "rb") as fp:
            algo = pickle.load(fp)
    else:
        with open("algo-20m.pickle", "rb") as fp:
            algo = pickle.load(fp)
    

In [46]:
# Calculate top-n recommendations to all users
if recalculate_topn:
    n = 10
    top_n = []
    all_users = algo.trainset._raw2inner_id_users.keys()
    print(str(len(all_users)))
    all_items = algo.trainset._raw2inner_id_items.keys()
    
    top_n_start_time = datetime.datetime.now()
    for k, u in enumerate(all_users):
        user_recommendations = []
        for i in all_items:
            prediction = algo.predict(u, i)
            user_recommendations.append((prediction.iid, prediction.est))
        user_recommendations.sort(key=lambda x: x[1], reverse=True)
        top_n.append(user_recommendations[:n])
        if k == 100: # Debug
            print(str(k))
    top_n_end_time = datetime.datetime.now()
    print("Top-n calculation duration: " + str(top_n_end_time - top_n_start_time))
    
    if development_set:
        with open("top_n-100k.pickle", "wb+") as fp:
            pickle.dump(top_n, fp)
    else:
        with open("top_n-20m.pickle", "wb+") as fp:
            pickle.dump(top_n, fp)
else:
    if development_set:
        with open("top_n-100k.pickle", "rb") as fp:
            top_n = pickle.load(fp)
    else:
        with open("top_n-20m.pickle", "rb") as fp:
            top_n = pickle.load(fp)

In [47]:
top_n_items = [ [x[0] for x in row] for row in top_n]
top_n_items[1]

# if development_set:
#     with open("top_n-100k.pickle", "wb+") as fp:
#         pickle.dump(top_n, fp)
# else:
#     with open("top_n-20m.pickle", "wb+") as fp:
#         pickle.dump(top_n, fp)

[356, 2329, 3949, 318, 527, 2571, 2959, 3578, 2324, 48394]

In [48]:
if recalculate_association_rules:
    # Calculate association rules from top-n
    te = TransactionEncoder()
    te_ary = te.fit(top_n_items).transform(top_n_items, sparse=True)
    topn_df = pd.DataFrame.sparse.from_spmatrix(te_ary, columns=te.columns_)
    print("Sparse df created")

    apriori_start_time = datetime.datetime.now()
    frequent_itemsets = apriori(topn_df, min_support=0.01, verbose=1)
    apriori_end_time = datetime.datetime.now()
    print("Training duration: " + str(apriori_end_time - apriori_start_time))

    frequent_itemsets['length'] = frequent_itemsets['itemsets'].apply(lambda x: len(x))
    print(frequent_itemsets)

    # frequent_itemsets[(frequent_itemsets['length'] == 2)]
    rules = association_rules(frequent_itemsets)
    if development_set:
        with open("association-rules-100k.pickle", "wb+") as fp:
            pickle.dump(rules, fp)
    else:
        with open("association-rules-20m.pickle", "wb+") as fp:
            pickle.dump(rules, fp)
else:
    if development_set:
        with open("association-rules-100k.pickle", "rb") as fp:
            rules = pickle.load(fp)
    else:
        with open("association-rules-20m.pickle", "rb") as fp:
            rules = pickle.load(fp)

In [49]:
filtered_rules = rules[(rules['support'] > 0.005) &
      (rules['confidence'] > 0.3)]
print(filtered_rules)

                         antecedents        consequents  antecedent support  \
0                             (4226)               (50)            0.015849   
1                             (1196)              (260)            0.076437   
2                             (1198)              (260)            0.036240   
3                             (1210)              (260)            0.035338   
5                              (778)              (296)            0.017445   
...                              ...                ...                 ...   
16453  (4993, 1221, 7502, 7153, 858)             (5952)            0.005394   
16454       (5952, 4993, 1221, 7502)        (7153, 858)            0.006152   
16455       (7153, 4993, 1221, 7502)        (5952, 858)            0.005914   
16456        (4993, 858, 1221, 7502)       (5952, 7153)            0.005856   
16457             (4993, 1221, 7502)  (5952, 7153, 858)            0.006397   

       consequent support   support  confidence    

In [45]:
movieId = 1221
filtered_rules.loc[filtered_rules['consequents'].apply(lambda cons: True if movieId in cons else False)]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
15,(1208),(1221),0.008592,0.222827,0.007127,0.829412,3.722220,0.005212,4.555841
16,(1213),(1221),0.018788,0.222827,0.017207,0.915834,4.110065,0.013020,9.233807
40,"(50, 1213)",(1221),0.007856,0.222827,0.007141,0.909007,4.079428,0.005391,8.541051
108,"(296, 593)",(1221),0.013300,0.222827,0.011199,0.842020,3.778801,0.008235,4.919424
128,"(296, 858)",(1221),0.106756,0.222827,0.086763,0.812716,3.647292,0.062974,4.149693
...,...,...,...,...,...,...,...,...,...
1493,"(593, 858, 1193, 318)",(1221),0.008044,0.222827,0.006542,0.813285,3.649849,0.004749,4.162358
1496,"(1193, 858, 7502, 318)",(1221),0.009221,0.222827,0.007632,0.827721,3.714634,0.005578,4.511135
1516,"(593, 858, 1193, 527)",(1221),0.007842,0.222827,0.006484,0.826888,3.710893,0.004737,4.489413
1521,"(1193, 858, 7502, 527)",(1221),0.010484,0.222827,0.008643,0.824380,3.699640,0.006307,4.425314


In [33]:
n = 20
top_n = []
all_users = algo.trainset._raw2inner_id_users.keys()
print(str(len(all_users)))
all_items = algo.trainset._raw2inner_id_items.keys()

top_n_start_time = datetime.datetime.now()
for k, u in enumerate(all_users):
    user_recommendations = []
    for i in all_items:
        prediction = algo.predict(u, i)
        user_recommendations.append((prediction.iid, prediction.est))
    user_recommendations.sort(key=lambda x: x[1], reverse=True)
    top_n.append(user_recommendations[:n])
    if k == 10: # Debug
        break
print(top_n)

138493
[[(858, 4.215223830711985), (2324, 4.211003289190636), (1221, 4.183114383967304), (318, 4.173749882382581), (6016, 4.163693153751168), (3949, 4.1616164255726344), (1193, 4.150517708476085), (527, 4.13852418974071), (296, 4.130790446729069), (593, 4.12982833749897), (4973, 4.129341484669373), (7361, 4.1192479027512965), (48394, 4.099757297867061), (2858, 4.095806526153439), (50, 4.093276867518577), (2959, 4.089279398825941), (93040, 4.051945795162437), (55820, 4.04878321864231), (55247, 4.033375435495483), (1209, 4.03075054952057)], [(356, 4.6898414706009195), (2329, 4.603877430222737), (3949, 4.580710983439617), (318, 4.5690847208279175), (527, 4.541656473578388), (2571, 4.533214069267075), (2959, 4.524187092864716), (3578, 4.51631832551747), (2324, 4.503870916182875), (48394, 4.502967817785687), (3147, 4.485171484970195), (48780, 4.4810730473706855), (26674, 4.469488601839008), (110, 4.469473131266811), (47, 4.441107047477216), (1704, 4.431388687214991), (4226, 4.4201993237585)