In [1]:
import datetime
import pickle
import pandas as pd
import tensorflow as tf
import numpy as np
from scipy.sparse import coo_matrix
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

import wals
import model

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
# Use these variables to choose which dataset to use, and whether to use saved variables
development_dataset = False
retrain_model = False
recalculate_topn = True

In [3]:
user_map, item_map, train_sparse, test_sparse, unique_items, unique_users = None, None, None, None, None, None
if retrain_model:
    clean_start_time = datetime.datetime.now()
    if development_dataset:
        user_map, item_map, train_sparse, test_sparse, unique_items, unique_users = model.clean_data("../data/ml_100k/ratings.csv")
    else:
        user_map, item_map, train_sparse, test_sparse, unique_items, unique_users = model.clean_data("../data/ml-20m/ratings.csv")
    clean_end_time = datetime.datetime.now()
    print("Data cleaning duration: " + str(clean_end_time - clean_start_time))
    
    latent_factors = 14
    num_iters = 20
    
    train_start_time = datetime.datetime.now()
    output_row, output_col = model.train_model(train_sparse, latent_factors, num_iters)
    train_end_time = datetime.datetime.now()
    print("Training duration: " + str(train_end_time - train_start_time))
    model.save_model(development_dataset, user_map, item_map, unique_items, unique_users, output_row, output_col)

    train_rmse = wals.get_rmse(output_row, output_col, train_sparse)
    test_rmse = wals.get_rmse(output_row, output_col, test_sparse)
    print('Train: ' + str(train_rmse) + ', Test: ' + str(test_rmse))
else:
    user_map, item_map, unique_items, unique_users, output_row, output_col = model.load_saved_model(development_dataset)

In [4]:
# This cell is just for testing...
user = 18
user_rated = [item_map[i] for i, x in enumerate(user_map) if x == user]
# print(user_rated)
print(str(output_row.shape[0]) + ", " + str(len(user_rated)))

pred_start_time = datetime.datetime.now()
model.generate_recommendations(user, user_rated, output_row, output_col, 6)
pred_end_time = datetime.datetime.now()
print("Prediction duration: " + str(pred_end_time - pred_start_time))

138493, 50
Prediction duration: 0:00:00.003350


In [14]:
topn_recommendations = []
print(str(len(unique_users)))
print(user_map)

if recalculate_topn:
    # Generate top-n...
    topn_start_time = datetime.datetime.now()
    for k, u in enumerate(unique_users):
        user_rated = True#[item_map[i] for i, x in enumerate(user_map) if x == u] #TODO fix
        if (k % 20000 == 0):
            print(k)
        if u < user_map[-1]:
            topn_recommendations.append(model.generate_recommendations(u, [], output_row, output_col, 6))
    topn_end_time = datetime.datetime.now()
    print("Top N calculation duration: " + str(topn_end_time - topn_start_time))
    print("length: " + str(len(topn_recommendations)))
    if development_dataset:
        with open("topn_100k.pickle", "wb+") as fp:
            pickle.dump(topn_recommendations, fp)
    else:
        with open("topn_20m.pickle", "wb+") as fp:
            pickle.dump(topn_recommendations, fp)
else:    
    # ...or read them from a file...
    if development_dataset:
        with open("topn_20m.pickle", "rb") as fp:
            topn_recommendations = pickle.load(fp)
    else:
        with open("topn_20m.pickle", "rb") as fp:
            topn_recommendations = pickle.load(fp)

138493
[0 0 0 ... 138492 138492 138492]
0
20000
40000
60000
80000
100000
120000
Top N calculation duration: 0:09:49.195773
length: 138491


In [15]:
te = TransactionEncoder()
te_ary = te.fit(topn_recommendations).transform(topn_recommendations)
topn_df = pd.DataFrame(te_ary, columns=te.columns_)

frequent_itemsets = apriori(topn_df, min_support=0.05)
frequent_itemsets['length'] = frequent_itemsets['itemsets'].apply(lambda x: len(x))

# frequent_itemsets[(frequent_itemsets['length'] == 2)]
rules = association_rules(frequent_itemsets)

rules = rules[(rules['support'] > 0.05) &
      (rules['confidence'] > 0.2) &
      (rules['lift'] > 3.0)]

# print(rules)

In [20]:
# This generates the explainable recommendations for a list of top N recommendations

explainableRecommendations = []
exp_start_time = datetime.datetime.now()
for i, u in enumerate(unique_users):
    recommendations = []
    if u < user_map[-1]:
        for index, row in rules.iterrows():
            antecedents = list(row['antecedents'])
            consequents = list(row['consequents'])
            if all(x in user_rated for x in antecedents) and all(x not in user_rated for x in consequents):
                recommendations.append({"explanation": tuple(row['antecedents']), "recommendation": tuple(row['consequents'])})
    explainableRecommendations.append(recommendations)
    if (i % 10000 == 0):
        print(i)
exp_end_time = datetime.datetime.now()
print("Top N calculation duration: " + str(exp_end_time - exp_start_time))
    
if development_dataset:
    with open("explainable_100k.pickle", "wb+") as fp:
        pickle.dump(explainableRecommendations, fp)
else:
    with open("explainable_20m.pickle", "wb+") as fp:
        pickle.dump(explainableRecommendations, fp)
    


0
10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000
130000
Top N calculation duration: 0:14:15.420208
