In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from CDAE import AutoEncoder
from tqdm import trange
from utils import *
import clustering

from sklearn.cluster import KMeans, spectral_clustering
from sklearn.decomposition import PCA

  return f(*args, **kwds)


In [2]:
df = pd.read_csv('../data/class/rating_data.csv')
df['freq'] = df.groupby('uid')['uid'].transform('count')  # count frequncy by column's values
df = df[df['freq'] > 5]  # remove row which corresponding frequence < 5

userList = df['uid'].unique()
itemList = df['iid'].unique()

total_usr = len(df['uid'].unique())
total_item = len(df['iid'].unique())

sparsity = len(df)/(total_usr*total_item)
print("sparsity of ratings is %.2f%%" %(sparsity*100))
print ("num. of users: %d, num. of items: %d" % (total_usr, total_item))
print (len(df))

sparsity of ratings is 10.49%
num. of users: 839, num. of items: 99
8709


In [3]:
import math
def get_map(list_):
    map_ = {}
    for idx, ident in enumerate(list_):
        map_[ident] = idx
        
    return map_

def get_matrix(data):
    matrix = np.zeros((total_usr, total_item), dtype=np.float32)
    for line in data:
        uid = user_map[line[0]]
        iid = item_map[line[1]]
        matrix[uid, iid] = 1
    
    return matrix

def train_test_split(df, time_interval, split_rate=0.5):
    start_time = min(df['timestamp'])
    end_time = max(df['timestamp'])
    time_elapse = (end_time-start_time) // time_interval
    split_time = start_time + math.floor(time_elapse * (1-split_rate)) * time_interval
    
    while split_time < end_time:
        df_train = df[df['timestamp'] < split_time]
        df_train = df_train[df_train['timestamp'] >= split_time - 6 * time_interval]
        
        df_test_1 = df[df['timestamp'] >= split_time - 3*time_interval]
        df_test_1 = df_test_1[df_test_1['timestamp'] < split_time]
        
        df_test_2 = df[df['timestamp'] >= split_time]
        df_test_2 = df_test_2[df_test_2['timestamp'] < split_time + time_interval]
        
        # start_time += time_interval
        # split_time = start_time + math.floor(time_elapse * (1-split_rate)) * time_interval
        split_time += time_interval
        
        yield df_train, df_test_1, df_test_2

In [4]:
user_map = get_map(userList)
item_map = get_map(itemList)
    
user_time_interval = 3 * 30 * 24 * 3600

# New

## Train

In [None]:
generator = train_test_split(df, user_time_interval)

NUM_CLUSTER = 10
top_items_list_75 = []
top_gt_list_next = []
top_gt_list_now = []

while True:
    try:
        df_train, df_test_now, df_test_next = next(generator)

        train_data = df_train.as_matrix()
        test_data_now = df_test_now.as_matrix()
        test_data_next = df_test_next.as_matrix()

        user_train_matrix = get_matrix(train_data)
        user_test_matrix_now = get_matrix(test_data_now)
        user_test_matrix_next = get_matrix(test_data_next)
        
        train_user = np.nonzero(np.count_nonzero(user_train_matrix, axis=1))[0]
        test_user_now = np.nonzero(np.count_nonzero(user_test_matrix_now, axis=1))[0]
        
        top_n = np.count_nonzero(user_train_matrix, axis=0).argsort()[::-1][:30]
        others = [k for k in range(total_item) if k not in top_n]

        # Train at first to get user_vector
        tf.reset_default_graph()
        autoencoder = AutoEncoder(user_num=total_usr, item_num=total_item, mode='user', 
                                  denoise_function=None, loss_function='cross_entropy',
                                  denoising=False, batch_size=1, epochs=200)

        autoencoder.train_all(rating=user_train_matrix, train_idents=train_user, topN=top_n, weight=30)

        autoencoder.model_save(1)
        
        # Get specify vectors/feature vectors
        vector_matrices = autoencoder.sess.run(autoencoder.vector_matrix)
        """vector_matrices = autoencoder.sess.run(
            autoencoder.code,
            feed_dict={
                autoencoder.input: user_train_matrix,
                autoencoder.ident: [x for x in range(total_usr)]
            })"""
        exist_vectors = np.take(vector_matrices, train_user, axis=0)
        
        # Clustering
        pca_out = clustering.get_pca_out(exist_vectors)
        kmeans = clustering.calculate_kmeans(pca_out, NUM_CLUSTER=NUM_CLUSTER)
        
        label_index, label_count = clustering.get_cluster_attributes(kmeans, NUM_CLUSTER=NUM_CLUSTER)
        
        # gether input data
        data = {
            'TRAIN_MATRIX': user_train_matrix,
            'TEST_MATRIX_NOW': user_test_matrix_now,
            'TEST_MATRIX_NEXT': user_test_matrix_next,
            'TRAIN_USER': train_user,
            'TEST_USER_NOW': test_user_now,
            'LABEL_INDEX': label_index,
        }
        
        # calculate top_N for each cluster
        cluster_top = clustering.calculate_cluster_top(
            data,
            total_usr,
            total_item,
            NUM_CLUSTER=NUM_CLUSTER,
            batch_size=1,
            weight=30,
            denoise_function=None,
            loss_function='cross_entropy')
        
        # get cluster's item distribution
        cluster_distribution = clustering.get_distribution(data, NUM_CLUSTER=NUM_CLUSTER)
        
        # calculate score for top items
        score_map = clustering.count_score(cluster_top, label_count, len(test_user_now), cluster_distribution, exponent=1.0001, alpha=100)
        
        # get top items
        top_N = clustering.get_score_top(score_map, N=30)
        
        # gether predict top items
        top_items_list_75.append(top_N)
        
        # gether ground truth items
        ground_truth_next = np.count_nonzero(user_test_matrix_next, axis=0).argsort()[::-1][:30]
        ground_truth_now = np.count_nonzero(user_test_matrix_now, axis=0).argsort()[::-1][:30]
        top_gt_list_next.append(ground_truth_next)
        top_gt_list_now.append(ground_truth_now)
                
    except StopIteration:
        break
        

top_items_list_75 = np.asarray(top_items_list_75)
top_gt_list_now = np.asarray(top_gt_list_now)
top_gt_list_next = np.asarray(top_gt_list_next)

np.save('./rec_lists/class_clustering_rec_lists.npy', top_items_list_75)
np.save('./rec_lists/class_gt_now.npy', top_gt_list_now)
np.save('./rec_lists/class_gt_next.npy', top_gt_list_next)

In [None]:
hit_ratio_top_30 = []
hit_ratio_top_10 = []
hit_ratio_top_5 = []
f1_top_30 = []
f1_top_10 = []
f1_top_5 = []

for i, j in zip(top_items_list_75, top_gt_list_next):
    hit_ratio_top_30.append(hit_recall(i, j, N=30))
    hit_ratio_top_10.append(hit_recall(i, j, N=10))
    hit_ratio_top_5.append(hit_recall(i, j, N=5))
    
    k = np.asarray([i])
    q = np.asarray([j])
    f1_top_30.append(2*hit_recall(i, j, N=30)**2/(2*hit_recall(i, j, N=30)))
    f1_top_10.append(2*hit_recall(i, j, N=10)**2/(2*hit_recall(i, j, N=10)))
    try:
        f1_top_5.append(2*hit_recall(i, j, N=5)**2/(2*hit_recall(i, j, N=5)))
    except ZeroDivisionError:
        f1_top_5.append(None)
    

hit_ratio_still_top30 = []
hit_ratio_still_top10 = []
hit_ratio_still_top5 = []
f1_still_30 = []
f1_still_10 = []
f1_still_5 = []

for i, j ,k in zip(top_items_list_75, top_gt_list_now, top_gt_list_next):
    still_in_items_30 = []
    still_in_items_10 = []
    still_in_items_5 = []
    
    for q in k:
        if q in j:
            still_in_items_30.append(q)
            
    for q in k[:10]:
        if q in j[:10]:
            still_in_items_10.append(q)
            
    for q in k[:5]:
        if q in j[:5]:
            still_in_items_5.append(q)
            
    hit_ratio_still_top30.append(hit_recall(i, still_in_items_30, N=30))
    hit_ratio_still_top10.append(hit_recall(i, still_in_items_10, N=10))
    hit_ratio_still_top5.append(hit_recall(i, still_in_items_5, N=5))
    
    x = np.asarray([i])
    y_30 = np.asarray([still_in_items_30])
    y_10 = np.asarray([still_in_items_10])
    y_5 = np.asarray([still_in_items_5])
    
    f1_still_30.append(2 * hit_recall(i, still_in_items_30, N=30) * \
                       (hit_recall(i, still_in_items_30, N=30)*min(30, len(still_in_items_30))/30) / \
                       (hit_recall(i, still_in_items_30, N=30) + \
                       (hit_recall(i, still_in_items_30, N=30)*min(30, len(still_in_items_30))/30)))
    f1_still_10.append(2 * hit_recall(i, still_in_items_10, N=10) * \
                       (hit_recall(i, still_in_items_10, N=10)*min(10, len(still_in_items_10))/10) / \
                       (hit_recall(i, still_in_items_10, N=10) + \
                       (hit_recall(i, still_in_items_10, N=10)*min(10, len(still_in_items_10))/10)))
    try:
        f1_still_5.append(2 * hit_recall(i, still_in_items_5, N=5) * \
                           (hit_recall(i, still_in_items_5, N=5)*min(5, len(still_in_items_5))/5) / \
                           (hit_recall(i, still_in_items_5, N=5) + \
                           (hit_recall(i, still_in_items_5, N=5)*min(5, len(still_in_items_5))/5)))
    except:
        f1_still_5.append(None)

In [None]:
plt.plot(range(len(hit_ratio_top_30)), hit_ratio_top_30, color='blue', label='hit_ratios')
plt.legend(loc="upper right")
# plt.title("Time: %d, Item: %d, mean: %f" % (i, top_items_list[i][j], top_means_list[i][j][1]))
plt.xlabel('Time')
plt.ylabel('ratio')
plt.title('NOW')
plt.show()
plt.gcf().clear()
print ("Hit ratio top 30: %f" % (sum(hit_ratio_top_30[:-1])/len(hit_ratio_top_30[:-1])))
print ("Hit ratio top 10: %f" % (sum(hit_ratio_top_10[:-1])/len(hit_ratio_top_10[:-1])))
print ("Hit ratio top 5: %f" % (sum(hit_ratio_top_5[:-1])/len(hit_ratio_top_5[:-1])))

In [None]:
plt.plot(range(len(hit_ratio_still_top10)), hit_ratio_still_top10, color='blue', label='hit_ratios')
plt.legend(loc="upper right")
# plt.title("Time: %d, Item: %d, mean: %f" % (i, top_items_list[i][j], top_means_list[i][j][1]))
plt.xlabel('Time')
plt.ylabel('ratio')
plt.title('NEXT')
plt.show()
plt.gcf().clear()
print ("Still in hit ratio top 30: %f" % (sum(hit_ratio_still_top30[:-1])/len(hit_ratio_still_top30[:-1])))
print ("Still in hit ratio top 10: %f" % (sum(hit_ratio_still_top10[:-1])/len(hit_ratio_still_top10[:-1])))
print ("Still in hit ratio top 5: %f" % (sum(hit_ratio_still_top5[:-1])/len(hit_ratio_still_top5[:-1])))

In [None]:
print ("F1 score top 30: %f" % (sum(f1_top_30[:-1])/len(f1_top_30[:-1])))
print ("F1 score top 10: %f" % (sum(f1_top_10[:-1])/len(f1_top_10[:-1])))
print ("F1 score top 5: %f" % (sum(f1_top_5[:-1])/len(f1_top_5[:-1])))
print ()
print ("F1 score still top 30: %f" % (sum(f1_still_30[:-1])/len(f1_still_30[:-1])))
print ("F1 score still top 10: %f" % (sum(f1_still_10[:-1])/len(f1_still_10[:-1])))
print ("F1 score still top 5: %f" % (sum(f1_still_5[:-1])/len(f1_still_5[:-1])))

In [None]:
hit_ratio_not_top30 = []
hit_ratio_not_top10 = []
hit_ratio_not_top5 = []

for i, j ,k in zip(top_items_list_75, top_gt_list_now, top_gt_list_next):
    not_in_items_30 = []
    not_in_items_10 = []
    not_in_items_5 = []
    
    for q in j:
        if q not in k:
            not_in_items_30.append(q)
            
    for q in j[:10]:
        if q not in k[:10]:
            not_in_items_10.append(q)
            
    for q in j[:5]:
        if q not in k[:5]:
            not_in_items_5.append(q)
            
    hit_ratio_not_top30.append(1-hit_recall(i, not_in_items_30, N=30))
    hit_ratio_not_top10.append(1-hit_recall(i, not_in_items_10, N=10))
    hit_ratio_not_top5.append(1-hit_recall(i, not_in_items_5, N=5))

In [None]:
print ("Not in hit ratio top 30: %f" % (sum(hit_ratio_not_top30[:-1])/len(hit_ratio_not_top30[:-1])))
print ("Not in hit ratio top 10: %f" % (sum(hit_ratio_not_top10[:-1])/len(hit_ratio_not_top10[:-1])))
print ("Not in hit ratio top 5: %f" % (sum(hit_ratio_not_top5[:-1])/len(hit_ratio_not_top5[:-1])))

# Old

# Still in ratio

In [None]:
generator = train_test_split(df, user_time_interval, split_rate=0.5)
count = 0
still_in_ratio = []
watched_people = []
watched_count = []
watched_item = []

while True:
    try:
        df_train, df_test_1, df_test_2 = next(generator)
        
        test_data_1 = df_test_1.as_matrix()
        test_data_2 = df_test_2.as_matrix()
        user_test_matrix_1 = get_matrix(test_data_1)
        user_test_matrix_2 = get_matrix(test_data_2)
        

        ground_truth_next = np.count_nonzero(user_test_matrix_2, axis=0).argsort()[::-1][:30]
        
        watched_count.append(sum(np.count_nonzero(user_test_matrix_1, axis=0))/len(np.count_nonzero(user_test_matrix_1, axis=0)))
        watched_item.append(len(np.nonzero(np.count_nonzero(user_test_matrix_1, axis=0))[0]))
        watched_people.append(len(np.nonzero(np.count_nonzero(user_test_matrix_1, axis=1))[0]))
        
        ground_truth_now = np.count_nonzero(user_test_matrix_1, axis=0).argsort()[::-1][:30]
        
        
        still_in = hit_recall(ground_truth_next, ground_truth_now, N=30)
        
        still_in_ratio.append(still_in)
        count += 1
        print (count)
        
    except StopIteration:
        break

In [None]:
plt.plot(range(len(still_in_ratio)), still_in_ratio, color='blue', label='still_in_ratio')
plt.legend(loc="upper right")
plt.title("Netflix_top_30_still_in_ratio")
plt.xlabel('Users')
plt.ylabel('score')
plt.savefig("./prediction_weekly/Netflix_still_in_top_30.jpg")
plt.show()
plt.gcf().clear()

print (sum(still_in_ratio)/len(still_in_ratio))

## Train

In [5]:
generator = train_test_split(df, user_time_interval)

tf.reset_default_graph()

autoencoder = AutoEncoder(user_num=total_usr, item_num=total_item, mode='user', loss_function='log_loss',
                          denoise_function='dropout', denoising=False, batch_size=1, epochs=200)

test_out_top = []
top_items_list_all = []
top_items_list_75 = []
top_items_list_85 = []
top_means_list = []
top_gt_list_next = []
top_gt_list_now = []

while True:
    try:
        df_train, df_test_1, df_test_2 = next(generator)

        train_data = df_train.as_matrix()
        test_data_1 = df_test_1.as_matrix()
        test_data_2 = df_test_2.as_matrix()

        user_train_matrix = get_matrix(train_data)
        user_test_matrix_1 = get_matrix(test_data_1)
        user_test_matrix_2 = get_matrix(test_data_2)
        
        top_n = np.count_nonzero(user_train_matrix, axis=0).argsort()[::-1][:30]
        others = [k for k in range(total_item) if k not in top_n]

        train_user = np.nonzero(np.count_nonzero(user_train_matrix, axis=1))[0]
        test_user_1 = np.nonzero(np.count_nonzero(user_test_matrix_1, axis=1))[0]

        autoencoder.train_all(rating=user_train_matrix, train_idents=train_user, topN=None, weight=None)

        test_out = autoencoder.predict(user_test_matrix_1, test_user_1)
        
        """ out data process """
        # all data mean
        test_out_stat_all = np.mean(test_out, axis=0)
        
        # upper quartile mean
        test_out_stat_75 = []
        quartile_75 = np.percentile(test_out, 75, axis=0)
        for i in range(test_out.shape[1]):
            test_out_stat_75.append(np.mean([x for x in test_out.T[i] if x > quartile_75[i]]))
        test_out_stat_75 = np.asarray(test_out_stat_75)
        
        test_out_stat_85 = []
        quartile_85 = np.percentile(test_out, 85, axis=0)
        for i in range(test_out.shape[1]):
            test_out_stat_85.append(np.mean([x for x in test_out.T[i] if x > quartile_85[i]]))
        test_out_stat_85 = np.asarray(test_out_stat_85)
        
        """Get top 10"""
        test_out_rank_all = test_out_stat_all.argsort()[::-1][:30]
        test_out_rank_75 = test_out_stat_75.argsort()[::-1][:30]
        test_out_rank_85 = test_out_stat_85.argsort()[::-1][:30]
        ground_truth_next = np.count_nonzero(user_test_matrix_2, axis=0).argsort()[::-1][:30]
        ground_truth_now = np.count_nonzero(user_test_matrix_1, axis=0).argsort()[::-1][:30]
        
        """out data collect"""
        # top_out = np.take(test_out, test_out_rank_all, axis=1).T
        # test_out_top.append(top_out)
        
        top_items_list_all.append(test_out_rank_all)
        top_items_list_75.append(test_out_rank_75)
        top_items_list_85.append(test_out_rank_85)
        top_gt_list_next.append(ground_truth_next)
        top_gt_list_now.append(ground_truth_now)
        
        """means = []
        for i in test_out_rank_all:
            means.append((i, test_out_stat_all[i]))
        top_means_list.append(means)"""
        
    except StopIteration:
        break
        
# top_items_list_all = np.asarray(top_items_list_all)
top_items_list_75 = np.asarray(top_items_list_75)
top_items_list_85 = np.asarray(top_items_list_85)
top_means_list = np.asarray(top_means_list)
top_gt_list_now = np.asarray(top_gt_list_now)
top_gt_list_next = np.asarray(top_gt_list_next)

np.save('./rec_lists/itri_org_rec_lists.npy', top_items_list_75)

100%|██████████| 200/200 [00:28<00:00,  7.07it/s]
100%|██████████| 200/200 [00:36<00:00,  5.54it/s]
100%|██████████| 200/200 [00:43<00:00,  4.62it/s]
100%|██████████| 200/200 [00:48<00:00,  4.16it/s]
100%|██████████| 200/200 [00:55<00:00,  3.60it/s]
100%|██████████| 200/200 [01:08<00:00,  2.93it/s]
100%|██████████| 200/200 [01:22<00:00,  2.43it/s]
100%|██████████| 200/200 [01:33<00:00,  2.13it/s]
100%|██████████| 200/200 [01:39<00:00,  2.00it/s]
100%|██████████| 200/200 [01:48<00:00,  1.85it/s]
100%|██████████| 200/200 [01:54<00:00,  1.75it/s]
100%|██████████| 200/200 [02:00<00:00,  1.67it/s]
100%|██████████| 200/200 [01:58<00:00,  1.69it/s]


In [8]:
print (test_out_stat_85)

[ 0.38965526  0.05022873  0.59063607  0.04710175  0.47774079  0.21033356
  0.01748728  0.3823854   0.03187207  0.05401026  0.02712947  0.08351601
  0.01007607  0.03252231  0.03898494  0.18826996  0.1525865   0.38864198
  0.06566289  0.02996645  0.04550518  0.02160897  0.08674006  0.13260762
  0.0982428   0.21181668  0.08661996  0.52284533  0.0598496   0.799788
  0.02612696  0.14254397  0.26623634  0.01706529  0.08340617  0.11883468
  0.07308783  0.03995959  0.04486014  0.03757913  0.0176826   0.02552888
  0.02324141  0.28580835  0.08427937  0.03931291  0.18164438  0.22499211
  0.04502813  0.04803188  0.01379419  0.26758412  0.03275115  0.03697237
  0.10946263  0.19010942  0.24229836  0.7821936   0.02660504  0.02828271
  0.01920071  0.043816    0.06193104  0.01652925  0.03460154  0.01573307
  0.04040642  0.18195875  0.11608242  0.03440028  0.13441916  0.02430603
  0.12760074  0.07817332  0.02653882  0.26281956  0.43955365  0.08969804
  0.24037383  0.04641305  0.09413739  0.04402523  0.3

In [10]:
count = 0
for i in range(test_out.shape[1]):
    count += 1
    print (count, end="\t")
    mean = [sum(test_out.T[i])/len(test_out.T[i]) for x in range(test_out.shape[0])]
    up_85 = [test_out_stat_85[i] for x in range(test_out.shape[0])]
    up_75 = [test_out_stat_75[i] for x in range(test_out.shape[0])]
    
    colors = np.random.rand(test_out.shape[0])
    plt.scatter(range(test_out.shape[0]), test_out.T[i], c=colors, alpha=0.5)
    plt.plot(range(test_out.shape[0]), mean, 'r--', color='red', label='mean')
    plt.plot(range(test_out.shape[0]), up_85, 'r--', color='green', label='up 15')
    plt.plot(range(test_out.shape[0]), up_75, 'r--', color='blue', label='up 25')
    plt.xlabel('User')
    plt.ylabel('score')
    plt.legend(loc="upper right")
    plt.title('Video %d prediction scores' % (i))
    plt.savefig('./figs/Video_%d_scores' % (i))
    plt.gcf().clear()
    

1	2	3	4	5	6	7	8	9	10	11	12	13	14	15	16	17	18	19	20	21	22	23	24	25	26	27	28	29	30	31	32	33	34	35	36	37	38	39	40	41	42	43	44	45	46	47	48	49	50	51	52	53	54	55	56	57	58	59	60	61	62	63	64	65	66	67	68	69	70	71	72	73	74	75	76	77	78	79	80	81	82	83	84	85	86	87	88	89	90	91	92	93	94	95	96	97	98	99	

for i in trange(top_means_list.shape[0]):
    for j in range(top_means_list.shape[1]):
        if top_items_list_all[i][j] in top_gt_list_now[i]:
            plt.plot(range(len(test_out_top[i][j])), test_out_top[i][j], color='green', label='pred. scores')
            plt.hlines(top_means_list[i][j][1], -5, len(test_out_top[i][j])+5, linestyles='solid', color='blue')
            plt.legend(loc="upper right")
            plt.title("Time: %d, Item: %d, mean: %f" % (i, top_items_list_all[i][j], top_means_list[i][j][1]))
            plt.xlabel('Users')
            plt.ylabel('score')
            plt.savefig("plots_netflix_now/scores_%d_%d.jpg" % (i, j))
        else:
            plt.plot(range(len(test_out_top[i][j])), test_out_top[i][j], color='red', label='pred. scores')
            plt.hlines(top_means_list[i][j][1], -5, len(test_out_top[i][j])+5, linestyles='solid', color='blue')
            plt.legend(loc="upper right")
            plt.title("Time: %d, Item: %d, mean: %f" % (i, top_items_list_all[i][j], top_means_list[i][j][1]))
            plt.xlabel('Users')
            plt.ylabel('score')
            plt.savefig("plots_netflix_now/scores_%d_%d.jpg" % (i, j))
        plt.show()
        plt.gcf().clear()

# ITRI

In [None]:
hit_ratio_top_30 = []
hit_ratio_top_10 = []
hit_ratio_top_5 = []
f1_top_30 = []
f1_top_10 = []
f1_top_5 = []

for i, j in zip(top_items_list_75, top_gt_list_next):
    hit_ratio_top_30.append(hit_recall(i, j, N=30))
    hit_ratio_top_10.append(hit_recall(i, j, N=10))
    hit_ratio_top_5.append(hit_recall(i, j, N=5))
    
    k = np.asarray([i])
    q = np.asarray([j])
    f1_top_30.append(2*hit_recall(i, j, N=30)**2/(2*hit_recall(i, j, N=30)))
    f1_top_10.append(2*hit_recall(i, j, N=10)**2/(2*hit_recall(i, j, N=10)))
    try:
        f1_top_5.append(2*hit_recall(i, j, N=5)**2/(2*hit_recall(i, j, N=5)))
    except ZeroDivisionError:
        f1_top_5.append(None)

In [None]:
hit_ratio_still_top30 = []
hit_ratio_still_top10 = []
hit_ratio_still_top5 = []
f1_still_30 = []
f1_still_10 = []
f1_still_5 = []

for i, j ,k in zip(top_items_list_75, top_gt_list_now, top_gt_list_next):
    still_in_items_30 = []
    still_in_items_10 = []
    still_in_items_5 = []
    
    for q in k:
        if q in j:
            still_in_items_30.append(q)
            
    for q in k[:10]:
        if q in j[:10]:
            still_in_items_10.append(q)
            
    for q in k[:5]:
        if q in j[:5]:
            still_in_items_5.append(q)
            
    hit_ratio_still_top30.append(hit_recall(i, still_in_items_30, N=30))
    hit_ratio_still_top10.append(hit_recall(i, still_in_items_10, N=10))
    hit_ratio_still_top5.append(hit_recall(i, still_in_items_5, N=5))
    
    x = np.asarray([i])
    y_30 = np.asarray([still_in_items_30])
    y_10 = np.asarray([still_in_items_10])
    y_5 = np.asarray([still_in_items_5])
    
    f1_still_30.append(2 * hit_recall(i, still_in_items_30, N=30) * \
                       (hit_recall(i, still_in_items_30, N=30)*min(30, len(still_in_items_30))/30) / \
                       (hit_recall(i, still_in_items_30, N=30) + \
                       (hit_recall(i, still_in_items_30, N=30)*min(30, len(still_in_items_30))/30)))
    f1_still_10.append(2 * hit_recall(i, still_in_items_10, N=10) * \
                       (hit_recall(i, still_in_items_10, N=10)*min(10, len(still_in_items_10))/10) / \
                       (hit_recall(i, still_in_items_10, N=10) + \
                       (hit_recall(i, still_in_items_10, N=10)*min(10, len(still_in_items_10))/10)))
    try:
        f1_still_5.append(2 * hit_recall(i, still_in_items_5, N=5) * \
                           (hit_recall(i, still_in_items_5, N=5)*min(5, len(still_in_items_5))/5) / \
                           (hit_recall(i, still_in_items_5, N=5) + \
                           (hit_recall(i, still_in_items_5, N=5)*min(5, len(still_in_items_5))/5)))
    except:
        f1_still_5.append(None)

## upper quartile mean

In [None]:
plt.plot(range(len(hit_ratio_top_30)), hit_ratio_top_30, color='blue', label='hit_ratios')
plt.legend(loc="upper right")
# plt.title("Time: %d, Item: %d, mean: %f" % (i, top_items_list[i][j], top_means_list[i][j][1]))
plt.xlabel('Time')
plt.ylabel('ratio')
plt.title('NOW')
plt.show()
plt.gcf().clear()
print ("Hit ratio top 30: %f" % (sum(hit_ratio_top_30[:-1])/len(hit_ratio_top_30[:-1])))
print ("Hit ratio top 10: %f" % (sum(hit_ratio_top_10[:-1])/len(hit_ratio_top_10[:-1])))
print ("Hit ratio top 5: %f" % (sum(hit_ratio_top_5[:-1])/len(hit_ratio_top_5[:-1])))

In [None]:
plt.plot(range(len(hit_ratio_still_top10)), hit_ratio_still_top10, color='blue', label='hit_ratios')
plt.legend(loc="upper right")
# plt.title("Time: %d, Item: %d, mean: %f" % (i, top_items_list[i][j], top_means_list[i][j][1]))
plt.xlabel('Time')
plt.ylabel('ratio')
plt.title('NEXT')
plt.show()
plt.gcf().clear()
print ("Still in hit ratio top 30: %f" % (sum(hit_ratio_still_top30[:-1])/len(hit_ratio_still_top30[:-1])))
print ("Still in hit ratio top 10: %f" % (sum(hit_ratio_still_top10[:-1])/len(hit_ratio_still_top10[:-1])))
print ("Still in hit ratio top 5: %f" % (sum(hit_ratio_still_top5[:-1])/len(hit_ratio_still_top5[:-1])))

In [None]:
print ("F1 score top 30: %f" % (sum(f1_top_30[:-1])/len(f1_top_30[:-1])))
print ("F1 score top 10: %f" % (sum(f1_top_10[:-1])/len(f1_top_10[:-1])))
print ("F1 score top 5: %f" % (sum(f1_top_5[:-1])/len(f1_top_5[:-1])))
print ()
print ("F1 score still top 30: %f" % (sum(f1_still_30[:-1])/len(f1_still_30[:-1])))
print ("F1 score still top 10: %f" % (sum(f1_still_10[:-1])/len(f1_still_10[:-1])))
print ("F1 score still top 5: %f" % (sum(f1_still_5[:-1])/len(f1_still_5[:-1])))

# Debug

In [None]:
# from sklearn.model_selection import train_test_split

df_train, df_test, _, _ = next(generator)

train_data = df_train.as_matrix()
test_data = df_test.as_matrix()

user_train_rating = np.zeros((total_usr, total_item), dtype=np.float32)
for line in train_data:
    uid = user_map[line[0]]
    iid = item_map[line[1]]
    user_train_rating[uid, iid] = 1
    
item_train_rating = user_train_rating.T

In [None]:
a = np.percentile(test_out, 75, axis=0)
k = []
for i in range(test_out.shape[1]):
    k.append(np.mean([x for x in test_out.T[i] if x > a[i]]))

In [None]:
print (len(sorted(k, reverse=True)))