In [1]:
# some basic libs
import os
import sys
import gc
import pickle
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm, trange
from sklearn.cluster import KMeans, DBSCAN

In [2]:
# numba
import warnings
from numba.errors import NumbaDeprecationWarning, NumbaPendingDeprecationWarning

warnings.filterwarnings("ignore", category=NumbaDeprecationWarning)
warnings.filterwarnings("ignore", category=NumbaPendingDeprecationWarning)

In [3]:
%load_ext autoreload
%autoreload 2

sys.path.append('../src')

import FPMC
import STDBSCAN
import preprocessing, params_tuning, visualization, calc_metrics

In [4]:
print('Python       :', sys.version.split('\n')[0])
!pip freeze > ../requirements.txt

Python       : 3.8.1 (default, Jan  8 2020, 16:15:59) 


In [5]:
path = '../data/BetaUser Data V1.1'
case_list = sorted([x for x in os.listdir(path) if x.endswith('.csv')])
case_list

['lxb.csv', 'lyc.csv', 'mzp.csv', 'spp.csv', 'zlw.csv', 'zsb.csv']

In [6]:
case = case_list[1]

In [7]:
target_column = 'packageName'
train_period = 5

In [8]:
pd.read_csv(os.path.join(path, case))

Unnamed: 0,rowid,timestamp,packageName,activityName,cellID,MCC,MNC,LAC,BSSID,longitude,latitude,city
0,1,1564588809054,com.tencent.mm,com.tencent.mm.ui.LauncherUI,,,,,,,,
1,2,1564588866253,com.tencent.mm,com.tencent.mm.plugin.sns.ui.SnsTimeLineUI,,,,,,,,
2,3,1564588867248,com.tencent.mm,com.tencent.mm.plugin.sns.ui.SnsBrowseUI,,,,,,,,
3,4,1564588869788,com.tencent.mm,com.tencent.mm.plugin.sns.ui.SnsTimeLineUI,,,,,,,,
4,5,1564588870292,com.tencent.mm,com.tencent.mm.plugin.sns.ui.SnsBrowseUI,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
15632,15633,1567267022149,com.dianping.v1,com.dianping.foodshop.preview.ShopHeadPreviewA...,54423181.0,460.0,0.0,10168.0,20:a6:80:7d:86:48,-1.0,-1.0,0
15633,15634,1567267023332,com.dianping.v1,com.dianping.ugc.album.AlbumActivity,54423181.0,460.0,0.0,10168.0,20:a6:80:7d:86:48,-1.0,-1.0,0
15634,15635,1567267135136,com.dianping.v1,com.dianping.foodshop.preview.ShopHeadPreviewA...,54423181.0,460.0,0.0,10168.0,20:a6:80:7d:86:48,-1.0,-1.0,0
15635,15636,1567267137132,com.dianping.v1,com.dianping.shopshell.ShopInfoActivity,54423181.0,460.0,0.0,10168.0,20:a6:80:7d:86:48,-1.0,-1.0,0


In [9]:
%%time
df, app_list, idx_to_app, app_to_idx = preprocessing.get_df(
    os.path.join(path, case))
loc_cols = ['longitude', 'latitude']
time_cols = [x for x in df.columns if (('timeDay' in x) or ('timeWeek' in x))]

CPU times: user 2.19 s, sys: 74.3 ms, total: 2.26 s
Wall time: 2.26 s


In [10]:
df_train = df[df.index.date < (df.iloc[-1].name.date() - pd.Timedelta(7, unit='days'))]
df_test = df[df.index.date >= (df.iloc[-1].name.date() - pd.Timedelta(7, unit='days'))]

In [11]:
df_test.head()

Unnamed: 0_level_0,packageName,cellID,MCC,MNC,LAC,BSSID,longitude,latitude,timeDay_sin,timeDay_cos,timeWeek_sin,timeWeek_cos
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2019-08-24 00:01:08+08:00,com.tencent.mm,46807297.0,460.0,0.0,10135.0,0,114.117131,22.544654,0.004363,0.99999,-0.975066,-0.221913
2019-08-24 00:03:45+08:00,com.tencent.mm,46807297.0,460.0,0.0,10135.0,0,114.117131,22.544654,0.01309,0.999914,-0.975342,-0.220697
2019-08-24 00:03:48+08:00,com.tencent.mm,46807297.0,460.0,0.0,10135.0,0,114.117131,22.544654,0.01309,0.999914,-0.975342,-0.220697
2019-08-24 00:03:52+08:00,com.tencent.mm,46807297.0,460.0,0.0,10135.0,0,114.117131,22.544654,0.01309,0.999914,-0.975342,-0.220697
2019-08-24 00:03:52+08:00,com.tencent.mm,46807297.0,460.0,0.0,10135.0,0,114.117131,22.544654,0.01309,0.999914,-0.975342,-0.220697


In [12]:
len(df_train), len(df_test)

(11938, 3670)

# Заполнение пропущенных местоположений

In [13]:
%%time
df_train, gps_wifi, gps = preprocessing.fill_missing_locations(df_train, mode='train')

CPU times: user 705 ms, sys: 9.25 ms, total: 715 ms
Wall time: 724 ms


In [14]:
%%time
df_test = preprocessing.fill_missing_locations(df_test, mode='test', gps_wifi=gps_wifi, gps=gps)

CPU times: user 10.2 s, sys: 322 ms, total: 10.5 s
Wall time: 10.4 s


In [15]:
len(df_train), len(df_test)

(11938, 3670)

In [16]:
# мы не предиктим то же приложение!!!
df_train = df_train[df_train['packageName'].shift() != df_train['packageName']]
df_test = df_test[df_test['packageName'].shift() != df_test['packageName']]

# Выделение кластеров
## ST-DBSCAN (очень медленно)

In [17]:
# df_train

In [18]:
# %%time
# spatial_threshold = 200  # meters
# temporal_threshold = 0.1
# min_neighbors = 1

# df_clustering = STDBSCAN.ST_DBSCAN(df_train.reset_index(), spatial_threshold, temporal_threshold, min_neighbors, 
#                                    time_cols = ['timeDay_sin', 'timeDay_cos', 'timeWeek_sin', 'timeWeek_cos'])

In [19]:
# df_clustering

## Кластеризация по времени

In [20]:
eps_time = 0.1
n_time = 5

In [21]:
%%time
df_train, time_clust, time_cl = preprocessing.clusterization(df_train, time_cols, 
                                                                  eps_time, n_time, dim='time')

CPU times: user 352 ms, sys: 94.4 ms, total: 446 ms
Wall time: 135 ms


In [22]:
len(np.unique(df_train.time_cluster)), np.unique(df_train.time_cluster)

(45,
 array([-1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
        16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
        33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43]))

In [23]:
# for c in np.unique(df_train.time_cluster):
#     print(c, len(df_train[df_train.time_cluster==c]), np.unique(df_train[df_train.time_cluster==c].index.hour), np.unique(df_train[df_train.time_cluster==c].index.weekday))

In [24]:
%%time
df_test = preprocessing.clusterization(df_test, time_cols, eps_time, n_time, 'time', 'test', time_cl)

CPU times: user 173 ms, sys: 49.4 ms, total: 222 ms
Wall time: 64.9 ms


In [25]:
np.unique(df_test.time_cluster)

array([-1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
       17, 18, 19, 20, 21, 22, 23, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
       35, 36, 37, 38, 39, 40, 41, 42])

In [26]:
visualization.plot_temporal_clusters(df_train, 'train')

In [27]:
visualization.plot_temporal_clusters(df_test, 'test')

## Кластеризация местоположения

In [28]:
eps_loc = 0.2
n_loc = 1

In [29]:
%%time
cluster = 0
for i in df_train.time_cluster.unique():
    cur_train = df_train[df_train['time_cluster'] == i]
    cur_train, loc_clust, loc_cl = preprocessing.clusterization(cur_train, loc_cols, eps_loc, n_loc, 'loc') 
    for idx, row in df_train[df_train['time_cluster'] == i].iterrows():
        df_train.loc[idx,'cluster'] = cur_train.loc[idx, 'loc_cluster'] + cluster
       
    
    cur_test = df_test[df_test['time_cluster'] == i]
    if len(cur_test) == 0:
        continue
#     print(len(cur_test))
    cur_test = preprocessing.clusterization(cur_test, loc_cols, eps_loc, n_loc, 'loc', 'test', loc_cl)
    for idx, row in df_test[df_test['time_cluster'] == i].iterrows():
        df_test.loc[idx,'cluster'] = cur_test.loc[idx, 'loc_cluster'] + cluster
    cluster += cur_train['loc_cluster'].max() + 1

CPU times: user 2.5 s, sys: 24.9 ms, total: 2.52 s
Wall time: 2.55 s


In [30]:
df_train.cluster.unique().max(), df_test.cluster.unique().max()

(131.0, 130.0)

In [31]:
# %%time
# df_train, loc_clust, loc_cl = fpmc_preprocessing.clusterization(df_train, loc_cols, eps_loc, n_loc, 'loc')

In [32]:
# len(np.unique(df_train.loc_cluster)), np.unique(df_train.loc_cluster)

In [33]:
# %%time
# df_test = fpmc_preprocessing.clusterization(df_test, loc_cols, eps_loc, n_loc, 'loc', 'test', loc_cl)

In [34]:
# np.unique(df_test.loc_cluster)

In [35]:
df_train.cluster.min()

0.0

In [36]:
visualization.plot_map(pd.concat(
    [df_train, df_test]), 'cluster', df_train.cluster.min(), df_train.cluster.max())

## Выделение spatio-temporal кластеров <BR> (больше размерность и может возникнуть новый кластер в тесте)

In [37]:
# %%time
# df_train, cluster_dict = fpmc_preprocessing.final_clusterization(df_train, time_cols, loc_cols)

In [38]:
# %%time
# df_test = fpmc_preprocessing.final_clusterization(df_test, time_cols, loc_cols, 
#                                                   mode='test', clucter_dict=cluster_dict)

In [39]:
# len(cluster_dict)

# Генерируем историю приложений (по времени)

In [40]:
# df_clustering.cluster = df_clustering.cluster.astype(int)
df_train.cluster = df_train.cluster.astype(int)
df_test.cluster = df_test.cluster.astype(int)

In [41]:
%%time
test = preprocessing.generate_previous_events(
    pd.concat([
        df_train[df_train.index >= df_train.index[-1] - pd.Timedelta(train_period, unit='minutes')], 
        df_test
    ]), 
    train_period, app_to_idx, mode='test', from_=df_test.index[0])

CPU times: user 106 ms, sys: 29.1 ms, total: 135 ms
Wall time: 137 ms


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


In [42]:
%%time
ttest = preprocessing.generate_next_events(df_test, train_period, app_to_idx)

CPU times: user 92.5 ms, sys: 15.8 ms, total: 108 ms
Wall time: 109 ms


In [43]:
test = test.merge(ttest, on=['cluster', 'cur_app_idx'], right_index=True, left_index=True)

In [44]:
%%time
train = preprocessing.generate_previous_events(df_train, train_period, app_to_idx)

CPU times: user 655 ms, sys: 297 ms, total: 952 ms
Wall time: 960 ms


In [45]:
# train_clustering = preprocessing.generate_previous_events(df_clustering.set_index('timestamp'), train_period, app_to_idx)

# Обучение FPMC <br> (используем кластер как "юзер")

In [47]:
# # Раскомментировать, если проверяем без "персонализации"
# train['cluster'] = 0
# train.head()

In [48]:
lr_list=np.logspace(-5,-1,5)
reg_list=np.logspace(-5,-1,5)
n_neg_list=[0,2,5,8]
# std_list=np.logspace(-5,-2,4)
n_epochs = 20

best_model = params_tuning.tuning(train.cluster.unique(), idx_to_app.keys(), 
                       train, 
                       factor=min(len(idx_to_app.keys()), 64), 
                       train_ratio=0.9, n_epoch=n_epochs,
                       lr_list=lr_list, reg_list=reg_list, n_neg_list=n_neg_list)

{'n_factor': 64, 'learn_rate': 1e-05, 'regular': 1e-05, 'neg_batch_size': 0, 'std': 0.01}
		 mrr = 0.0629617461374191; acc = 0.027522935779816515
{'n_factor': 64, 'learn_rate': 1e-05, 'regular': 1e-05, 'neg_batch_size': 2, 'std': 0.01}
{'n_factor': 64, 'learn_rate': 1e-05, 'regular': 1e-05, 'neg_batch_size': 5, 'std': 0.01}
{'n_factor': 64, 'learn_rate': 1e-05, 'regular': 1e-05, 'neg_batch_size': 8, 'std': 0.01}
{'n_factor': 64, 'learn_rate': 1e-05, 'regular': 1e-05, 'neg_batch_size': 0, 'std': 0.1}
{'n_factor': 64, 'learn_rate': 1e-05, 'regular': 1e-05, 'neg_batch_size': 2, 'std': 0.1}
{'n_factor': 64, 'learn_rate': 1e-05, 'regular': 1e-05, 'neg_batch_size': 5, 'std': 0.1}
{'n_factor': 64, 'learn_rate': 1e-05, 'regular': 1e-05, 'neg_batch_size': 8, 'std': 0.1}
{'n_factor': 64, 'learn_rate': 1e-05, 'regular': 0.0001, 'neg_batch_size': 0, 'std': 0.01}
{'n_factor': 64, 'learn_rate': 1e-05, 'regular': 0.0001, 'neg_batch_size': 2, 'std': 0.01}
{'n_factor': 64, 'learn_rate': 1e-05, 'regular

{'n_factor': 64, 'learn_rate': 0.001, 'regular': 1e-05, 'neg_batch_size': 8, 'std': 0.1}
		 mrr = 0.3915483503476902; acc = 0.21559633027522937
{'n_factor': 64, 'learn_rate': 0.001, 'regular': 0.0001, 'neg_batch_size': 0, 'std': 0.01}
{'n_factor': 64, 'learn_rate': 0.001, 'regular': 0.0001, 'neg_batch_size': 2, 'std': 0.01}
{'n_factor': 64, 'learn_rate': 0.001, 'regular': 0.0001, 'neg_batch_size': 5, 'std': 0.01}
{'n_factor': 64, 'learn_rate': 0.001, 'regular': 0.0001, 'neg_batch_size': 8, 'std': 0.01}
{'n_factor': 64, 'learn_rate': 0.001, 'regular': 0.0001, 'neg_batch_size': 0, 'std': 0.1}
{'n_factor': 64, 'learn_rate': 0.001, 'regular': 0.0001, 'neg_batch_size': 2, 'std': 0.1}
{'n_factor': 64, 'learn_rate': 0.001, 'regular': 0.0001, 'neg_batch_size': 5, 'std': 0.1}
{'n_factor': 64, 'learn_rate': 0.001, 'regular': 0.0001, 'neg_batch_size': 8, 'std': 0.1}
{'n_factor': 64, 'learn_rate': 0.001, 'regular': 0.001, 'neg_batch_size': 0, 'std': 0.01}
{'n_factor': 64, 'learn_rate': 0.001, 'reg

{'n_factor': 64, 'learn_rate': 0.1, 'regular': 0.001, 'neg_batch_size': 5, 'std': 0.01}
{'n_factor': 64, 'learn_rate': 0.1, 'regular': 0.001, 'neg_batch_size': 8, 'std': 0.01}
{'n_factor': 64, 'learn_rate': 0.1, 'regular': 0.001, 'neg_batch_size': 0, 'std': 0.1}
{'n_factor': 64, 'learn_rate': 0.1, 'regular': 0.001, 'neg_batch_size': 2, 'std': 0.1}
{'n_factor': 64, 'learn_rate': 0.1, 'regular': 0.001, 'neg_batch_size': 5, 'std': 0.1}
{'n_factor': 64, 'learn_rate': 0.1, 'regular': 0.001, 'neg_batch_size': 8, 'std': 0.1}
{'n_factor': 64, 'learn_rate': 0.1, 'regular': 0.01, 'neg_batch_size': 0, 'std': 0.01}
{'n_factor': 64, 'learn_rate': 0.1, 'regular': 0.01, 'neg_batch_size': 2, 'std': 0.01}
{'n_factor': 64, 'learn_rate': 0.1, 'regular': 0.01, 'neg_batch_size': 5, 'std': 0.01}
{'n_factor': 64, 'learn_rate': 0.1, 'regular': 0.01, 'neg_batch_size': 8, 'std': 0.01}
{'n_factor': 64, 'learn_rate': 0.1, 'regular': 0.01, 'neg_batch_size': 0, 'std': 0.1}
{'n_factor': 64, 'learn_rate': 0.1, 'regul

In [49]:
print('Final parameters for case {}: {}'.format(case, best_model.params))

Final parameters for case lyc.csv: {'n_factor': 64, 'learn_rate': 0.01, 'regular': 0.01, 'neg_batch_size': 8, 'std': 0.01}


# Пример обучения и прогнозирования

In [50]:
fpmc = FPMC.FPMC_numba(train.cluster.unique(), idx_to_app.keys(), **best_model.params)

In [52]:
%%time
_ = fpmc.learnSBPR_FPMC(train, n_epoch=20, verbose=True)

epoch 0 done
epoch 1 done
epoch 2 done
epoch 3 done
epoch 4 done
epoch 5 done
epoch 6 done
epoch 7 done
epoch 8 done
epoch 9 done
epoch 10 done
epoch 11 done
epoch 12 done
epoch 13 done
epoch 14 done
epoch 15 done
epoch 16 done
epoch 17 done
epoch 18 done
epoch 19 done
	In sample:0.2376	0.4104
CPU times: user 1.63 s, sys: 21.8 ms, total: 1.65 s
Wall time: 1.6 s


In [53]:
scores, preds = [], []
for i, row in test.iterrows():
    score = np.array([FPMC.sigmoid_jit(x) for x in FPMC.compute_x_batch_jit(
        row['cluster'], row['prev_apps_idx'], fpmc.VUI_m_VIU, fpmc.VIL_m_VLI)])
    pred = [k for k in np.argsort(score)[::-1]]

    fpmc.folding_in(row[:3], 10)

    scores.append(score)
    preds.append(pred)

In [54]:
test['scores'] = scores
test['preds'] = preds

In [62]:
with open('../output/predictions_5min.pkl', 'wb') as f:
    pickle.dump(test, f)

# Эксперименты для разных временных отрезков

In [None]:
results = pd.DataFrame()
for i, train_period in tqdm(enumerate([1, 2, 5, 10, 15, 30, 60])):
    for pred_period in tqdm([1, 2, 5, 10, 15, 30]):
        test = preprocessing.generate_previous_events(pd.concat([df_train[df_train.index >= df_train.index[-1] - pd.Timedelta(
            train_period, unit='minutes')], df_test]), train_period, app_to_idx, mode='test', from_=df_test.index[0])
        ttest = preprocessing.generate_next_events(
            df_test, pred_period, app_to_idx)
        test = test.merge(
            ttest, on=['cluster', 'cur_app_idx'], right_index=True, left_index=True)
        train = preprocessing.generate_previous_events(
            df_train, train_period, app_to_idx)
#         test['cluster'] = 0
#         train['cluster'] = 0

        fpmc = FPMC_numba.FPMC(train.cluster.unique(), idx_to_app.keys(
        ), n_factor=64, learn_rate=0.01, regular=0.01, neg_batch_size=5, std=0.1)
        _ = fpmc.learnSBPR_FPMC(train, n_epoch=20, verbose=False)

        scores, preds = [], []
        for i, row in test.iterrows():
            score = np.array([FPMC_numba.sigmoid_jit(x)
                              for x in FPMC_numba.compute_x_batch_jit(
                                  row['cluster'], row['prev_apps_idx'], fpmc.VUI_m_VIU, fpmc.VIL_m_VLI)
                              ])
            pred = [k for k in np.argsort(score)[::-1]]
            fpmc.folding_in(row[:3], 10)
            scores.append(score)
            preds.append(pred)

        test['scores'] = scores
        test['preds'] = preds
        test['MRR'] = test.apply(find_rank, axis=1)
        acc, prec, recall, f1 = accuracy_precision_recall(
            test['next_apps_idx'],
            test['preds'].values,
            K=10
        )
        cur_res = pd.DataFrame(
            [acc.mean(axis=0), prec.mean(axis=0),
             recall.mean(axis=0), f1.mean(axis=0)],
            index=['acc@K', 'prec@K', 'recall@K', 'F1@K'],
            columns=[f'K={x+1}' for x in range(acc.shape[1])]
        )
        cur_res['train_period, min'] = train_period
        cur_res['pred_period, min'] = pred_period
        cur_res['MRR'] = np.round(test['MRR'].mean(), 3)

        results = pd.concat([results, cur_res])

In [None]:
print(results.set_index(['train_period, min', 'pred_period, min', 'MRR', 'metric']).iloc[:,:5].to_latex())

In [None]:
results_noclust[[x for x in results_noclust.columns if 'K' in x]] = results_noclust[[x for x in results_noclust.columns if 'K' in x]].apply(lambda x: np.round(x,3))

In [None]:
best_fmc = results_noclust[results_noclust['MRR'] == results_noclust['MRR'].max()] \
    .reset_index() \
    .rename(columns={'index':'metric'}) \
    .set_index(['train_period, min', 'pred_period, min', 'MRR', 'metric'])

print(best_fmc.to_latex())

In [None]:
results_noclust = results_noclust.reset_index() \
    .rename(columns={'index':'metric'}) \
    .set_index(['train_period, min', 'pred_period, min', 'MRR'])

In [None]:
results_noclust.loc[results_noclust[results_noclust['metric'] == 'prec@K']['K=2'].argmax()]

In [None]:
results_noclust = results_noclust.reset_index()

In [None]:
results_noclust.head()

In [None]:
[np.arange(1,11)] * 10

In [None]:
cur_m = results_noclust[results_noclust['metric'] == m]
cur_m[[col for col in cur_m.columns if 'K' in col]].to_numpy().max()

In [None]:
from bokeh.palettes import T

In [None]:
from bokeh.palettes import Spectral11, Set3_12
from bokeh.io import export_png

for m in ['acc@K', 'prec@K', 'recall@K', 'F1@K']:
    output_file(f"FMC_{m}.html")
    ss = []
    cur_m = results_noclust[results_noclust['metric'] == m]
    for train_period in results_noclust['train_period, min'].unique():
        cur = cur_m[cur_m['train_period, min'] == train_period]
        s = figure(width=800, height=600, 
                   x_range=((1,10)),
                   y_range=((cur_m[[col for col in cur_m.columns if 'K' in col]].to_numpy().min()-0.01,
                             cur_m[[col for col in cur_m.columns if 'K' in col]].to_numpy().max()+0.01)),
                    title=f'train_period = {train_period} min',
                    toolbar_location=None, tools="")
        mypalette=Set3_12[1:len(cur)+2][::-1]

    #     s.circle(4, 0.2, size=0.00000001, color= "#ffffff", legend='prediction')
    #     s.circle(4, 0.3, size=0.00000001, color= "#ffffff", legend='period, min')
        if train_period == 60:
            continue
        for j, (i, row) in enumerate(cur.iterrows()):
            s.line(np.arange(1,11), row.values[-10:], line_color=mypalette[j], line_width=3, legend_label=f'{row.values[1]} min')
        s.xaxis.axis_label = 'K'
        s.xaxis.axis_label_text_font_style = 'normal'
        s.yaxis.axis_label_text_font_style = 'normal'
        s.xaxis.axis_label_text_font = "times"
        s.yaxis.axis_label_text_font = "times"
        s.xaxis.axis_label_text_font_size = "14pt"
        s.xaxis.major_label_text_font_size = "11pt"
        s.yaxis.axis_label_text_font_size = "14pt"
        s.yaxis.major_label_text_font_size = "12pt"
        s.xaxis.major_label_text_font = "times"
        s.yaxis.major_label_text_font = "times"
        s.title.text_font_size = '18pt'
        s.title.text_font = "times"
        s.title.text_font_style = 'normal'
        s.legend.label_text_font_size = '12pt'
        s.legend.label_text_font = 'times'
        s.legend.border_line_alpha = 0.2
        ss.append(s)
    p = gridplot([ss[:3], ss[3:]])
    show(p)
#     export_png(p, filename=f"FMC_{m}.png")
#         s.line(np.range(1,11), row.values[-10:], color="#c9d9d3", line_width=2)

In [None]:
from bokeh.layouts import gridplot, column

In [None]:
# i=1
# output_file(f"MP_{app}.html")
# for idx in results_noclust[results_noclust['metric'] == 'acc@K'].index:
#     s1 = figure(width=1250, height=250, 
#                 title=f'App {k}',
#                 toolbar_location=None, tools="")
#     s1.line(timestamps, X_app, color="#c9d9d3", line_width=2)
#     s1.xaxis.formatter = DatetimeTickFormatter(seconds = ['%s'])
#     plt.plot(results_noclust[results_noclust['metric'] == 'acc@K'].loc[idx].values[1:])
#     if idx[0] != i:
#         plt.title(f'train_period = {i} min')
#         plt.show()
#     i = idx[0]
#     if idx[0] == 60:
#         break

In [None]:
# i = 1
# for idx in results_noclust[results_noclust['metric'] == 'F1@K'].index:
#     if idx[0] != i:
        
#     break

In [None]:
# for i, row in results_noclust[results_noclust['metric'] == 'F1@K'].iterrows():
#     print(i)
#     break

In [None]:
results = pd.DataFrame()
for i, train_period in tqdm(enumerate([1,2,5,10,15,30])):
    for pred_period in tqdm([1,2,5,10,15,30]):
        test = fpmc_preprocessing.generate_previous_events(
            pd.concat([
                df_train[df_train.index >= df_train.index[-1] - pd.Timedelta(train_period, unit='minutes')], 
                df_test
            ]), 
            train_period, app_to_idx, mode='test', from_=df_test.index[0])
        ttest = fpmc_preprocessing.generate_next_events(df_test, pred_period, app_to_idx)
        test = test.merge(ttest, on=['cluster', 'cur_app_idx'], right_index=True, left_index=True)
        
        train = fpmc_preprocessing.generate_previous_events(df_train, train_period, app_to_idx)


        fpmc = FPMC_numba.FPMC(train.cluster.unique(), idx_to_app.keys(),
                               n_factor=64, learn_rate=0.01, regular=0.01, neg_batch_size=8, std=0.1)
        _ = fpmc.learnSBPR_FPMC(train, n_epoch=20, verbose=False)

        scores, preds = [], []
        for i, row in test.iterrows():
            score = np.array([FPMC_numba.sigmoid_jit(x) 
                              for x in FPMC_numba.compute_x_batch_jit(
                                  row['cluster'], row['prev_apps_idx'], fpmc.VUI_m_VIU, fpmc.VIL_m_VLI)
                             ])
            pred = [k for k in np.argsort(score)[::-1]]
            fpmc.folding_in(row[:3], 10)
            scores.append(score)
            preds.append(pred)
        test['scores'] = scores
        test['preds'] = preds

        test['MRR'] = test.apply(find_rank, axis=1)
        acc, prec, recall, f1 = accuracy_precision_recall(
            test['next_apps_idx'], 
            test['preds'].values, 
            K=10
        )
        cur_res = pd.DataFrame(
            [acc.mean(axis=0), prec.mean(axis=0), recall.mean(axis=0), f1.mean(axis=0)], 
            index=['acc@K', 'prec@K', 'recall@K', 'F1@K'], 
            columns=[f'K={x+1}' for x in range(acc.shape[1])]
        )
        cur_res['train_period, min'] = train_period
        cur_res['pred_period, min'] = pred_period
        cur_res['MRR'] = np.round(test['MRR'].mean(), 3)

        results = pd.concat([results, cur_res])

In [None]:
results

In [None]:
results = results.reset_index() \
    .rename(columns={'index':'metric'}) \
    .set_index(['train_period, min', 'pred_period, min', 'MRR'])
results = results.reset_index()
results

In [None]:
from bokeh.palettes import Spectral11, Set3_12
from bokeh.io import export_png

for m in ['acc@K', 'prec@K', 'recall@K', 'F1@K']:
    output_file(f"FCMC_{m}.html")
    ss = []
    cur_m = results[results['metric'] == m]
    for train_period in results['train_period, min'].unique():
        cur = cur_m[cur_m['train_period, min'] == train_period]
        s = figure(width=800, height=600, 
                   x_range=((1,10)),
                   y_range=((cur_m[[col for col in cur_m.columns if 'K' in col]].to_numpy().min()-0.01,
                             cur_m[[col for col in cur_m.columns if 'K' in col]].to_numpy().max()+0.01)),
                    title=f'train_period = {train_period} min',
                    toolbar_location=None, tools="")
        mypalette=Set3_12[1:len(cur)+2][::-1]

    #     s.circle(4, 0.2, size=0.00000001, color= "#ffffff", legend='prediction')
    #     s.circle(4, 0.3, size=0.00000001, color= "#ffffff", legend='period, min')
        if train_period == 60:
            continue
        for j, (i, row) in enumerate(cur.iterrows()):
            s.line(np.arange(1,11), row.values[-10:], line_color=mypalette[j], line_width=3, legend_label=f'{row.values[1]} min')
        s.xaxis.axis_label = 'K'
        s.xaxis.axis_label_text_font_style = 'normal'
        s.yaxis.axis_label_text_font_style = 'normal'
        s.xaxis.axis_label_text_font = "times"
        s.yaxis.axis_label_text_font = "times"
        s.xaxis.axis_label_text_font_size = "14pt"
        s.xaxis.major_label_text_font_size = "11pt"
        s.yaxis.axis_label_text_font_size = "14pt"
        s.yaxis.major_label_text_font_size = "12pt"
        s.xaxis.major_label_text_font = "times"
        s.yaxis.major_label_text_font = "times"
        s.title.text_font_size = '18pt'
        s.title.text_font = "times"
        s.title.text_font_style = 'normal'
        s.legend.label_text_font_size = '12pt'
        s.legend.label_text_font = 'times'
        s.legend.border_line_alpha = 0.2
        ss.append(s)
    p = gridplot([ss[:3], ss[3:]])
#     show(p)
    export_png(p, filename=f"FСMC_{m}.png")
#         s.line(np.range(1,11), row.values[-10:], color="#c9d9d3", line_width=2)

In [None]:
results[[col for col in results.columns if 'K' in col]] = results[[col for col in results.columns if 'K' in col]].apply(lambda x: np.round(x,3))

In [None]:
results.set_index(['train_period, min', 'pred_period, min', 'MRR', 'metric']).iloc[:,:5]

In [None]:
print(results.set_index(['train_period, min', 'pred_period, min', 'MRR', 'metric']).iloc[:,:5].to_latex())

In [None]:
best_fcmc = results[(results['train_period, min'] == 2) & (results['pred_period, min'] == 15)] \
    .set_index(['train_period, min', 'pred_period, min', 'MRR', 'metric'])

# print(best_fcmc.to_latex())
best_fcmc

In [None]:
best_fcmc

In [None]:
results[results['MRR'] == results['MRR'].max()]

In [None]:
for i in (range(len(results[results.index == 'acc@K']))):
    plt.plot(np.arange(1,6,1), results[results.index == 'acc@K'].drop(['MRR','time_delta, min'], axis=1).iloc[i][:5].values, 
             label=f'{results[results.index == "acc@K"]["time_delta, min"].unique()[i]} min')
plt.title('FCMC')
plt.xlabel('K')
plt.ylabel('Accuracy@K')
# plt.xlim(-5,66)
plt.legend()
plt.grid(True)
plt.savefig('acc@K_cluster.png', dpi=300, quality=95)

In [None]:
results.reset_index().set_index(['time_delta, min', 'MRR'])

In [None]:
plt.plot(results_one_cluster[['time_delta, min', 'K=2']][results_one_cluster.index == 'acc@K'].reset_index(drop=True).drop_duplicates()['time_delta, min'],
        results_one_cluster[['time_delta, min', 'K=2']][results_one_cluster.index == 'acc@K'].reset_index(drop=True).drop_duplicates()['K=2'], 
         "o-", label='FMC')
plt.plot(results_one_cluster[['time_delta, min', 'K=2']][results_one_cluster.index == 'prec@K'].reset_index(drop=True).drop_duplicates()['time_delta, min'],
        results_one_cluster[['time_delta, min', 'K=2']][results_one_cluster.index == 'prec@K'].reset_index(drop=True).drop_duplicates()['K=2'], 
         "o-", label='FMC')
plt.plot(results_one_cluster[['time_delta, min', 'K=2']][results_one_cluster.index == 'recall@K'].reset_index(drop=True).drop_duplicates()['time_delta, min'],
        results_one_cluster[['time_delta, min', 'K=2']][results_one_cluster.index == 'recall@K'].reset_index(drop=True).drop_duplicates()['K=2'], 
         "o-", label='FMC')
plt.plot(results[['time_delta, min', 'K=2']][results_one_cluster.index == 'acc@K'].reset_index(drop=True).drop_duplicates()['time_delta, min'],
        results[['time_delta, min', 'K=2']][results_one_cluster.index == 'acc@K'].reset_index(drop=True).drop_duplicates()['K=2'], 
         "x-", label='FCMC')
plt.plot(results[['time_delta, min', 'K=2']][results_one_cluster.index == 'prec@K'].reset_index(drop=True).drop_duplicates()['time_delta, min'],
        results[['time_delta, min', 'K=2']][results_one_cluster.index == 'prec@K'].reset_index(drop=True).drop_duplicates()['K=2'], 
         "x-", label='FCMC')
plt.plot(results[['time_delta, min', 'K=1']][results_one_cluster.index == 'recall@K'].reset_index(drop=True).drop_duplicates()['time_delta, min'],
        results[['time_delta, min', 'K=1']][results_one_cluster.index == 'recall@K'].reset_index(drop=True).drop_duplicates()['K=2'], 
         "x-", label='FCMC')
plt.title('Accuracy(∆t) dependency')
plt.xlabel('∆t, min')
plt.ylabel('Accuracy@1')
plt.legend()
plt.grid(True)
# plt.savefig('acc.png', dpi=300, quality=95)

In [None]:
pd.DataFrame(results.reset_index().set_index('time_delta, min')['MRR'].drop_duplicates()).transpose()

In [None]:
plt.plot(results_one_cluster[['time_delta, min', 'MRR']].reset_index(drop=True).drop_duplicates()['time_delta, min'],
        results_one_cluster[['time_delta, min', 'MRR']].reset_index(drop=True).drop_duplicates()['MRR'], 
         "o-", label='FMC')
plt.plot(results[['time_delta, min', 'MRR']].reset_index(drop=True).drop_duplicates()['time_delta, min'],
        results[['time_delta, min', 'MRR']].reset_index(drop=True).drop_duplicates()['MRR'], 
         "o-", label='FCMC')
plt.title('MRR(∆t) dependency')
plt.xlabel('∆t, min')
plt.ylabel('MRR')
plt.xlim(-5,66)
plt.legend()
plt.grid(True)
plt.savefig('mrr.png', dpi=300, quality=95)

In [None]:
for i in (range(len(results_one_cluster[results.index == 'acc@K']))):
    plt.plot(np.arange(1,6,1), results_one_cluster[results_one_cluster.index == 'acc@K'].drop(['MRR','time_delta, min'], axis=1).iloc[i][:5].values, 
             label=f'{results_one_cluster[results_one_cluster.index == "acc@K"]["time_delta, min"].unique()[i]} min')
    if i == 4:
        break
plt.title('FMC')
plt.xlabel('K')
plt.ylabel('Accuracy@K')
# plt.xlim(-5,66)
plt.legend()
plt.grid(True)
plt.savefig('acc@K.png', dpi=300, quality=95)

In [None]:
results_one_cluster.set_index(['time_delta, min', 'MRR'])

In [None]:
results_one_cluster = results_one_cluster.reset_index().set_index(['time_delta, min', 'MRR'])

In [None]:
results_one_cluster['∆t, min'] = results_one_cluster.reset_index(0)['time_delta, min'].values

In [None]:
results_one_cluster = results_one_cluster.reset_index().drop('time_delta, min', axis=1)

In [None]:
results_one_cluster = results_one_cluster.reset_index().set_index(['∆t, min','index'])

In [None]:
results_one_cluster

In [None]:
pd.DataFrame(results_one_cluster.reset_index().set_index('∆t, min')['MRR'].drop_duplicates()).transpose()

In [None]:
acc_1clust = results_one_cluster[results_one_cluster['index'] == 'acc@K'].copy()
acc_1clust.index = acc_1clust.index.droplevel(1)
acc_1clust = acc_1clust.transpose().drop('index',0)
acc_1clust.index = ['accuracy@'+str(x.split("=")[-1]) for x in acc_1clust.index]
acc_1clust

In [None]:
prec_1clust = results_one_cluster[results_one_cluster['index'] == 'prec@K'].copy()
prec_1clust.index = prec_1clust.index.droplevel(1)
prec_1clust = prec_1clust.transpose().drop('index', 0)
prec_1clust.index = ['precision@'+str(x.split("=")[-1]) for x in prec_1clust.index]
prec_1clust

In [None]:
recall_1clust = results_one_cluster[results_one_cluster['index'] == 'recall@K'].copy()
recall_1clust.index = recall_1clust.index.droplevel(1)
recall_1clust = recall_1clust.transpose().drop('index',0)
recall_1clust.index = ['recall@'+str(x.split("=")[-1]) for x in recall_1clust.index]
recall_1clust

In [None]:
test[test['best'].apply(lambda x: x[0])!=test['cur_app_idx']]

In [None]:
%%time
train_period = 5
test = fpmc_preprocessing.generate_previous_events(
    pd.concat([
        df_train[df_train.index >= df_train.index[-1] - pd.Timedelta(train_period, unit='minutes')], 
        df_test
    ]), 
    train_period, app_to_idx, mode='test', from_=df_test.index[0])
train = fpmc_preprocessing.generate_previous_events(df_train, train_period, app_to_idx)
train['cluster'] = 0
test['cluster'] = 0
fpmc = FPMC.FPMC(cluster_dict.values(), idx_to_app.keys(),
                       n_factor=64, learn_rate=0.01, regular=1e-05, neg_batch_size=8, std=0.01)
_ = fpmc.learnSBPR_FPMC(train, n_epoch=20, verbose=False)
test['scores'] = test.apply(get_probs, axis=1)
test['best'] = test['scores'].apply(lambda x: np.argsort(x)[::-1])
test['MRR'] = test.apply(find_rank, axis=1)
acc, prec, recall = accuracy_precision_recall(
    test.cur_app_idx.apply(lambda x: [x]).values, 
    test['best'].values, 
    K=10
)
cur_res = pd.DataFrame(
    [acc.mean(axis=0), prec.mean(axis=0), recall.mean(axis=0)], 
    index=['acc@K', 'prec@K', 'recall@K'], 
    columns=[f'K={x+1}' for x in range(acc.shape[1])]
)
cur_res['time_delta, min'] = train_period
cur_res['MRR'] = np.round(test['MRR'].mean(), 3)

In [None]:
%%time
test['scores'] = test.apply(get_probs, axis=1)
test['best'] = test['scores'].apply(lambda x: np.argsort(x)[::-1])
# test['MRR'] = test.apply(find_rank, axis=1)
# acc, prec, recall = accuracy_precision_recall(
#     test.cur_app_idx.apply(lambda x: [x]).values, 
#     test['best'].values, 
#     K=10
# )
# cur_res = pd.DataFrame(
#     [acc.mean(axis=0), prec.mean(axis=0), recall.mean(axis=0)], 
#     index=['acc@K', 'prec@K', 'recall@K'], 
#     columns=[f'K={x+1}' for x in range(acc.shape[1])]
# )
# cur_res['time_delta, min'] = train_period
# cur_res['MRR'] = np.round(test['MRR'].mean(), 3)

In [None]:
df_test

In [None]:
from bokeh.io import output_file, show
from bokeh.plotting import figure
from bokeh.models.ranges import FactorRange
from bokeh.models import ColumnDataSource
from bokeh.io import export_png
from bokeh.transform import dodge

In [None]:
years = [2017,2018,2019]
data = {'years' : years,
        'SMARTPHONE USERS'   : [2.7, 2.9, 3.2],
        'SMARTPHONE DEVICES USED'   : [3.3,3.6,3.8]
       }

source = ColumnDataSource(data=data)

p = figure(plot_width=800, plot_height=600,
           toolbar_location=None, tools="")

# p = figure(plot_width=800, plot_height=600,
#            toolbar_location=None, tools="")

# p.vbar(x=stats.index, top=stats.iloc[0].values/1e6, width=0.5)
# p.vbar(x=stats.index, top=stats.iloc[1].values/1e6, width=0.5)
p.vbar(x=dodge('years', -0.25, range=p.x_range), top='SMARTPHONE USERS', width=0.2, source=source,
       color="#c9d9d3", legend_label="Smartphone users")

p.vbar(x=dodge('years',  0.0,  range=p.x_range), top='SMARTPHONE DEVICES USED', width=0.2, source=source,
       color="#718dbf", legend_label="Smartphone devices used")

p.xgrid.grid_line_color = None
p.y_range.start = 0
p.x_range.range_padding = 0.1
p.legend.location = "top_left"

# p.title.text_font_size = '18pt'

p.xaxis.ticker = [2017, 2018, 2019]
p.xaxis.axis_label = 'Year'
p.xaxis.axis_label_text_font_style = 'normal'
p.axis.axis_label_text_font_style = 'normal'
p.xaxis.axis_label_text_font = "times"
p.xaxis.axis_label_text_font_size = "24pt"
p.xaxis.major_label_text_font_size = "18pt"
p.xaxis.major_label_text_font = "times"


p.yaxis.axis_label = 'Smartphone users (in billions)'
p.yaxis.axis_label_text_font_style = 'normal'
p.yaxis.axis_label_text_font = "times"
p.yaxis.axis_label_text_font_size = "24pt"
p.yaxis.major_label_text_font_size = "18pt"
p.yaxis.major_label_text_font = "times"

p.legend.label_text_font_size = '18pt'
p.legend.label_text_font = 'times'
p.legend.border_line_alpha = 0.2

# p.output_backend = "svg"
# export_png(p, filename="years_stats.png")
show(p)

In [None]:
companies = ['Samsung','Apple','Huawei', 'Oppo', 'Vivo']
data = {'companies' : companies,
        'smartphones'   : [898.5, 832.9, 384.0, 427.9, 300.8],
        'tablets'   : [36.3,160.0,13.8,0,0,]
       }

source = ColumnDataSource(data=data)

p = figure(x_range=companies, plot_width=800, plot_height=600,
           toolbar_location=None, tools="")

# p = figure(plot_width=800, plot_height=600,
#            toolbar_location=None, tools="")

# p.vbar(x=data['companies'], top=data['smartphones'], width=0.5)
# p.vbar(x=stats.index, top=stats.iloc[1].values/1e6, width=0.5)
p.hbar(x=dodge('companies', -0.25, range=p.x_range), top='smartphones', width=0.25, source=source,
       color="#c9d9d3", legend_label="Smartphones")

p.bar(x=dodge('companies',  0.0,  range=p.x_range), top='tablets', width=0.25, source=source,
       color="#718dbf", legend_label="Tablets")

p.xgrid.grid_line_color = None
p.y_range.start = 0
p.x_range.range_padding = 0.1
p.legend.location = "top_right"

# p.title.text_font_size = '18pt'

p.xaxis.axis_label = 'Brand'
p.xaxis.axis_label_text_font_style = 'normal'
p.xaxis.axis_label_text_font = "times"
p.xaxis.axis_label_text_font_size = "24pt"
p.xaxis.major_label_text_font_size = "18pt"
p.xaxis.major_label_text_font = "times"


p.yaxis.axis_label = 'Users (in millions)'
p.yaxis.axis_label_text_font_style = 'normal'
p.yaxis.axis_label_text_font = "times"
p.yaxis.axis_label_text_font_size = "24pt"
p.yaxis.major_label_text_font_size = "18pt"
p.yaxis.major_label_text_font = "times"

p.legend.label_text_font_size = '18pt'
p.legend.label_text_font = 'times'
p.legend.border_line_alpha = 0.2

# p.output_backend = "svg"
export_png(p, filename="brands.png")
# show(p)

In [None]:
p = figure(x_range=companies, plot_width=800, plot_height=600,
           toolbar_location=None, tools="")

In [None]:
p.h