# Base model for baseline comparison
## Implement on 1 user sample

In [1]:
%load_ext autotime

import pandas as pd
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
from heapq import nlargest
import warnings
import numpy as np
warnings.filterwarnings('ignore')
from scipy import stats
from tqdm.notebook import tqdm

pd.set_option("display.max_colwidth",200)
pd.set_option("display.max_columns",20)
pd.set_option('float_format', '{:.3f}'.format)

time: 1.75 s


# Data Preprocessing

From the DeepApp paper, they preprocess the data according to their definition. <br>
They use 30-min interval as a session. A window is 24 hours which consists of 48 intervals. <br>
They did not implement a dataloader. Rather they treat each window as a batch. (**Can be fine-tuned**)<br>
<br>
The preprocessed data should look like this: <br>

<br>data: {
    <br>user: 
        <br>'20-Apr': {
            <br>'tim' : [list of time in the window Shape(48,1)],
            <br>'loc' : [lsit of loc in the window Shape(48,1)],
            <br>'app': [list of multi-hot-code vector in the window. Shape(48, 2000)]    
        <br>},
        <br>'21-Apr': {
            <br>'tim' : [list of time in the window Shape(48,1)],
            <br>'loc' : [lsit of loc in the window Shape(48,1)],
            <br>'app': [list of multi-hot-code vector in the window. Shape(48, 2000)]
        <br>}, ........
    <br>},
    <br>user2: {
        <br>Same pattern........
    <br>}
<br>}
<br>
<br>
**Brief summary of the helper functions**
- generate_input returns a train set and a test set, adds more field like ptim, app_target, loc_target, uid, tim_o, loc_o, topk, etc.
- generate_queue returns a queue so it pops a window of a user everytime and feed to the model

<br>
<br>
In other words, if we use a dataloader instead. For each user, trainloader has 3 batches, Vall has 1. Each batch has 48 samples. If we want to use dataloader instead, we have to make sure each iteration of it contains the information of one user?? Or order does not matter at all.

In [2]:
#import preprocessed per user dataset from onedrive prickled file
import pickle
import os
import numpy as np

path = r"C:/Users/natha/OneDrive - The University of Sydney (Students)/CS48-CAPSTONE Project 2021 Sem1/dataset"
#create output dictionary
#enter number of user here
N_user = 1

data = {}
for u in os.listdir(path+'/user_preprocessed_pickle')[:N_user]: #import N users

    uid = int(u[:-7])
    file_name = '/'.join([path,'/user_preprocessed_pickle', u])

    try:
        with open(file_name, 'rb') as f:
            dic = pickle.load(f)
            data[uid] = dic[uid]
    except:
        print(dic)

time: 0 ns


In [3]:
#Print out the data
data

{0: {20: array([[0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         ...,
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.]]),
  21: array([[0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         ...,
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.]]),
  22: array([[0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         ...,
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.]]),
  25: array([[0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         ...,
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],

time: 0 ns


In [4]:
#we want to make use of the app count of prev session, therefore we will need to concate 2 vector
list_a = [1,2,3]
list_b = [4,5,6]
list_c = np.concatenate((list_a, list_b), axis = 0)
list_c

array([1, 2, 3, 4, 5, 6])

time: 0 ns


## The following shows the way we wanted to concate data, with value represent the index of rows
## every row: session, loc + prev app count as feature, current app count as label

In [5]:
# from IPython import display
# display.Image(r"C:/Users/natha/Desktop/Base_model.png")

time: 0 ns


In [6]:
#original dataset on 21
df_orignal_21 = pd.DataFrame(data = data[0][21])
df_orignal_21

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1992,1993,1994,1995,1996,1997,1998,1999,2000,2001
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


time: 31 ms


In [7]:
#data manipulation, refer to the png
timeloc = data[0][21][1:,:2]
prev_app_count = data[0][21][:-1,2:]
target_app_count = data[0][21][1:,2:]
feature = np.concatenate((timeloc, prev_app_count),axis = 1)
# print(feature.shape)
# print(prev_app_count.shape)
# print(target_app_count.shape)
# print(feature.shape)
df_trans_21 = pd.DataFrame(feature)
df_trans_21

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1992,1993,1994,1995,1996,1997,1998,1999,2000,2001
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


time: 31 ms


In [8]:
df_label = pd.DataFrame(target_app_count)
df_label

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1990,1991,1992,1993,1994,1995,1996,1997,1998,1999
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


time: 31 ms


In [9]:
X_train = feature
y_train = target_app_count
print(X_train.shape)
print(y_train.shape)

(47, 2002)
(47, 2000)
time: 0 ns


## Apply above idea in generat_input function

In [10]:
#create 2 function to extract the feature and label from array
#just to keep every clean as possible

def extract_feature(lst):
    #extract first 2 columns: session and mode location 
    return [item[:2] for item in lst]

def extract_label(lst):
    #extract the remaning columns
    return [item[2:] for item in lst]



# function to generate input for baseline model
# return(no.of users, rows of a users, feature dimension)
def generate_baseline_input(data, mode):
    data_neural = data
    data_feature = []
    data_label = []

    if mode == 'train':
        day_id = [20, 21, 22, 25] #the day for training

    elif mode == 'test':
        day_id = [26] # the day for testing

    # if candidate is None:
    candidate = data_neural.keys() #filter, and get user id

    #iterate all the users
    for u in candidate:
        #seperate feature and label list
        user_X_train = []
        user_y_label = []
        #get user's record
        sessions = data_neural[u]
        #seperate days for training and testing
        for i in day_id:
            #call specific day
            session = data_neural[u][i]
            #extract the part we want
            timeloc = extract_feature(session)[1:]
            prev_app_count = extract_label(session)[:-1]
            target = extract_label(session)[1:]
            #concate feature
            user_feature = np.concatenate((timeloc,prev_app_count),axis = 1)
            #append feature and label to corresponding list
            user_X_train.append(user_feature)
            user_y_label.append(target)
        #reason for reshape here: group 1 user at 1 dimension when append to all_user list
        user_X_train = np.array(user_X_train).reshape(-1,2002)
        user_y_label = np.array(user_y_label).reshape(-1,2000)
        #append user_data to all_use list
        data_feature.append(user_X_train)
        data_label.append(user_y_label)
    # no reshape here because when calling the variables it is easy to check how many user have been imported\
    # therefore it returns 3 dimenionsal data : (no.of users, rows of a users, feature dimension)
    data_feature = np.array(data_feature)
    data_label = np.array(data_label)

    return data_feature, data_label


time: 0 ns


In [11]:
X_train, y_train = generate_baseline_input(data, 'train')
X_test, y_test = generate_baseline_input(data,'test')
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)
print("X_train has {} user, each user has {} rows, each row has {} columns". format(X_train.shape[0], X_train.shape[1], X_train.shape[2]))

(1, 188, 2002)
(1, 188, 2000)
(1, 47, 2002)
(1, 47, 2000)
X_train has 1 user, each user has 188 rows, each row has 2002 columns
time: 16 ms


In [12]:
X_train = X_train.reshape(-1,2002)
y_train = y_train.reshape(-1,2000)
print(X_train.shape)
print(y_train.shape)

(188, 2002)
(188, 2000)
time: 0 ns


In [13]:
X_test = X_test.reshape(-1,2002)
y_test = y_test.reshape(-1,2000)
print(X_test.shape)
print(y_test.shape)

(47, 2002)
(47, 2000)
time: 0 ns


# Multinomial Naive Bayes
## This means that we are using the count to predict the count of app usage in next session
## In prediction, every class is the occurred app 'count' in the previous session

In [14]:
from sklearn.multioutput import MultiOutputClassifier
from sklearn.naive_bayes import MultinomialNB
clf_MNB = MultiOutputClassifier(MultinomialNB()).fit(X_train, y_train)
# pred_MNB = clf_MNB.predict(X_test)
# pred_MNB_prob = clf_MNB.predict_proba(X_test)

time: 1.78 s


In [15]:
pred_MNB = clf_MNB.predict(X_test)
pred_MNB_prob = clf_MNB.predict_proba(X_test)

time: 703 ms


In [16]:
#reason for extract prob:
#deepapp output is a list of vector of value being the prob of user using a app in the next session or not
#in order to match the output, we need the prob of class [0] < no matter you train with multinomial or binary
#in all cases 1- class [0] = prob of the user using a app
def extract_prob(lst):
    return [(1-item[0]) for item in lst]

time: 0 ns


In [17]:
#pred_MNB_prob returns (2000,47), first we extract every prob class [0] in rows then we do a transpose
for i in range(2000):
    pred_MNB_prob[i] =  extract_prob(pred_MNB_prob[i])

time: 63 ms


In [18]:
#transpose
pred_MNB_prob = np.array(pred_MNB_prob).T
pred_MNB_prob.shape

(47, 2000)

time: 15 ms


# MLR

In [19]:
#one hot code y_train and y_test
y_train = np.where(y_train > 0, 1, 0)
y_test = np.where(y_test > 0, 1, 0)

time: 0 ns


In [20]:
all0_column_ind = np.argwhere(np.all(y_train == 0, axis = 0))

time: 0 ns


In [22]:
len(all0_column_ind)

1903

time: 0 ns


In [18]:
y_train_rm = np.delete(y_train, all0_column_ind, axis=1)
y_test_rm = np.delete(y_test, all0_column_ind, axis=1)

time: 0 ns


In [19]:
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier
clf_MLR = MultiOutputClassifier(LogisticRegression(multi_class = 'ovr')).fit(X_train, y_train_rm)

time: 4.41 s


In [23]:
pred_MLR = clf_MLR.predict(X_test)
pred_MLR_prob = clf_MLR.predict_proba(X_test)

AttributeError: 'list' object has no attribute 'shape'

In [28]:
def extract_prob(lst):
    return [(1-item[0]) for item in lst]
for i in range(len(pred_MLR_prob)):
    pred_MLR_prob[i] =  extract_prob(pred_MLR_prob[i])

time: 0 ns


In [29]:
#transpose
pred_MLR_prob = np.array(pred_MLR_prob).T
pred_MLR_prob.shape

(47, 97)

time: 0 ns


# MLR Evaluation


In [35]:
from sklearn import metrics as skmetrics

avg_auc = 0
avg_map = 0
avg_recall = 0
count = 0

for i in range(y_test_rm.shape[0]):
    if np.sum(y_test_rm[i])> 0:
        fpr, tpr, thresholds = skmetrics.roc_curve(y_test_rm[i], pred_MLR_prob[i], pos_label=1)
        avg_auc += skmetrics.auc(fpr, tpr)
        actual, pred_o, pred = r_k(y_test_rm[i], pred_MLR_prob[i] ,5, 5)
        avg_recall += skmetrics.recall_score(actual, pred_o, average='macro') # Recall@5
        avg_map += cal_ap(y_test_rm[i], pred_MLR_prob[i], 5)

    else:
        count +=1
        pass
    
print('AUC: ', avg_auc / y_test_rm.shape[0])
print('MAP: ', avg_map / y_test_rm.shape[0])
print('Recall@5: ', avg_recall / y_test_rm.shape[0])

print('{} rows has been skipped'.format(count))

# fpr, tpr, thresholds = skmetrics.roc_curve(y_test, pred_MNB_prob, pos_label=1) # Collect the recall and false positive rate from all 2000 predictions
# acc[0] += skmetrics.auc(fpr, tpr)
# # acc[1] += cal_ap(truth, predict, 5)

AUC:  0.20809638705624273
MAP:  0.21536643026004726
Recall@5:  0.2978723404255319
33 rows has been skipped
time: 16 ms


# Evaluation

In [32]:
def cal_ap( y_actual, y_pred, k ):
    topK = min( len(y_pred), k ) # set top k
    l_zip = list(zip(y_actual,y_pred))
    # sort y_pred by the probability of the model
    s_zip = sorted( l_zip, key=lambda x: x[1], reverse=True )
    # topk of sorted result
    s_zip_topk = s_zip[:topK] # Shape (5,2)
    # Calculation of precision
    num = 0
    rank = 0
    sumP = 0.0
    for item in s_zip_topk:
        rank += 1
        if item[0] == 1:
            num += 1
            sumP += (num*1.0)/(rank*1.0)
    ap = 0.0
    if num > 0:
        ap = sumP/(num*1.0)
    return ap   # average precision
# Take topk prediction and the ground truth


def r_k(y_actual, y_pred, k, threshold):
    topK = min( len(y_pred), k ) # set top k
    l_zip = list(zip(y_actual,y_pred))
    # sort y_pred by the probability of the model
    s_zip = sorted( l_zip, key=lambda x: x[1], reverse=True )
    # topk of sorted result
    s_zip_topk = s_zip[:topK] # Shape (5,2)
    # print(s_zip_topk)
    actual, pred = zip(*s_zip_topk)
    actual = np.where(np.array(actual) > threshold, 1, 0)
    pred = np.array(pred)
    pred_o = np.where(pred > threshold, 1, 0)
    return actual, pred_o, pred

# acc[3] += skmetrics.recall_score(actual, pred_o, average='macro') # Recall@5

time: 0 ns


In [63]:
pred_MNB_prob = np.where(pred_MNB_prob > 0, 1, 0)
y_test = np.where(y_test > 0, 1, 0)

time: 0 ns


In [70]:
# df_y_test = pd.DataFrame(y_test)
# df_y_test

time: 0 ns


# Multinomial Naive Bayes

In [68]:
from sklearn import metrics as skmetrics

avg_auc = 0
avg_map = 0
avg_recall = 0
count = 0

for i in range(y_test.shape[0]):
    if np.sum(y_test[i])> 0:
        fpr, tpr, thresholds = skmetrics.roc_curve(y_test[i], pred_MNB_prob[i], pos_label=1)
        avg_auc += skmetrics.auc(fpr, tpr)
        actual, pred_o, pred = r_k(y_test[i], pred_MNB_prob[i] ,5, 5)
        avg_recall += skmetrics.recall_score(actual, pred_o, average='macro') # Recall@5
        avg_map += cal_ap(y_test[i], pred_MNB_prob[i], 5)

    else:
        count +=1
        pass
    
print('AUC: ', avg_auc / y_test.shape[0])
print('MAP: ', avg_map / y_test.shape[0])
print('Recall@5: ', avg_recall / y_test.shape[0])

print('{} rows has been skipped'.format(count))

# fpr, tpr, thresholds = skmetrics.roc_curve(y_test, pred_MNB_prob, pos_label=1) # Collect the recall and false positive rate from all 2000 predictions
# acc[0] += skmetrics.auc(fpr, tpr)
# # acc[1] += cal_ap(truth, predict, 5)

AUC:  0.16121329285559488
MAP:  0.20319148936170212
Recall@5:  0.2978723404255319
33 rows has been skipped
time: 31 ms


In [69]:
from sklearn import metrics as skmetrics

avg_auc = 0
avg_map = 0
avg_recall = 0
count = 0

for i in range(y_test.shape[0]):
    if np.sum(pred_MNB_prob[i])> 0:
        fpr, tpr, thresholds = skmetrics.roc_curve(y_test[i], pred_MNB_prob[i], pos_label=1)
        avg_auc += skmetrics.auc(fpr, tpr)
        actual, pred_o, pred = r_k(y_test[i], pred_MNB_prob[i] ,5, 5)
        avg_recall += skmetrics.recall_score(actual, pred_o, average='macro') # Recall@5
        avg_map += cal_ap(y_test[i], pred_MNB_prob[i], 5)

    else:
        count +=1
        pass
    
print('AUC: ', avg_auc / y_test.shape[0])
print('MAP: ', avg_map / y_test.shape[0])
print('Recall@5: ', avg_recall / y_test.shape[0])

print('{} rows has been skipped'.format(count))

# fpr, tpr, thresholds = skmetrics.roc_curve(y_test, pred_MNB_prob, pos_label=1) # Collect the recall and false positive rate from all 2000 predictions
# acc[0] += skmetrics.auc(fpr, tpr)
# # acc[1] += cal_ap(truth, predict, 5)

AUC:  nan
MAP:  0.04680851063829788
Recall@5:  0.7872340425531915
10 rows has been skipped
time: 79 ms


# BernoulliNB (0/1 NB)
## This section, the label are all one-hot coded, either 1(use) or 0(not used)

In [82]:
X_train, y_train = generate_baseline_input(data, 'train')
X_test, y_test = generate_baseline_input(data,'test')
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

print("X_train has {} user, each user has {} rows, each row has {} columns". format(X_train.shape[0], X_train.shape[1], X_train.shape[2]))

(1, 188, 2002)
(1, 188, 2000)
(1, 47, 2002)
(1, 47, 2000)
X_train has 1 user, each user has 188 rows, each row has 2002 columns
time: 15 ms


In [72]:
X_train = X_train.reshape(-1,2002)
y_train = y_train.reshape(-1,2000)
print(X_train.shape)
print(y_train.shape)
X_test = X_test.reshape(-1,2002)
y_test = y_test.reshape(-1,2000)
print(X_test.shape)
print(y_test.shape)

(188, 2002)
(188, 2000)
(47, 2002)
(47, 2000)
time: 0 ns


## One-hot code label

In [73]:
#one hot first before train
y_train = np.where(y_train > 0, 1, 0)
y_test = np.where(y_test > 0, 1, 0)

time: 16 ms


In [74]:
#user the MultiOutputClassifier to fit one classifier to one target (so every rows of lable are trained with a seperate model)
#pick Naive bayes purely because it is fast to train

from sklearn.naive_bayes import BernoulliNB
from sklearn.multioutput import MultiOutputClassifier

clf_BNB = MultiOutputClassifier(BernoulliNB()).fit(X_train, y_train)
# pred_BNB = clf_BNB.predict(X_test)
# pred_BNB_prob = clf_BNB.predict_proba(X_test)
# need the predict_proba to generate prob for each class(count)
#as one hot coded it is just 2 class(0 for not use, 1 for use)

time: 4.84 s


In [75]:
pred_BNB = clf_BNB.predict(X_test)
pred_BNB_prob = clf_BNB.predict_proba(X_test)

time: 1.58 s


In [76]:
def extract_prob(lst):
    return [(1-item[0]) for item in lst]

for i in range(2000):
    pred_BNB_prob[i] =  extract_prob(pred_BNB_prob[i])

time: 47 ms


In [77]:
pred_BNB_prob = np.array(pred_BNB_prob).T
pred_BNB_prob.shape

(47, 2000)

time: 16 ms


# Evaluation

In [78]:
from sklearn import metrics as skmetrics

avg_auc = 0
avg_map = 0
avg_recall = 0
count = 0

for i in range(y_test.shape[0]):
    if np.sum(y_test[i])> 0:
        fpr, tpr, thresholds = skmetrics.roc_curve(y_test[i], pred_BNB_prob[i], pos_label=1)
        avg_auc += skmetrics.auc(fpr, tpr)
        actual, pred_o, pred = r_k(y_test[i], pred_BNB_prob[i] ,5, 5)
        avg_recall += skmetrics.recall_score(actual, pred_o, average='macro') # Recall@5
        avg_map += cal_ap(y_test[i], pred_BNB_prob[i], 5)

    else:
        count +=1
        pass
    
print('AUC: ', avg_auc / y_test.shape[0])
print('MAP: ', avg_map / y_test.shape[0])
print('Recall@5: ', avg_recall / y_test.shape[0])

print('{} rows has been skipped'.format(count))

# fpr, tpr, thresholds = skmetrics.roc_curve(y_test, pred_MNB_prob, pos_label=1) # Collect the recall and false positive rate from all 2000 predictions
# acc[0] += skmetrics.auc(fpr, tpr)
# # acc[1] += cal_ap(truth, predict, 5)

AUC:  0.18500307355559156
MAP:  0.1978723404255319
Recall@5:  0.2978723404255319
33 rows has been skipped
time: 31 ms


In [79]:
from sklearn import metrics as skmetrics

avg_auc = 0
avg_map = 0
avg_recall = 0
count = 0

for i in range(y_test.shape[0]):
    if np.sum(pred_BNB_prob[i])> 0:
        fpr, tpr, thresholds = skmetrics.roc_curve(y_test[i], pred_BNB_prob[i], pos_label=1)
        avg_auc += skmetrics.auc(fpr, tpr)
        actual, pred_o, pred = r_k(y_test[i], pred_BNB_prob[i] ,5, 5)
        avg_recall += skmetrics.recall_score(actual, pred_o, average='macro') # Recall@5
        avg_map += cal_ap(y_test[i], pred_BNB_prob[i], 5)

    else:
        count +=1
        pass
    
print('AUC: ', avg_auc / y_test.shape[0])
print('MAP: ', avg_map / y_test.shape[0])
print('Recall@5: ', avg_recall / y_test.shape[0])

print('{} rows has been skipped'.format(count))

# fpr, tpr, thresholds = skmetrics.roc_curve(y_test, pred_MNB_prob, pos_label=1) # Collect the recall and false positive rate from all 2000 predictions
# acc[0] += skmetrics.auc(fpr, tpr)
# # acc[1] += cal_ap(truth, predict, 5)

AUC:  nan
MAP:  0.1978723404255319
Recall@5:  0.3191489361702128
32 rows has been skipped
time: 31 ms


# Ignoring the following for now, MLP takes a lot of time to train

# MLP

In [39]:
all_acc_session = []
y_test = y_test.astype(np.int64)
pred = pred_MNB.astype(np.int64)

for i in range(y_test.shape[0]):
    correct_count = 0
    pred_app_session = [app_ind for app_ind,app_count in enumerate(pred[i]) if app_count !=0]
    true_app_session = [app_ind for app_ind,app_count in enumerate(y_test[i]) if app_count !=0]
    if ((len(true_app_session) == 0) & (len(pred_app_session) == 0)):
        all_acc_session.append(1)
    else:
        for j in pred_app_session:
            if j in true_app_session:
                correct_count +=1
        acc = correct_count/len(true_app_session)
        all_acc_session.append(acc)

print("Precision tanh: {:.5f}".format(np.mean(all_acc_session)))

Precision tanh: 0.71250


In [12]:
#just random picked one model for testing, should have a lot of models
from sklearn.multioutput import MultiOutputClassifier
# from sklearn.neighbors import KNeighborsClassifier ##/
# from sklearn.linear_model import LogisticRegression ##X
from sklearn.neural_network import MLPClassifier

clf_tanh = MultiOutputClassifier(MLPClassifier(random_state=1, max_iter=300, activation = 'tanh')).fit(X_train, y_train)
pred_tanh_prob = clf_tanh.predict_proba(X_test)
pred_tanh = clf_tanh.predict(X_test)

time: 5min 34s


In [13]:
#just random picked one model for testing, should have a lot of models
from sklearn.multioutput import MultiOutputClassifier
# from sklearn.neighbors import KNeighborsClassifier ##/
# from sklearn.linear_model import LogisticRegression ##X
from sklearn.neural_network import MLPClassifier

clf_iden = MultiOutputClassifier(MLPClassifier(random_state=1, max_iter=300, activation = 'identity')).fit(X_train, y_train)
pred_iden_prob = clf_iden.predict_proba(X_test)
pred_iden = clf_iden.predict(X_test)

time: 3min 52s


In [14]:
#just random picked one model for testing, should have a lot of models
from sklearn.multioutput import MultiOutputClassifier
# from sklearn.neighbors import KNeighborsClassifier ##/
# from sklearn.linear_model import LogisticRegression ##X
from sklearn.neural_network import MLPClassifier

clf_log = MultiOutputClassifier(MLPClassifier(random_state=1, max_iter=300, activation = 'logistic')).fit(X_train, y_train)
pred_log_prob = clf_log.predict_proba(X_test)
pred_log = clf_log.predict(X_test)

time: 5min 6s


In [15]:
#just random picked one model for testing, should have a lot of models
from sklearn.multioutput import MultiOutputClassifier
# from sklearn.neighbors import KNeighborsClassifier ##/
# from sklearn.linear_model import LogisticRegression ##X
from sklearn.neural_network import MLPClassifier

clf_relu = MultiOutputClassifier(MLPClassifier(random_state=1, max_iter=300, activation = 'relu')).fit(X_train, y_train)
pred_relu_prob = clf_relu.predict_proba(X_test)
pred_relu = clf_relu.predict(X_test)

time: 8min 50s


In [46]:
clf_relu.classes_[0]

array([ 0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10., 11., 12.,
       14., 15., 16., 17., 19., 23., 41., 63.])

time: 0 ns


In [43]:
pred_relu_prob[0][0]

array([0.20809085, 0.04716014, 0.03867527, 0.0515416 , 0.04779884,
       0.03731243, 0.04797006, 0.03997754, 0.0330743 , 0.03488414,
       0.0464032 , 0.03301819, 0.03031492, 0.03453107, 0.02653504,
       0.02731394, 0.05035559, 0.03835128, 0.03516546, 0.05290762,
       0.03861852])

time: 0 ns


# Evaluation

In [17]:
# all_acc_session = []

# y_test = y_test.astype(np.int64)
# pred = pred_1.astype(np.int64)
# #compare and calculate matrics row by row, finally take the loss of the matrics list
# for i in range(y_test.shape[0]):
#     correct_count = 0
#     pred_app_session = [app_ind for app_ind,app_count in enumerate(pred[i]) if app_count !=0]
#     true_app_session = [app_ind for app_ind,app_count in enumerate(y_test[i]) if app_count !=0]
#     if ((len(true_app_session) == 0) & (len(pred_app_session) == 0)):
#         all_acc_session.append(1)
#     else:
#         for j in pred_app_session:
#             if j in true_app_session:
#                 correct_count +=1
#         acc = correct_count/len(true_app_session)
#         all_acc_session.append(acc)

# print("Precision: {:.5f}".format(np.mean(all_acc_session)))

time: 0 ns


In [18]:
all_acc_session = []
y_test = y_test.astype(np.int64)
pred = pred_tanh.astype(np.int64)

for i in range(y_test.shape[0]):
    correct_count = 0
    pred_app_session = [app_ind for app_ind,app_count in enumerate(pred_tanh[i]) if app_count !=0]
    true_app_session = [app_ind for app_ind,app_count in enumerate(y_test[i]) if app_count !=0]
    if ((len(true_app_session) == 0) & (len(pred_app_session) == 0)):
        all_acc_session.append(1)
    else:
        for j in pred_app_session:
            if j in true_app_session:
                correct_count +=1
        acc = correct_count/len(true_app_session)
        all_acc_session.append(acc)

print("Precision tanh: {:.5f}".format(np.mean(all_acc_session)))

Precision tanh: 0.70833
time: 78 ms


In [19]:
all_acc_session = []
y_test = y_test.astype(np.int64)
pred = pred_iden.astype(np.int64)

for i in range(y_test.shape[0]):
    correct_count = 0
    pred_app_session = [app_ind for app_ind,app_count in enumerate(pred_iden[i]) if app_count !=0]
    true_app_session = [app_ind for app_ind,app_count in enumerate(y_test[i]) if app_count !=0]
    if ((len(true_app_session) == 0) & (len(pred_app_session) == 0)):
        all_acc_session.append(1)
    else:
        for j in pred_app_session:
            if j in true_app_session:
                correct_count +=1
        acc = correct_count/len(true_app_session)
        all_acc_session.append(acc)

print("Precision tanh: {:.5f}".format(np.mean(all_acc_session)))

Precision tanh: 0.79683
time: 78 ms


In [20]:
all_acc_session = []
y_test = y_test.astype(np.int64)
pred = pred_log.astype(np.int64)

for i in range(y_test.shape[0]):
    correct_count = 0
    pred_app_session = [app_ind for app_ind,app_count in enumerate(pred_log[i]) if app_count !=0]
    true_app_session = [app_ind for app_ind,app_count in enumerate(y_test[i]) if app_count !=0]
    if ((len(true_app_session) == 0) & (len(pred_app_session) == 0)):
        all_acc_session.append(1)
    else:
        for j in pred_app_session:
            if j in true_app_session:
                correct_count +=1
        acc = correct_count/len(true_app_session)
        all_acc_session.append(acc)

print("Precision tanh: {:.5f}".format(np.mean(all_acc_session)))

Precision tanh: 0.70833
time: 62 ms


In [21]:
all_acc_session = []
y_test = y_test.astype(np.int64)
pred = pred_relu.astype(np.int64)

for i in range(y_test.shape[0]):
    correct_count = 0
    pred_app_session = [app_ind for app_ind,app_count in enumerate(pred_relu[i]) if app_count !=0]
    true_app_session = [app_ind for app_ind,app_count in enumerate(y_test[i]) if app_count !=0]
    if ((len(true_app_session) == 0) & (len(pred_app_session) == 0)):
        all_acc_session.append(1)
    else:
        for j in pred_app_session:
            if j in true_app_session:
                correct_count +=1
        acc = correct_count/len(true_app_session)
        all_acc_session.append(acc)

print("Precision tanh: {:.5f}".format(np.mean(all_acc_session)))

Precision tanh: 0.79860
time: 62 ms


In [22]:
print(np.where(pred_tanh > 0)) #no output > 0
print(np.where(pred_iden > 0))
print(np.where(pred_log > 0)) #no output > 0
print(np.where(pred_relu > 0))

(array([], dtype=int64), array([], dtype=int64))
(array([17, 17, 17, 17, 17, 17, 17, 17, 18, 18, 18, 18, 18, 18, 18, 18, 19,
       19, 19, 19, 19, 19, 19, 20, 20, 20, 20, 20, 20, 20, 21, 21, 21, 21,
       21, 21, 21, 23, 23, 23, 23, 23, 23, 23, 24, 24, 24, 24, 24, 24, 24,
       25, 25, 25, 25, 25, 25, 25, 26, 26, 26, 26, 26, 26, 26, 27, 27, 27,
       27, 27, 27, 27, 28, 28, 28, 28, 28, 28, 28, 29, 29, 29, 29, 29, 29,
       29, 30, 30, 30, 30, 30, 30, 30, 39, 39, 39, 39, 39, 39, 39],
      dtype=int64), array([   0,    4,   73,   75,  125,  341,  611, 1427,    0,    4,   73,
         75,  125,  341,  611, 1427,    0,   73,   75,  125,  341,  611,
       1427,    0,   73,   75,  125,  341,  611, 1427,    0,   73,   75,
        125,  341,  611, 1427,    0,   73,   75,  125,  341,  611, 1427,
          0,   73,   75,  125,  341,  611, 1427,    0,   73,   75,  125,
        341,  611, 1427,    0,   73,   75,  125,  341,  611, 1427,    0,
         73,   75,  125,  341,  611, 1427,    0, 

## KNN

In [24]:
from sklearn.neighbors import KNeighborsClassifier
clf = MultiOutputClassifier(KNeighborsClassifier()).fit(X_train, y_train)
pred = clf.predict(X_test)

time: 3.52 s


In [25]:
all_acc_session = []

y_test = y_test.astype(np.int64)
pred = pred.astype(np.int64)
#compare and calculate matrics row by row, finally take the loss of the matrics list
for i in range(y_test.shape[0]):
    correct_count = 0
    pred_app_session = [app_ind for app_ind,app_count in enumerate(pred[i]) if app_count !=0]
    true_app_session = [app_ind for app_ind,app_count in enumerate(y_test[i]) if app_count !=0]
    if ((len(true_app_session) == 0) & (len(pred_app_session) == 0)):
        all_acc_session.append(1)
    else:
        for j in pred_app_session:
            if j in true_app_session:
                correct_count +=1
        acc = correct_count/len(true_app_session)
        all_acc_session.append(acc)

print("Precision: {:.5f}".format(np.mean(all_acc_session)))

Precision: 0.75417
time: 78 ms


In [26]:
all_acc_session

[1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1,
 0.0,
 1.0,
 0.2,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0.0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1]

time: 0 ns


In [13]:
from sklearn.neighbors import KNeighborsClassifier
clf = MultiOutputClassifier(KNeighborsClassifier()).fit(X_train, y_train)
pred = clf.predict(X_test)

time: 3.44 s


In [14]:
all_acc_session = []

y_test = y_test.astype(np.int64)
pred = pred.astype(np.int64)
#compare and calculate matrics row by row, finally take the loss of the matrics list
for i in range(y_test.shape[0]):
    correct_count = 0
    pred_app_session = [app_ind for app_ind,app_count in enumerate(pred[i]) if app_count !=0]
    true_app_session = [app_ind for app_ind,app_count in enumerate(y_test[i]) if app_count !=0]
    if ((len(true_app_session) == 0) & (len(pred_app_session) == 0)):
        all_acc_session.append(1)
    else:
        for j in pred_app_session:
            if j in true_app_session:
                correct_count +=1
        acc = correct_count/len(true_app_session)
    all_acc_session.append(acc)

print("Precision: {:.5f}".format(np.mean(all_acc_session))

Accuracy: 0.99871, Zero/One Loss: 0.00129, Hamming Loss: 0.00129
time: 16 ms


# Another method for evaluation by calculating correct item in sessions

In [66]:
pred_app = [i for i,e in enumerate(pred[1]) if e !=0]
print(pred_app)

[]
time: 0 ns


In [67]:
true_app = [i for i,e in enumerate(y_test[1]) if e !=0]
print(true_app)

[]
time: 0 ns


In [58]:
count = 0
for i in pred_app:
    if i in true_app:
        count += 1
adv_acc = count/len(true_app)
print(adv_acc)

0.0
time: 0 ns


In [68]:
y_test = y_test.astype(np.int64)
pred = pred.astype(np.int64)
all_acc_session = []

for i in range(y_test.shape[0]):
    correct_count = 0
    pred_app_session = [app_ind for app_ind,app_count in enumerate(pred[i]) if app_count !=0]
    true_app_session = [app_ind for app_ind,app_count in enumerate(y_test[i]) if app_count !=0]
    if ((len(true_app_session) == 0) & (len(pred_app_session) == 0)):
        all_acc_session.append(1)
    else:
        for j in pred_app_session:
            if j in true_app_session:
                correct_count +=1
        acc = correct_count/len(true_app_session)
    all_acc_session.append(acc)
print(np.mean(all_acc_session))

0.4414634146341464
time: 63 ms
