In [None]:
import math
import random
import datetime
from collections import Counter

import pandas as pd
import numpy as np
import tqdm

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import LabelEncoder

import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

In [None]:
df_usage = pd.read_csv('data/App_usage_trace.txt', sep=' ', names=['user', 'time', 'location', 'app', 'traffic'])
df_usage = df_usage[['user', 'time', 'app']]

# merging the consecutive usage records of the same app in one minute
df_usage['time'] = df_usage['time'].apply(lambda x: str(x)[:-2])
df_usage.drop_duplicates(inplace=True)

# delete apps used less than 10 times for all users
df_usage = df_usage[df_usage.groupby('app')['app'].transform('count').ge(10)]

In [None]:
prev_user = -1
prev_time = -1
app_seq = []
all_app_seq= []

seq_length = 4

recent_apps = []
all_recent_apps = []

# df_usage is already sorted based on user and time sequence

for i in tqdm.tqdm(range(len(df_usage))):
    user = df_usage.iloc[i]['user']
    app = df_usage.iloc[i]['app']
    time = df_usage.iloc[i]['time']
    time = datetime.datetime.strptime(time, '%Y%m%d%H%M')
    
    if prev_user != user:
        app_seq = [app]
        all_app_seq.append([])
        recent_apps = [app]
        all_recent_apps.append([])
        
    else:
        # same sequence if the time gap between them is equal to or less than 7 mins
        if (time - prev_time).total_seconds()//60 <= 7:
            if len(app_seq) == seq_length:
                all_app_seq.append(app_seq)
                app_seq = app_seq[1:] + [app]
                
            else:
                app_seq.append(app)
                all_app_seq.append([])
        else:
            app_seq = [app]
            all_app_seq.append([])
        
        # recently used apps
        all_recent_apps.append(recent_apps[:])

        recent_apps.append(app)
        if len(recent_apps) > 10:
            recent_apps.pop(0)
        
    
    prev_user = user
    prev_time = time

100%|██████████| 1123955/1123955 [11:39<00:00, 1606.52it/s]


In [None]:
df_usage['app_seq'] = all_app_seq
df_usage['recent_apps'] = all_recent_apps

# only filled sequences are treated as data
df_usage = df_usage[df_usage['app_seq'].map(len) != 0]
# delete users who have sequences less than 50
df_usage = df_usage[df_usage.groupby('user')['user'].transform('count').ge(50)]

df_usage.head()

Unnamed: 0,user,time,app,app_seq,recent_apps
8,0,201604200816,612,"[361, 361, 31, 360]","[361, 361, 31, 360]"
10,0,201604200817,31,"[361, 31, 360, 612]","[361, 361, 31, 360, 612]"
13,0,201604200817,360,"[31, 360, 612, 31]","[361, 361, 31, 360, 612, 31]"
14,0,201604200817,361,"[360, 612, 31, 360]","[361, 361, 31, 360, 612, 31, 360]"
16,0,201604200824,1,"[612, 31, 360, 361]","[361, 361, 31, 360, 612, 31, 360, 361]"


In [None]:
user_apps = df_usage.groupby('user')['app'].apply(set).apply(list).reset_index()
user_app_dict = dict(zip(user_apps['user'], user_apps['app']))

## MFU

In [None]:
df_mfu = df_usage.copy()

counter = Counter()
fu_apps = []
for i in tqdm.tqdm(range(len(df_mfu))):
    user = df_mfu.iloc[i]['user']
    app = df_mfu.iloc[i]['app']
    if user not in counter:
        counter[user] = Counter(df_mfu.iloc[i]['app_seq'])
    
    fu_apps.append([x[0] for x in counter[user].most_common(10)])

    if app not in counter[user]:
        counter[user][app] = 1
    else:
        counter[user][app] += 1

df_mfu['mfu'] = fu_apps

# split based on dates
df_mfu['time'] = df_mfu['time'].apply(int)
df_mfu_test = df_mfu[df_mfu['time'] > 201604251200]

df_mfu_test.head()

100%|██████████| 908770/908770 [05:23<00:00, 2813.27it/s]


In [None]:
correct = [0,0,0]
for i in tqdm.tqdm(range(len(df_mfu_test))):
    mfu = df_mfu_test.iloc[i]['mfu']
    app = df_mfu_test.iloc[i]['app']
    if app == mfu[0]:
        correct = [x+1 for x in correct]
    elif app in mfu[:5]:
        correct[1] += 1
        correct[2] += 1
    elif app in mfu:
        correct[2] += 1
    else:
        pass
    
total = len(df_mfu_test)
print(correct[0]/total)
print(correct[1]/total)
print(correct[2]/total)

100%|██████████| 228626/228626 [01:02<00:00, 3647.46it/s]

0.15734868300193328
0.38241932238678017
0.4993220368636988





## MRU

In [None]:
df_mru = df_usage.copy()

df_mru['time'] = df_mru['time'].apply(int)
df_mru_test = df_mru[df_mru['time'] > 201604251200]

correct = [0,0,0]

for i in tqdm.tqdm(range(len(df_mru_test))):
    app = df_mru_test.iloc[i]['app']
    mru = df_mru_test.iloc[i]['recent_apps']

    if app == mru[-1]:
        correct = [x+1 for x in correct]
    elif app in mru[-5:]:
        correct[1] += 1
        correct[2] += 1
    elif app in mru:
        correct[2] += 1
    else:
        pass

total = len(df_mru_test)
print(correct[0]/total)
print(correct[1]/total)
print(correct[2]/total)

0.13601252700917657
0.36279775703550776
0.4680307576566095


## NB 

In [None]:
df_nb = df_usage.copy()

def prep_time(t):
    t = t[:-2]
    weekday = datetime.datetime.strptime(t[:-2], '%Y%m%d').weekday()
    return '{}_{}'.format(weekday, t[-2:])

df_nb['time'] = df_nb['time'].apply(lambda x: prep_time(x))

app_seq_str = df_nb['app_seq'].apply(lambda app_seq: ' '.join([str(app) for app in app_seq]))
df_nb['nb_input'] = df_nb['time'] + ' ' + app_seq_str

df_nb.head()

Unnamed: 0,user,time,app,app_seq
8,0,2_08,612,"[361, 361, 31, 360]"
10,0,2_08,31,"[361, 31, 360, 612]"
13,0,2_08,360,"[31, 360, 612, 31]"
14,0,2_08,361,"[360, 612, 31, 360]"
16,0,2_08,1,"[612, 31, 360, 361]"


In [None]:
# need to be split if you have more data (longer period)
train, test = train_test_split(df_nb, test_size=0.2, random_state=2021, stratify=df_nb['user'])

users = df_nb['user'].unique()
correct = [0,0,0]

for user in users:
    df_train = train[train['user']==user]
    df_test = test[test['user']==user]

    train_x = df_train['nb_input'].values.tolist()
    train_y = df_train['app'].values.tolist()

    cv = CountVectorizer()
    cv_train_x = cv.fit_transform(train_x)
    NBClassifier = MultinomialNB()
    NBClassifier.fit(cv_train_x, train_y)
    
    test_x = df_test['nb_input'].values.tolist()
    test_y = df_test['app'].values.tolist()

    cv_test_x = cv.transform(test_x)
    probs = NBClassifier.predict_proba(cv_test_x)
    topn = np.argsort(probs, axis=1)[:, -10:]
    topn = np.flip(topn, axis=1)
    topn = NBClassifier.classes_[topn]
    
    for answer, predictions in zip(test_y, topn):
        if answer == predictions[0]:
            correct[0] += 1
            correct[1] += 1
            correct[2] += 1
        elif answer in predictions[:5]:
            correct[1] += 1
            correct[2] += 1
        elif answer in predictions:
            correct[2] += 1
        else:
            pass
print(correct)
print(correct[0]/len(test))
print(correct[1]/len(test))
print(correct[2]/len(test))

[45060, 102510, 125307]
0.24791751488275363
0.564004093444986
0.6894318694499159


## Markov chain

https://python.plainenglish.io/word-prediction-with-markov-chains-in-python-d685eed4b0b3

In [None]:
df_mc = df_usage.copy()
df_mc['time'] = df_mc['time'].apply(int)
train = df_mc[df_mc['time'] <= 201604251200]
test = df_mc[df_mc['time'] > 201604251200]

In [None]:
chain = {}

users = df_mc['user'].unique()

correct = [0,0,0]

for user in tqdm.tqdm(users):
    chain[user] = {}
    
    # make markov chain
    train_per_user = train[train['user']==user]
    for i in range(len(train_per_user)):
        current = train_per_user.iloc[i]['app']
        last = train_per_user.iloc[i]['app_seq'][-1]

        if last not in chain[user]:
            chain[user].update({last: {current: 1}})
            continue
        
        if current not in chain[user][last]:
            chain[user][last].update({current: 1})
        else:
            chain[user][last].update({current: chain[user][last][current] + 1})
    
    # normalization
    for app, transition in chain[user].items():
        transition = dict((key, value / sum(transition.values())) for key, value in transition.items())
        chain[user][app] = transition
    
    # test
    test_per_user = test[test['user']==user]

    for i in range(len(test_per_user)):
        last = test_per_user.iloc[i]['app_seq'][-1]

        if last not in chain[user]:
            preds = random.sample(user_app_dict[user], min(len(user_app_dict[user]), 10))
        else:
            transitions = chain[user][last]
            preds = np.random.choice(list(transitions.keys()), size=min(10, len(transitions)), replace=False, p=list(transitions.values()))
            preds = preds.tolist()
        
        answer = test_per_user.iloc[i]['app']

        if answer == preds[0]:
            correct[0] += 1
            correct[1] += 1
            correct[2] += 1
        elif answer in preds[:5]:
            correct[1] += 1
            correct[2] += 1
        elif answer in preds:
            correct[2] += 1
        else:
            pass

print(correct)
print(correct[0]/len(test))
print(correct[1]/len(test))
print(correct[2]/len(test))

100%|██████████| 748/748 [05:11<00:00,  2.40it/s]

[34967, 92166, 114433]
0.15294410959383448
0.40313000271185256
0.5005248746861687





## DNN

In [None]:
class DNNDataset(Dataset):
    def __init__(self, df, user_encoder, time_encoder, app_encoder):
        self.df = df
        self.user_encoder = user_encoder
        self.time_encoder = time_encoder
        self.app_encoder = app_encoder

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        user = self.user_encoder.transform([self.df.iloc[idx]['user']])
        time = self.time_encoder.transform([self.df.iloc[idx]['time']])
        app = self.app_encoder.transform([self.df.iloc[idx]['app']])
        app_seq = self.app_encoder.transform(self.df.iloc[idx]['app_seq'])
        return (torch.LongTensor(user), torch.LongTensor(time), torch.LongTensor(app), torch.LongTensor(app_seq))

class DNN(nn.Module):
    def __init__(self, n_users, n_times, n_apps, dim, hidden, seq_length):
        super(DNN, self).__init__()
        self.user_emb = nn.Embedding(n_users, dim)
        self.time_emb = nn.Embedding(n_times, dim)
        self.app_emb = nn.Embedding(n_apps, dim)

        self.nn1 = nn.Linear(dim * (seq_length + 2), hidden)
        self.nn2 = nn.Linear(hidden, hidden)
        self.classifier = nn.Linear(hidden, n_apps)

    def forward(self, users, times, app_seq):
        # users [batch_size, 1]
        # times [batch_size, 1]
        # app_seq [batch_size, seq_length]

        batch_size = users.size(0)
        user_vector = self.user_emb(users) # [batch_size, 1, dim]
        time_vector = self.time_emb(times) # [batch_size, 1, dim]
        app_seq_vector = self.app_emb(app_seq) # [batch_size, seq_length, dim]

        input_vector = torch.cat([user_vector, time_vector, app_seq_vector], axis=1) # [batch_size, seq_length+2, dim]
        input_vector = input_vector.view(batch_size, -1)

        x = self.nn1(input_vector)
        x = F.relu(x)
        x = self.nn2(x)
        x = F.relu(x)

        return self.classifier(x)

In [None]:
df_dnn = df_usage.copy()
train, test = train_test_split(df_dnn, test_size=0.2, random_state=2021, stratify=df_dnn['user'])

epoch = 20
batch_size = 32
dim = 50
seq_length = 4
hidden = 100
lr = 0.001

random.seed(2021)
torch.manual_seed(2021)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
if torch.cuda.device_count() > 0:
    torch.cuda.manual_seed_all(2021)

# encoder
user_encoder = LabelEncoder()
time_encoder = LabelEncoder()
app_encoder = LabelEncoder()

user_encoder.fit(df_dnn['user'].unique())
time_encoder.fit(df_dnn['time'].unique())
all_apps = list(df_dnn['app'].unique())
for app_seq in df_dnn['app_seq']:
    all_apps.extend(app_seq)
app_encoder.fit(list(set(all_apps)))

train_dataset = DNNDataset(train, user_encoder, time_encoder, app_encoder)
test_dataset = DNNDataset(test, user_encoder, time_encoder, app_encoder)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=2)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=2)

num_users = len(df_dnn['user'].unique())
num_times = len(df_dnn['time'].unique())
num_apps = len(app_encoder.classes_)

model = DNN(num_users, num_times, num_apps, dim, hidden, seq_length)
model.to(device)
optimizer = optim.Adam(model.parameters(), lr=lr)

total_loss = 0
itr = 1
p_itr = 100
best_acc = 0
Ks = [1,5,10]

for e in range(epoch):
    model.train()

    for i, (user, time, app, app_seq) in enumerate(train_loader):
        user = user.to(device)
        time = time.to(device)
        target = app.to(device)
        app_seq = app_seq.to(device)

        optimizer.zero_grad()
        scores = model(user, time, app_seq) # [batch_size, num_apps]
        loss = F.cross_entropy(scores, target.view(-1))
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

        if itr % p_itr == 0:
            print("[TRAIN] Epoch: {} / Iter: {} Loss - {}".format(e+1, itr, total_loss/p_itr))
            total_loss = 0
        itr += 1

    model.eval()
    corrects = [0,0,0]
    with torch.no_grad():
        for i, (user, time, app, app_seq) in enumerate(test_loader):
            user = user.to(device)
            time = time.to(device)
            target = app.to(device)
            app_seq = app_seq.to(device)

            scores = model(user, time, app_seq)

            for idx, k in enumerate(Ks):
                correct = torc.sum(torch.eq(torch.topk(scores, dim=1, k=k).indices, target)).item()
                corrects[idx] += correct
    accs = [x/len(test_loader) for x in corrects]
    print("[EVALUATION] Epoch: {} - Acc: {:.5f} / {:.5f} / {:.5f}".format(e+1, accs[0], accs[1], accs[2]))    


cuda


RuntimeError: ignored

## RNN + Attention

In [None]:
class RNNDataset(Dataset):
    def __init__(self, df, user_encoder, time_encoder, app_encoder):
        self.df = df
        self.user_encoder = user_encoder
        self.time_encoder = time_encoder
        self.app_encoder = app_encoder

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        user = self.user_encoder.transform([self.df.iloc[idx]['user']])
        time = self.time_encoder.transform([self.df.iloc[idx]['time']])
        app = self.app_encoder.transform([self.df.iloc[idx]['app']])
        app_seq = self.app_encoder.transform(self.df.iloc[idx]['app_seq'])
        return (torch.LongTensor(user), torch.LongTensor(time), torch.LongTensor(app), torch.LongTensor(app_seq))

class RNN(nn.Module):
    def __init__(self, n_users, n_times, n_apps, input_dim, hidden_dim, dim, seq_length):
        super(RNN, self).__init__()
        
        self.user_emb = nn.Embedding(n_users, dim)
        self.time_emb = nn.Embedding(n_times, dim)
        self.app_emb = nn.Embedding(n_apps, dim)
        self.dim = dim
        self.seq_length = seq_length

        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.scale = 1. / math.sqrt(hidden_dim)
        self.rnn = nn.LSTM(input_dim, hidden_dim, num_layers=1, batch_first=True)
        self.classifier = nn.Linear(hidden_dim, n_apps)

    def forward(self, users, times, app_seq):
        # users [batch_size, 1]
        # times [batch_size, 1]
        # app_seq [batch_size, seq_length]

        batch_size = users.size(0)
        user_vector = self.user_emb(users) # [batch_size, 1, dim]
        time_vector = self.time_emb(times) # [batch_size, 1, dim]
        app_seq_vector = self.app_emb(app_seq) # [batch_size, seq_length, dim]

        input_vector = torch.cat([user_vector.repeat(1, self.seq_length, 1), time_vector.repeat(1, self.seq_length, 1), app_seq_vector], axis=2) #[batch_size, seq_length, input_dim]
        output, (hidden_last, cell_last) = self.rnn(input_vector)

        query = cell_last.permute(1,0,2) #[batch_size, 1, hidden_dim]
        keys = output.permute(0,2,1) #[batch_size, hidden, seq_length]
        energy = torch.bmm(query, keys) #[batch_size, 1, seq_length]
        energy = F.softmax(torch.mul(energy, self.scale), dim=2)

        values = output #[batch_size, seq_length, hidden_dim]
        x = torch.bmm(energy, values) #[batch_size, 1, hidden_dim]
        x = x.squeeze(1)
        return self.classifier(x)

In [None]:
df_rnn = df_usage.copy()
train, test = train_test_split(df_rnn, test_size=0.2, random_state=2021, stratify=df_rnn['user'])

epoch = 20
batch_size = 32
dim = 50
seq_length = 4
hidden = 100
lr = 0.001

random.seed(2021)
torch.manual_seed(2021)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
if torch.cuda.device_count() > 0:
    torch.cuda.manual_seed_all(2021)

# encoder
user_encoder = LabelEncoder()
time_encoder = LabelEncoder()
app_encoder = LabelEncoder()

user_encoder.fit(df_rnn['user'].unique())
time_encoder.fit(df_rnn['time'].unique())
all_apps = list(df_rnn['app'].unique())
for app_seq in df_rnn['app_seq']:
    all_apps.extend(app_seq)
app_encoder.fit(list(set(all_apps)))

train_dataset = RNNDataset(train, user_encoder, time_encoder, app_encoder)
test_dataset = RNNDataset(test, user_encoder, time_encoder, app_encoder)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=2)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=2)

num_users = len(df_rnn['user'].unique())
num_times = len(df_rnn['time'].unique())
num_apps = len(app_encoder.classes_)

model = RNN(num_users, num_times, num_apps, dim*3, hidden, dim, seq_length)
model.to(device)
optimizer = optim.Adam(model.parameters(), lr=lr)

total_loss = 0
itr = 1
p_itr = 100
best_acc = 0
Ks = [1,5,10]

for e in range(epoch):
    model.train()

    for i, (user, time, app, app_seq) in enumerate(train_loader):
        user = user.to(device)
        time = time.to(device)
        target = app.to(device)
        app_seq = app_seq.to(device)

        optimizer.zero_grad()
        scores = model(user, time, app_seq) # [batch_size, num_apps]
        loss = F.cross_entropy(scores, target.view(-1))
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

        if itr % p_itr == 0:
            print("[TRAIN] Epoch: {} / Iter: {} Loss - {}".format(e+1, itr, total_loss/p_itr))
            total_loss = 0
        itr += 1

    model.eval()
    corrects = [0,0,0]
    with torch.no_grad():
        for i, (user, time, app, app_seq) in enumerate(test_loader):
            user = user.to(device)
            time = time.to(device)
            target = app.to(device)
            app_seq = app_seq.to(device)

            scores = model(user, time, app_seq)

            for idx, k in enumerate(Ks):
                correct = torc.sum(torch.eq(torch.topk(scores, dim=1, k=k).indices, target)).item()
                corrects[idx] += correct
    accs = [x/len(test_loader) for x in corrects]
    print("[EVALUATION] Epoch: {} - Acc: {:.5f} / {:.5f} / {:.5f}".format(e+1, accs[0], accs[1], accs[2]))    


cuda
[TRAIN] Epoch: 1 / Iter: 100 Loss - 6.4524541759490965
[TRAIN] Epoch: 1 / Iter: 200 Loss - 5.538028836250305
[TRAIN] Epoch: 1 / Iter: 300 Loss - 5.364945583343506
[TRAIN] Epoch: 1 / Iter: 400 Loss - 5.314250016212464
[TRAIN] Epoch: 1 / Iter: 500 Loss - 5.1043570876121525
[TRAIN] Epoch: 1 / Iter: 600 Loss - 5.12263757944107
[TRAIN] Epoch: 1 / Iter: 700 Loss - 4.944546093940735
[TRAIN] Epoch: 1 / Iter: 800 Loss - 4.9096127390861515
[TRAIN] Epoch: 1 / Iter: 900 Loss - 4.818425827026367
[TRAIN] Epoch: 1 / Iter: 1000 Loss - 4.752338418960571
[TRAIN] Epoch: 1 / Iter: 1100 Loss - 4.608730733394623
[TRAIN] Epoch: 1 / Iter: 1200 Loss - 4.615391592979432
[TRAIN] Epoch: 1 / Iter: 1300 Loss - 4.5561016035079955
[TRAIN] Epoch: 1 / Iter: 1400 Loss - 4.530325701236725
[TRAIN] Epoch: 1 / Iter: 1500 Loss - 4.356298532485962
[TRAIN] Epoch: 1 / Iter: 1600 Loss - 4.262339305877686
[TRAIN] Epoch: 1 / Iter: 1700 Loss - 4.3194933080673215
[TRAIN] Epoch: 1 / Iter: 1800 Loss - 4.338506741523743
[TRAIN] Ep