In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import datetime
import sklearn
import tqdm
from sklearn.model_selection import train_test_split

In [2]:
df_usage = pd.read_csv('data/App_usage_trace.txt', sep=' ', names=['user', 'time', 'location', 'app', 'traffic'])
df_usage = df_usage[['user', 'time', 'app']]

# merging the consecutive usage records of the same app in one minute
df_usage['time'] = df_usage['time'].apply(lambda x: str(x)[:-2])
df_usage.drop_duplicates(inplace=True)

# delete apps used less than 10 times for all users
df_usage = df_usage[df_usage.groupby('app')['app'].transform('count').ge(10)]

In [3]:
df_usage.head()

Unnamed: 0,user,time,app
0,0,201604200813,361
5,0,201604200816,361
6,0,201604200816,31
7,0,201604200816,360
8,0,201604200816,612


In [4]:
prev_user = -1
prev_time = -1
app_seq = []
time_seq = []
all_app_seq= []
all_time_seq = []

seq_length = 4

# df_usage is already sorted based on user and time sequence
for i in tqdm.tqdm(range(len(df_usage))):
    user = df_usage.iloc[i]['user']
    app = df_usage.iloc[i]['app']
    time = df_usage.iloc[i]['time']
    time = datetime.datetime.strptime(time, '%Y%m%d%H%M')
    
    if prev_user != user:
        app_seq = [app]
        time_seq = [time]
        all_app_seq.append([])
        all_time_seq.append([])
        
    else:
        # same sequence if the time gap between them is equal to or less than 7 mins
        if (time - prev_time).total_seconds()//60 <= 7:
            if len(app_seq) == seq_length:
                all_app_seq.append(app_seq)
                all_time_seq.append([(prev_time - x).total_seconds()//60 for x in time_seq])
                app_seq = app_seq[1:] + [app]
                time_seq = time_seq[1:] + [time]
            else:
                app_seq.append(app)
                time_seq.append(time)
                all_app_seq.append([])
                all_time_seq.append([])
        else:
            app_seq = [app]
            time_seq = [time]
            all_app_seq.append([])
            all_time_seq.append([])
    
    prev_user = user
    prev_time = time

100%|██████████| 1123955/1123955 [05:38<00:00, 3322.00it/s]


In [5]:
df_usage['app_seq'] = all_app_seq
df_usage['time_seq'] = all_time_seq

# only filled sequences are treated as data
df_usage = df_usage[df_usage['app_seq'].map(len) != 0]
# delete users who have sequences less than 50
df_usage = df_usage[df_usage.groupby('user')['user'].transform('count').ge(50)]

df_usage[:10]

Unnamed: 0,user,time,app,app_seq,time_seq
8,0,201604200816,612,"[361, 361, 31, 360]","[3.0, 0.0, 0.0, 0.0]"
10,0,201604200817,31,"[361, 31, 360, 612]","[0.0, 0.0, 0.0, 0.0]"
13,0,201604200817,360,"[31, 360, 612, 31]","[1.0, 1.0, 1.0, 0.0]"
14,0,201604200817,361,"[360, 612, 31, 360]","[1.0, 1.0, 0.0, 0.0]"
16,0,201604200824,1,"[612, 31, 360, 361]","[1.0, 0.0, 0.0, 0.0]"
17,0,201604200829,31,"[31, 360, 361, 1]","[7.0, 7.0, 7.0, 0.0]"
19,0,201604200829,612,"[360, 361, 1, 31]","[12.0, 12.0, 5.0, 0.0]"
20,0,201604200829,360,"[361, 1, 31, 612]","[12.0, 5.0, 0.0, 0.0]"
22,0,201604200829,4,"[1, 31, 612, 360]","[5.0, 0.0, 0.0, 0.0]"
23,0,201604200831,31,"[31, 612, 360, 4]","[0.0, 0.0, 0.0, 0.0]"


In [6]:
# represent time as weekday_time
def prep_time(t):
    t = t[:-2]
    weekday = datetime.datetime.strptime(t[:-2], '%Y%m%d').weekday()
    return '{}_{}'.format(weekday, t[-2:])

df_usage['time'] = df_usage['time'].apply(lambda x: prep_time(x))

In [7]:
df_usage.head()

Unnamed: 0,user,time,app,app_seq,time_seq
8,0,2_08,612,"[361, 361, 31, 360]","[3.0, 0.0, 0.0, 0.0]"
10,0,2_08,31,"[361, 31, 360, 612]","[0.0, 0.0, 0.0, 0.0]"
13,0,2_08,360,"[31, 360, 612, 31]","[1.0, 1.0, 1.0, 0.0]"
14,0,2_08,361,"[360, 612, 31, 360]","[1.0, 1.0, 0.0, 0.0]"
16,0,2_08,1,"[612, 31, 360, 361]","[1.0, 0.0, 0.0, 0.0]"


In [8]:
user2id = {u: i for i, u in enumerate(sorted(df_usage['user'].unique()))}
app_set = set()
for s in df_usage['app_seq'].values:
    app_set.update(s)
app2id = {a: i for i, a in enumerate(sorted(app_set))}

In [9]:
def dict2file(dic, filename):
    with open(filename, 'w') as f:
        for k, v in dic.items():
            f.write("{}\t{}\n".format(k, v))

In [10]:
dict2file(user2id, "data/user2id.txt")
dict2file(app2id, "data/app2id.txt")

In [11]:
df_dataset = pd.DataFrame()
df_dataset['user'] = df_usage['user'].apply(lambda x: user2id[x])
df_dataset['time'] = df_usage['time']
df_dataset['app_seq'] = df_usage['app_seq'].apply(lambda x: [app2id[c] for c in x])
df_dataset['time_seq'] = df_usage['time_seq']
df_dataset['app'] = df_usage['app'].apply(lambda x: app2id[x])

In [12]:
train, test = train_test_split(df_dataset, test_size=0.2, random_state=2021, stratify=df_dataset['user'])
train.to_csv('data/train.txt', sep='\t', index=False)
test.to_csv('data/test.txt', sep='\t', index=False)