In [1]:
import numpy as np
import pandas as pd
import time
import ast
from tqdm import tqdm
import datetime
from multiprocessing import Pool, cpu_count
from itertools import zip_longest
from collections import defaultdict, OrderedDict, Counter
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
from sklearn import metrics
from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from joblib import Parallel, delayed
%matplotlib inline

In [2]:
base_dir = './dataset/'

In [3]:
user_app_usage_hot = pd.read_hdf(base_dir+'user_app_usage_hot_100.h5',key='data')
user_app_usage_hot.head()

Unnamed: 0,uId,duration,times,use_date,idx,dayofweek
30,20765863,701,2,2019-07-14,30,6
31,20765863,51,3,2019-07-23,30,1
32,20765863,701,2,2019-07-13,30,5
33,20765863,2819,2,2019-07-05,30,4
34,20765863,2819,2,2019-07-06,30,5


In [4]:
user_app_usage_hot['duration'] = user_app_usage_hot['duration'].astype(np.uint16)
user_app_usage_hot['times'] = user_app_usage_hot['times'].astype(np.uint16)

In [4]:
user_app_usage_hot.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 443257438 entries, 0 to 651007718
Data columns (total 6 columns):
uId          uint32
duration     uint16
times        uint16
use_date     datetime64[ns]
idx          uint8
dayofweek    uint8
dtypes: datetime64[ns](1), uint16(2), uint32(1), uint8(2)
memory usage: 10.7 GB


In [4]:
# def build_usage_app_table(group_df):
#     (uId, use_date),group = group_df
#     app_times = np.zeros((50,),dtype=np.int16)
#     app_durations = np.zeros((50,),dtype=np.int16)
#     idx = list(group['idx'])
#     times = list(group['times'])
#     durations = list(group['duration'])
#     app_times[idx] = times
#     app_durations[idx] = durations
#     app_usage = np.concatenate([app_times, app_durations]).reshape((1,-1))
#     app_usage_df = pd.DataFrame(app_usage)
#     app_usage_df['uId'] = uId
#     app_usage_df['use_date'] = use_date
#     return app_usage_df

In [5]:
def build_usage_app_table_by_dayofweek(group_df):
    (uId, dayofweek),group = group_df
    app_times = np.zeros((100,),dtype=np.int16)
    app_durations = np.zeros((100,),dtype=np.int16)
    idx = list(group['idx'])
    times = list(group['times'])
    durations = list(group['duration'])
    for i in range(len(idx)):
        app_times[idx[i]] += times[i]
        app_durations[idx[i]] += durations[i]
    app_usage = np.concatenate([app_times, app_durations]).reshape((1,-1))
    app_usage_df = pd.DataFrame(app_usage)
    app_usage_df['uId'] = uId
    app_usage_df['dayofweek'] = dayofweek
    return app_usage_df

In [6]:
uId_array = np.load('./uId_array.npy')
len(uId_array[1])

700000

In [7]:
uId_df = pd.DataFrame(uId_array[1])
uId_df.shape

(700000, 1)

In [8]:
uId_df.columns = ['uId']

In [9]:
uId_df.head()

Unnamed: 0,uId
0,8669774
1,8673581
2,8680332
3,8682427
4,8688337


In [10]:
sub_user_app_usage_hot = uId_df.merge(user_app_usage_hot,on=['uId'],how='left')
sub_user_app_usage_hot.shape

(114728990, 6)

In [12]:
sub_user_app_usage_hot.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 44601108 entries, 0 to 44601107
Data columns (total 6 columns):
uId          int64
duration     uint16
times        uint16
use_date     datetime64[ns]
idx          uint8
dayofweek    uint8
dtypes: datetime64[ns](1), int64(1), uint16(2), uint8(2)
memory usage: 1.2 GB


In [11]:
sub_user_app_usage_hot['uId'] = sub_user_app_usage_hot['uId'].astype(np.uint32)

In [12]:
df_list = []
i = 0
for (uId, dayofweek),group in sub_user_app_usage_hot.groupby(['uId','dayofweek']):
    a = build_usage_app_table_by_dayofweek(((uId,dayofweek),group))
    df_list.append(a)
    i += 1
    if i % 1000000 == 0:
        print(i)

1000000
2000000
3000000
4000000


In [13]:
top_100_app_usage_df = pd.concat(df_list)
top_100_app_usage_df.shape

(4768784, 202)

In [14]:
top_100_app_usage_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,192,193,194,195,196,197,198,199,uId,dayofweek
0,1096,0,0,0,68,0,0,29,0,0,...,0,0,0,0,108,0,0,0,1000012,0
0,1529,0,0,0,95,0,0,15,0,0,...,0,0,0,0,371,0,0,0,1000012,1
0,1298,0,0,0,52,0,0,5,0,0,...,0,0,0,0,456,0,0,0,1000012,2
0,2265,0,0,0,94,0,0,20,0,0,...,0,0,0,0,97,0,0,0,1000012,3
0,1936,0,0,0,53,0,0,27,0,0,...,0,0,0,0,4,0,0,0,1000012,4


In [15]:
h5 = pd.HDFStore(base_dir+'top_100_app_usage_df_1.h5','w')
h5['data'] = top_100_app_usage_df
h5.close()

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed-integer,key->axis0] [items->None]

  


In [None]:
print('0')