In [1]:
import pandas as pd
import numpy as np
import datetime
import pickle
import logging
LOG_FORMAT = "%(asctime)s - %(levelname)s - %(message)s"
logging.basicConfig(level=logging.INFO, format=LOG_FORMAT)
from tqdm import tqdm_notebook as tqdm
import time
import IPython.display as ipd


# df_master

In [26]:
df_master_records = pd.read_csv('../data/202002/master_records.csv')
df_map_income = pd.read_csv('../data/202002/map_income.csv')
mp_income_number = {}
for income, real_money in zip(df_map_income['income'], df_map_income['近12个月月收入平均区间']):
    real_money = str(real_money)
    two_ends = real_money.split('-')
    min_money = float(two_ends[0].replace('+', ''))
    if len(two_ends) > 1:
        mp_income_number[income] = [min_money, two_ends[1]] 
    else:
        mp_income_number[income] = [min_money, min_money] 
df_income_map = pd.DataFrame(pd.DataFrame(mp_income_number).T)
df_income_map.columns = ['min_income', 'max_income']
df_master_records = pd.merge(df_master_records, df_income_map, how = 'left', left_on = 'income', right_index = True)

df_master_records['loan_date'] = pd.to_datetime(df_master_records['loan_date'], format='%Y%m%d', errors='ignore')
loan_sequence = pd.read_csv('../data/202002/loan_sequence.csv')
loan_sequence.index = loan_sequence.id
df_master_records.index = df_master_records.id
df_master_records['loan_sequence'] = loan_sequence['loan_sequence']

target_update_0514 = pd.read_csv('../data/202005/target_update_0514.csv')
target_update_0514.index = target_update_0514.id
df_master_records = pd.merge(df_master_records, target_update_0514, how = 'left', left_index = True, right_index = True)
df_master_records = df_master_records.drop(['id_x', 'id_y', 'Unnamed: 11'], axis = 1)
pickle.dump(df_master_records, open('../data_sortout/df_master_records.pickle', 'wb'))

# app_install_list

In [2]:
df_app_install_records = pd.read_csv('../data/202002/app_installed_records.csv', header=None, names=['id', 'loan_date', 'pkg', 'date'])
df_app_install_records['loan_date'] = pd.to_datetime(df_app_install_records['loan_date'], format='%Y%m%d', errors='ignore')
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
pkg_id = le.fit_transform(df_app_install_records['pkg'])
df_app_install_records['pkg_id'] = pkg_id
df_app_install_records['pkg_id'] = df_app_install_records['pkg_id'] + 1

In [12]:
pickle.dump(le.classes_, open('../data_sortout/mp_install_list_pkg_id2pkg.pickle', 'wb'))
logging.info('start se_id_install_list')
se_id_install_list = df_app_install_records.groupby('id').apply(lambda x : list(x['pkg_id']))
pickle.dump(se_id_install_list, open('../data_sortout/se_id_install_list.pickle', 'wb'))
logging.info('finish se_id_install_list')

2020-08-24 14:46:44,125 - INFO - start se_id_install_list
2020-08-24 14:46:58,178 - INFO - finish se_id_install_list


# app install behave

In [2]:
df_install_behave = pd.read_csv('../data/202002/app_inunstall_records.csv', header=None, names=['id', 'loan_date', 'pkg', 'date', 'action_type'])
df_install_behave['loan_date'] = pd.to_datetime(df_install_behave['loan_date'], format='%Y%m%d', errors='ignore')
df_install_behave['date'] = pd.to_datetime(df_install_behave['date'], format='%Y%m%d', errors='ignore')
df_install_behave_sort = df_install_behave.sort_values(by = ['date'])

In [3]:
df_install_behave_sort_less = df_install_behave_sort[df_install_behave_sort.loan_date > df_install_behave_sort.date]

In [4]:
df_install_behave_sort_less.shape, df_install_behave_sort.shape

((26896240, 5), (54811025, 5))

In [5]:
len(set(df_install_behave_sort_less.id)), len(set(df_install_behave_sort.id)),

(197737, 288083)

In [6]:
df_install_behave_sort = df_install_behave_sort_less

In [7]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
pkg_id = le.fit_transform(df_install_behave_sort['pkg'])
pickle.dump(le.classes_, open('../data_sortout/mp_install_behave_pkg_id2pkg.pickle', 'wb'))

In [9]:
df_install_behave_sort['pkg_id'] = pkg_id + 1

In [10]:
logging.info('start df_install_behave_sort')
df_install_behave = df_install_behave_sort.groupby('id').apply(lambda x : pd.Series({
    'pkg_id' : list(x['pkg_id'].values),
    'date' : list(x['date'].values),
    'action' : list(x['action_type'].map({'install': 1, 'unstall':0}) ),
}))
logging.info('finish df_install_behave_sort')


2020-08-24 14:58:25,292 - INFO - start df_install_behave_sort
2020-08-24 15:03:26,004 - INFO - finish df_install_behave_sort


In [5]:
df_install_behave

Unnamed: 0_level_0,pkg_id,date,action
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,"[60891, 140058, 61058, 129506, 211465, 142427,...","[2018-03-30T00:00:00.000000000, 2018-03-30T00:...","[1, 1, 1, -1, 1, 1, -1, 1, -1, 1, -1, -1, -1, ..."
2,"[271118, 277281, 78310, 78310, 250267, 210170,...","[2019-05-23T00:00:00.000000000, 2019-05-23T00:...","[1, -1, -1, 1, 1, 1, -1, 1, 1, 1, 1, 1, 1, 1, ..."
3,"[283371, 129506, 259322, 283371, 259322, 25932...","[2019-01-07T00:00:00.000000000, 2019-01-07T00:...","[1, -1, 1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, -..."
4,"[283371, 129506, 259322, 283371, 259322, 25932...","[2019-01-07T00:00:00.000000000, 2019-01-07T00:...","[1, -1, 1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, -..."
5,"[65893, 132149, 107776, 266168, 217755, 101846...","[2019-03-19T00:00:00.000000000, 2019-03-19T00:...","[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -..."
...,...,...,...
402835,"[125491, 104804, 97889, 104817, 104818, 104823...","[2018-01-02T00:00:00.000000000, 2018-01-19T00:...","[1, 1, -1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,..."
402836,"[172615, 250356, 283371, 169411, 277168, 27716...","[2019-04-10T00:00:00.000000000, 2019-04-12T00:...","[1, -1, -1, 1, -1, 1, 1, 1, 1, 1, 1, 1, -1, -1..."
402837,"[156535, 177297, 119216, 177297, 283371, 60891...","[2019-08-12T00:00:00.000000000, 2019-08-12T00:...","[-1, 1, 1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
402838,"[81795, 60891, 195253, 172615, 60891, 283748, ...","[2018-10-26T00:00:00.000000000, 2018-10-26T00:...","[-1, 1, -1, -1, -1, -1, -1, -1, -1, -1, 1, -1,..."


In [11]:
logging.info('start dump df_install_behave_no_date')
pickle.dump(df_install_behave[['pkg_id', 'action']], open('../data_sortout/df_install_behave_no_date.pickle', 'wb'))
logging.info('finish dump df_install_behave_no_date')

logging.info('start dump df_install_behave')
pickle.dump(df_install_behave, open('../data_sortout/df_install_behave.pickle', 'wb'))
logging.info('finish dump df_install_behave')


2020-08-24 15:05:57,461 - INFO - start dump df_install_behave_no_date
2020-08-24 15:07:30,924 - INFO - finish dump df_install_behave_no_date
2020-08-24 15:07:30,927 - INFO - start dump df_install_behave
2020-08-24 15:17:15,888 - INFO - finish dump df_install_behave


In [9]:
df_day = df_install_behave['date'].apply(lambda date_list : pd.Series({
    'year' : list(pd.DatetimeIndex(date_list).year - 2000),
    'month' : list(pd.DatetimeIndex(date_list).month),
    'day' : list(pd.DatetimeIndex(date_list).day),
}))
logging.info('start dump df_install_behave_date')
pickle.dump(df_day, open('../data_sortout/df_install_behave_date.pickle', 'wb'))
logging.info('finish dump df_install_behave_date')

2020-07-01 18:22:39,697 - INFO - start dump df_install_behave_no_date
2020-07-01 18:24:24,903 - INFO - finish dump df_install_behave_no_date
2020-07-01 18:24:25,246 - INFO - Note: NumExpr detected 32 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
2020-07-01 18:24:25,247 - INFO - NumExpr defaulting to 8 threads.
2020-07-01 18:30:21,189 - INFO - start dump df_install_behave_date
2020-07-01 18:30:24,339 - INFO - finish dump df_install_behave_date


In [2]:
df_install_behave = pickle.load(open('../data_sortout/df_install_behave_no_date.pickle', 'rb'))
new_actions = []
for actions in tqdm( df_install_behave['action']):
    new_action = list(map(lambda x : x if x > 0 else 0, actions) )
    new_actions.append(new_action)
df_install_behave['action'] = new_actions

HBox(children=(IntProgress(value=0, max=197737), HTML(value='')))




In [4]:
pickle.dump(df_install_behave, open('../data_sortout/df_install_behave_no_date.pickle', 'wb'))

## time

In [None]:
df_install_behave = pd.read_csv('../data/202002/app_inunstall_records.csv', header=None, names=['id', 'loan_date', 'pkg', 'date', 'action_type'])
df_install_behave['loan_date'] = pd.to_datetime(df_install_behave['loan_date'], format='%Y%m%d', errors='ignore')
df_install_behave['date'] = pd.to_datetime(df_install_behave['date'], format='%Y%m%d', errors='ignore')
df_install_behave_sort = df_install_behave.sort_values(by = ['date'])
df_install_behave_sort_less = df_install_behave_sort[df_install_behave_sort.loan_date > df_install_behave_sort.date]

In [21]:
qcut_time = pd.qcut(df_install_behave_sort_less['date'], 32)
cut_time = pd.cut(df_install_behave_sort_less['date'], 32)

In [39]:
mp_qcut_id = dict(zip(list(qcut_time.value_counts().sort_index().index), list(range(32))))
qcut_time_id = qcut_time.map(mp_qcut_id)

In [40]:
mp_cut_id = dict(zip(list(cut_time.value_counts().sort_index().index), list(range(32))))
cut_time_id = cut_time.map(mp_cut_id)

In [45]:
df_install_behave_sort_less['qcut_time_id'] = qcut_time_id
df_install_behave_sort_less['cut_time_id'] = cut_time_id

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [46]:
df_install_behave_sort_less

Unnamed: 0,id,loan_date,pkg,date,action_type,qcut_time_id,cut_time_id
48652634,128369,2019-05-16,86c50828e16d0bd301679f74e300bcd2ac18b70c.d2a39...,2018-01-01,install,0,0
7847169,210807,2019-07-17,86c50828e16d0bd301679f74e300bcd2ac18b70c.783e2...,2018-01-01,unstall,0,0
25507654,260063,2019-08-16,86c50828e16d0bd301679f74e300bcd2ac18b70c.62178...,2018-01-01,unstall,0,0
37400437,46113,2019-01-09,86c50828e16d0bd301679f74e300bcd2ac18b70c.8df0d...,2018-01-01,install,0,0
49391071,263646,2019-09-09,86c50828e16d0bd301679f74e300bcd2ac18b70c.e2b76...,2018-01-01,install,0,0
...,...,...,...,...,...,...,...
13948738,237696,2019-09-29,86c50828e16d0bd301679f74e300bcd2ac18b70c.77bbf...,2019-09-28,install,31,31
9315965,78902,2019-09-29,86c50828e16d0bd301679f74e300bcd2ac18b70c.e4198...,2019-09-28,unstall,31,31
9315968,78902,2019-09-29,86c50828e16d0bd301679f74e300bcd2ac18b70c.e4198...,2019-09-28,install,31,31
34114495,12548,2019-09-29,86c50828e16d0bd301679f74e300bcd2ac18b70c.6c5aa...,2019-09-28,install,31,31


In [49]:
df_time = df_install_behave_sort_less.groupby('id').apply(lambda x : pd.Series({
    'qcut_id' : list(x['qcut_time_id'].values),
    'cut_id' : list(x['cut_time_id'].values),
}))

In [50]:
pickle.dump(df_time, open('../data_sortout/df_time_cut.pickle', 'wb'))

In [51]:
df_time

Unnamed: 0_level_0,qcut_id,cut_id
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,"[1, 1, 1, 1, 1, 1, 1, 1, 8, 8, 8]","[4, 4, 4, 4, 4, 4, 4, 4, 15, 15, 15]"
2,"[23, 23, 23, 23, 23, 23, 23, 23, 24, 24, 24, 2...","[25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 2..."
3,"[12, 12, 16]","[18, 18, 21]"
4,"[12, 12, 16]","[18, 18, 21]"
8,"[6, 6, 6, 6, 6, 6, 6, 10, 10, 11, 11, 11, 11, ...","[13, 13, 13, 13, 13, 13, 13, 17, 17, 17, 17, 1..."
...,...,...
402831,"[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ...","[6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, ..."
402833,"[10, 10, 10, 12, 12, 12, 14, 14, 14, 14, 14, 1...","[17, 17, 17, 18, 18, 18, 19, 19, 19, 19, 19, 1..."
402835,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, ..."
402836,"[19, 19, 19, 19, 19, 20, 20, 20, 20, 20, 20, 2...","[23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 2..."


# App user log

In [2]:
df_user_log = pickle.load(open('../data/202005/df_user_log.pickle','rb'))

In [3]:
df_user_log

Unnamed: 0,userid,time,order_in_day,order_in_session,session_id,page,tgt_event_id
0,23836,2018-04-30 18:27:58.582,18,18,8329882,,59.0
1,23836,2018-04-30 18:27:39.337,8,8,8329882,,59.0
2,23836,2018-04-30 18:27:33.282,1,1,8329882,,59.0
3,93197,2018-08-13 07:30:36.060,28,4,15291922,,59.0
4,71227,2018-11-29 21:16:52.902,146,146,21750936,,59.0
...,...,...,...,...,...,...,...
105134439,114016,2020-02-28 09:17:53.785,236,235,21436045,18.0,
105134440,114016,2020-02-28 09:17:45.620,229,228,21436045,18.0,
105134441,114016,2020-02-28 09:17:40.661,224,223,21436045,18.0,
105134442,180399,2020-02-24 07:50:35.274,110,110,2488024,18.0,


In [7]:
df_user_log['page'].count() / df_user_log['page'].shape[0], df_user_log['tgt_event_id'].count() / df_user_log['tgt_event_id'].shape[0]

(0.9240901541459543, 0.8705568085227181)

In [3]:
df_user_log['time'] = pd.to_datetime(df_user_log['time'], format='%Y-%m-%d %H:%M:%S', errors='ignore')

In [4]:
df_user_log = df_user_log.sort_values(by = ['time'])

In [5]:
df_master_records = pickle.load(open('../data_sortout/df_master_records.pickle', 'rb'))

In [6]:
df_user_log = df_user_log.fillna(0)

In [7]:
gby_userid_log = df_user_log.groupby('userid')
mp_userid_log = {}
for userid, df in gby_userid_log:
    mp_userid_log[userid] = df

In [8]:
df_master_records_with_log = df_master_records[df_master_records.userid.isin(set(df_user_log['userid']))]

In [9]:
df_master_records_with_log.shape, df_master_records.shape

((402033, 21), (402840, 21))

In [10]:
df_user_log['page'] = df_user_log['page'].apply(int)
df_user_log['tgt_event_id'] = df_user_log['tgt_event_id'].apply(int)

In [11]:
userlog_before_loan = []
useids_log = []
cnt = 0
for i in tqdm(range(df_master_records_with_log.shape[0])):
    se = df_master_records_with_log.iloc[i]
    userid = se['userid']
    loan_date = se['loan_date']
    df_item = mp_userid_log[userid].query('time <= @loan_date')
#     ipd.display(df_item)
    useids_log.append(userid)
    userlog_before_loan.append({
        'time' : df_item['time'].values,
        'session_id' : df_item['session_id'].values,
        'page' : df_item['page'].values,
        'tgt_event_id' : df_item['tgt_event_id'].values,
    })
    

HBox(children=(IntProgress(value=0, max=402033), HTML(value='')))

2020-08-06 14:04:14,950 - INFO - Note: NumExpr detected 32 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
2020-08-06 14:04:14,952 - INFO - NumExpr defaulting to 8 threads.





In [12]:
df_userlog_sequence = pd.DataFrame(userlog_before_loan, index = df_master_records_with_log.index)

In [13]:
pickle.dump(df_userlog_sequence, open('../data_sortout/df_userlog_sequence.pickle', 'wb'))

In [14]:
df_userlog_sequence = df_userlog_sequence[df_userlog_sequence['time'].apply(len) > 0]
pickle.dump(df_userlog_sequence[['page', 'tgt_event_id']], open('../data_sortout/df_userlog_sequence_less.pickle', 'wb'))

## Time

In [2]:
df_userlog_sequence = pickle.load(open('../data_sortout/df_userlog_sequence.pickle', 'rb'))

In [7]:
df_master_records = pickle.load(open('../data_sortout/df_master_records.pickle', 'rb'))

In [22]:
df_userlog_sequence_data = df_userlog_sequence[df_userlog_sequence['time'].apply(len) > 0]
df_userlog_sequence_with_loan_date = pd.merge(df_userlog_sequence_data, df_master_records[['loan_date']], how = 'left', left_index=True, right_index=True)

In [111]:
day_list, second_list, id_list = [], [], []
for i in tqdm(range(df_userlog_sequence_with_loan_date.shape[0])):
    a_row = df_userlog_sequence_with_loan_date.iloc[i]
    time_list = list(map(lambda x : ((a_row['loan_date'] - x).seconds, (a_row['loan_date'] - x).days), a_row['time']))
    time_list = list(zip(*time_list))
    second_list.extend(time_list[0])
    day_list.extend(time_list[1])
    id_list.extend([a_row.name] * len(time_list[1]))

HBox(children=(IntProgress(value=0, max=319977), HTML(value='')))




In [112]:
def cut(se_time, cut_piece):
    qcut_time = pd.qcut(se_time, cut_piece)
    mp_qcut_id = dict(zip(list(qcut_time.value_counts().sort_index().index), list(range(cut_piece))))
    qcut_time_id = qcut_time.map(mp_qcut_id)
    
    cut_time = pd.cut(se_time, cut_piece)
    mp_cut_id = dict(zip(list(cut_time.value_counts().sort_index().index), list(range(cut_piece))))
    cut_time_id = cut_time.map(mp_cut_id)
    
    return qcut_time_id, cut_time_id

df_time = pd.DataFrame({'day' :day_list, 'second' : second_list, 'id' : id_list})

qcut_day_id, cut_day_id = cut(df_time['day'], 8)
qcut_second_id, cut_second_id = cut(df_time['second'], 32)
df_time['qcut_day_id'] = qcut_day_id
df_time['cut_day_id'] = cut_day_id
df_time['qcut_second_id'] = qcut_second_id
df_time['cut_second_id'] = cut_second_id

In [113]:
df_time_seq = df_time.groupby('id').apply(lambda x : pd.Series({
    'qcut_day_id' : list(x['qcut_day_id']),
    'cut_day_id' : list(x['cut_day_id']),
    'qcut_second_id' : list(x['qcut_second_id']),
    'cut_second_id' : list(x['cut_second_id']),
}))

In [116]:
# df_time_seq

In [114]:
pickle.dump(df_time_seq, open('../data_sortout/df_userlog_time_seq.pickle', 'wb'))

## cross

In [2]:
df_userlog_sequence = pickle.load(open('../data_sortout/df_userlog_sequence_less.pickle', 'rb'))

In [3]:
cross_list = []
for page_list, tgt_list in tqdm(zip(df_userlog_sequence['page'], df_userlog_sequence['tgt_event_id'])):
    cross_list.append([str(int(page))+ '_' + str(int(tgt)) for (page, tgt) in zip(page_list, tgt_list)])
se_userlog_cross = pd.Series(cross_list)


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




NameError: name 'df_userlog' is not defined

In [4]:
se_userlog_cross.index = df_userlog_sequence.index
pickle.dump(se_userlog_cross, open('../data_sortout/se_userlog_cross.pickle', 'wb'))

In [32]:
se_userlog_cross = pickle.load(open('../data_sortout/se_userlog_cross.pickle', 'rb'))

In [5]:
mp_cross_id = {}
cnt = 1
for i in range(se_userlog_cross.shape[0]):
    user_log_list = se_userlog_cross.iloc[i]
    for action in user_log_list: 
        if (action not in mp_cross_id):
            mp_cross_id[action] = cnt
            cnt += 1
    if(cnt == 138):
        break
#     print(user_log_list)
#     break

In [6]:
se_userlog_cross_id = se_userlog_cross.apply(lambda x : list(map(lambda item : mp_cross_id[item], x)))
pickle.dump(se_userlog_cross_id, open('../data_sortout/se_userlog_cross_id.pickle', 'wb'))

In [37]:
se_userlog_cross.apply(len).sum()

185482544

In [33]:
se_userlog_cross

id
1         [15_57, 0_0, 0_0, 20_47, 17_0, 24_0, 24_45, 29...
2         [0_0, 17_45, 17_0, 24_45, 29_59, 2_0, 16_0, 16...
3         [0_0, 0_0, 20_47, 24_45, 29_59, 29_39, 2_0, 2_...
4         [0_0, 0_0, 20_47, 24_45, 29_59, 29_39, 2_0, 2_...
5         [15_57, 0_0, 0_0, 20_47, 17_45, 17_45, 17_0, 1...
                                ...                        
402835    [15_57, 29_59, 15_57, 29_59, 29_59, 20_47, 0_0...
402836    [15_57, 0_0, 29_59, 20_47, 29_59, 25_35, 1_0, ...
402837    [15_57, 29_59, 29_39, 20_47, 24_45, 29_49, 29_...
402838    [2_0, 2_36, 17_0, 24_45, 29_39, 29_39, 8_36, 8...
402839    [15_57, 15_57, 0_0, 0_0, 20_47, 0_0, 0_0, 29_5...
Length: 319977, dtype: object

## load

In [2]:
df_userlog_sequence = pickle.load(open('../data_sortout/df_userlog_sequence_less.pickle', 'rb'))

In [5]:
pad_start_session = []
userids = []
start_token = '#'
for i in tqdm(range(df_userlog_sequence.shape[0])):
    se_item = df_userlog_sequence.iloc[i]
    
    session_ids = se_item['session_id']
    
    if(len(session_ids) == 0):
        continue
        
    time_list, page_list, tgt_list = [start_token], [start_token], [start_token]
    for j in range(len(session_ids)):
        if (j > 0 and session_ids[j] != session_ids[j-1]):
            time_list.append(start_token)
            page_list.append(start_token)
            tgt_list.append(start_token)
        time_list.append(se_item['time'][j])
        page_list.append(se_item['page'][j])
        tgt_list.append(se_item['tgt_event_id'][j])

    pad_start_session.append({
        'time' : time_list,
        'page' : page_list,
        'tgt' : tgt_list,
    })
    userids.append(se_item.name)

HBox(children=(IntProgress(value=0, max=402033), HTML(value='')))




In [None]:
df_userlog_sequence_start_token = pd.DataFrame(pad_start_session, index = userids)
pickle.dump(df_userlog_sequence_start_token, open('../data_sortout/df_userlog_sequence_start_token.pickle', 'wb'))

In [7]:
df_userlog_sequence_start_token

Unnamed: 0,time,page,tgt
8175,"[#, 2019-02-06T23:46:33.440000000, 2019-02-06T...","[#, 15.0, nan, nan, 20.0, 17.0, 24.0, 24.0, 29...","[#, 57.0, nan, nan, 47.0, nan, nan, 45.0, 59.0..."
2826,"[#, 2019-01-09T09:32:19.808000000, 2019-01-09T...","[#, nan, 17.0, 17.0, 24.0, 29.0, 2.0, 16.0, 16...","[#, nan, 45.0, nan, 45.0, 59.0, nan, nan, nan,..."
77490,"[#, 2019-03-20T22:55:48.505000000, 2019-03-20T...","[#, nan, nan, 20.0, 24.0, 29.0, 29.0, 2.0, 2.0...","[#, nan, nan, 47.0, 45.0, 59.0, 39.0, nan, 37...."
77490,"[#, 2019-03-20T22:55:48.505000000, 2019-03-20T...","[#, nan, nan, 20.0, 24.0, 29.0, 29.0, 2.0, 2.0...","[#, nan, nan, 47.0, 45.0, 59.0, 39.0, nan, 37...."
2119,"[#, 2019-02-22T21:16:32.436000000, 2019-02-22T...","[#, 15.0, nan, nan, 20.0, 17.0, 17.0, 17.0, 17...","[#, 57.0, nan, nan, 47.0, 45.0, 45.0, nan, 45...."
...,...,...,...
107763,"[#, 2018-12-23T18:05:44.506000000, 2018-12-23T...","[#, 15.0, 29.0, 15.0, 29.0, 29.0, 20.0, nan, n...","[#, 57.0, 59.0, 57.0, 59.0, 59.0, 47.0, nan, n..."
145737,"[#, 2019-05-19T09:43:46.777000000, 2019-05-19T...","[#, 15.0, nan, 29.0, 20.0, 29.0, 25.0, 1.0, 1....","[#, 57.0, nan, 59.0, 47.0, 59.0, 35.0, nan, na..."
102221,"[#, 2018-12-04T08:05:43.017000000, 2018-12-04T...","[#, 15.0, 29.0, 29.0, 20.0, 24.0, 29.0, 29.0, ...","[#, 57.0, 59.0, 39.0, 47.0, 45.0, 49.0, 39.0, ..."
48480,"[#, 2018-12-24T13:13:56.082000000, 2018-12-24T...","[#, 2.0, 2.0, 17.0, 24.0, 29.0, 29.0, 8.0, 8.0...","[#, nan, 36.0, nan, 45.0, 39.0, 39.0, 36.0, 36..."
