In [1]:
# mask v1 0.80448
import pandas as pd
import numpy as np
import gc
from base import Cache
from tqdm import tqdm
from gensim.models import Word2Vec
import sys
from tensorflow.keras.preprocessing.sequence import pad_sequences
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('precision', 5)
pd.set_option('display.float_format', lambda x: '%.5f' % x)
pd.set_option('max_colwidth', 200)
pd.set_option('display.width', 5000)
# # log
# class Logger(object):
#     def __init__(self, fileN="Default.log"):
#         self.terminal = sys.stdout
#         self.log = open(fileN, "a", encoding='utf-8')
# 
#     def write(self, message):
#         self.terminal.write(message)
#         self.log.write(message)
# 
#     def flush(self):
#         pass
# sys.stdout = Logger("zlh0918log.txt")

def reduce_mem(df, use_float16=False):
    start_mem = df.memory_usage().sum() / 1024**2
    tm_cols = df.select_dtypes('datetime').columns
    for col in df.columns:
        if col in tm_cols:
            continue
        col_type = df[col].dtypes
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(
                        np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(
                        np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(
                        np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(
                        np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if use_float16 and c_min > np.finfo(
                        np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(
                        np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024**2
    print('{:.2f} Mb, {:.2f} Mb ({:.2f} %)'.format(
        start_mem, end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

def w2v_pro(df_raw, sentence_id, word_id, emb_size=128, 
            dropna=False, n_jobs=16, method='cbow', 
            hs=1,negative=0,epoch=10,return_model=True):
    if method.lower() in ['sg','skipgram']:
        sg=1
        logger.info("## Use skip-gram ##")
    elif method.lower() in ['cbow']:
        sg=0
        logger.info("## Use CBOW ##")
    else:
        raise NotImplementedError
    list_col_nm = f'{sentence_id}__{word_id}_list'
    if (n_jobs is None) or (n_jobs <= 0):
        n_jobs = multiprocessing.cpu_count()
    logger.info(f"========== W2V:  {sentence_id} {word_id} ==========")
    df = df_raw[[sentence_id, word_id]].copy()
    if df[sentence_id].isnull().sum() > 0:
        logger.warning("NaNs exist in sentence_id column!!")
    if dropna:
        df = df.dropna(subset=[sentence_id, word_id])
    else:
        df = df.fillna('NULL')
    df = df.astype(str)
    tmp = df.groupby(sentence_id,
                     as_index=False)[word_id].agg({list_col_nm: list})
    sentences = tmp[list_col_nm].values.tolist()
    all_words_vocabulary = df[word_id].unique().tolist()
    del tmp[list_col_nm]
    gc.collect()
    model = Word2Vec(
        sentences,
        size=emb_size,
        window=30,
        workers=n_jobs,
        min_count=1,  # 最低词频. min_count>1会出现OOV
        sg=sg,  # 1 for skip-gram; otherwise CBOW.
        hs=hs,  # If 1, hierarchical softmax will be used for model training
        negative=negative,  # hs=1 + negative 负采样
        iter=epoch,
        seed=0)
    
    # get word embedding matrix
    emb_dict = {}
    for word_i in all_words_vocabulary:
        if word_i in model.wv:
            emb_dict[word_i] = model.wv[word_i]
        else:
            emb_dict[word_i] = np.zeros(emb_size)
            
    return {"word_emb_dict": emb_dict}

def get_sequence(data,col,max_len=None):
    key2index = {}
    def split(x):
        for key in x:
            if key not in key2index:
                # Notice : input value 0 is a special "padding", 
                # so we do not use 0 to encode valid feature for sequence input
                key2index[key] = len(key2index) + 1 # 从1开始，0用于padding
        return list(map(lambda x: key2index[x], x))
    
    # preprocess the sequence feature
    id_list = list(map(split, data[col].values))# 转index
    id_list_length = np.array(list(map(len, id_list)))
    # max_len = max(genres_length)
    if max_len is None:
        max_len = int(np.percentile(id_list_length,99))
    id_list = pad_sequences(id_list, maxlen=max_len, padding='post',truncating='post')
    return id_list,key2index

def gen_list_df(feature):
    print(f'{feature} start!')
    data = Cache.reload_cache('CACHE_data_step_1_feature_0917_r5.pkl')
    if feature =='label':
        data['label'] = data['label'].fillna(-1).astype(int)# mask
        data['label'] = data['label']+1# 因为0用于padding
    data = data[['uid',feature,'pt_d']]
    gc.collect()
    print(data.shape)
    data_group = data.groupby(['uid'])
    gc.collect()
    index_list = []
    feature_list = []
    print('index_list start')
    for name,group in tqdm(data_group):
        index_list.append(name)    
    print('feature_list start')
    for i in tqdm(index_list):
        index_get_group = data_group.get_group(i)
        ptd_set = set(index_get_group['pt_d'].values.flatten().tolist())
        for j in ptd_set:
            feature_list_ = []
            buf_list = []
            buf_list = index_get_group.query('pt_d < @j')[feature].values.flatten().tolist()
            buf_list.append(0)# padding 0
            feature_list_.append(buf_list)# 行为序列
            feature_list_.append(j)# pt_d
            feature_list_.append(i)# uid
            feature_list.append(feature_list_)

    list_df = pd.DataFrame(feature_list)
    del index_list,feature_list,feature_list_,data_group,index_get_group,ptd_set
    gc.collect()
    list_df.columns=['list','pt_d','uid']
    list_df['list'] = list_df['list'].map(lambda x: [str(i) for i in x])# 转str
    list_df = list_df.drop_duplicates(subset=['pt_d','uid'])
#     data_uid_ptd = data[['uid','pt_d']]
    list_df = data.merge(list_df,how='left',on=('uid','pt_d'))# 顺序还是用data的顺序
    # 加入当天本样本
    if feature!='label':
        list_df['list'] = list_df[feature].map(lambda x:[str(x)]) + list_df['list']
    print('w2v start!')
    emb_size = 32
    model = Word2Vec(
    list_df['list'].values.tolist(),
    size=emb_size,
    window=5,
    workers=5,
    min_count=1,  # 最低词频. min_count>1会出现OOV
    sg=0,  # 1 for skip-gram; otherwise CBOW.
    hs=0,  # If 1, hierarchical softmax will be used for model training
    negative=5,  # hs=1 + negative 负采样
    iter=5,
    seed=0)
    # 1 获取seq
    id_list,key2index = get_sequence(list_df,'list',max_len=40)
    # 2 获取key2index
    emb_dict = {}
    for word_i in list(model.wv.vocab.keys()):
        if word_i in model.wv:
            emb_dict[word_i] = model.wv[word_i]
        else:
            emb_dict[word_i] = np.zeros(emb_size)
    # 3 保存
    id_list_dict={}
    id_list_dict['id_list'] = id_list
    id_list_dict['key2index'] = key2index
    id_list_dict['emb'] = emb_dict
    Cache.cache_data(id_list_dict, nm_marker=f'EMB_INPUTSEQ_{feature}')
    print(f'{feature} done!')

from multiprocessing import Pool
if __name__ == '__main__':
    # 获取过去的list + 当前的一行
    # 得到id_list_dict和tx一样
    poc_feature_list = ['creat_type_cd','tags','spread_app_id','task_id','adv_id','label']#'task_id','adv_id','dev_id','inter_type_cd','spread_app_id','tags','app_first_class','app_second_class','his_app_size','his_on_shelf_time','app_score',,'creat_type_cd','adv_prim_id','indu_name'
    with Pool(6) as p:
        p.map(gen_list_df, poc_feature_list)
#     # 获取过去的list + 当前的一行
#     # 得到id_list_dict和tx一样
#     poc_feature_list = ['label']#'task_id','adv_id','dev_id','inter_type_cd','spread_app_id','tags','app_first_class','app_second_class','his_app_size','his_on_shelf_time','app_score',,'creat_type_cd','adv_prim_id','indu_name'
#     with Pool(1) as p:
#         p.map(gen_list_df, poc_feature_list)



label start!


[2020-09-19 18:06:55] - __init__.py[line:126] - INFO: Successfully Reload: /home/zhangqibot/proj/digix/zlh/cached_data/CACHE_data_step_1_feature_0917_r5.pkl


(8601298, 3)
index_list start


100%|██████████| 1139171/1139171 [02:16<00:00, 8335.23it/s]


feature_list start


100%|██████████| 1139171/1139171 [1:23:23<00:00, 227.70it/s]


w2v start!


[2020-09-19 19:33:23] - word2vec.py[line:1399] - INFO: collecting all words and their counts
[2020-09-19 19:33:23] - word2vec.py[line:1384] - INFO: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
[2020-09-19 19:33:23] - word2vec.py[line:1384] - INFO: PROGRESS: at sentence #10000, processed 105381 words, keeping 3 word types
[2020-09-19 19:33:23] - word2vec.py[line:1384] - INFO: PROGRESS: at sentence #20000, processed 202690 words, keeping 3 word types
[2020-09-19 19:33:23] - word2vec.py[line:1384] - INFO: PROGRESS: at sentence #30000, processed 292708 words, keeping 3 word types
[2020-09-19 19:33:23] - word2vec.py[line:1384] - INFO: PROGRESS: at sentence #40000, processed 396895 words, keeping 3 word types
[2020-09-19 19:33:23] - word2vec.py[line:1384] - INFO: PROGRESS: at sentence #50000, processed 491839 words, keeping 3 word types
[2020-09-19 19:33:23] - word2vec.py[line:1384] - INFO: PROGRESS: at sentence #60000, processed 581207 words, keeping 3 word types
[2020-

[2020-09-19 19:33:24] - word2vec.py[line:1384] - INFO: PROGRESS: at sentence #620000, processed 5897821 words, keeping 3 word types
[2020-09-19 19:33:24] - word2vec.py[line:1384] - INFO: PROGRESS: at sentence #630000, processed 5995358 words, keeping 3 word types
[2020-09-19 19:33:24] - word2vec.py[line:1384] - INFO: PROGRESS: at sentence #640000, processed 6087760 words, keeping 3 word types
[2020-09-19 19:33:24] - word2vec.py[line:1384] - INFO: PROGRESS: at sentence #650000, processed 6194494 words, keeping 3 word types
[2020-09-19 19:33:24] - word2vec.py[line:1384] - INFO: PROGRESS: at sentence #660000, processed 6292074 words, keeping 3 word types
[2020-09-19 19:33:24] - word2vec.py[line:1384] - INFO: PROGRESS: at sentence #670000, processed 6386632 words, keeping 3 word types
[2020-09-19 19:33:24] - word2vec.py[line:1384] - INFO: PROGRESS: at sentence #680000, processed 6478472 words, keeping 3 word types
[2020-09-19 19:33:24] - word2vec.py[line:1384] - INFO: PROGRESS: at sentence

[2020-09-19 19:33:25] - word2vec.py[line:1384] - INFO: PROGRESS: at sentence #1240000, processed 11968977 words, keeping 3 word types
[2020-09-19 19:33:25] - word2vec.py[line:1384] - INFO: PROGRESS: at sentence #1250000, processed 12072515 words, keeping 3 word types
[2020-09-19 19:33:25] - word2vec.py[line:1384] - INFO: PROGRESS: at sentence #1260000, processed 12176829 words, keeping 3 word types
[2020-09-19 19:33:25] - word2vec.py[line:1384] - INFO: PROGRESS: at sentence #1270000, processed 12278509 words, keeping 3 word types
[2020-09-19 19:33:25] - word2vec.py[line:1384] - INFO: PROGRESS: at sentence #1280000, processed 12385096 words, keeping 3 word types
[2020-09-19 19:33:25] - word2vec.py[line:1384] - INFO: PROGRESS: at sentence #1290000, processed 12477857 words, keeping 3 word types
[2020-09-19 19:33:25] - word2vec.py[line:1384] - INFO: PROGRESS: at sentence #1300000, processed 12560042 words, keeping 3 word types
[2020-09-19 19:33:25] - word2vec.py[line:1384] - INFO: PROGRES

[2020-09-19 19:33:25] - word2vec.py[line:1384] - INFO: PROGRESS: at sentence #1860000, processed 17937361 words, keeping 3 word types
[2020-09-19 19:33:25] - word2vec.py[line:1384] - INFO: PROGRESS: at sentence #1870000, processed 18037501 words, keeping 3 word types
[2020-09-19 19:33:25] - word2vec.py[line:1384] - INFO: PROGRESS: at sentence #1880000, processed 18128779 words, keeping 3 word types
[2020-09-19 19:33:25] - word2vec.py[line:1384] - INFO: PROGRESS: at sentence #1890000, processed 18223067 words, keeping 3 word types
[2020-09-19 19:33:25] - word2vec.py[line:1384] - INFO: PROGRESS: at sentence #1900000, processed 18324951 words, keeping 3 word types
[2020-09-19 19:33:25] - word2vec.py[line:1384] - INFO: PROGRESS: at sentence #1910000, processed 18407453 words, keeping 3 word types
[2020-09-19 19:33:25] - word2vec.py[line:1384] - INFO: PROGRESS: at sentence #1920000, processed 18510997 words, keeping 3 word types
[2020-09-19 19:33:25] - word2vec.py[line:1384] - INFO: PROGRES

[2020-09-19 19:33:26] - word2vec.py[line:1384] - INFO: PROGRESS: at sentence #2480000, processed 23995196 words, keeping 3 word types
[2020-09-19 19:33:26] - word2vec.py[line:1384] - INFO: PROGRESS: at sentence #2490000, processed 24096289 words, keeping 3 word types
[2020-09-19 19:33:26] - word2vec.py[line:1384] - INFO: PROGRESS: at sentence #2500000, processed 24183470 words, keeping 3 word types
[2020-09-19 19:33:26] - word2vec.py[line:1384] - INFO: PROGRESS: at sentence #2510000, processed 24271682 words, keeping 3 word types
[2020-09-19 19:33:26] - word2vec.py[line:1384] - INFO: PROGRESS: at sentence #2520000, processed 24370590 words, keeping 3 word types
[2020-09-19 19:33:26] - word2vec.py[line:1384] - INFO: PROGRESS: at sentence #2530000, processed 24467406 words, keeping 3 word types
[2020-09-19 19:33:26] - word2vec.py[line:1384] - INFO: PROGRESS: at sentence #2540000, processed 24553965 words, keeping 3 word types
[2020-09-19 19:33:26] - word2vec.py[line:1384] - INFO: PROGRES

[2020-09-19 19:33:27] - word2vec.py[line:1384] - INFO: PROGRESS: at sentence #3100000, processed 30022139 words, keeping 3 word types
[2020-09-19 19:33:27] - word2vec.py[line:1384] - INFO: PROGRESS: at sentence #3110000, processed 30106865 words, keeping 3 word types
[2020-09-19 19:33:27] - word2vec.py[line:1384] - INFO: PROGRESS: at sentence #3120000, processed 30206189 words, keeping 3 word types
[2020-09-19 19:33:27] - word2vec.py[line:1384] - INFO: PROGRESS: at sentence #3130000, processed 30304884 words, keeping 3 word types
[2020-09-19 19:33:27] - word2vec.py[line:1384] - INFO: PROGRESS: at sentence #3140000, processed 30403120 words, keeping 3 word types
[2020-09-19 19:33:27] - word2vec.py[line:1384] - INFO: PROGRESS: at sentence #3150000, processed 30490547 words, keeping 3 word types
[2020-09-19 19:33:27] - word2vec.py[line:1384] - INFO: PROGRESS: at sentence #3160000, processed 30580852 words, keeping 3 word types
[2020-09-19 19:33:27] - word2vec.py[line:1384] - INFO: PROGRES

[2020-09-19 19:33:27] - word2vec.py[line:1384] - INFO: PROGRESS: at sentence #3720000, processed 36001055 words, keeping 3 word types
[2020-09-19 19:33:27] - word2vec.py[line:1384] - INFO: PROGRESS: at sentence #3730000, processed 36094296 words, keeping 3 word types
[2020-09-19 19:33:27] - word2vec.py[line:1384] - INFO: PROGRESS: at sentence #3740000, processed 36192500 words, keeping 3 word types
[2020-09-19 19:33:27] - word2vec.py[line:1384] - INFO: PROGRESS: at sentence #3750000, processed 36293032 words, keeping 3 word types
[2020-09-19 19:33:27] - word2vec.py[line:1384] - INFO: PROGRESS: at sentence #3760000, processed 36382889 words, keeping 3 word types
[2020-09-19 19:33:27] - word2vec.py[line:1384] - INFO: PROGRESS: at sentence #3770000, processed 36494073 words, keeping 3 word types
[2020-09-19 19:33:27] - word2vec.py[line:1384] - INFO: PROGRESS: at sentence #3780000, processed 36586166 words, keeping 3 word types
[2020-09-19 19:33:27] - word2vec.py[line:1384] - INFO: PROGRES

[2020-09-19 19:33:28] - word2vec.py[line:1384] - INFO: PROGRESS: at sentence #4340000, processed 41985443 words, keeping 3 word types
[2020-09-19 19:33:28] - word2vec.py[line:1384] - INFO: PROGRESS: at sentence #4350000, processed 42075379 words, keeping 3 word types
[2020-09-19 19:33:28] - word2vec.py[line:1384] - INFO: PROGRESS: at sentence #4360000, processed 42168729 words, keeping 3 word types
[2020-09-19 19:33:28] - word2vec.py[line:1384] - INFO: PROGRESS: at sentence #4370000, processed 42267669 words, keeping 3 word types
[2020-09-19 19:33:28] - word2vec.py[line:1384] - INFO: PROGRESS: at sentence #4380000, processed 42356401 words, keeping 3 word types
[2020-09-19 19:33:28] - word2vec.py[line:1384] - INFO: PROGRESS: at sentence #4390000, processed 42458081 words, keeping 3 word types
[2020-09-19 19:33:28] - word2vec.py[line:1384] - INFO: PROGRESS: at sentence #4400000, processed 42548630 words, keeping 3 word types
[2020-09-19 19:33:28] - word2vec.py[line:1384] - INFO: PROGRES

[2020-09-19 19:33:29] - word2vec.py[line:1384] - INFO: PROGRESS: at sentence #4960000, processed 48057633 words, keeping 3 word types
[2020-09-19 19:33:29] - word2vec.py[line:1384] - INFO: PROGRESS: at sentence #4970000, processed 48163267 words, keeping 3 word types
[2020-09-19 19:33:29] - word2vec.py[line:1384] - INFO: PROGRESS: at sentence #4980000, processed 48254767 words, keeping 3 word types
[2020-09-19 19:33:29] - word2vec.py[line:1384] - INFO: PROGRESS: at sentence #4990000, processed 48349002 words, keeping 3 word types
[2020-09-19 19:33:29] - word2vec.py[line:1384] - INFO: PROGRESS: at sentence #5000000, processed 48445932 words, keeping 3 word types
[2020-09-19 19:33:29] - word2vec.py[line:1384] - INFO: PROGRESS: at sentence #5010000, processed 48551295 words, keeping 3 word types
[2020-09-19 19:33:29] - word2vec.py[line:1384] - INFO: PROGRESS: at sentence #5020000, processed 48660282 words, keeping 3 word types
[2020-09-19 19:33:29] - word2vec.py[line:1384] - INFO: PROGRES

[2020-09-19 19:33:30] - word2vec.py[line:1384] - INFO: PROGRESS: at sentence #5580000, processed 54122761 words, keeping 3 word types
[2020-09-19 19:33:30] - word2vec.py[line:1384] - INFO: PROGRESS: at sentence #5590000, processed 54213085 words, keeping 3 word types
[2020-09-19 19:33:30] - word2vec.py[line:1384] - INFO: PROGRESS: at sentence #5600000, processed 54307306 words, keeping 3 word types
[2020-09-19 19:33:30] - word2vec.py[line:1384] - INFO: PROGRESS: at sentence #5610000, processed 54394764 words, keeping 3 word types
[2020-09-19 19:33:30] - word2vec.py[line:1384] - INFO: PROGRESS: at sentence #5620000, processed 54495290 words, keeping 3 word types
[2020-09-19 19:33:30] - word2vec.py[line:1384] - INFO: PROGRESS: at sentence #5630000, processed 54591816 words, keeping 3 word types
[2020-09-19 19:33:30] - word2vec.py[line:1384] - INFO: PROGRESS: at sentence #5640000, processed 54701514 words, keeping 3 word types
[2020-09-19 19:33:30] - word2vec.py[line:1384] - INFO: PROGRES

[2020-09-19 19:33:30] - word2vec.py[line:1384] - INFO: PROGRESS: at sentence #6200000, processed 60204729 words, keeping 3 word types
[2020-09-19 19:33:30] - word2vec.py[line:1384] - INFO: PROGRESS: at sentence #6210000, processed 60308872 words, keeping 3 word types
[2020-09-19 19:33:30] - word2vec.py[line:1384] - INFO: PROGRESS: at sentence #6220000, processed 60397992 words, keeping 3 word types
[2020-09-19 19:33:30] - word2vec.py[line:1384] - INFO: PROGRESS: at sentence #6230000, processed 60494137 words, keeping 3 word types
[2020-09-19 19:33:30] - word2vec.py[line:1384] - INFO: PROGRESS: at sentence #6240000, processed 60611194 words, keeping 3 word types
[2020-09-19 19:33:30] - word2vec.py[line:1384] - INFO: PROGRESS: at sentence #6250000, processed 60703476 words, keeping 3 word types
[2020-09-19 19:33:30] - word2vec.py[line:1384] - INFO: PROGRESS: at sentence #6260000, processed 60800076 words, keeping 3 word types
[2020-09-19 19:33:30] - word2vec.py[line:1384] - INFO: PROGRES

[2020-09-19 19:33:31] - word2vec.py[line:1384] - INFO: PROGRESS: at sentence #6820000, processed 66098734 words, keeping 3 word types
[2020-09-19 19:33:31] - word2vec.py[line:1384] - INFO: PROGRESS: at sentence #6830000, processed 66198739 words, keeping 3 word types
[2020-09-19 19:33:31] - word2vec.py[line:1384] - INFO: PROGRESS: at sentence #6840000, processed 66287153 words, keeping 3 word types
[2020-09-19 19:33:31] - word2vec.py[line:1384] - INFO: PROGRESS: at sentence #6850000, processed 66398034 words, keeping 3 word types
[2020-09-19 19:33:31] - word2vec.py[line:1384] - INFO: PROGRESS: at sentence #6860000, processed 66491897 words, keeping 3 word types
[2020-09-19 19:33:31] - word2vec.py[line:1384] - INFO: PROGRESS: at sentence #6870000, processed 66598487 words, keeping 3 word types
[2020-09-19 19:33:31] - word2vec.py[line:1384] - INFO: PROGRESS: at sentence #6880000, processed 66696135 words, keeping 3 word types
[2020-09-19 19:33:31] - word2vec.py[line:1384] - INFO: PROGRES

[2020-09-19 19:33:32] - word2vec.py[line:1384] - INFO: PROGRESS: at sentence #7440000, processed 72244003 words, keeping 3 word types
[2020-09-19 19:33:32] - word2vec.py[line:1384] - INFO: PROGRESS: at sentence #7450000, processed 72349821 words, keeping 3 word types
[2020-09-19 19:33:32] - word2vec.py[line:1384] - INFO: PROGRESS: at sentence #7460000, processed 72439594 words, keeping 3 word types
[2020-09-19 19:33:32] - word2vec.py[line:1384] - INFO: PROGRESS: at sentence #7470000, processed 72534568 words, keeping 3 word types
[2020-09-19 19:33:32] - word2vec.py[line:1384] - INFO: PROGRESS: at sentence #7480000, processed 72632965 words, keeping 3 word types
[2020-09-19 19:33:32] - word2vec.py[line:1384] - INFO: PROGRESS: at sentence #7490000, processed 72719419 words, keeping 3 word types
[2020-09-19 19:33:32] - word2vec.py[line:1384] - INFO: PROGRESS: at sentence #7500000, processed 72821655 words, keeping 3 word types
[2020-09-19 19:33:32] - word2vec.py[line:1384] - INFO: PROGRES

[2020-09-19 19:33:32] - word2vec.py[line:1384] - INFO: PROGRESS: at sentence #8060000, processed 78200222 words, keeping 3 word types
[2020-09-19 19:33:32] - word2vec.py[line:1384] - INFO: PROGRESS: at sentence #8070000, processed 78307331 words, keeping 3 word types
[2020-09-19 19:33:32] - word2vec.py[line:1384] - INFO: PROGRESS: at sentence #8080000, processed 78394275 words, keeping 3 word types
[2020-09-19 19:33:32] - word2vec.py[line:1384] - INFO: PROGRESS: at sentence #8090000, processed 78493171 words, keeping 3 word types
[2020-09-19 19:33:32] - word2vec.py[line:1384] - INFO: PROGRESS: at sentence #8100000, processed 78581416 words, keeping 3 word types
[2020-09-19 19:33:32] - word2vec.py[line:1384] - INFO: PROGRESS: at sentence #8110000, processed 78674547 words, keeping 3 word types
[2020-09-19 19:33:32] - word2vec.py[line:1384] - INFO: PROGRESS: at sentence #8120000, processed 78776459 words, keeping 3 word types
[2020-09-19 19:33:32] - word2vec.py[line:1384] - INFO: PROGRES

[2020-09-19 19:33:33] - word2vec.py[line:1699] - INFO: resetting layer weights
[2020-09-19 19:33:33] - base_any2vec.py[line:1196] - INFO: training model with 5 workers on 3 vocabulary and 32 features, using sg=0 hs=0 sample=0.001 negative=5 window=5
[2020-09-19 19:33:34] - base_any2vec.py[line:1291] - INFO: EPOCH 1 - PROGRESS: at 3.15% examples, 128018 words/s, in_qsize 9, out_qsize 0
[2020-09-19 19:33:35] - base_any2vec.py[line:1291] - INFO: EPOCH 1 - PROGRESS: at 6.14% examples, 124755 words/s, in_qsize 9, out_qsize 0
[2020-09-19 19:33:36] - base_any2vec.py[line:1291] - INFO: EPOCH 1 - PROGRESS: at 9.02% examples, 122980 words/s, in_qsize 3, out_qsize 8
[2020-09-19 19:33:37] - base_any2vec.py[line:1291] - INFO: EPOCH 1 - PROGRESS: at 11.98% examples, 123788 words/s, in_qsize 8, out_qsize 1
[2020-09-19 19:33:38] - base_any2vec.py[line:1291] - INFO: EPOCH 1 - PROGRESS: at 14.88% examples, 123465 words/s, in_qsize 8, out_qsize 1
[2020-09-19 19:33:39] - base_any2vec.py[line:1291] - INFO:

[2020-09-19 19:34:27] - base_any2vec.py[line:1291] - INFO: EPOCH 2 - PROGRESS: at 58.67% examples, 122230 words/s, in_qsize 8, out_qsize 1
[2020-09-19 19:34:28] - base_any2vec.py[line:1291] - INFO: EPOCH 2 - PROGRESS: at 61.59% examples, 122274 words/s, in_qsize 9, out_qsize 0
[2020-09-19 19:34:29] - base_any2vec.py[line:1291] - INFO: EPOCH 2 - PROGRESS: at 64.54% examples, 122330 words/s, in_qsize 9, out_qsize 0
[2020-09-19 19:34:30] - base_any2vec.py[line:1291] - INFO: EPOCH 2 - PROGRESS: at 67.50% examples, 122482 words/s, in_qsize 7, out_qsize 2
[2020-09-19 19:34:31] - base_any2vec.py[line:1291] - INFO: EPOCH 2 - PROGRESS: at 70.48% examples, 122580 words/s, in_qsize 8, out_qsize 1
[2020-09-19 19:34:32] - base_any2vec.py[line:1291] - INFO: EPOCH 2 - PROGRESS: at 73.46% examples, 122700 words/s, in_qsize 8, out_qsize 1
[2020-09-19 19:34:33] - base_any2vec.py[line:1291] - INFO: EPOCH 2 - PROGRESS: at 76.46% examples, 122706 words/s, in_qsize 10, out_qsize 1
[2020-09-19 19:34:34] - ba

[2020-09-19 19:35:18] - base_any2vec.py[line:1291] - INFO: EPOCH 4 - PROGRESS: at 8.79% examples, 121070 words/s, in_qsize 10, out_qsize 0
[2020-09-19 19:35:19] - base_any2vec.py[line:1291] - INFO: EPOCH 4 - PROGRESS: at 11.65% examples, 121163 words/s, in_qsize 10, out_qsize 0
[2020-09-19 19:35:20] - base_any2vec.py[line:1291] - INFO: EPOCH 4 - PROGRESS: at 14.48% examples, 120687 words/s, in_qsize 10, out_qsize 0
[2020-09-19 19:35:21] - base_any2vec.py[line:1291] - INFO: EPOCH 4 - PROGRESS: at 17.38% examples, 120814 words/s, in_qsize 10, out_qsize 0
[2020-09-19 19:35:22] - base_any2vec.py[line:1291] - INFO: EPOCH 4 - PROGRESS: at 20.27% examples, 120541 words/s, in_qsize 8, out_qsize 4
[2020-09-19 19:35:23] - base_any2vec.py[line:1291] - INFO: EPOCH 4 - PROGRESS: at 23.29% examples, 121354 words/s, in_qsize 9, out_qsize 0
[2020-09-19 19:35:24] - base_any2vec.py[line:1291] - INFO: EPOCH 4 - PROGRESS: at 26.23% examples, 121529 words/s, in_qsize 9, out_qsize 0
[2020-09-19 19:35:25] - 

[2020-09-19 19:36:13] - base_any2vec.py[line:1291] - INFO: EPOCH 5 - PROGRESS: at 72.40% examples, 126171 words/s, in_qsize 9, out_qsize 0
[2020-09-19 19:36:14] - base_any2vec.py[line:1291] - INFO: EPOCH 5 - PROGRESS: at 75.38% examples, 126049 words/s, in_qsize 9, out_qsize 0
[2020-09-19 19:36:15] - base_any2vec.py[line:1291] - INFO: EPOCH 5 - PROGRESS: at 78.39% examples, 125933 words/s, in_qsize 8, out_qsize 1
[2020-09-19 19:36:16] - base_any2vec.py[line:1291] - INFO: EPOCH 5 - PROGRESS: at 81.39% examples, 125920 words/s, in_qsize 9, out_qsize 0
[2020-09-19 19:36:17] - base_any2vec.py[line:1291] - INFO: EPOCH 5 - PROGRESS: at 84.31% examples, 125924 words/s, in_qsize 9, out_qsize 0
[2020-09-19 19:36:18] - base_any2vec.py[line:1291] - INFO: EPOCH 5 - PROGRESS: at 87.26% examples, 125833 words/s, in_qsize 10, out_qsize 0
[2020-09-19 19:36:19] - base_any2vec.py[line:1291] - INFO: EPOCH 5 - PROGRESS: at 90.29% examples, 125914 words/s, in_qsize 9, out_qsize 0
[2020-09-19 19:36:20] - ba

label done!


In [33]:
feature='label'
print(f'{feature} start!')
data = Cache.reload_cache('CACHE_data_step_1_feature_0917_r5.pkl')
if feature =='label':
    data['label'] = data['label'].fillna(2).astype(int)# mask到0
#     data['label'] = data['label']+1# 因为0用于padding
data = data[['uid',feature,'pt_d']]
gc.collect()
print(data.shape)
data_group = data.groupby(['uid'])
gc.collect()
index_list = []
feature_list = []
print('index_list start')
for name,group in tqdm(data_group):
    index_list.append(name)    
print('feature_list start')

[2020-09-19 12:12:19] - __init__.py[line:126] - INFO: Successfully Reload: /home/zhangqibot/proj/digix/zlh/cached_data/CACHE_data_step_1_feature_0917_r5.pkl
100%|██████████| 1139171/1139171 [03:36<00:00, 5251.53it/s]


In [37]:
feature_list=[]
index_get_group = data_group.get_group(index_list[2000])
ptd_set = set(index_get_group['pt_d'].values.flatten().tolist())
for j in ptd_set:
    feature_list_ = []
    buf_list = []
    buf_list = index_get_group.query('pt_d < @j')[feature].values.flatten().tolist()
    buf_list.append(2)# padding 1
    feature_list_.append(buf_list)# 行为序列
    feature_list_.append(j)# pt_d
    feature_list_.append(index_list[2000])# uid
    feature_list.append(feature_list_)

list_df = pd.DataFrame(feature_list)
list_df

Unnamed: 0,0,1,2
0,[2],1,1002170
1,"[0, 0, 1, 0, 0, 2]",2,1002170
2,"[0, 0, 0, 0, 0, 1, 0, 0, 2]",3,1002170
3,"[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 2]",4,1002170
4,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 2]",5,1002170
5,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 2]",6,1002170
6,"[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 2]",7,1002170
7,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 2]",8,1002170


In [38]:
list_df.columns=['list','pt_d','uid']
list_df['list'] = list_df['list'].map(lambda x: [str(i) for i in x])# 转str
list_df = list_df.drop_duplicates(subset=['pt_d','uid'])
#     data_uid_ptd = data[['uid','pt_d']]
list_df = data.query('uid==1002170').merge(list_df,how='left',on=('uid','pt_d'))# 顺序还是用data的顺序
# 加入当天本样本
list_df['list'] = list_df[feature].map(lambda x:[str(x)]) + list_df['list']
# list_df = list_df['list'].values.tolist()

In [40]:
list_df

Unnamed: 0,uid,label,pt_d,list
0,1002170,2,8,"[2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 2]"
1,1002170,2,8,"[2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 2]"
2,1002170,2,8,"[2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 2]"
3,1002170,2,8,"[2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 2]"
4,1002170,0,7,"[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 2]"
5,1002170,0,7,"[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 2]"
6,1002170,0,7,"[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 2]"
7,1002170,0,6,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 2]"
8,1002170,0,6,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 2]"
9,1002170,0,6,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 2]"


In [27]:
print('w2v start!')
from tensorflow.keras.preprocessing.sequence import pad_sequences
emb_size = 32
model = Word2Vec(
list_df['list'].values.tolist(),
size=emb_size,
window=5,
workers=5,
min_count=1,  # 最低词频. min_count>1会出现OOV
sg=0,  # 1 for skip-gram; otherwise CBOW.
hs=0,  # If 1, hierarchical softmax will be used for model training
negative=5,  # hs=1 + negative 负采样
iter=5,
seed=0)
# 1 获取seq
id_list,key2index = get_sequence(list_df,'list',max_len=40)
# 2 获取key2index
emb_dict = {}
for word_i in list(model.wv.vocab.keys()):
    if word_i in model.wv:
        emb_dict[word_i] = model.wv[word_i]
    else:
        emb_dict[word_i] = np.zeros(emb_size)
# 3 保存
id_list_dict={}
id_list_dict['id_list'] = id_list
id_list_dict['key2index'] = key2index
id_list_dict['emb'] = emb_dict

[2020-09-19 11:13:39] - word2vec.py[line:1399] - INFO: collecting all words and their counts
[2020-09-19 11:13:39] - word2vec.py[line:1384] - INFO: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
[2020-09-19 11:13:39] - word2vec.py[line:1407] - INFO: collected 28 word types from a corpus of 476 raw words and 31 sentences
[2020-09-19 11:13:39] - word2vec.py[line:1458] - INFO: Loading a fresh vocabulary
[2020-09-19 11:13:39] - word2vec.py[line:1482] - INFO: effective_min_count=1 retains 28 unique words (100% of original 28, drops 0)
[2020-09-19 11:13:39] - word2vec.py[line:1488] - INFO: effective_min_count=1 leaves 476 word corpus (100% of original 476, drops 0)
[2020-09-19 11:13:39] - word2vec.py[line:1547] - INFO: deleting the raw counts dictionary of 28 items
[2020-09-19 11:13:39] - word2vec.py[line:1550] - INFO: sample=0.001 downsamples 24 most-common words
[2020-09-19 11:13:39] - word2vec.py[line:1553] - INFO: downsampling leaves estimated 86 word corpus (18.1% of 

In [28]:
id_list_dict

{'id_list': array([[ 0,  0,  0, ..., 23, 24, 25],
        [ 0,  0,  0, ..., 23, 24, 25],
        [ 0,  0,  0, ..., 23, 24, 25],
        ...,
        [ 0,  0,  0, ...,  0,  4, 25],
        [ 0,  0,  0, ...,  0, 23, 25],
        [ 0,  0,  0, ...,  0, 24, 25]], dtype=int32),
 'key2index': {'5875': 1,
  '4261': 2,
  '5511': 3,
  '3007': 4,
  '2023': 5,
  '1808': 6,
  '5004': 7,
  '1951': 8,
  '4449': 9,
  '2434': 10,
  '2205': 11,
  '5718': 12,
  '3401': 13,
  '3584': 14,
  '1952': 15,
  '2694': 16,
  '2502': 17,
  '3025': 18,
  '2325': 19,
  '5633': 20,
  '1665': 21,
  '3530': 22,
  '4923': 23,
  '1180': 24,
  '0': 25,
  '4530': 26,
  '4032': 27,
  '3006': 28},
 'emb': {'5875': array([-0.01075382,  0.01125624,  0.01399479,  0.01061861, -0.01499187,
         -0.00772008, -0.00963185,  0.00941925,  0.00043449,  0.00346754,
         -0.0087173 , -0.00214588, -0.00205218, -0.01232668, -0.0102672 ,
         -0.0119534 ,  0.01254403,  0.00972093, -0.01073186,  0.01248486,
          0.00673845, 

In [2]:
import pandas as pd
import numpy as np
import gc
from base import Cache
from tqdm import tqdm
from gensim.models import Word2Vec
data = Cache.reload_cache('CACHE_data_step_1_feature_0917_r5.pkl')
seq_emb = Cache.reload_cache('CACHE_EMB_INPUTSEQ_adv_id.pkl')

[2020-09-19 17:53:50] - __init__.py[line:126] - INFO: Successfully Reload: /home/zhangqibot/proj/digix/zlh/cached_data/CACHE_data_step_1_feature_0917_r5.pkl
[2020-09-19 17:53:52] - __init__.py[line:126] - INFO: Successfully Reload: /home/zhangqibot/proj/digix/zlh/cached_data/CACHE_EMB_INPUTSEQ_adv_id.pkl


In [7]:
data[['index','uid','pt_d','adv_id']].head()

Unnamed: 0,index,uid,pt_d,adv_id
0,17073310,2237673,5,6340
1,17636486,2237673,5,4501
2,35175266,2237672,6,3701
3,30784519,2237672,4,5389
4,11939625,2237672,4,4506


In [8]:
data.query('uid==2237673')[['index','uid','pt_d','adv_id']]

Unnamed: 0,index,uid,pt_d,adv_id
0,17073310,2237673,5,6340
1,17636486,2237673,5,4501


In [4]:
seq_emb.keys()

dict_keys(['id_list', 'key2index', 'emb'])

In [6]:
seq_emb['id_list'][:5,:]

array([[1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [3, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [4, 5, 6, 7, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [5, 7, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [6, 7, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]],
      dtype=int32)

In [10]:
seq_emb['key2index']['6340']

1

In [11]:
seq_emb['key2index']['4501']

3

### 检查过了，seq 和 dense 可以对上！