# base festures

In [1]:
import pandas as pd 
import numpy as np
import gc
from base import Cache
from tqdm import tqdm

from multiprocessing import Pool

def reduce_mem(df, use_float16=False):
    start_mem = df.memory_usage().sum() / 1024**2
    tm_cols = df.select_dtypes('datetime').columns
    for col in df.columns:
        if col in tm_cols:
            continue
        col_type = df[col].dtypes
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(
                        np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(
                        np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(
                        np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(
                        np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if use_float16 and c_min > np.finfo(
                        np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(
                        np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024**2
    print('{:.2f} Mb, {:.2f} Mb ({:.2f} %)'.format(
        start_mem, end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [2]:
data = Cache.reload_cache('CACHE_data_sampling_pos1_neg5.pkl')

[2020-09-26 17:12:03] - __init__.py[line:126] - INFO: Successfully Reload: /home/zhangqibot/proj/digix/zlh/stage2/cached_data/CACHE_data_sampling_pos1_neg5.pkl


## count encode

In [3]:
from multiprocessing import Pool

cate_cols = ['task_id', 'adv_id', 'creat_type_cd', 'adv_prim_id', 'dev_id', 'inter_type_cd', 'slot_id', 'spread_app_id',
             'tags', 'app_first_class', 'app_second_class', 'city', 'device_name', 'career', 'gender', 'age', 'net_type',
             'residence', 'emui_dev', 'indu_name', 
             'communication_onlinerate_1','communication_onlinerate_2','communication_onlinerate_3',
             'communication_onlinerate_4','communication_onlinerate_5','communication_onlinerate_6',
             'communication_onlinerate_7','communication_onlinerate_8','communication_onlinerate_9',
             'communication_onlinerate_10','communication_onlinerate_11','communication_onlinerate_12',
             'communication_onlinerate_13','communication_onlinerate_14','communication_onlinerate_15',
             'communication_onlinerate_16','communication_onlinerate_17','communication_onlinerate_18',
             'communication_onlinerate_19','communication_onlinerate_20','communication_onlinerate_21',
             'communication_onlinerate_22','communication_onlinerate_23','communication_onlinerate_24']
cate_cols_df = []
for var in tqdm(cate_cols):
    cate_cols_df.append(data[['uid', 'pt_d', var]])


def cls(df):
    ## 列的countencoding，当天内的count归一化encoding
    ## 做countencoding时优先以train部分做映射
    f = df.columns[-1]
    mapping = dict(df.query('pt_d<8')[f].value_counts() / df.query('pt_d<8')[f].value_counts().max())  # 只统计train
    mapping_test = dict(df.query('pt_d>=8')[f].value_counts() / df.query('pt_d>=8')[f].value_counts().max())  # 只统计test
    for key, value in mapping_test.items():
        # 优先用train
        if key not in mapping:
            mapping[key] = value
    df[f + '_count'] = df[f].map(mapping)  # 映射
    fe = df.groupby([f, 'pt_d'])['uid'].count().rename(f'{f}_pt_d_count').reset_index()  # 当天统计count
    fe_max = fe.groupby('pt_d')[f'{f}_pt_d_count'].max().rename(f'{f}_pt_d_count_max').reset_index()
    fe = fe.merge(fe_max, on='pt_d', how='left')
    fe[f'{f}_pt_d_count'] = fe[f'{f}_pt_d_count'] / fe[f'{f}_pt_d_count_max']
    fe[f'{f}_pt_d_count'] = fe[f'{f}_pt_d_count'].fillna(0)
    del fe[f'{f}_pt_d_count_max']
    df = df.merge(fe, on=[f, 'pt_d'], how='left')
    print(df.columns)
    return df[[f, 'pt_d', f + '_count', f'{f}_pt_d_count']]


with Pool(10) as p:
    result = p.map(cls, cate_cols_df)
for index, fe in enumerate(result):
    f = cate_cols[index]
    data = pd.concat([data, fe[fe.columns[-2:]]], axis=1)
    print(fe.columns[-2:], f, data.shape)
    del fe
    gc.collect()
del result, f, cate_cols_df
gc.collect()
data = reduce_mem(data, use_float16=False)

# print(data)

100%|██████████| 44/44 [00:02<00:00, 16.49it/s]


Index(['uid', 'pt_d', 'slot_id', 'slot_id_count', 'slot_id_pt_d_count'], dtype='object')
Index(['uid', 'pt_d', 'task_id', 'task_id_count', 'task_id_pt_d_count'], dtype='object')
Index(['uid', 'pt_d', 'dev_id', 'dev_id_count', 'dev_id_pt_d_count'], dtype='object')
Index(['uid', 'pt_d', 'creat_type_cd', 'creat_type_cd_count',
       'creat_type_cd_pt_d_count'],
      dtype='object')
Index(['uid', 'pt_d', 'tags', 'tags_count', 'tags_pt_d_count'], dtype='object')
Index(['uid', 'pt_d', 'app_second_class', 'app_second_class_count',
       'app_second_class_pt_d_count'],
      dtype='object')
Index(['uid', 'pt_d', 'device_name', 'device_name_count',
       'device_name_pt_d_count'],
      dtype='object')
Index(['uid', 'pt_d', 'gender', 'gender_count', 'gender_pt_d_count'], dtype='object')
Index(['uid', 'pt_d', 'net_type', 'net_type_count', 'net_type_pt_d_count'], dtype='object')
Index(['uid', 'pt_d', 'emui_dev', 'emui_dev_count', 'emui_dev_pt_d_count'], dtype='object')
Index(['uid', 'pt_d', '

Index(['communication_onlinerate_4_count', 'communication_onlinerate_4_pt_d_count'], dtype='object') communication_onlinerate_4 (9672928, 108)
Index(['communication_onlinerate_5_count', 'communication_onlinerate_5_pt_d_count'], dtype='object') communication_onlinerate_5 (9672928, 110)
Index(['communication_onlinerate_6_count', 'communication_onlinerate_6_pt_d_count'], dtype='object') communication_onlinerate_6 (9672928, 112)
Index(['communication_onlinerate_7_count', 'communication_onlinerate_7_pt_d_count'], dtype='object') communication_onlinerate_7 (9672928, 114)
Index(['communication_onlinerate_8_count', 'communication_onlinerate_8_pt_d_count'], dtype='object') communication_onlinerate_8 (9672928, 116)
Index(['communication_onlinerate_9_count', 'communication_onlinerate_9_pt_d_count'], dtype='object') communication_onlinerate_9 (9672928, 118)
Index(['communication_onlinerate_10_count', 'communication_onlinerate_10_pt_d_count'], dtype='object') communication_onlinerate_10 (9672928, 1

## target encode

In [4]:
##########################groupby feature#######################
def group_fea(data, key, target):
    tmp = data.groupby(key, as_index=False)[target].agg({
        key + target + '_nunique': 'nunique',
    }).reset_index()
    del tmp['index']
    return tmp


def group_fea_pt_d(data, key, target):
    tmp = data.groupby([key, 'pt_d'], as_index=False)[target].agg({
        key + target + '_pt_d_nunique': 'nunique',
    }).reset_index()
    fe = tmp.groupby('pt_d')[key + target + '_pt_d_nunique'].max().rename('dmax').reset_index()
    tmp = tmp.merge(fe, on='pt_d', how='left')
    tmp[key + target + '_pt_d_nunique'] = tmp[key + target + '_pt_d_nunique'] / tmp['dmax']
    del tmp['index'], tmp['dmax']
    print("**************************{}**************************".format(target))
    return tmp


feature_key = ['uid', 'age', 'gender', 'career', 'city', 'slot_id', 'net_type']
feature_target = ['task_id', 'adv_id', 'dev_id', 'spread_app_id', 'indu_name']

for key in tqdm(feature_key):
    for target in feature_target:
        tmp = group_fea(data, key, target)
        data = data.merge(tmp, on=key, how='left')
        tmp = group_fea_pt_d(data, key, target)
        data = data.merge(tmp, on=[key, 'pt_d'], how='left')
del tmp
gc.collect()
data = reduce_mem(data, use_float16=False)

test_df = data[data["pt_d"] >= 8].copy().reset_index()
train_df = data[data["pt_d"] < 8].reset_index()
del data
gc.collect()

# 统计做了groupby特征的特征
group_list = []
for s in train_df.columns:
    if '_nunique' in s:
        group_list.append(s)
print(group_list)

##########################target_enc feature#######################
## 和开源基本一致
from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=2020)
enc_list = group_list + ['net_type', 'task_id', 'adv_id', 'adv_prim_id', 'age',
                         'app_first_class', 'app_second_class', 'career', 'city', 'consume_purchase', 'uid', 'dev_id',
                         'tags', 'slot_id']
for f in tqdm(enc_list):
    train_df[f + '_target_enc'] = 0
    test_df[f + '_target_enc'] = 0
    for i, (trn_idx, val_idx) in enumerate(skf.split(train_df, train_df['label'])):
        trn_x = train_df[[f, 'label']].iloc[trn_idx].reset_index(drop=True)
        val_x = train_df[[f]].iloc[val_idx].reset_index(drop=True)
        enc_df = trn_x.groupby(f, as_index=False)['label'].agg({f + '_target_enc': 'mean'})
        val_x = val_x.merge(enc_df, on=f, how='left')
        test_x = test_df[[f]].merge(enc_df, on=f, how='left')
        val_x[f + '_target_enc'] = val_x[f + '_target_enc'].fillna(train_df['label'].mean())
        test_x[f + '_target_enc'] = test_x[f + '_target_enc'].fillna(train_df['label'].mean())
        train_df.loc[val_idx, f + '_target_enc'] = val_x[f + '_target_enc'].values
        test_df[f + '_target_enc'] += test_x[f + '_target_enc'].values / skf.n_splits

del trn_x, val_x, enc_df, test_x
gc.collect()
# all features
df_fe = pd.concat([train_df, test_df])
del train_df, test_df
df_fe = df_fe.sort_values('index').reset_index(drop=True)
df_fe = reduce_mem(df_fe, use_float16=False)

droplist = []
set_test = df_fe.query('pt_d>=8')
for var in df_fe.columns:
    if var not in ['id', 'index', 'label', 'pt_d']:
        if set_test[var].nunique() < 2 or set_test[var].count() < 2:
            droplist.append(var)
print('drop list:', droplist)
df_fe = df_fe.drop(droplist, axis=1)

  0%|          | 0/7 [00:00<?, ?it/s]

**************************task_id**************************
**************************adv_id**************************
**************************dev_id**************************
**************************spread_app_id**************************
**************************indu_name**************************


 14%|█▍        | 1/7 [02:28<14:52, 148.71s/it]

**************************task_id**************************
**************************adv_id**************************
**************************dev_id**************************
**************************spread_app_id**************************
**************************indu_name**************************


 29%|██▊       | 2/7 [04:58<12:25, 149.13s/it]

**************************task_id**************************
**************************adv_id**************************
**************************dev_id**************************
**************************spread_app_id**************************
**************************indu_name**************************


 43%|████▎     | 3/7 [08:34<11:16, 169.09s/it]

**************************task_id**************************
**************************adv_id**************************
**************************dev_id**************************
**************************spread_app_id**************************
**************************indu_name**************************


 57%|█████▋    | 4/7 [13:59<10:47, 215.91s/it]

**************************task_id**************************
**************************adv_id**************************
**************************dev_id**************************
**************************spread_app_id**************************
**************************indu_name**************************


 71%|███████▏  | 5/7 [21:47<09:42, 291.38s/it]

**************************task_id**************************
**************************adv_id**************************
**************************dev_id**************************
**************************spread_app_id**************************
**************************indu_name**************************


 86%|████████▌ | 6/7 [32:17<06:33, 393.09s/it]

**************************task_id**************************
**************************adv_id**************************
**************************dev_id**************************
**************************spread_app_id**************************
**************************indu_name**************************


100%|██████████| 7/7 [45:52<00:00, 393.20s/it]


9141.80 Mb, 5719.39 Mb (37.44 %)


  0%|          | 0/84 [00:00<?, ?it/s]

['uidtask_id_nunique', 'uidtask_id_pt_d_nunique', 'uidadv_id_nunique', 'uidadv_id_pt_d_nunique', 'uiddev_id_nunique', 'uiddev_id_pt_d_nunique', 'uidspread_app_id_nunique', 'uidspread_app_id_pt_d_nunique', 'uidindu_name_nunique', 'uidindu_name_pt_d_nunique', 'agetask_id_nunique', 'agetask_id_pt_d_nunique', 'ageadv_id_nunique', 'ageadv_id_pt_d_nunique', 'agedev_id_nunique', 'agedev_id_pt_d_nunique', 'agespread_app_id_nunique', 'agespread_app_id_pt_d_nunique', 'ageindu_name_nunique', 'ageindu_name_pt_d_nunique', 'gendertask_id_nunique', 'gendertask_id_pt_d_nunique', 'genderadv_id_nunique', 'genderadv_id_pt_d_nunique', 'genderdev_id_nunique', 'genderdev_id_pt_d_nunique', 'genderspread_app_id_nunique', 'genderspread_app_id_pt_d_nunique', 'genderindu_name_nunique', 'genderindu_name_pt_d_nunique', 'careertask_id_nunique', 'careertask_id_pt_d_nunique', 'careeradv_id_nunique', 'careeradv_id_pt_d_nunique', 'careerdev_id_nunique', 'careerdev_id_pt_d_nunique', 'careerspread_app_id_nunique', 'caree

100%|██████████| 84/84 [14:55<00:00, 10.66s/it]


11918.47 Mb, 8782.03 Mb (26.32 %)
drop list: ['communication_onlinerate_24', 'communication_onlinerate_24_count', 'communication_onlinerate_24_pt_d_count']


## data merge

In [5]:
df_fe = df_fe.drop(columns = ['index'])
Cache.cache_data(df_fe, nm_marker='sampling_pro_feature')

[2020-09-26 18:23:39] - __init__.py[line:111] - INFO: Cache Successfully! File name: /home/zhangqibot/proj/digix/zlh/stage2/cached_data/CACHE_sampling_pro_feature.pkl
