In [1]:
# uid 当天特征
import pandas as pd
import numpy as np
import gc
from base import Cache
from tqdm import tqdm

import warnings

warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('precision', 5)
pd.set_option('display.float_format', lambda x: '%.5f' % x)
pd.set_option('max_colwidth', 200)
pd.set_option('display.width', 5000)

from multiprocessing import Pool
from tqdm import tqdm


def reduce_mem(df, use_float16=False):
    start_mem = df.memory_usage().sum() / 1024 ** 2
    tm_cols = df.select_dtypes('datetime').columns
    for col in df.columns:
        if col in tm_cols:
            continue
        col_type = df[col].dtypes
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type).find('int') > -1:
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(
                        np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(
                        np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(
                        np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(
                        np.int64).max:
                    df[col] = df[col].astype(np.int64)
            elif str(col_type).find('float') > -1:
                if use_float16 and c_min > np.finfo(
                        np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(
                        np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024 ** 2
    print('{:.2f} Mb, {:.2f} Mb ({:.2f} %)'.format(
        start_mem, end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df


data = Cache.reload_cache('CACHE_data_0912.pkl')

# 当天count的rank已经做了编码，这里统计当天这个用户 点击某个类占曝光的比例
cate_fe = ['task_id','creat_type_cd','adv_id','adv_prim_id','dev_id',
                                  'inter_type_cd','spread_app_id','tags','app_first_class',
                                  'app_second_class','indu_name','slot_id','net_type']
add_cols = []
count_fe = data.groupby(['uid','pt_d'])['index'].count().rename('uid_pt_d_total_counts').reset_index()# 当日曝光数量
count_fe_max = count_fe.groupby(['pt_d'])['uid_pt_d_total_counts'].max().rename('uid_pt_d_total_counts_max').reset_index()# 当日曝光数量max
data = data.merge(count_fe,on=['uid','pt_d'],how='left')
data = data.merge(count_fe_max,on=['pt_d'],how='left')
add_cols.append('uid_pt_d_total_counts')

[2020-09-13 17:23:10] - __init__.py[line:126] - INFO: Successfully Reload: /home/tione/notebook/huawei/cached_data/CACHE_data_0912.pkl


In [None]:
map_rate = data['uid_pt_d_total_counts'].values/data['uid_pt_d_total_counts_max'].values
print(map_rate.shape)
for var in tqdm(cate_fe):
    fe = data.groupby(['uid','pt_d',var])['index'].count().rename(f'uid_pt_d_{var}_counts').reset_index()# 当日点击这个item的数量
    fe_max = fe.groupby(['uid','pt_d'])[f'uid_pt_d_{var}_counts'].max().rename(f'uid_pt_d_{var}_counts_max').reset_index()# 最大值
    fe = fe.merge(fe_max,on=['uid','pt_d'],how='left')
    # 平滑的曝光占比
    data = data.merge(fe,on=['uid','pt_d',var],how='left')
    print(data.shape)
    data[f'uid_pt_d_{var}_sm_curr_rate'] = (data[f'uid_pt_d_{var}_counts'].values+2.0)/(data['uid_pt_d_total_counts'].values+3.0)
    data[f'uid_pt_d_{var}_counts'] = (data[f'uid_pt_d_{var}_counts'].values+1.0)/(data[f'uid_pt_d_{var}_counts_max'].values+1.0)# 平滑
    data[f'uid_pt_d_{var}_rank_sm_curr_rate'] = (data[f'uid_pt_d_{var}_counts'].values)/map_rate
    del data[f'uid_pt_d_{var}_counts_max']
    gc.collect()
    add_cols.append(f'uid_pt_d_{var}_sm_curr_rate')
    add_cols.append(f'uid_pt_d_{var}_rank_sm_curr_rate')
    add_cols.append(f'uid_pt_d_{var}_counts')
    print(data.shape)
fe = data.groupby(['uid','pt_d','slot_id','net_type'])['index'].count().rename(f'uid_pt_d_slot_id_net_type_counts').reset_index()# 当日点击这个item的数量
fe_max = fe.groupby(['uid','pt_d'])[f'uid_pt_d_slot_id_net_type_counts'].max().rename(f'uid_pt_d_slot_id_net_type_counts_max').reset_index()# 最大值
fe = fe.merge(fe_max,on=['uid','pt_d'],how='left')
data = data.merge(fe,on=['uid','pt_d','slot_id','net_type'],how='left')
data[f'uid_pt_d_slot_id_net_type_sm_curr_rate'] = (data[f'uid_pt_d_slot_id_net_type_counts'].values+2.0)/(data['uid_pt_d_total_counts'].values+3.0)
data[f'uid_pt_d_slot_id_net_type_counts'] = (data[f'uid_pt_d_slot_id_net_type_counts'].values+1.0)/(data[f'uid_pt_d_slot_id_net_type_counts_max'].values+1.0)# 平滑
data[f'uid_pt_d_slot_id_net_type_rank_sm_curr_rate'] = (data[f'uid_pt_d_slot_id_net_type_counts'].values)/map_rate
del data[f'uid_pt_d_slot_id_net_type_counts_max']
add_cols.append(f'uid_pt_d_slot_id_net_type_sm_curr_rate')
add_cols.append(f'uid_pt_d_slot_id_net_type_rank_sm_curr_rate')
add_cols.append(f'uid_pt_d_slot_id_net_type_counts')

data['uid_pt_d_total_counts'] = data['uid_pt_d_total_counts']/data['uid_pt_d_total_counts_max']# 补上，做成相对值
del data['uid_pt_d_total_counts_max']
dara = data[['index']+add_cols]
gc.collect()
data = reduce_mem(data, use_float16=False)
Cache.cache_data(data, nm_marker='data_step_5_0913')# 有index