In [2]:
# 类别特征和连续性特征 一阶交叉

def cross_cat_num(df, num_col, cat_col):
    for f1 in tqdm(cat_col):
        g = df.groupby(f1, as_index=False)
        for f2 in tqdm(num_col):
            feat = g[f2].agg({
                '{}_{}_max'.format(f1, f2): 'max', '{}_{}_min'.format(f1, f2): 'min',
                '{}_{}_median'.format(f1, f2): 'median',
            })
            df = df.merge(feat, on=f1, how='left')
    return (df)

data = cross_cat_num(data, num_col, cat_col)  # 一阶交叉
print('一阶交叉特征处理后：', data.shape)

In [None]:
# count 编码
def count_coding(df, col_cat):
    for f in col_cat:
        df[f'{f}_count'] = df[f].map(df[f].value_counts())
    return (df)

data = count_coding(data,col_cat)


# 类别特征二阶交叉
f_pairs = [['是否双频','信用等级代码'],['婚姻状况','预计收入'],['婚姻状况','信用等级代码'],['家庭成人人数','家庭中唯一订阅者的数量'],['信息库匹配','账户消费限额'],
           ['信用等级代码','信息库匹配'],['地理区域','家庭活跃用户数'],['家庭活跃用户数','信用等级代码'],['是否翻新机','家庭中唯一订阅者的数量']]

def cross_qua_cat_num(df):
    for f_pair in tqdm(f_pairs):
        ### 共现次数
        df['_'.join(f_pair) + '_count'] = df.groupby(f_pair)['客户ID'].transform('count')
        ### n unique、熵
        df = df.merge(df.groupby(f_pair[0], as_index=False)[f_pair[1]].agg({
            '{}_{}_nunique'.format(f_pair[0], f_pair[1]): 'nunique',
            '{}_{}_ent'.format(f_pair[0], f_pair[1]): lambda x: entropy(x.value_counts() / x.shape[0])
        }), on=f_pair[0], how='left')
        df = df.merge(df.groupby(f_pair[1], as_index=False)[f_pair[0]].agg({
            '{}_{}_nunique'.format(f_pair[1], f_pair[0]): 'nunique',
            '{}_{}_ent'.format(f_pair[1], f_pair[0]): lambda x: entropy(x.value_counts() / x.shape[0])
        }), on=f_pair[1], how='left')
        ### 比例偏好
        df['{}_in_{}_prop'.format(f_pair[0], f_pair[1])] = df['_'.join(f_pair) + '_count'] / df[f_pair[1] + '_count']
        df['{}_in_{}_prop'.format(f_pair[1], f_pair[0])] = df['_'.join(f_pair) + '_count'] / df[f_pair[0] + '_count']
    return (df)


cross_qua_cat_num(data)

In [None]:
# 类别特征 -- 单个、组合count

# count_combinations = [
#         ['app'],
#         ['ip'],  # 3.03
#         ['ip', 'device'],  # 9.88
#         ['day', 'hour', 'app'],  # 4.08
#         ['app', 'channel'],  # 2.8
#         ['ip', 'day', 'in_test_hh'],  # 1.74
#         ['ip', 'day', 'hour'],  # 0.52
#         ['os', 'device'],  # 0.44
#         ['ip', 'os', 'day', 'hour'],  # 0.41
#     ]


def count_agg(df, group_cols):
    for i, cols in enumerate(group_cols):
        col_name = "_".join(cols) + '_count'
        print(i, col_name)
        count = df.groupby(cols).size().reset_index(name=col_name)
        df = df.merge(count, on=cols, how='left')
        del count
        gc.collect()
    return df

In [None]:
# 类别特征--累计count

# accum_combinations = [
#     # ['app'],
#     ['ip']  # 3.03
#     # ['day', 'hour', 'app']
# ]

def count_cum(df, group_cols):
    for i, cols in enumerate(group_cols):
        col_name = "_".join(cols) + '_countAccum'
        print(i, col_name)
        df[col_name] = df.groupby(cols).cumcount()
        gc.collect()
    return df

In [None]:
# 类别特征 -- 组合特征 col_nuique

# countUniq_combinations = [
#         # [['app'],'ip'],
#         # [['app', 'device', 'os', 'channel'], 'ip'],
#         [['ip'], 'channel'],  # 0.9
#         [['ip'], 'app'],  # 1.3
#         [['ip'], 'os']  # 0.45
#     ]

def count_uniq(df, group_uniq_cols):
    for i, cols in enumerate(group_uniq_cols):
        group_cols, uniq_col = cols[0], cols[1]
        col_name = "_".join(group_cols) + '_uniq_' + uniq_col + '_countUniq'
        print(i, col_name)
        tmp = df.groupby(group_cols)[uniq_col].nunique().reset_index(name=col_name)
        df = df.merge(tmp, on=group_cols, how='left')
        del tmp
        gc.collect()
    return df

In [None]:
# 时序特征 -- 不同类别特征组合下的shift时间差

# nextClick_combinations = [
#         ['ip', 'os'],
#         ['ip', 'device', 'os'],
#         ['ip', 'app', 'device', 'os'],
#         ['ip', 'app', 'device', 'os', 'channel']
#     ]

def next_click(df, group_cols):
    for i, cols in enumerate(group_cols):
        col_name = "_".join(cols) + '_nextClick'
        print(i, col_name)
        df[col_name] = (df.groupby(cols).click_time.shift(-1) - df.click_time).astype(np.float32)
        gc.collect()
    return df

In [None]:
# 类别特征 -- 类别特征分组 统计连续值

# combination_col_mean = [
#         [['ip','os'], 'values'],  # 0.9
#     ]

def frequence(df, group_cols):
    for i, cols in enumerate(group_cols):
        group_cols, col = cols[0], cols[1]
        col_name = "_".join(group_cols) + '_'+col+'_mean'
        print(i, col_name)
        tmp = df.groupby(group_cols)[col].mean().reset_index(name=col_name)
        df = df.merge(tmp, on=group_cols, how='left')
        del tmp
        gc.collect()
    return df