In [77]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore", module="lightgbm")

import joblib
import re
import sys
import glob2
import config

In [82]:
def preprocess(file):
    data = pd.read_csv(file, low_memory=False)
    pat = re.compile(r"\d+(?=-)|\d+\.\d+(?=-)|(?<=-)\w+")
    match = pat.findall(file)
    date, bank = match[0], match[2]
    data = data.assign(bank=bank)
    data = data[pd.notnull(data['mob'])]
    return data


# 通过转换类型，减少数据所占内存
# 转换数值型变量
def reduce_memory_usage(matrix):
    matrix['age'] = matrix['age'].astype("int8")
    matrix['mob'] = matrix['mob'].astype("int16")
    matrix['核卡率'] = matrix['核卡率'].astype("float32")
    matrix['被拒率'] = matrix['被拒率'].astype('float32')
    matrix['距离上一次申请信用卡天数'] = matrix['距离上一次申请信用卡天数'].astype("Int16")
    matrix['下单用户'] = matrix['下单用户'].astype("int8")
    matrix['近一月推送次数'] = matrix['近一月推送次数'].fillna(0).astype("int16")

    # 转换类别变量
    cat_vars = ['最近一次申请产品', 'bank', 'gender']
    for v in cat_vars:
        matrix[v] = matrix[v].astype('category')

    # 转换布尔变量
    bool_vars = ['是否曾经申请过相同银行', '是否有该银行的核卡', '是否曾被该银行拒绝过',
                 '是否和上一次申请的银行一致']
    for v in bool_vars:
        matrix[v] = matrix[v].astype("boolean")

    return matrix


def feature_engineering(matrix):
    # 用0填充空值
    matrix[['信用卡申请数', '信贷申请数', '线上申请信用卡次数', '线上申请信贷次数']] = \
        matrix[['信用卡申请数', '信贷申请数', '线上申请信用卡次数', '线上申请信贷次数']].fillna(0)


    # 计算衍生变量
    matrix['mob_month'] = ((matrix['mob'] / 30) + 1).astype("int")
    matrix['apply_credit_card_average_month'] = (matrix['信用卡申请数'] / matrix['mob_month']).astype(np.double)
    matrix['apply_loan_average_month'] = (matrix['信贷申请数'] / matrix['mob_month'])
    matrix['apply_credit_card_online_average_month'] = (matrix['线上申请信用卡次数'] / matrix['mob_month'])
    matrix['apply_loan_online_average_month'] = (matrix['线上申请信贷次数'] / matrix['mob_month'])
    matrix['new_customer'] = matrix.mob_month == 1
    matrix['new_customer'] = matrix['new_customer'].astype("int8")
    matrix = matrix.drop(columns='mob_month')

    # 剔除异常客户
    matrix = matrix[matrix['信用卡申请数'] < 20]
    matrix = matrix[(matrix['age'] > 18) & (matrix['age'] < 65)]

    # matrix['信用卡申请数'] = matrix['信用卡申请数'].astype("int8")
    # matrix['线上申请信用卡次数'] = matrix['线上申请信用卡次数'].astype("int8")
    # matrix['线上申请信贷次数'] = matrix['线上申请信贷次数'].astype("int8")
    # matrix['信贷申请数'] = matrix['信贷申请数'].astype("int8")

    # 性别特征处理，用2填充空值
    if matrix['gender'].isna().values.any():
        gender_map = {'male': 0, 'female': 1, 'NONE': 2}
        matrix['gender'] = matrix['gender'].fillna("NONE")
    else:
        gender_map = {'male': 0, 'female': 1}
    matrix['gender'] = matrix['gender'].map(gender_map).astype('int8')

    # 若距离上一次申请信用卡天数为空，用-1填充
    matrix['距离上一次申请信用卡天数'] = matrix['距离上一次申请信用卡天数'].fillna(-1).astype("int16")
    # 若上一次生产产品类型为空，用0填充
    matrix['最近一次申请产品'] = matrix['最近一次申请产品'].astype("Int8").fillna(0).astype("int8")
    # 若该字段为空，用0填充
    matrix['是否曾经申请过相同银行'] = matrix['是否曾经申请过相同银行'].fillna(0).astype("int8")

    # 若银行为非脱银行或者该字段为空，该字段的值为2
    feituo = ['招商银行', '交通银行', '浦发银行']
    matrix['是否有该银行的核卡'] = matrix['是否有该银行的核卡'].astype("Int8")
    matrix.loc[matrix['bank'].isin(feituo), '是否有该银行的核卡'] = np.nan
    matrix['是否有该银行的核卡'] = matrix['是否有该银行的核卡'].fillna(2).astype("int8")

    # 若银行为非脱银行或者该字段为空，该字段的值为2
    matrix['是否曾被该银行拒绝过'] = matrix['是否曾被该银行拒绝过'].astype("Int8")
    matrix.loc[matrix['bank'].isin(feituo), '是否曾被该银行拒绝过'] = np.nan
    matrix['是否曾被该银行拒绝过'] = matrix['是否曾被该银行拒绝过'].fillna(2).astype("int8")

    # 若该字段为空，用2填充
    matrix['是否和上一次申请的银行一致'] = matrix['是否和上一次申请的银行一致'].astype("Int8").fillna(2).astype("int8")

    bank_encoder = joblib.load(BANK_ENCODER_PATH)
    major_banks = ['招商银行', '中信银行', '交通银行', '光大银行', '工商银行', '平安银行', '渤海银行', '广发银行',
                   '民生银行', '华夏银行', '浦发银行', '建设银行']
    matrix['bank_processed'] = matrix['bank'].copy().astype('str')
    matrix.loc[~matrix['bank'].isin(major_banks), 'bank_processed'] = "rare"
    matrix['bank_processed'] = bank_encoder.transform(matrix['bank_processed']).astype("int8")
    matrix = matrix.drop(columns=['bank'])

    customer_source = pd.read_csv(CUSTOMER_SOURCE_PATH)
    matrix = pd.merge(matrix, customer_source, on='customer_key', how='left')

    # 若该字段为空，用4填充
    matrix['line_key'] = matrix['line_key'].fillna(4)
    matrix['line_key'] = matrix['line_key'].astype("int8")

    kmeans = joblib.load(KMEANS_PATH)
    cluster_features = ['信用卡申请数', '信贷申请数', '线上申请信用卡次数', '线上申请信贷次数',
                        'apply_credit_card_average_month', 'apply_loan_average_month',
                        'apply_credit_card_online_average_month',
                        'apply_loan_online_average_month']
    matrix['cluster_n_3'] = kmeans.predict(matrix[cluster_features].astype("float32"))

    #     客群跟指定文件匹配，匹配上为True，匹配不上为False
    pos_merchant = pd.read_csv(POS_MERCHANT_PATH)
    matrix.loc[matrix['customer_key'].isin(pos_merchant['customer_key']), 'pos_merchant'] = True
    matrix['pos_merchant'] = matrix['pos_merchant'].fillna(False)
    matrix['pos_merchant'] = matrix['pos_merchant'].astype("int8")

    #     贷款需求字段处理，空值用3填充
    qst = pd.read_csv(QUESTIONNAIRE_PATH)
    matrix = pd.merge(matrix, qst, on ='customer_key', how='left')
    loan_intention_map = {'没有': 0, '有': 1, '不确定，但我想了解': 2}
    matrix['是否需要贷款'] = matrix['是否需要贷款'].map(loan_intention_map)
    matrix['是否需要贷款'] = matrix['是否需要贷款'].fillna(3)

    #     学历字段处理，空值用0填充
    degree_unification = {'高中/中专/技校': '高中及以下', '小学及以下': '高中及以下', '初中': '高中及以下', "硕士（含）及以上": "研究生及以上"}
    matrix['学历'] = matrix['学历'].replace(degree_unification)
    degree_map = {'高中及以下': 1, '大学专科': 2, '大学本科': 3, '研究生及以上': 4}
    matrix['学历'] = matrix['学历'].map(degree_map)
    matrix['学历'] = matrix['学历'].fillna(0)

    #     利用是否拥有信用卡和多少张信用卡互相填补空值
    matrix.loc[(matrix['是否拥有信用卡'] == "否") &
               (matrix['有多少张信用卡'].isna()), "有多少张信用卡"] = "无信用卡"
    matrix.loc[(matrix['是否拥有信用卡'].isna()) &
               (matrix['有多少张信用卡'] == "无信用卡"), '是否拥有信用卡'] = "否"
    matrix.loc[(matrix['是否拥有信用卡'].isna()) &
               (matrix['有多少张信用卡'] != "无信用卡") &
               (pd.notnull(matrix['有多少张信用卡'])), '是否拥有信用卡'] = '是'
    matrix.loc[(matrix['是否拥有信用卡'] == "是") &
               (matrix['有多少张信用卡'].isna()), "有多少张信用卡"] = "至少一张信用卡"

    #     是否拥有信用卡字段处理，空值用2处理
    own_credit_card_map = {'否': 0, '是': 1}
    matrix['是否拥有信用卡'] = matrix['是否拥有信用卡'].map(own_credit_card_map)
    matrix['是否拥有信用卡'] = matrix['是否拥有信用卡'].fillna(2)

    #     信用卡数量字段处理, 空值用6填充
    credit_card_num_map = {'无信用卡': 0, '至少一张信用卡': 1, '1张': 2, '2张': 3, '3张': 4, '4张及以上': 5}
    matrix['有多少张信用卡'] = matrix['有多少张信用卡'].map(credit_card_num_map)
    matrix['有多少张信用卡'] = matrix['有多少张信用卡'].fillna(6)

    #     行业字段处理，空值用9填充
    职业_map = {'制造业/商业贸易/批发零售': 1, '旅游/酒店/餐饮等服务行业': 2, '自由职业': 3,
                  '金融/互联网/大众传媒': 4, '交通/运输/建筑': 5, '教育/科研/医疗卫生': 6, '政府机关/公共事业': 7,
                  '农林牧矿鱼': 8}
    matrix['职业'] = matrix['职业'].map(职业_map)
    matrix['职业'] = matrix['职业'].fillna(9)

    # 因为lgbm是按列的index，而不是按名称抽取特征，因此要对列重新排序，以保持和训练集特征排序一致
    matrix = matrix[['customer_key', '下单用户', 'age', 'gender', 'mob', '信用卡申请数', '信贷申请数',
                     '核卡数', '被拒数', '核卡率', '被拒率', '最近一次申请产品',
                     '线上申请信用卡次数', '线上申请信贷次数', '距离上一次申请信用卡天数',
                     '是否曾经申请过相同银行', '是否有该银行的核卡', '是否曾被该银行拒绝过',
                     '是否和上一次申请的银行一致', '该银行申请次数','近一月推送次数', 'apply_credit_card_average_month',
                     'apply_loan_average_month', 'apply_credit_card_online_average_month',
                     'apply_loan_online_average_month', 'new_customer', 'bank_processed',
                     'line_key', 'cluster_n_3', 'pos_merchant', '是否需要贷款', '学历', '是否拥有信用卡',
                     '有多少张信用卡', '职业']]
    # matrix = matrix.drop(columns=['customer_key.1', '该银行申请次数'], errors='ignore')

    return matrix


def predict_apply_score(file_path, mode="pred"):
    if mode != "pred":
        matrix = feature_engineering(reduce_memory_usage(preprocess(file_path)))
    else:
        matrix = preprocess(file_path)
        matrix['下单用户'] = -1
        matrix = feature_engineering(reduce_memory_usage(matrix))

    predictions = matrix[['customer_key', '下单用户']].copy()

    for num_model in range(0, 2):
        lgbm = joblib.load(f'{LGBM_PATH}/gbm{num_model:1d}.pkl')
        pred_score = lgbm.predict(matrix.drop(columns=['customer_key', '下单用户'], errors='ignore'))
        predictions[f'model_{num_model}'] = pred_score
        
    predictions = predictions.melt(id_vars=['customer_key', '下单用户'], value_vars=['model_0', 'model_1'], var_name='model', value_name='score')

    return predictions

In [83]:
KMEANS_PATH = f"{config.MODEL_UTILS}/kmeans.pkl"
BANK_ENCODER_PATH = f"{config.MODEL_UTILS}/bank_encoder.pkl"
POS_MERCHANT_PATH = f"{config.AUXILIARY_DIR}/pos_merchant.csv"
QUESTIONNAIRE_PATH = f"{config.AUXILIARY_DIR}/questionnaire.csv"
CUSTOMER_SOURCE_PATH = f"{config.AUXILIARY_DIR}/customer_source.csv"
LGBM_PATH = config.MODEL_GBM

percentile = list(range(0, 101, 10))
for file in glob2.glob(f"{config.TEST_DIR}/*"):
    print(file.split("/")[-1])
    predictions = predict_apply_score(file, 'valid')
    predictions['percentile'] = predictions.groupby('model')['score'].rank(ascending=False, pct=True)
    predictions['percentile'] = predictions['percentile'] * 100
    for p in percentile:
        if p == 0:
            continue
        over_threshold = predictions.query(f'percentile < {p}').drop_duplicates(['customer_key'])
        precision = over_threshold['下单用户'].sum() / over_threshold.shape[0]
        recall = over_threshold['下单用户'].sum() / (predictions['下单用户'].sum() / 2)
        print(f'top {p}%, precision {precision:.4f}, recall {recall:.4f}')

20220326-魔方E卡通-中信银行-0325魔方E卡通光大已核卡、被拒.csv


  qst = pd.read_csv(QUESTIONNAIRE_PATH)


top 10%, precision 0.1050, recall 0.5234
top 20%, precision 0.0693, recall 0.6495
top 30%, precision 0.0542, recall 0.7617
top 40%, precision 0.0412, recall 0.8178
top 50%, precision 0.0340, recall 0.8551
top 60%, precision 0.0299, recall 0.8925
top 70%, precision 0.0280, recall 0.9439
top 80%, precision 0.0267, recall 0.9813
top 90%, precision 0.0250, recall 0.9907
top 100%, precision 0.0231, recall 1.0000
20220326-魔方用卡小助手-中信银行-0325魔方用卡小助手近2月申请光大（剔除已核卡、被拒）.csv


  qst = pd.read_csv(QUESTIONNAIRE_PATH)


top 10%, precision 0.0949, recall 0.3822
top 20%, precision 0.0696, recall 0.5405
top 30%, precision 0.0586, recall 0.6718
top 40%, precision 0.0520, recall 0.7954
top 50%, precision 0.0462, recall 0.8726
top 60%, precision 0.0411, recall 0.9228
top 70%, precision 0.0365, recall 0.9459
top 80%, precision 0.0331, recall 0.9653
top 90%, precision 0.0307, recall 0.9923
top 100%, precision 0.0282, recall 1.0000
20220320-魔方办卡-交通银行-0320-魔方办卡-超2月申请【交通】.csv


  qst = pd.read_csv(QUESTIONNAIRE_PATH)


top 10%, precision 0.0637, recall 0.3643
top 20%, precision 0.0441, recall 0.4929
top 30%, precision 0.0352, recall 0.5786
top 40%, precision 0.0323, recall 0.7000
top 50%, precision 0.0272, recall 0.7357
top 60%, precision 0.0253, recall 0.8143
top 70%, precision 0.0239, recall 0.8857
top 80%, precision 0.0222, recall 0.9357
top 90%, precision 0.0214, recall 0.9929
top 100%, precision 0.0198, recall 1.0000
20220326-魔方E卡通-中信银行-0325魔方E卡通近2月申请光大（剔除已核卡、被拒）.csv


  qst = pd.read_csv(QUESTIONNAIRE_PATH)


top 10%, precision 0.2487, recall 0.4188
top 20%, precision 0.1772, recall 0.5983
top 30%, precision 0.1544, recall 0.7863
top 40%, precision 0.1275, recall 0.8632
top 50%, precision 0.1113, recall 0.9145
top 60%, precision 0.0980, recall 0.9658
top 70%, precision 0.0877, recall 0.9829
top 80%, precision 0.0798, recall 1.0000
top 90%, precision 0.0721, recall 1.0000
top 100%, precision 0.0656, recall 1.0000
20220320-魔方E卡通-中信银行-0320-魔方E卡-近2月申请【中信】.csv


  qst = pd.read_csv(QUESTIONNAIRE_PATH)


top 10%, precision 0.3167, recall 0.3564
top 20%, precision 0.2474, recall 0.5578
top 30%, precision 0.2014, recall 0.6865
top 40%, precision 0.1733, recall 0.7723
top 50%, precision 0.1531, recall 0.8383
top 60%, precision 0.1366, recall 0.8878
top 70%, precision 0.1248, recall 0.9307
top 80%, precision 0.1154, recall 0.9703
top 90%, precision 0.1070, recall 0.9934
top 100%, precision 0.0983, recall 1.0000
20220313-魔方办卡-广发银行-魔方办卡-平安已核【广发】.csv


  qst = pd.read_csv(QUESTIONNAIRE_PATH)


top 10%, precision 0.0816, recall 0.4909
top 20%, precision 0.0594, recall 0.6909
top 30%, precision 0.0467, recall 0.8364
top 40%, precision 0.0385, recall 0.9091
top 50%, precision 0.0314, recall 0.9273
top 60%, precision 0.0256, recall 0.9273
top 70%, precision 0.0227, recall 0.9455
top 80%, precision 0.0211, recall 0.9818
top 90%, precision 0.0197, recall 1.0000
top 100%, precision 0.0184, recall 1.0000
20220314-魔方办卡VIP-中信银行-魔方办卡VIP -近2月申请【中信】.csv


  qst = pd.read_csv(QUESTIONNAIRE_PATH)


top 10%, precision 0.3271, recall 0.3672
top 20%, precision 0.2942, recall 0.5313
top 30%, precision 0.2695, recall 0.6806
top 40%, precision 0.2289, recall 0.7791
top 50%, precision 0.2011, recall 0.8448
top 60%, precision 0.1759, recall 0.8896
top 70%, precision 0.1593, recall 0.9313
top 80%, precision 0.1471, recall 0.9672
top 90%, precision 0.1370, recall 0.9970
top 100%, precision 0.1258, recall 1.0000
20220309-魔方办卡VIP-招商银行-0309-魔方办卡VIP-超2月申请招商.csv


  qst = pd.read_csv(QUESTIONNAIRE_PATH)


top 10%, precision 0.0875, recall 0.3000
top 20%, precision 0.0833, recall 0.5714
top 30%, precision 0.0654, recall 0.6571
top 40%, precision 0.0543, recall 0.7000
top 50%, precision 0.0474, recall 0.7571
top 60%, precision 0.0431, recall 0.8000
top 70%, precision 0.0436, recall 0.9286
top 80%, precision 0.0399, recall 0.9571
top 90%, precision 0.0365, recall 0.9714
top 100%, precision 0.0345, recall 1.0000
20220310-魔方办卡-平安银行-0310-魔方办卡-近2月申请光大.csv


  qst = pd.read_csv(QUESTIONNAIRE_PATH)


top 10%, precision 0.0920, recall 0.2632
top 20%, precision 0.0737, recall 0.4035
top 30%, precision 0.0656, recall 0.5263
top 40%, precision 0.0689, recall 0.7368
top 50%, precision 0.0617, recall 0.8070
top 60%, precision 0.0544, recall 0.8421
top 70%, precision 0.0490, recall 0.8772
top 80%, precision 0.0466, recall 0.9298
top 90%, precision 0.0451, recall 0.9825
top 100%, precision 0.0416, recall 1.0000
20220309-魔方办卡VIP-招商银行-0309-魔方办卡VIP-未申请招商.csv


  qst = pd.read_csv(QUESTIONNAIRE_PATH)


top 10%, precision 0.2313, recall 0.4911
top 20%, precision 0.1610, recall 0.6696
top 30%, precision 0.1279, recall 0.7661
top 40%, precision 0.1080, recall 0.8179
top 50%, precision 0.0902, recall 0.8607
top 60%, precision 0.0791, recall 0.9000
top 70%, precision 0.0709, recall 0.9268
top 80%, precision 0.0658, recall 0.9696
top 90%, precision 0.0594, recall 0.9911
top 100%, precision 0.0551, recall 1.0000
