In [1]:
import pandas as pd
pd.set_option("display.max_columns", 100)
pd.set_option("display.max_rows", 100)
import numpy as np
import os
from utils import peek
import config

## 加载样本数据

In [2]:
sample = pd.read_csv("/sda/huweipeng/project/长安银行/data/third_party_data/百融数据/百融样本.csv", parse_dates=['申请日期'])
label = pd.read_csv("/sda/huweipeng/project/长安银行/data/third_party_data/百融数据/样本核卡状态.csv", parse_dates=['apply_date_key'])
sample = sample.merge(label, left_on=['申请日期', '手机号'], right_on=['apply_date_key', 'customer_phone'], how = 'left')[['customer_phone', 'apply_date_key', 'order_status_key']]
sample = sample.dropna(subset=['order_status_key']).drop_duplicates()
peek(sample, 2)

(10006, 3)
                     customer_phone apply_date_key  order_status_key
0  cc92befb0e0b9f7ef4736f0fabc86821     2023-02-22                 3
1  057331edd64b9002ddfc084e2d09b441     2023-03-25                 2


## 加载百融多头特征

In [3]:
br_duotou = pd.read_csv("/sda/huweipeng/project/长安银行/data/third_party_data/百融数据/样本量10000/详细匹配数据及字典/借贷意向验证-V2.0.csv", header=1, parse_dates=['申请日期'])
customer = pd.read_csv("/sda/huweipeng/project/长安银行/data/third_party_data/百融数据/百融样本.csv", parse_dates=['申请日期'])
br_duotou['customer_phone'] = customer['手机号']
br_duotou = br_duotou[br_duotou['产品输出标识'] == 1]
use_features = pd.read_csv("/sda/huweipeng/project/长安银行/data/third_party_data/feature_iv/百融多头iv.csv")
use_features = use_features['feature'].tolist() + ['customer_phone', '申请日期']
br_duotou = br_duotou[use_features]
special_features = ['按身份证号查询，近3个月在非银机构申请最小间隔天数',
                     '按身份证号查询，近12个月在非银机构申请最小间隔天数',
                     '按身份证号查询，近12个月在非银机构周末申请机构数',
                     '按身份证号查询，近12个月申请最小间隔天数',
                     '按身份证号查询，近1个月在非银机构周末申请机构数',
                     '按身份证号查询，近6个月在银行机构申请最小间隔天数',
                     '按身份证号查询，近7天在非银机构周末申请机构数',
                     '按身份证号查询，近12个月在银行机构申请最小间隔天数']
normal_features = br_duotou.columns.difference(special_features)
br_duotou[special_features] = br_duotou[special_features].fillna(-9999)
br_duotou[normal_features] = br_duotou[normal_features].fillna(0)
feature_id = pd.read_excel("/sda/huweipeng/project/长安银行/data/third_party_data/百融数据/样本量10000/详细匹配数据及字典/多头字段名称映射.xlsx")
feature_map = dict(zip(feature_id['中文名称'], feature_id['参数名']))
br_duotou.rename(columns=feature_map, inplace=True)
sample = sample.merge(br_duotou, left_on=['customer_phone', 'apply_date_key'], right_on=['customer_phone', '申请日期'], how='left')
sample = sample.drop(columns=['申请日期']).drop_duplicates()
peek(sample, 2)

(10006, 19)
                     customer_phone apply_date_key  order_status_key  \
0  cc92befb0e0b9f7ef4736f0fabc86821     2023-02-22                 3   
1  057331edd64b9002ddfc084e2d09b441     2023-03-25                 2   

   als_m12_id_nbank_orgnum  als_m3_id_nbank_min_inteday  \
0                      7.0                          0.0   
1                      2.0                      -9999.0   

   als_m12_id_nbank_min_inteday  als_m12_id_nbank_week_orgnum  \
0                           0.0                           3.0   
1                          60.0                           0.0   

   als_m12_id_min_inteday  als_d15_id_nbank_orgnum  \
0                     0.0                      0.0   
1                    25.0                      0.0   

   als_m1_id_nbank_week_orgnum  als_m12_id_rel_orgnum  \
0                      -9999.0                    2.0   
1                      -9999.0                    2.0   

   als_m12_id_cooff_orgnum  als_m12_id_bank_ret_orgnum  \
0   

## 加载百融黑名单

In [4]:
br_black_list = pd.read_csv("/sda/huweipeng/project/长安银行/data/third_party_data/百融数据/样本量10000/详细匹配数据及字典/特殊名单验证-V2.1.csv", header = 1, parse_dates=['申请日期'])
br_black_list['customer_phone'] = customer['手机号']
br_black_list = br_black_list.query("产品输出标识 == 1")
br_black_list['in_black_list'] = True
sample = sample.merge(br_black_list[['customer_phone', '申请日期', 'in_black_list']], left_on=['customer_phone', 'apply_date_key'], right_on=['customer_phone', '申请日期'], how='left')
sample = sample.drop(columns=['申请日期']).drop_duplicates()

## 加载百融偿债压力指数

In [5]:
br_debt_pressure = pd.read_csv("/sda/huweipeng/project/长安银行/data/third_party_data/百融数据/样本量10000/详细匹配数据及字典/偿债压力指数-V1.0.csv", header = 1, parse_dates=['申请日期'])
br_debt_pressure['customer_phone'] = customer['手机号']
br_debt_pressure = br_debt_pressure.query("偿债压力指数产品输出标识 == 1")
sample = sample.merge(br_debt_pressure[['customer_phone', '申请日期', '偿债压力指数']], left_on=['customer_phone', 'apply_date_key'], right_on=['customer_phone', '申请日期'], how='left')
sample.rename(columns={'偿债压力指数':'debt_pressure_index'}, inplace=True)
sample = sample.drop(columns=['申请日期']).drop_duplicates()

## 加载腾讯反欺诈

In [6]:
tencent_fraud_risk = pd.read_excel("/sda/huweipeng/project/长安银行/data/third_party_data/腾讯数据/数字魔方_反欺诈v5v6v7-zx&灵鲲v6&行业风险3.0_20230406.xlsx", 
                      header = 1, parse_dates=['回溯时间'], sheet_name="反欺诈")
tencent_fraud_risk = tencent_fraud_risk.dropna(subset=['highirr_v6_20220425_score'])
use_feature = pd.read_csv("/sda/huweipeng/project/长安银行/data/third_party_data/feature_iv/腾讯反欺诈.csv")['feature'].tolist()
use_feature = ['手机号', '回溯时间'] + use_feature
tencent_fraud_risk = tencent_fraud_risk[use_feature]
sample = sample.merge(tencent_fraud_risk, right_on=['手机号', '回溯时间'], left_on=['customer_phone', 'apply_date_key'], how='left')
sample = sample.drop(columns=['手机号', '回溯时间']).drop_duplicates()

## 加载尚为信用评估

In [7]:
shangwei_credit = pd.read_csv("/sda/huweipeng/project/长安银行/data/third_party_data/尚为数据/尚为立信模型-综合信用评估1+0407swrj.txt", sep="\t", parse_dates=['back_date'])
shangwei_credit = shangwei_credit.query('y == "y"').drop(columns=['y'])[['mobile', 'back_date', 'omriskscoregeneral']]
sample = sample.merge(shangwei_credit, right_on=['mobile', 'back_date'], left_on=['customer_phone', 'apply_date_key'], how='left')
sample = sample.drop(columns=['mobile', 'back_date']).drop_duplicates()
# peek(shangwei_credit, 2)

## 加载人口统计学信息

In [8]:
demographic = pd.read_csv("/sda/huweipeng/project/长安银行/data/样本人口学信息.csv")
sample = sample.merge(demographic, right_on=['lower_phone_md5'], left_on=['customer_phone'], how='left')
sample = sample.drop(columns=['lower_phone_md5'])
peek(sample, 2)

(10006, 27)
                     customer_phone apply_date_key  order_status_key  \
0  cc92befb0e0b9f7ef4736f0fabc86821     2023-02-22                 3   
1  057331edd64b9002ddfc084e2d09b441     2023-03-25                 2   

   als_m12_id_nbank_orgnum  als_m3_id_nbank_min_inteday  \
0                      7.0                          0.0   
1                      2.0                      -9999.0   

   als_m12_id_nbank_min_inteday  als_m12_id_nbank_week_orgnum  \
0                           0.0                           3.0   
1                          60.0                           0.0   

   als_m12_id_min_inteday  als_d15_id_nbank_orgnum  \
0                     0.0                      0.0   
1                    25.0                      0.0   

   als_m1_id_nbank_week_orgnum  als_m12_id_rel_orgnum  \
0                      -9999.0                    2.0   
1                      -9999.0                    2.0   

   als_m12_id_cooff_orgnum  als_m12_id_bank_ret_orgnum  \
0   

## 加载问卷数据

In [9]:
quest = pd.read_csv("/sda/huweipeng/project/长安银行/data/调查问卷数据.csv")
quest = quest.pivot(index='lower_phone_md5', columns='question_subject', values='answer_content').reset_index()
quest.columns = ['customer_phone', 'degree', 'card_num']

In [10]:
sample = sample.merge(quest, on=['customer_phone'], how='left')

In [11]:
peek(sample, 2)

(10006, 29)
                     customer_phone apply_date_key  order_status_key  \
0  cc92befb0e0b9f7ef4736f0fabc86821     2023-02-22                 3   
1  057331edd64b9002ddfc084e2d09b441     2023-03-25                 2   

   als_m12_id_nbank_orgnum  als_m3_id_nbank_min_inteday  \
0                      7.0                          0.0   
1                      2.0                      -9999.0   

   als_m12_id_nbank_min_inteday  als_m12_id_nbank_week_orgnum  \
0                           0.0                           3.0   
1                          60.0                           0.0   

   als_m12_id_min_inteday  als_d15_id_nbank_orgnum  \
0                     0.0                      0.0   
1                    25.0                      0.0   

   als_m1_id_nbank_week_orgnum  als_m12_id_rel_orgnum  \
0                      -9999.0                    2.0   
1                      -9999.0                    2.0   

   als_m12_id_cooff_orgnum  als_m12_id_bank_ret_orgnum  \
0   

## 输出

In [12]:
sample.to_csv(config.DATA, index=False)