In [35]:
import pandas as pd
pd.set_option("display.max_column", 100)
import numpy as np
import config
from utils import peek
import joblib
import pickle
import missingno as msno
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder, KBinsDiscretizer

## 前言
_Last update: 2023-5-25（补充之前的记录）_  
_author: 胡伟澎_  

经EDA分析，选择以下这些特征入模：  
百融多头(dataframe index: br_duotou)：  
- 'als_m12_id_nbank_orgnum',  
- 'als_m12_id_nbank_min_inteday',
- 'als_m12_id_min_inteday',
- 'als_m3_id_nbank_min_inteday',
- 'als_lst_id_nbank_inteday',
- 'als_lst_id_nbank_consnum',
- 'als_m12_id_rel_orgnum',
- 'als_m6_id_bank_min_inteday',
- 'als_m12_id_cooff_orgnum',
- 'als_m12_id_bank_ret_orgnum',
- 'als_m12_id_bank_week_orgnum',
- 'als_m12_id_af_allnum',
- 'als_d7_id_rel_allnum',
- als_m12_id_nbank_night_orgnum',
- 'als_d7_id_nbank_week_orgnum'  

腾讯反欺诈 (dataframe index: tc_fraud_risk)：  
- model_risk_v7_stongyong_score
- risk_score:
- v6_seq_online  

尚为综合信用评估 (dataframe index: sw_credit_eval)：
- omriskscoregeneral

人口学数据：
- gender
- actual_age

调查问卷数据:
- degree
- card_num
---
_Last update: 2023-5-25_  
_author: 胡伟澎_  

业务要求把成本控制在1元以内，经测试，第三方产品使用百融多头和腾讯反欺诈model_risk_v7_stongyong_score能够达到与原有模型差不多的效果，成本为6毛钱

In [2]:
data = pd.read_csv(config.DATA, header=[0, 1], low_memory=False)
data.columns = data.columns.get_level_values(1)

In [3]:
selected_features = joblib.load(config.SELECTED_FEATURES)
discard_features = ['risk_score:', 'v6_seq_online', 'omriskscoregeneral']
selected_features = list(set(selected_features) - set(discard_features))
sample_info = ['customer_phone', 'apply_date_key', 'order_status_key']
selected_features
use_cols = sample_info + selected_features

data = data[use_cols]

In [4]:
data = data.query('flag_applyloanstr == 1').drop('flag_applyloanstr', axis=1)

In [5]:
data = data.query('flag_specialList_c != 1').drop('flag_specialList_c', axis=1)

In [6]:
duplicates = data[data.duplicated(['customer_phone', 'apply_date_key'])].customer_phone.unique()
remove_duplicates = data[data.customer_phone.isin(duplicates)].query('order_status_key != 6')
data = data[~data['customer_phone'].isin(duplicates)]
data = pd.concat([data, remove_duplicates])
data = data.drop('apply_date_key', axis=1)

In [7]:
order_status_map = {2:1}
data['order_status_key'] = data['order_status_key'].map(order_status_map).fillna(0)

In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7656 entries, 1 to 9720
Data columns (total 21 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   customer_phone                 7656 non-null   object 
 1   order_status_key               7656 non-null   float64
 2   als_m12_id_min_inteday         6379 non-null   float64
 3   als_lst_id_nbank_consnum       6650 non-null   float64
 4   als_m6_id_bank_min_inteday     2480 non-null   float64
 5   als_m12_id_nbank_orgnum        6650 non-null   float64
 6   card_num                       7594 non-null   object 
 7   als_m12_id_cooff_orgnum        1011 non-null   float64
 8   degree                         7597 non-null   object 
 9   als_m12_id_af_allnum           715 non-null    float64
 10  als_lst_id_nbank_inteday       6650 non-null   float64
 11  als_m12_id_rel_orgnum          6260 non-null   float64
 12  actual_age                     7656 non-null   f

In [9]:
br_duotou_inte_feats = data.filter(regex="inte").columns
br_duotou_apply_num_feats = data.filter(regex="als(?!.*inte)").columns
data[br_duotou_inte_feats] = data[br_duotou_inte_feats].fillna(9999)
data[br_duotou_apply_num_feats] = data[br_duotou_apply_num_feats].fillna(0)

In [10]:
data = data.query('als_m12_id_nbank_orgnum < 9')

In [11]:
data = data.dropna(subset='model_risk_v7_stongyong_score')

In [12]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6231 entries, 1 to 8461
Data columns (total 21 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   customer_phone                 6231 non-null   object 
 1   order_status_key               6231 non-null   float64
 2   als_m12_id_min_inteday         6231 non-null   float64
 3   als_lst_id_nbank_consnum       6231 non-null   float64
 4   als_m6_id_bank_min_inteday     6231 non-null   float64
 5   als_m12_id_nbank_orgnum        6231 non-null   float64
 6   card_num                       6182 non-null   object 
 7   als_m12_id_cooff_orgnum        6231 non-null   float64
 8   degree                         6184 non-null   object 
 9   als_m12_id_af_allnum           6231 non-null   float64
 10  als_lst_id_nbank_inteday       6231 non-null   float64
 11  als_m12_id_rel_orgnum          6231 non-null   float64
 12  actual_age                     6231 non-null   f

In [13]:
gender_map = {'male':0, 'female':1}
data['gender'] = data['gender'].map(gender_map)

In [14]:
data['degree'] = data['degree'].fillna("NONE")
degree_map = {'高中及以下':1, '大学专科':2, '大学本科':3, '硕士（含）及以上':4, 'NONE':5}
data['degree'] = data['degree'].map(degree_map)

In [15]:
data['card_num'] = data['card_num'].fillna("NONE")
card_num_map = {'无信用卡':0, '1张':1, '2张':2, '3张':3, '4张及以上':4, 'NONE':5}
data['card_num'] = data['card_num'].map(card_num_map)

In [16]:
train, test = train_test_split(data, test_size=0.3, random_state=42, stratify=data['order_status_key'])

In [17]:
train = train.reset_index(drop=True)
test = test.reset_index(drop=True)

In [18]:
kfold = -1
splitter = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)
for f, (t, v) in enumerate(splitter.split(train, train['order_status_key'])):
    train.loc[v, 'fold'] = f
train['fold'].value_counts()

0.0    873
2.0    872
4.0    872
3.0    872
1.0    872
Name: fold, dtype: int64

In [36]:
train.describe()

Unnamed: 0,order_status_key,als_m12_id_min_inteday,als_lst_id_nbank_consnum,als_m6_id_bank_min_inteday,als_m12_id_nbank_orgnum,card_num,als_m12_id_cooff_orgnum,degree,als_m12_id_af_allnum,als_lst_id_nbank_inteday,als_m12_id_rel_orgnum,actual_age,als_m12_id_nbank_night_orgnum,als_m12_id_bank_week_orgnum,gender,als_d7_id_nbank_week_orgnum,als_m12_id_nbank_min_inteday,als_m12_id_bank_ret_orgnum,als_m3_id_nbank_min_inteday,model_risk_v7_stongyong_score,fold
count,4361.0,4361.0,4361.0,4361.0,4361.0,4361.0,4361.0,4361.0,4361.0,4361.0,4361.0,4361.0,4361.0,4361.0,4361.0,4361.0,4361.0,4361.0,4361.0,4361.0,4361.0
mean,0.327448,2001.380188,0.945884,7491.307498,2.318276,1.750287,0.04953,2.007796,0.075441,1669.641367,1.547581,36.214859,0.357945,0.390048,0.355882,0.027746,3796.405182,0.098601,6990.061912,39.383169,1.999541
std,0.469336,3973.199353,0.607527,4326.135515,2.061455,1.313922,0.22933,0.623643,0.350026,3628.067629,1.387392,8.39432,0.656346,0.693933,0.478835,0.197253,4832.635584,0.335103,4582.499969,17.649003,1.414538
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,18.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0
25%,0.0,0.0,1.0,153.0,1.0,1.0,0.0,2.0,0.0,22.0,1.0,30.0,0.0,0.0,0.0,0.0,1.0,0.0,29.0,25.0,1.0
50%,0.0,9.0,1.0,9999.0,2.0,2.0,0.0,2.0,0.0,83.0,1.0,35.0,0.0,0.0,0.0,0.0,53.0,0.0,9999.0,38.0,2.0
75%,1.0,132.0,1.0,9999.0,3.0,3.0,0.0,2.0,0.0,236.0,2.0,42.0,1.0,1.0,1.0,0.0,9999.0,0.0,9999.0,52.0,3.0
max,1.0,9999.0,9.0,9999.0,8.0,5.0,2.0,5.0,5.0,9999.0,11.0,61.0,4.0,6.0,1.0,4.0,9999.0,4.0,9999.0,89.0,4.0


In [62]:
class Binner:
    def __init__(self):
        self.feature_bins = {}

    def fit(self, df):
        features = df.columns
        percentiles = list(range(0, 101, 10))

        for f in features:
            val = np.percentile(df[[f]], percentiles).tolist()
            if max(val) == 9999:
                bins = list(sorted(set([-1, 9998, 9999] + val)))
            else:
                bins = list(sorted(set([-1] + val)))
            self.feature_bins[f] = bins
        return self

    def transform(self, df):
        if not self.feature_bins:
            print("Error: Binner is not fitted yet")
            return
        else:
            df_bin = df.copy()
            for f in df_bin.columns:
                bins = self.feature_bins[f]
                df_bin[f] = pd.cut(df_bin[f], bins, labels= list(range(len(bins) - 1))).astype("int")
            return df_bin
        
        
def preprocess1(data, is_train=True):
    processed = data.copy()
    binning_features = processed.filter(regex="als").columns
    scale_features = ['model_risk_v7_stongyong_score', 'actual_age']
    cat_features = ['gender', 'degree', 'card_num']
    if is_train:
        binner = Binner()
        binner.fit(processed[binning_features])
        processed[binning_features] = binner.transform(processed[binning_features])
        pickle.dump(binner, open(config.BINNER, 'wb'))
        scaler = StandardScaler()
        processed[scale_features] = scaler.fit_transform(processed[scale_features])
        pickle.dump(scaler, open(config.SCALER, "wb"))
        onehot_enc = OneHotEncoder(handle_unknown='ignore')
        encoded = pd.DataFrame(onehot_enc.fit_transform(processed.loc[:, cat_features]).toarray(), columns=onehot_enc.get_feature_names_out(cat_features))
        pickle.dump(onehot_enc, open(config.ENCODER, 'wb'))
    else:
        try:
            scaler = joblib.load(config.SCALER)
            encoder = joblib.load(config.ENCODER)
            binner = joblib.load(config.BINNER)
        except FileNotFoundError:
            print("Error: scaler, encoder or binner does not exist")
            return
            
        processed[binning_features] = binner.transform(processed[binning_features])
        processed[scale_features] = scaler.transform(processed[scale_features])
        encoded = pd.DataFrame(encoder.transform(processed[cat_features]).toarray(), columns=encoder.get_feature_names_out(cat_features))
    
    processed = processed.drop(columns=cat_features)
    processed = pd.concat([encoded, processed], axis=1)
    return processed


def preprocess2(data, is_train=True):
    processed = data.copy()
    cat_features = ['gender', 'degree', 'card_num']
    if is_train:
        onehot_enc = OneHotEncoder(handle_unknown='ignore')
        encoded = pd.DataFrame(onehot_enc.fit_transform(processed.loc[:, cat_features]).toarray(), columns=onehot_enc.get_feature_names_out(cat_features))
        pickle.dump(onehot_enc, open(config.ENCODER, 'wb'))
    else:
        try:
            encoder = joblib.load(config.ENCODER)
        except FileNotFoundError:
            print("Error: scaler, encoder or binner does not exist")
            return
            
        encoded = pd.DataFrame(encoder.transform(processed[cat_features]).toarray(), columns=encoder.get_feature_names_out(cat_features))
    
    processed = processed.drop(columns=cat_features)
    processed = pd.concat([encoded, processed], axis=1)
    return processed


In [63]:
train_processed = preprocess1(train)
test_processed = preprocess1(test)

In [64]:
train_processed.to_csv(config.TRAIN, index=False)
test_processed.to_csv(config.TEST, index=False)

In [71]:
train_processed2 = preprocess2(train, is_train=True)
test_processed2 = preprocess2(test, is_train=False)

In [72]:
train_processed2.to_csv(config.TRAIN2, index=False)
test_processed2.to_csv(config.TEST2, index=False)

In [73]:
train.to_csv(config.TRAIN3, index=False)
test.to_csv(config.TEST3, index=False)