In [1]:
import pandas as pd
import numpy as np
import config
from utils import peek
import joblib
import pickle
import missingno as msno
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder, KBinsDiscretizer

In [2]:
data = pd.read_csv(config.DATA)

In [3]:
duplicates = data[data.duplicated(['customer_phone', 'apply_date_key'])].customer_phone.unique()
remove_duplicates = data[data.customer_phone.isin(duplicates)].query('order_status_key != 6')
data = data[~data['customer_phone'].isin(duplicates)]
data = pd.concat([data, remove_duplicates])

In [4]:
use_features = joblib.load(config.USE_FEATURES)
label = ['order_status_key']
use_cols = use_features + label
data = data[use_cols]
peek(data, 2)

(9973, 18)
   model_risk_v7_stongyong_score  omriskscoregeneral  als_m12_id_nbank_orgnum  \
0                           66.0               515.0                      7.0   
1                           20.0               684.0                      2.0   

   als_m12_id_nbank_min_inteday  als_m12_id_min_inteday  \
0                           0.0                     0.0   
1                          60.0                    25.0   

   als_m3_id_nbank_min_inteday  debt_pressure_index  v6_seq_online  \
0                          0.0                 31.0           38.0   
1                      -9999.0                 17.0           36.0   

   als_m12_id_nbank_week_orgnum  als_lst_id_nbank_consnum  \
0                           3.0                       1.0   
1                           0.0                       1.0   

   als_d15_id_nbank_orgnum  als_m1_id_nbank_week_orgnum in_black_list  \
0                      0.0                      -9999.0          True   
1                      0.0

In [5]:
data['order_status_key'] = data['order_status_key'].map({2:1}).fillna(0)

In [6]:
data = data.query('als_m12_id_nbank_orgnum < 9')
data = data[pd.isnull(data['in_black_list'])].drop('in_black_list', axis=1)
data.shape

(6236, 17)

In [7]:
data = data.dropna(subset=['als_m12_id_nbank_orgnum', 'model_risk_v7_stongyong_score', 'v6_seq_online', 'omriskscoregeneral'])

In [8]:
gender_map = {'male':0, 'female':1}
data['gender'] = data['gender'].map(gender_map)

In [9]:
data['degree'] = data['degree'].fillna("NONE")
degree_map = {'高中及以下':1, '大学专科':2, '大学本科':3, '硕士（含）及以上':4, 'NONE':5}
data['degree'] = data['degree'].map(degree_map)

In [10]:
data['card_num'] = data['card_num'].fillna("NONE")
card_num_map = {'无信用卡':0, '1张':1, '2张':2, '3张':3, '4张及以上':4, 'NONE':5}
data['card_num'] = data['card_num'].map(card_num_map)

In [11]:
train, test = train_test_split(data, test_size=0.3, random_state=42, stratify=data['order_status_key'])

In [12]:
train = train.reset_index(drop=True)
test = test.reset_index(drop=True)

In [13]:
kfold = -1
splitter = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)
for f, (t, v) in enumerate(splitter.split(train, train['order_status_key'])):
    train.loc[v, 'fold'] = f
train['fold'].value_counts()

0.0    870
1.0    870
3.0    870
2.0    870
4.0    869
Name: fold, dtype: int64

In [17]:
class Binner:
    def __init__(self):
        self.feature_bins = {}

    def fit(self, df):
        features = df.columns
        percentiles = list(range(0, 101, 10))

        for f in features:
            val = np.percentile(df[[f]], percentiles).tolist()
            if min(val) >= 0:
                bins = list(sorted(set([-999999, 999999] + val)))
            else:
                bins = list(sorted(set([-999999, 999999] + [-10000, -9999] + val)))
            self.feature_bins[f] = bins
        return self

    def transform(self, df):
        if not self.feature_bins:
            print("Error: Binner is not fitted yet")
            return
        else:
            df_bin = df.copy()
            for f in df_bin.columns:
                bins = self.feature_bins[f]
                df_bin[f] = pd.cut(df_bin[f], bins, labels= list(range(len(bins) - 1))).astype("int")
            return df_bin
        
        
def preprocess1(data, is_train=True):
    processed = data.copy()
    binning_features = ['als_m12_id_nbank_orgnum', 'als_m12_id_nbank_min_inteday',
                       'als_m12_id_min_inteday', 'als_m3_id_nbank_min_inteday',
                        'als_m12_id_nbank_week_orgnum','als_lst_id_nbank_consnum', 
                        'als_d15_id_nbank_orgnum','als_m1_id_nbank_week_orgnum']
    scale_features = ['model_risk_v7_stongyong_score', 'omriskscoregeneral', 'debt_pressure_index', 'v6_seq_online', 
                      'actual_age']
    cat_features = ['gender', 'degree', 'card_num']
    processed[binning_features]
    if is_train:
        binner = Binner()
        binner.fit(processed[binning_features])
        processed[binning_features] = binner.transform(processed[binning_features])
        pickle.dump(binner, open(config.BINNER, 'wb'))
        scaler = StandardScaler()
        processed[scale_features] = scaler.fit_transform(processed[scale_features])
        pickle.dump(scaler, open(config.SCALER, "wb"))
        onehot_enc = OneHotEncoder(handle_unknown='ignore')
        encoded = pd.DataFrame(onehot_enc.fit_transform(processed.loc[:, cat_features]).toarray(), columns=onehot_enc.get_feature_names_out(cat_features))
        pickle.dump(onehot_enc, open(config.ENCODER, 'wb'))
    else:
        try:
            scaler = joblib.load(config.SCALER)
            encoder = joblib.load(config.ENCODER)
            binner = joblib.load(config.BINNER)
        except FileNotFoundError:
            print("Error: scaler, encoder or binner does not exist")
            return
            
        processed[binning_features] = binner.transform(processed[binning_features])
        processed[scale_features] = scaler.transform(processed[scale_features])
        encoded = pd.DataFrame(encoder.transform(processed[cat_features]).toarray(), columns=encoder.get_feature_names_out(cat_features))
    
    processed = processed.drop(columns=cat_features)
    processed = pd.concat([encoded, processed], axis=1)
    return processed


def preprocess2(data, is_train=True):
    processed = data.copy()
    cat_features = ['gender', 'degree', 'card_num']
    if is_train:
        onehot_enc = OneHotEncoder(handle_unknown='ignore')
        encoded = pd.DataFrame(onehot_enc.fit_transform(processed.loc[:, cat_features]).toarray(), columns=onehot_enc.get_feature_names_out(cat_features))
        pickle.dump(onehot_enc, open(config.ENCODER, 'wb'))
    else:
        try:
            encoder = joblib.load(config.ENCODER)
        except FileNotFoundError:
            print("Error: scaler, encoder or binner does not exist")
            return
            
        encoded = pd.DataFrame(encoder.transform(processed[cat_features]).toarray(), columns=encoder.get_feature_names_out(cat_features))
    
    processed = processed.drop(columns=cat_features)
    processed = pd.concat([encoded, processed], axis=1)
    return processed


In [24]:
train_processed = preprocess2(train)
test_processed = preprocess2(test)

In [23]:
# train_processed.to_csv(config.TRAIN, index=False)
# test_processed.to_csv(config.TEST, index=False)

In [28]:
train_processed.to_csv(config.TRAIN2, index=False)
test_processed.to_csv(config.TEST2, index=False)

In [29]:
train.to_csv(config.TRAIN3, index=False)
test.to_csv(config.TEST3, index=False)