In [1]:
import pandas as pd
pd.set_option("display.max_rows", 100)
pd.set_option("display.max_columns", 100)
import numpy as np
from glob2 import glob
import warnings

warnings.filterwarnings("ignore", module="lightgbm")

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
import pickle
import re
import gc

import config # 自定义配置文件，存放各个数据的路径

In [2]:
import os

In [3]:
def peek(df ,line = None):
    print(df.shape)
    if line is None:
        print(df.head())
    else:
        print(df.head(line))

In [4]:
def preprocess(file, purpose):
    data = pd.read_csv(file, low_memory=False)
    pat = re.compile(r"\d+(?=-)|\d+\.\d+(?=-)|(?<=-)\w+")
    match = pat.findall(file)
    date, bank = match[0], match[2]
    data = data.assign(bank=bank)
    data = data[pd.notnull(data['mob'])]
    return data

In [41]:
import re

# train_set, dev_set数据量较大，不太方便上传到git上，所以这里读的是软连接
train = []
train_dir = config.TRAINING_DIR
for file in glob(f"{train_dir}/*.csv"):
    data = preprocess(file,"train")
    train.append(data)
train = pd.concat(train)

In [6]:
dev = []
dev_dir = config.DEV_DIR
for file in glob(f"{dev_dir}/*.csv"):
    data = preprocess(file,"dev")
    dev.append(data)
dev = pd.concat(dev)

In [7]:
train['type'] = 'train'
dev['type'] = 'dev'
matrix = pd.concat([train, dev])

In [8]:
def reduce_memory_usage(df, allow_categorical, float_type="float32"):
    def _downcast_numeric(series, allow_categorical, float_type):
        if pd.api.types.is_sparse(series.dtype):
            return series
        elif pd.api.types.is_numeric_dtype(series) is False: 
            if pd.api.types.is_datetime64_any_dtype(series.dtype):
                return series
            else:
                if allow_categorical:
                    return series
                else:
                    code, uniq = series.factorize()
                    series = pd.Series(code, index=series.index)
                    series = _downcast_numeric(series, allow_categorical, float_type)
                    return series
        else:
            series = pd.to_numeric(series, downcast="integer")
        if pd.api.types.is_float_dtype(series.dtype):
            series = series.astype(float_type)
        return series
    
    print(f"before reducing memory: {np.sum(df.memory_usage()) / (1024*1024)}")
    if df.ndim == 1:
        df = _downcast_numeric(df, allow_categorical, float_type)
    else:
        for col in df.columns:
            df[col] = _downcast_numeric(df[col], allow_categorical, float_type)
    print(f"after reducing memory: {np.sum(df.memory_usage()) / (1024*1024)}")

    return df

def reduce_new_col_mem(df, allow_categorical, old_cols=None):
    if old_cols is not None:
        new_cols = df.columns.difference(old_cols)
    else:
        new_cols = df.columns
        
    df[new_cols] = reduce_memory_usage(df[new_cols], allow_categorical)
    old_cols = df.columns
    return df, old_cols

In [9]:
matrix, old_cols = reduce_new_col_mem(matrix, allow_categorical=True)

before reducing memory: 2011.9474411010742
after reducing memory: 1116.6308298110962


In [10]:
matrix = matrix.query("age > 18 & age < 65")

In [11]:
# 计算用户生命周期中每月平均申请次数
def add_average_apply_feature(matrix):
    matrix[['信用卡申请数','信贷申请数', '核卡数', '被拒数', '线上申请信用卡次数', '线上申请信贷次数']] = \
        matrix[['信用卡申请数','信贷申请数', '核卡数', '被拒数', '线上申请信用卡次数', '线上申请信贷次数']].fillna(0)
    
    matrix['mob_month'] = ((matrix['mob'] / 30) + 1).astype("int")
    matrix['apply_credit_card_average_month'] = matrix['信用卡申请数'] / matrix['mob_month']
    matrix['apply_loan_average_month'] = matrix['信贷申请数'] / matrix['mob_month']
    matrix['apply_credit_card_online_average_month'] = matrix['线上申请信用卡次数'] / matrix['mob_month']
    matrix['apply_loan_online_average_month'] = matrix['线上申请信贷次数'] / matrix['mob_month']
    matrix['new_customer'] = matrix.mob_month == 1
    matrix = matrix.drop(columns='mob_month')
    return matrix

matrix = add_average_apply_feature(matrix)

In [12]:
matrix, old_cols = reduce_new_col_mem(matrix,True,old_cols)

before reducing memory: 246.91128730773926


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = _downcast_numeric(df[col], allow_categorical, float_type)


after reducing memory: 246.91128730773926


In [13]:
matrix[['信用卡申请数','信贷申请数', '核卡数', '被拒数', '线上申请信用卡次数', '线上申请信贷次数','apply_credit_card_average_month',
       'apply_loan_average_month', 'apply_credit_card_online_average_month', 'apply_loan_online_average_month']].describe(percentiles=[0.75, 0.9, 0.95, 0.98])

Unnamed: 0,信用卡申请数,信贷申请数,核卡数,被拒数,线上申请信用卡次数,线上申请信贷次数,apply_credit_card_average_month,apply_loan_average_month,apply_credit_card_online_average_month,apply_loan_online_average_month
count,10356210.0,10356210.0,10356210.0,10356210.0,10356210.0,10356210.0,10356210.0,10356210.0,10356210.0,10356210.0
mean,3.096205,0.8317689,0.2619604,0.3735804,1.384775,0.7653195,0.8328851,0.2663787,0.3317609,0.2484861
std,2.937862,1.162456,0.5006847,0.6449426,2.02545,1.082022,0.8923416,0.4886368,0.5265778,0.4507945
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,2.0,0.0,0.0,0.0,1.0,0.0,0.5,0.0,0.1111111,0.0
75%,4.0,1.0,0.0,1.0,2.0,1.0,1.0,0.3333333,0.5,0.3333333
90%,6.0,2.0,1.0,1.0,4.0,2.0,2.0,1.0,1.0,1.0
95%,8.0,3.0,1.0,1.0,5.0,3.0,3.0,1.0,1.333333,1.0
98%,11.0,4.0,2.0,2.0,7.0,4.0,4.0,2.0,2.0,2.0
max,246.0,52.0,8.0,14.0,128.0,52.0,39.0,11.66667,15.0,10.66667


In [14]:
matrix = matrix.query('信用卡申请数 < 20')

In [15]:
matrix['gender'] = matrix.gender.replace({'male':0, 'female':1})
matrix['gender'] = reduce_memory_usage(matrix['gender'], True)

before reducing memory: 157.57760620117188
after reducing memory: 88.63740348815918


In [16]:
# matrix['最近一次申请产品'] = matrix['最近一次申请产品'].map({1:'信用卡', 2:'信贷'}).fillna('无')
# matrix['最近一次申请产品'] = reduce_memory_usage(matrix['最近一次申请产品'], allow_categorical=False)
matrix['最近一次申请产品'] = matrix['最近一次申请产品'].fillna(0)

In [17]:
matrix['是否曾经申请过相同银行'] = matrix['是否曾经申请过相同银行'].fillna(0).astype("int8")

In [20]:
# 若银行为非脱银行或者该字段为空，该字段的值为2
feituo = ['招商银行', '交通银行', '浦发银行']
matrix['是否有该银行的核卡'] = matrix['是否有该银行的核卡'].astype("Int8")
matrix.loc[matrix['bank'].isin(feituo), '是否有该银行的核卡'] = np.nan
matrix['是否有该银行的核卡'] = matrix['是否有该银行的核卡'].fillna(2).astype("int8")

# 若银行为非脱银行或者该字段为空，该字段的值为2
matrix['是否曾被该银行拒绝过'] = matrix['是否曾被该银行拒绝过'].astype("Int8")
matrix.loc[matrix['bank'].isin(feituo), '是否曾被该银行拒绝过'] = np.nan
matrix['是否曾被该银行拒绝过'] = matrix['是否曾被该银行拒绝过'].fillna(2).astype("int8")

In [21]:
matrix['是否和上一次申请的银行一致'] = matrix['是否和上一次申请的银行一致'].fillna(2).astype("int8")

In [22]:
major_banks = ['招商银行', '中信银行', '交通银行', '光大银行', '工商银行', '平安银行', '渤海银行', '广发银行',
                   '民生银行', '华夏银行', '浦发银行', '建设银行']
matrix['bank_processed'] = matrix['bank'].copy().astype('str')
matrix.loc[~matrix['bank'].isin(major_banks), 'bank_processed'] = "rare"
encoder = LabelEncoder()
matrix['bank_processed'] = encoder.fit_transform(matrix['bank_processed']).astype("int8")
matrix = matrix.drop(columns=['bank'])
utils = config.MODEL_UTILS
pickle.dump(encoder, open(f"{utils}/bank_encoder.pkl", "wb"))

In [23]:
auxiliary = config.AUXILIARY_DIR
customer_source = pd.read_csv(f"{auxiliary}/customer_source.csv")
matrix = matrix.merge(customer_source, on='customer_key', how='left')
matrix['line_key'] = matrix['line_key'].fillna(4).astype("int8")

In [24]:
cluster_feature = ['信用卡申请数','信贷申请数','线上申请信用卡次数','线上申请信贷次数',
                        'apply_credit_card_average_month','apply_loan_average_month','apply_credit_card_online_average_month',
                        'apply_loan_online_average_month']
kmeans = KMeans(n_clusters=3)
matrix['cluster_n_3'] = kmeans.fit_predict(matrix[cluster_feature])

In [25]:
pickle.dump(kmeans, open(f'{utils}/kmeans.pkl', 'wb'))

In [None]:
# components = pd.DataFrame(pca.transform(matrix[['信用卡申请数','信贷申请数','线上申请信用卡次数','线上申请信贷次数',
#                         'apply_credit_card_average_month','apply_loan_average_month','apply_credit_card_online_average_month',
#                         'apply_loan_online_average_month']])).apply(lambda x:x.astype("float32"))
# matrix['customer_group'] = model_reload.predict(components)

In [28]:
matrix = reduce_memory_usage(matrix, allow_categorical=False)

before reducing memory: 1053.800241470337
after reducing memory: 718.9478282928467


In [29]:
pos_merchant = pd.read_csv(f"{auxiliary}/pos_merchant.csv")
matrix.loc[matrix['customer_key'].isin(pos_merchant['customer_key']), 'pos_merchant'] = True
matrix['pos_merchant'] = matrix['pos_merchant'].fillna(False)
matrix['pos_merchant'] = matrix['pos_merchant'].astype("int8")

questionnaire = pd.read_csv(f"{auxiliary}/questionnaire.csv")
matrix = matrix.merge(questionnaire, on = 'customer_key', how = 'left')
#     贷款需求字段处理，空值用3填充
loan_intention_map = {'没有': 0, '有': 1, '不确定，但我想了解': 2}
matrix['是否需要贷款'] = matrix['是否需要贷款'].map(loan_intention_map)
matrix['是否需要贷款'] = matrix['是否需要贷款'].fillna(3)

#     学历字段处理，空值用0填充
degree_unification = {'高中/中专/技校': '高中及以下', '小学及以下': '高中及以下', '初中': '高中及以下', "硕士（含）及以上": "研究生及以上"}
matrix['学历'] = matrix['学历'].replace(degree_unification)
degree_map = {'高中及以下': 1, '大学专科': 2, '大学本科': 3, '研究生及以上': 4}
matrix['学历'] = matrix['学历'].map(degree_map)
matrix['学历'] = matrix['学历'].fillna(0)

#     利用是否拥有信用卡和多少张信用卡互相填补空值
matrix.loc[(matrix['是否拥有信用卡'] == "否") &
           (matrix['有多少张信用卡'].isna()), "有多少张信用卡"] = "无信用卡"
matrix.loc[(matrix['是否拥有信用卡'].isna()) &
           (matrix['有多少张信用卡'] == "无信用卡"), '是否拥有信用卡'] = "否"
matrix.loc[(matrix['是否拥有信用卡'].isna()) &
           (matrix['有多少张信用卡'] != "无信用卡") &
           (pd.notnull(matrix['有多少张信用卡'])), '是否拥有信用卡'] = '是'
matrix.loc[(matrix['是否拥有信用卡'] == "是") &
           (matrix['有多少张信用卡'].isna()), "有多少张信用卡"] = "至少一张信用卡"

#     是否拥有信用卡字段处理，空值用2处理
own_credit_card_map = {'否': 0, '是': 1}
matrix['是否拥有信用卡'] = matrix['是否拥有信用卡'].map(own_credit_card_map)
matrix['是否拥有信用卡'] = matrix['是否拥有信用卡'].fillna(2)

#     信用卡数量字段处理, 空值用6填充
credit_card_num_map = {'无信用卡': 0, '至少一张信用卡': 1, '1张': 2, '2张': 3, '3张': 4, '4张及以上': 5}
matrix['有多少张信用卡'] = matrix['有多少张信用卡'].map(credit_card_num_map)
matrix['有多少张信用卡'] = matrix['有多少张信用卡'].fillna(6)

#     行业字段处理，空值用9填充
career_map = {'制造业/商业贸易/批发零售': 1, '旅游/酒店/餐饮等服务行业': 2, '自由职业': 3,
              '金融/互联网/大众传媒': 4, '交通/运输/建筑': 5, '教育/科研/医疗卫生': 6, '政府机关/公共事业': 7,
              '农林牧矿鱼': 8}
matrix['职业'] = matrix['职业'].map(career_map)
matrix['职业'] = matrix['职业'].fillna(9)

  questionnaire = pd.read_csv(f"{auxiliary}/questionnaire.csv")


In [30]:
matrix = matrix.drop(columns=['customer_key','customer_key.1'])

In [37]:
train = matrix[matrix.type == 0]
dev = matrix[matrix.type == 1]

In [38]:
train = train.drop(columns='type')

In [39]:
dev = dev.drop(columns='type')

In [40]:
train.to_pickle(config.TRAIN_BINARY_FILE)
dev.to_pickle(config.DEV_BINARY_FILE)