In [1]:
import os
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import lightgbm as lgb
import time
from sklearn.linear_model import HuberRegressor
import sklearn.ensemble as tree_model
# from tqdm import tqdm
import datetime
pd.set_option('display.max_column',100)
warnings.filterwarnings('ignore')

%load_ext autoreload
%autoreload 2
from utils import make_dir, score, timer, kf_lgbm, kf_xgbm, kf_ctbm, kf_sklearn

In [2]:
import sklearn
sklearn.__version__  # important

'0.19.0'

In [7]:
def make_features(df):
    app_feature = [
        '当月网购类应用使用次数',
        '当月物流快递类应用使用次数',
        '当月金融理财类应用使用总次数',
        '当月视频播放类应用使用次数',
        '当月飞机类应用使用次数',
        '当月火车类应用使用次数',
        '当月旅游资讯类应用使用次数',
    ]
    
    for f in app_feature:
        df['round_log1p'+f] = np.round(np.log1p(df[f])).astype(int)
    
    df['前五个月消费总费用'] = 6*df['用户近6个月平均消费值（元）'] - df['用户账单当月总费用（元）']
    df['前五个月消费平均费用'] = df['前五个月消费总费用'] / 5
    df['当月费用/前五个月消费平均费用'] = (df['用户账单当月总费用（元）']) \
                        / (1+df['前五个月消费平均费用'])
    df['当月费用-前五个月消费平均费用'] = df['用户账单当月总费用（元）'] - df['前五个月消费平均费用']
        
    def make_count_feature(df, col, fea_name):
        df['idx'] = range(len(df))
        tmp = df.groupby(col)['用户编码'].agg([
            (fea_name,'count')]).reset_index()
        df = df.merge(tmp)
        df = df.sort_values('idx').drop('idx',axis=1).reset_index(drop=True)
        return df
        
    df = make_count_feature(df, '缴费用户最近一次缴费金额（元）','count_缴费')
    df = make_count_feature(df, '用户账单当月总费用（元）','count_当月费用')
    df = make_count_feature(df, '前五个月消费总费用', 'count_总费用')
    df = make_count_feature(df, '当月费用-前五个月消费平均费用', 'count_费用差')
    df = make_count_feature(df, '用户近6个月平均消费值（元）', 'count_平均费用')
    df = make_count_feature(df, ['用户账单当月总费用（元）','用户近6个月平均消费值（元）'],
                            'count_当月费用_平均费用')
            
    arr = df['缴费用户最近一次缴费金额（元）']
    df['是否998折'] = ((arr/0.998)%1==0)&(arr!=0)
    
    df['年龄_0_as_nan'] = np.where(df['用户年龄']==0, [np.nan]*len(df), df['用户年龄'])
    
    return df
    
def load_df_and_make_features():
    train_df = pd.read_csv('C:/Users/yue/yuekangwei/credit-competition/train_dataset/train_dataset.csv')
    test_df = pd.read_csv('C:/Users/yue/yuekangwei/credit-competition/test_dataset/test_dataset.csv')
    train_df['train'] = 1
    test_df['train'] = 0
    df = pd.concat([train_df,test_df])
    df = make_features(df)
    return df

In [8]:
feature_name1 = \
['用户年龄',
 '用户网龄（月）',
 '用户实名制是否通过核实',
 '是否大学生客户',
 '是否4G不健康客户',
 '用户最近一次缴费距今时长（月）',
 '缴费用户最近一次缴费金额（元）',
 '用户近6个月平均消费值（元）',
 '用户账单当月总费用（元）',
 '用户当月账户余额（元）',
 '用户话费敏感度',
 '当月费用-前五个月消费平均费用',
 '前五个月消费总费用',
 'count_缴费',
 'count_当月费用',
 'count_费用差',
 'count_平均费用',
 'count_当月费用_平均费用',
 '是否998折',
 '当月通话交往圈人数',
 '近三个月月均商场出现次数',
 '当月网购类应用使用次数',
 '当月物流快递类应用使用次数',
 '当月金融理财类应用使用总次数',
 '当月视频播放类应用使用次数',
 '当月飞机类应用使用次数',
 '当月火车类应用使用次数',
 '当月旅游资讯类应用使用次数',
 '当月是否逛过福州仓山万达',
 '当月是否到过福州山姆会员店',
 '当月是否看电影',
 '当月是否景点游览',
 '当月是否体育场馆消费',
 '是否经常逛商场的人',
 '是否黑名单客户',
 '缴费用户当前是否欠费缴费']

In [10]:
df = load_df_and_make_features()
train_df = df[df['train']==1]
test_df = df[df['train']!=1]

In [11]:
df.head()

Unnamed: 0,train,信用分,当月旅游资讯类应用使用次数,当月是否体育场馆消费,当月是否到过福州山姆会员店,当月是否景点游览,当月是否看电影,当月是否逛过福州仓山万达,当月火车类应用使用次数,当月物流快递类应用使用次数,当月网购类应用使用次数,当月视频播放类应用使用次数,当月通话交往圈人数,当月金融理财类应用使用总次数,当月飞机类应用使用次数,是否4G不健康客户,是否大学生客户,是否经常逛商场的人,是否黑名单客户,用户实名制是否通过核实,用户年龄,用户当月账户余额（元）,用户最近一次缴费距今时长（月）,用户编码,用户网龄（月）,用户话费敏感度,用户账单当月总费用（元）,用户近6个月平均消费值（元）,缴费用户当前是否欠费缴费,缴费用户最近一次缴费金额（元）,近三个月月均商场出现次数,round_log1p当月网购类应用使用次数,round_log1p当月物流快递类应用使用次数,round_log1p当月金融理财类应用使用总次数,round_log1p当月视频播放类应用使用次数,round_log1p当月飞机类应用使用次数,round_log1p当月火车类应用使用次数,round_log1p当月旅游资讯类应用使用次数,前五个月消费总费用,前五个月消费平均费用,当月费用/前五个月消费平均费用,当月费用-前五个月消费平均费用,count_缴费,count_当月费用,count_总费用,count_费用差,count_平均费用,count_当月费用_平均费用,是否998折,年龄_0_as_nan
0,1,664.0,30,1,0,1,0,0,0,0,713,7145,83,2740,0,0,0,1,0,1,44,180,1,a4651f98c82948b186bdcdc8108381b4,186,3,159.2,163.86,0,99.8,75,7,0,8,9,0,0,3,823.96,164.792,0.960239,-5.592,22284,18,1,4,2,1,True,44.0
1,1,530.0,0,0,0,0,0,0,0,0,414,44862,21,2731,0,1,0,1,0,1,18,110,1,aeb10247db4e4d67b2550bbc42ff9827,5,3,145.1,153.28,0,29.94,16,6,0,8,11,0,0,0,774.58,154.916,0.930629,-9.816,11740,38,1,7,5,1,True,18.0
2,1,643.0,1,0,0,0,0,0,0,0,3391,4804,59,0,0,0,0,0,0,1,47,70,1,5af23a1e0e77410abb25e9a7eee510aa,145,1,120.2,109.64,0,49.9,1,8,0,0,8,0,0,1,537.64,107.528,1.107548,12.672,21066,28,7,3,9,1,True,47.0
3,1,649.0,5,1,0,1,0,0,0,0,500,3141,78,1931,0,0,0,1,0,1,55,90,1,43c64379d3c24a15b8478851b22049e4,234,3,167.42,92.97,0,99.8,26,6,0,8,8,0,0,2,390.4,78.08,2.117097,89.34,22284,4,3,2,9,1,True,55.0
4,1,648.0,0,0,0,1,0,0,0,0,522,59,70,64,0,0,0,1,0,1,40,80,1,f1687f3b8a6f4910bd0b13eb634056e2,76,3,101.0,95.47,0,49.9,44,6,0,4,4,0,0,0,471.82,94.364,1.0591,6.636,21066,475,3,4,7,2,True,40.0


In [12]:
# now = str(datetime.datetime.now()).split('.')[0]
# now=now.replace(' ','_')
# now=now.replace(':','-')
# output_dir = 'lgb'+now
output_dir = './stacking_files/' 

In [13]:
x, y = train_df[feature_name1], train_df['信用分'].values
x_test = test_df[feature_name1]

model = kf_sklearn(x=x,y=y,x_test=x_test,output_dir=output_dir,name="gotcha_gbdt1",
                   model_class=tree_model.GradientBoostingRegressor, 
                   n_estimators=250,subsample=0.8,min_samples_leaf=10,
                   max_depth=7,loss='huber',random_state=2019)











0.0635676429637	0.0635195620108	0.0634272539799	0.0626815198709	0.0642561708594	0.0632119391352	0.0620407664009	0.0634675311244	0.0631024901823	0.0638429358789
min score: 0.062041
max score: 0.064256
median score: 0.063447
mean score: 0.063312
[ 602.61723627  537.6425845   667.13433286  677.14377941  656.57419819
  614.41691495  640.49103811  568.01745542  674.310979    588.30282033]
