In [163]:
import os

os.environ['NUM_OMP_THREADS'] = "4"

import warnings
import numpy as np

import pandas as pd
import matplotlib.pyplot as plt
import lightgbm as lgb
import time
from sklearn.linear_model import HuberRegressor
import sklearn.ensemble as tree_model
from sklearn.metrics import mean_absolute_error
# from tqdm import tqdm
import datetime
pd.set_option('display.max_column',100)
warnings.filterwarnings('ignore')

%load_ext autoreload
%autoreload 2
from utils import make_dir, score, timer, kf_lgbm, kf_xgbm, kf_ctbm, kf_sklearn

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [164]:
def make_features(df):
    app_feature = [
        '当月网购类应用使用次数',
        '当月物流快递类应用使用次数',
        '当月金融理财类应用使用总次数',
        '当月视频播放类应用使用次数',
        '当月飞机类应用使用次数',
        '当月火车类应用使用次数',
        '当月旅游资讯类应用使用次数',
    ]
    
    for f in app_feature:
        df['round_log1p'+f] = np.round(np.log1p(df[f])).astype(int)
    
    df['前五个月消费总费用'] = 6*df['用户近6个月平均消费值（元）'] - df['用户账单当月总费用（元）']
    df['前五个月消费平均费用'] = df['前五个月消费总费用'] / 5
    df['当月费用/前五个月消费平均费用'] = (df['用户账单当月总费用（元）']) \
                        / (1+df['前五个月消费平均费用'])
    df['当月费用-前五个月消费平均费用'] = df['用户账单当月总费用（元）'] - df['前五个月消费平均费用']
        
    def make_count_feature(df, col, fea_name):
        df['idx'] = range(len(df))
        tmp = df.groupby(col)['用户编码'].agg([
            (fea_name,'count')]).reset_index()
        df = df.merge(tmp)
        df = df.sort_values('idx').drop('idx',axis=1).reset_index(drop=True)
        return df
        
    df = make_count_feature(df, '缴费用户最近一次缴费金额（元）','count_缴费')
    df = make_count_feature(df, '用户账单当月总费用（元）','count_当月费用')
    df = make_count_feature(df, '前五个月消费总费用', 'count_总费用')
    df = make_count_feature(df, '当月费用-前五个月消费平均费用', 'count_费用差')
    df = make_count_feature(df, '用户近6个月平均消费值（元）', 'count_平均费用')
    df = make_count_feature(df, ['用户账单当月总费用（元）','用户近6个月平均消费值（元）'],
                            'count_当月费用_平均费用')
            
    arr = df['缴费用户最近一次缴费金额（元）']
    df['是否998折'] = ((arr/0.998)%1==0)&(arr!=0)
    
    df['年龄_0_as_nan'] = np.where(df['用户年龄']==0, [np.nan]*len(df), df['用户年龄'])
    
    return df
    
def load_df_and_make_features():
    train_df = pd.read_csv('C:/Users/yue/yuekangwei/credit-competition/train_dataset/train_dataset.csv')
    test_df = pd.read_csv('C:/Users/yue/yuekangwei/credit-competition/test_dataset/test_dataset.csv')
    train_df['train'] = 1
    test_df['train'] = 0
    df = pd.concat([train_df,test_df])
    df = make_features(df)
    return df

In [165]:
feature_name1 = \
['用户年龄',
 '用户网龄（月）',
 '用户实名制是否通过核实',
 '是否大学生客户',
 '是否4G不健康客户',
 '用户最近一次缴费距今时长（月）',
 '缴费用户最近一次缴费金额（元）',
 '用户近6个月平均消费值（元）',
 '用户账单当月总费用（元）',
 '用户当月账户余额（元）',
 '用户话费敏感度',
 '当月费用-前五个月消费平均费用',
 '前五个月消费总费用',
 'count_缴费',
 'count_当月费用',
 'count_费用差',
 'count_平均费用',
 'count_当月费用_平均费用',
 '是否998折',
 '当月通话交往圈人数',
 '近三个月月均商场出现次数',
 '当月网购类应用使用次数',
 '当月物流快递类应用使用次数',
 '当月金融理财类应用使用总次数',
 '当月视频播放类应用使用次数',
 '当月飞机类应用使用次数',
 '当月火车类应用使用次数',
 '当月旅游资讯类应用使用次数',
 '当月是否逛过福州仓山万达',
 '当月是否到过福州山姆会员店',
 '当月是否看电影',
 '当月是否景点游览',
 '当月是否体育场馆消费',
 '是否经常逛商场的人',
 '是否黑名单客户',
 '缴费用户当前是否欠费缴费']


feature_name2 = \
['年龄_0_as_nan',
 '用户网龄（月）',
 '用户实名制是否通过核实',
 '是否大学生客户',
 '是否4G不健康客户',
 '用户最近一次缴费距今时长（月）',
 '缴费用户最近一次缴费金额（元）',
 '用户近6个月平均消费值（元）',
 '用户账单当月总费用（元）',
 '用户当月账户余额（元）',
 '用户话费敏感度',
 '当月费用-前五个月消费平均费用',
 '前五个月消费总费用',
 'count_缴费',
 'count_当月费用',
 'count_费用差',
 'count_平均费用',
 'count_当月费用_平均费用',
 '是否998折',
 '当月通话交往圈人数',
 '近三个月月均商场出现次数',
 '当月网购类应用使用次数',
 '当月物流快递类应用使用次数',
 '当月金融理财类应用使用总次数',
 '当月视频播放类应用使用次数',
 '当月飞机类应用使用次数',
 '当月火车类应用使用次数',
 '当月旅游资讯类应用使用次数',
 '当月是否逛过福州仓山万达',
 '当月是否到过福州山姆会员店',
 '当月是否看电影',
 '当月是否景点游览',
 '当月是否体育场馆消费',
 '是否经常逛商场的人',
 '是否黑名单客户',
 '缴费用户当前是否欠费缴费']

feature_name3 = \
['用户年龄',
 '用户网龄（月）',
 '用户实名制是否通过核实',
 '是否大学生客户',
 '是否4G不健康客户',
 '用户最近一次缴费距今时长（月）',
 '缴费用户最近一次缴费金额（元）',
 '用户近6个月平均消费值（元）',
 '用户账单当月总费用（元）',
 '用户当月账户余额（元）',
 '用户话费敏感度',
 '当月费用-前五个月消费平均费用',
 '前五个月消费总费用',
 'count_缴费',
 'count_当月费用',
 'count_费用差',
 'count_平均费用',
 'count_当月费用_平均费用',
 '是否998折',
 '当月通话交往圈人数',
 '近三个月月均商场出现次数',
 'round_log1p当月网购类应用使用次数',
 'round_log1p当月物流快递类应用使用次数',
 'round_log1p当月金融理财类应用使用总次数',
 'round_log1p当月视频播放类应用使用次数',
 'round_log1p当月飞机类应用使用次数',
 'round_log1p当月火车类应用使用次数',
 'round_log1p当月旅游资讯类应用使用次数',
 '当月是否逛过福州仓山万达',
 '当月是否到过福州山姆会员店',
 '当月是否看电影',
 '当月是否景点游览',
 '当月是否体育场馆消费',
 '是否经常逛商场的人',
 '是否黑名单客户',
 '缴费用户当前是否欠费缴费']

feature_name4 = \
['年龄_0_as_nan',
 '用户网龄（月）',
 '用户实名制是否通过核实',
 '是否大学生客户',
 '是否4G不健康客户',
 '用户最近一次缴费距今时长（月）',
 '缴费用户最近一次缴费金额（元）',
 '用户近6个月平均消费值（元）',
 '用户账单当月总费用（元）',
 '用户当月账户余额（元）',
 '用户话费敏感度',
 '当月费用-前五个月消费平均费用',
 '前五个月消费总费用',
 'count_缴费',
 'count_当月费用',
 'count_费用差',
 'count_平均费用',
 'count_当月费用_平均费用',
 '是否998折',
 '当月通话交往圈人数',
 '近三个月月均商场出现次数',
 'round_log1p当月网购类应用使用次数',
 'round_log1p当月物流快递类应用使用次数',
 'round_log1p当月金融理财类应用使用总次数',
 'round_log1p当月视频播放类应用使用次数',
 'round_log1p当月飞机类应用使用次数',
 'round_log1p当月火车类应用使用次数',
 'round_log1p当月旅游资讯类应用使用次数',
 '当月是否逛过福州仓山万达',
 '当月是否到过福州山姆会员店',
 '当月是否看电影',
 '当月是否景点游览',
 '当月是否体育场馆消费',
 '是否经常逛商场的人',
 '是否黑名单客户',
 '缴费用户当前是否欠费缴费']

feature_name5 = \
['用户年龄',
 '用户网龄（月）',
 '用户实名制是否通过核实',
 '是否大学生客户',
 '是否4G不健康客户',
 '用户最近一次缴费距今时长（月）',
 '缴费用户最近一次缴费金额（元）',
 '用户近6个月平均消费值（元）',
 '用户账单当月总费用（元）',
 '用户当月账户余额（元）',
 '用户话费敏感度',
 '当月费用-前五个月消费平均费用',
 '当月通话交往圈人数',
 '近三个月月均商场出现次数',
 '当月网购类应用使用次数',
 '当月物流快递类应用使用次数',
 '当月金融理财类应用使用总次数',
 '当月视频播放类应用使用次数',
 '当月飞机类应用使用次数',
 '当月火车类应用使用次数',
 '当月旅游资讯类应用使用次数',
 '当月是否逛过福州仓山万达',
 '当月是否到过福州山姆会员店',
 '当月是否看电影',
 '当月是否景点游览',
 '当月是否体育场馆消费',
 '是否经常逛商场的人',
 '是否黑名单客户',
 '缴费用户当前是否欠费缴费']


feature_name6 = \
['年龄_0_as_nan',
 '用户网龄（月）',
 '用户实名制是否通过核实',
 '是否大学生客户',
 '是否4G不健康客户',
 '用户最近一次缴费距今时长（月）',
 '缴费用户最近一次缴费金额（元）',
 '用户近6个月平均消费值（元）',
 '用户账单当月总费用（元）',
 '用户当月账户余额（元）',
 '用户话费敏感度',
 '当月费用-前五个月消费平均费用',
 '当月通话交往圈人数',
 '近三个月月均商场出现次数',
 '当月网购类应用使用次数',
 '当月物流快递类应用使用次数',
 '当月金融理财类应用使用总次数',
 '当月视频播放类应用使用次数',
 '当月飞机类应用使用次数',
 '当月火车类应用使用次数',
 '当月旅游资讯类应用使用次数',
 '当月是否逛过福州仓山万达',
 '当月是否到过福州山姆会员店',
 '当月是否看电影',
 '当月是否景点游览',
 '当月是否体育场馆消费',
 '是否经常逛商场的人',
 '是否黑名单客户',
 '缴费用户当前是否欠费缴费']

In [166]:
df = load_df_and_make_features()
train_df = df[df['train']==1]
test_df = df[df['train']!=1]


In [167]:
output_dir = './stacking_files/'

In [168]:
df.head(5)

Unnamed: 0,train,信用分,当月旅游资讯类应用使用次数,当月是否体育场馆消费,当月是否到过福州山姆会员店,当月是否景点游览,当月是否看电影,当月是否逛过福州仓山万达,当月火车类应用使用次数,当月物流快递类应用使用次数,当月网购类应用使用次数,当月视频播放类应用使用次数,当月通话交往圈人数,当月金融理财类应用使用总次数,当月飞机类应用使用次数,是否4G不健康客户,是否大学生客户,是否经常逛商场的人,是否黑名单客户,用户实名制是否通过核实,用户年龄,用户当月账户余额（元）,用户最近一次缴费距今时长（月）,用户编码,用户网龄（月）,用户话费敏感度,用户账单当月总费用（元）,用户近6个月平均消费值（元）,缴费用户当前是否欠费缴费,缴费用户最近一次缴费金额（元）,近三个月月均商场出现次数,round_log1p当月网购类应用使用次数,round_log1p当月物流快递类应用使用次数,round_log1p当月金融理财类应用使用总次数,round_log1p当月视频播放类应用使用次数,round_log1p当月飞机类应用使用次数,round_log1p当月火车类应用使用次数,round_log1p当月旅游资讯类应用使用次数,前五个月消费总费用,前五个月消费平均费用,当月费用/前五个月消费平均费用,当月费用-前五个月消费平均费用,count_缴费,count_当月费用,count_总费用,count_费用差,count_平均费用,count_当月费用_平均费用,是否998折,年龄_0_as_nan
0,1,664.0,30,1,0,1,0,0,0,0,713,7145,83,2740,0,0,0,1,0,1,44,180,1,a4651f98c82948b186bdcdc8108381b4,186,3,159.2,163.86,0,99.8,75,7,0,8,9,0,0,3,823.96,164.792,0.960239,-5.592,22284,18,1,4,2,1,True,44.0
1,1,530.0,0,0,0,0,0,0,0,0,414,44862,21,2731,0,1,0,1,0,1,18,110,1,aeb10247db4e4d67b2550bbc42ff9827,5,3,145.1,153.28,0,29.94,16,6,0,8,11,0,0,0,774.58,154.916,0.930629,-9.816,11740,38,1,7,5,1,True,18.0
2,1,643.0,1,0,0,0,0,0,0,0,3391,4804,59,0,0,0,0,0,0,1,47,70,1,5af23a1e0e77410abb25e9a7eee510aa,145,1,120.2,109.64,0,49.9,1,8,0,0,8,0,0,1,537.64,107.528,1.107548,12.672,21066,28,7,3,9,1,True,47.0
3,1,649.0,5,1,0,1,0,0,0,0,500,3141,78,1931,0,0,0,1,0,1,55,90,1,43c64379d3c24a15b8478851b22049e4,234,3,167.42,92.97,0,99.8,26,6,0,8,8,0,0,2,390.4,78.08,2.117097,89.34,22284,4,3,2,9,1,True,55.0
4,1,648.0,0,0,0,1,0,0,0,0,522,59,70,64,0,0,0,1,0,1,40,80,1,f1687f3b8a6f4910bd0b13eb634056e2,76,3,101.0,95.47,0,49.9,44,6,0,4,4,0,0,0,471.82,94.364,1.0591,6.636,21066,475,3,4,7,2,True,40.0


In [169]:
# x, y = train_df[feature_name1], train_df['信用分'].values
# x_test = test_df[feature_name1]

# model = kf_lgbm(x=x,y=y,x_test=x_test,learning_rate=0.01, 
#                 stratify=True,
#                 min_split_gain=1,
#                 categorical_feature=['用户话费敏感度'],
#                 boosting_type='gbdt',
#                 early_stopping_rounds=80,
#                 fair_c=25, 
#                 huber_delta=2,
#                 max_cat_to_onehot=4,
#                 objective="mae_fair",
#                 eval_metric="mae",
#                 subsample_freq=2,
#                 min_child_samples=20,
#                 num_leaves=31,
#                 bagging_fraction=0.8,
#                 feature_fraction=0.5,
#                 max_depth=5,
#                 output_dir=output_dir,
#                 name='gotcha_lgb1',
#                 n_estimators=8000)

In [170]:
import numpy as np
import pandas as pd
import lightgbm as lgb
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
from xgboost import plot_importance
from sklearn import preprocessing
from sklearn.metrics import r2_score
from sklearn.preprocessing import MinMaxScaler



def lightgbm(x_train,x_test,y_train,categorical_feats,features):
    # 交叉验证
    folds = KFold(n_splits=5, shuffle=True, random_state=2333)

    oof_lgb = np.zeros(len(x_train))
    predictions_lgb = np.zeros(len(x_test))
    # feature_importance_df = pd.DataFrame()

    params = {
        'num_leaves': 31,
        'min_data_in_leaf': 20,
        'min_child_samples': 20,
        'objective': 'regression',
        'learning_rate': 0.005,
        "boosting": "gbdt",
        "feature_fraction": 0.8,
        "bagging_freq": 1,
        "bagging_fraction": 0.85,
        "bagging_seed": 23,
        "metric": 'mae',
        "lambda_l1": 0.2,
        "nthread": 4,
    }
    for fold_, (trn_idx, val_idx) in enumerate(folds.split(x_train.values, y_train.values)):
        print("fold {}".format(fold_))
        trn_data = lgb.Dataset(x_train.iloc[trn_idx], label=y_train.iloc[trn_idx],
                               categorical_feature=categorical_feats)
        val_data = lgb.Dataset(x_train.iloc[val_idx], label=y_train.iloc[val_idx],
                               categorical_feature=categorical_feats)

        num_round = 10000
        clf = lgb.train(params,
                        trn_data,
                        num_round,
                        valid_sets=[trn_data, val_data],
                        verbose_eval=500,
                        early_stopping_rounds=200
                        )

        oof_lgb[val_idx] = clf.predict(x_train.iloc[val_idx], num_iteration=clf.best_iteration)
        predictions_lgb += clf.predict(x_test, num_iteration=clf.best_iteration) / folds.n_splits

   
    
    mae = mean_absolute_error(oof_lgb, y_train) + 1
    score =1 / mae
    print("MAE  score: {:<8.8f}".format(score))
    print(predictions_lgb)
    return predictions_lgb


In [171]:
# 调用lgb
x, y = train_df[feature_name1], train_df['信用分']
x_test = test_df[feature_name1]
categorical_feats=['用户话费敏感度']
features = train_df.columns
# print(x.shape[0],y.shape[0])

result = lightgbm(x_train=x,x_test=x_test,y_train=y,categorical_feats=['用户话费敏感度'],features = train_df.columns)


fold 0
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3768
[LightGBM] [Info] Number of data points in the train set: 40000, number of used features: 36
[LightGBM] [Info] Start training from score 617.895275
Training until validation scores don't improve for 200 rounds
[500]	training's l1: 15.3764	valid_1's l1: 15.8516
[1000]	training's l1: 14.4007	valid_1's l1: 15.042
[1500]	training's l1: 14.0755	valid_1's l1: 14.9102
[2000]	training's l1: 13.8455	valid_1's l1: 14.858
[2500]	training's l1: 13.6426	valid_1's l1: 14.8259
[3000]	training's l1: 13.4562	valid_1's l1: 14.8065
[3500]	training's l1: 13.2813	valid_1's l1: 14.7912
[4000]	training's l1: 13.1152	valid_1's l1: 14.7812
[4500]	training's l1: 12.9586	valid_1's l1: 14.7766
[5000]	training's l1: 12.8048	valid_1's l1: 14.7728
[5500]	training's l1: 12.6541	valid_1's l1: 14.768
[6000]	training's l1: 12.5092	valid_1's l1: 14.7646
Early s

In [173]:
submission_df = pd.DataFrame(data = result).apply(round)
submission_df.to_csv('C:/Users/yue/yuekangwei/credit-competition/output/submit.csv',na_rep='\n',index=False,encoding='utf8',header=False)

In [174]:
submission_df.shape

(50000, 1)

In [175]:
submission_df.head()

Unnamed: 0,0
0,601.0
1,531.0
2,668.0
3,676.0
4,658.0


In [None]:
# from sklearn.model_selection import StratifiedKFold, KFold
# x, y = train_df[feature_name1], train_df['信用分'].values
# x_test = test_df[feature_name1]

# model = kf_lgbm(x=x,y=y,x_test=x_test,learning_rate=0.01, 
#                 stratify=True,
#                 min_split_gain=1,
#                 categorical_feature=['用户话费敏感度'],
#                 boosting_type='gbdt',
#                 early_stopping_rounds=80,
#                 fair_c=25, 
#                 huber_delta=2,
#                 max_cat_to_onehot=4,
#                 objective="mae_fair",
#                 eval_metric="mae",
#                 subsample_freq=2,
#                 min_child_samples=20,
#                 num_leaves=31,
#                 bagging_fraction=0.8,
#                 feature_fraction=0.5,
#                 max_depth=5,
#                 output_dir=output_dir,
#                 name='gotcha_lgb1',
#                 n_estimators=8000)