In [143]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import datetime,gc,math
import random
%matplotlib inline
import gc
import lightgbm as lgb
from matplotlib import pyplot as plt
from sklearn.model_selection import KFold, train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder,MinMaxScaler, StandardScaler
from sklearn.metrics import recall_score, auc, accuracy_score, f1_score, precision_score, classification_report, roc_auc_score,mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.feature_selection import VarianceThreshold

In [144]:
def active_time_transform(df):
    df['role_created_active_time'].fillna('"0-8,0","8-12,0","12-14,0","14-18,0","18-24,0"', inplace=True)
    df['role_created_active_time'] = df['role_created_active_time'].apply(lambda x:str(x)[1:-1])
    temp = df['role_created_active_time'].str.split(',',expand=True).iloc[:,[1,3,5,7,9]].rename(columns={1:'active_0-8',
                                        3:'active_8-12',5:'active_12-14',7:'active_14-18',9:'active_18-24'})
    for i in range(len(temp.columns)):
        temp.iloc[:,i] = temp.iloc[:,i].apply(lambda x:str(x)[:-1])
    df = df.join(temp).drop(['role_created_active_time'], axis=1)
    return df

def pay_grade_transform(df):
    df['pay_grade'].fillna('[0,0,0,0,0,0,0]',inplace=True)
    df['pay_grade'] = df['pay_grade'].apply(lambda x:str(x)[1:-1])
    temp = df['pay_grade'].str.split(',',expand=True).rename(columns={0:'pay_grade_1',
                1:'pay_grade_2',2:'pay_grade_3',3:'pay_grade_4',4:'pay_grade_5',5:'pay_grade_6',6:'pay_grade_7'})
    df = df.join(temp).drop(['pay_grade'], axis=1)
    return df

In [145]:
%%time
role_info = pd.read_csv('./data/mr_role_1d.csv')
role_info.drop_duplicates(subset=['user_id','cp_server_no','cp_role_id'],inplace=True)
role_info = active_time_transform(role_info)
role_info = pay_grade_transform(role_info)

Wall time: 3min 40s


In [146]:
# 删除2020-7-19日前30天的数据
role_info['create_role_time'] = pd.to_datetime(role_info['create_role_time'], format='%Y-%m-%d %H:%M:%S')
role_info.drop(role_info[role_info['create_role_time'] > (role_info['create_role_time'].max() + 
                                                          datetime.timedelta(-30))].index,inplace=True)

In [147]:
# 删除没有心跳数据的样本(1、没有登录;2、没有拿到心跳)
role_info.dropna(subset=['role_created_login_num'], inplace=True)
role_info.dropna(subset=['role_created_online'], inplace=True)
role_info['pay_num'].fillna(0, inplace=True)
role_info['pay_sum'].fillna(0, inplace=True)
role_info['role_created_active'] = role_info['role_created_active'].clip(0,1)
role_info['pay_rate'] = role_info['pay_num'] / (role_info['role_created_active'] + 1e-4)
role_info['pay_avg'] = role_info['pay_sum'] / (role_info['pay_num'] + 1e-4)

In [148]:
select_features = ['user_id','cp_server_no','cp_role_id','role_created_login_num',
       'role_created_active', 'role_created_online',
       'max_role_level', 'ip_num',
       'pay_num', 'pay_sum', 'active_0-8', 'active_8-12', 'active_12-14',
       'active_14-18', 'active_18-24', 'pay_grade_1', 'pay_grade_2',
       'pay_grade_3', 'pay_grade_4', 'pay_grade_5', 'pay_grade_6',
       'pay_rate', 'pay_avg','role_created_30_pay_sum']
role_info = role_info[select_features]
role_info['role_created_30_pay_sum'].fillna(0,inplace=True)
col_list = ['active_0-8','active_8-12','active_12-14','active_14-18','active_18-24','pay_grade_1','pay_grade_2',
            'pay_grade_3','pay_grade_4','pay_grade_5','pay_grade_6']
for col in col_list:
    role_info[col] = pd.to_numeric(role_info[col], errors='coerce')

In [180]:
role_info.to_pickle('./role_info_1d.pkl')

In [149]:
df_train, df_test = train_test_split(role_info,test_size=0.3)
df_train_pay = df_train[df_train['pay_sum']>0]
df_train_nopay = df_train[df_train['pay_sum']==0]
df_train_nopay_online_n = df_train_nopay[(df_train_nopay['role_created_online']==300) | (df_train_nopay['role_created_login_num']<3)|(df_train_nopay['max_role_level']==1)]
df_train_nopay_online_y = df_train_nopay.loc[np.setdiff1d(df_train_nopay.index, df_train_nopay_online_n.index),:]

In [150]:
print(df_train_nopay_online_n.shape[0]/df_train.shape[0])
print(df_train_nopay_online_y.shape[0]/df_train.shape[0])
print(df_train_pay.shape[0]/df_train.shape[0])

0.8677791006706002
0.11665108369928995
0.015569815630109904


In [152]:
target = df_train_pay['role_created_30_pay_sum']
features = df_train_pay.drop(['role_created_30_pay_sum','user_id','cp_server_no','cp_role_id'], axis=1)
target_ln = np.log1p(target)
X_val, x_test, Y_val, y_test = train_test_split(features,target_ln, test_size=0.3)

In [121]:
# lgb的调参与交叉验证
train_data = lgb.Dataset(X_val, label=Y_val)
val_data = lgb.Dataset(x_test, label=y_test, reference=train_data)

# 设置参数初始值，不含交叉验证参数
print('设置参数')
params = {
    'boosting_type':'gbdt',
    'objective':'regression',
    'metric':'rmse',
    'nthread':4,
    'learning_rate':0.05
}

# 交叉验证（调参）
print('交叉验证')
min_rmse = float('5')
best_params = {}

# 准确率
print('调参1：提高准确率')
for num_leaves in range(5, 100, 5):
    for max_depth in range(3, 8, 1):
        params['num_leaves'] = num_leaves
        params['max_depth'] = max_depth
        
        cv_results = lgb.cv(
                            params,
                            train_data,
                            nfold=3,
                            stratified=False,
                            early_stopping_rounds=10,
                            verbose_eval=50     
                            )
        mean_rmse = pd.Series(cv_results['rmse-mean']).min()
        boost_rounds = pd.Series(cv_results['rmse-mean']).idxmin()
        
        if mean_rmse <= min_rmse:
            min_rmse = mean_rmse
            best_params['num_leaves'] = num_leaves
            best_params['max_depth'] = max_depth

if 'num_leaves' and 'max_depth' in best_params.keys():
    params['num_leaves'] = best_params['num_leaves']
    params['max_depth'] = best_params['max_depth']

# 过拟合
print('调参2：降低过拟合')
min_rmse = float('5')
for max_bin in range(5, 256, 10):
    for min_data_in_leaf in range(1, 102, 10):
        params['max_bin'] = max_bin
        params['min_data_in_leaf'] = min_data_in_leaf
        
        cv_results = lgb.cv(
                            params,
                            train_data,
                            nfold=3,
                            stratified=False,
                            early_stopping_rounds=10,
                            verbose_eval=50 
                            )
        mean_rmse = pd.Series(cv_results['rmse-mean']).min()
        boost_rounds = pd.Series(cv_results['rmse-mean']).idxmin()
        
        if mean_rmse <= min_rmse:
            min_rmse = mean_rmse
            best_params['min_data_in_leaf'] = min_data_in_leaf
            best_params['max_bin'] = max_bin
            
if 'max_bin' and 'min_data_in_leaf' in best_params.keys():
    params['max_bin'] = best_params['max_bin']
    params['min_data_in_leaf'] = best_params['min_data_in_leaf']

print('调参3：降低过拟合')
min_rmse = float('5')
for feature_fraction in [0.5, 0.6, 0.7, 0.8, 0.9, 1.0]:
    for bagging_fraction in [0.5, 0.6, 0.7, 0.8, 0.9, 1.0]:
        for bagging_freq in range(0,50,5):
            params['feature_fraction'] = feature_fraction
            params['bagging_fraction'] = bagging_fraction
            params['bagging_freq'] = bagging_freq

            cv_results = lgb.cv(
                            params,
                            train_data,
                            nfold=3,
                            stratified=False,
                            early_stopping_rounds=10,
                            verbose_eval=50
                            )
            mean_rmse = pd.Series(cv_results['rmse-mean']).min()
            boost_rounds = pd.Series(cv_results['rmse-mean']).idxmin()
        
            if mean_rmse <= min_rmse:
                min_rmse = mean_rmse
                best_params['feature_fraction'] = feature_fraction
                best_params['bagging_fraction'] = bagging_fraction
                best_params['bagging_freq'] = bagging_freq

if 'feature_fraction' and 'bagging_fraction' and 'bagging_freq' in best_params.keys():
    params['feature_fraction'] = best_params['feature_fraction']
    params['bagging_fraction'] = best_params['bagging_fraction']
    params['bagging_freq'] = best_params['bagging_freq']

print('调参4：降低过拟合')
min_rmse = float('5')
for lambda_l1 in [1e-5,1e-3,1e-1,0.0,0.1,0.3,0.5,0.7,0.9,1.0]:
    for lambda_l2 in [1e-5,1e-3,1e-1,0.0,0.1,0.4,0.6,0.7,0.9,1.0]:
        params['lambda_l1'] = lambda_l1
        params['lambda_l2'] = lambda_l2
        cv_results = lgb.cv(
                            params,
                            train_data,
                            nfold=3,
                            stratified=False,
                            early_stopping_rounds=10,
                            verbose_eval=50
                            )
        mean_rmse = pd.Series(cv_results['rmse-mean']).min()
        boost_rounds = pd.Series(cv_results['rmse-mean']).idxmin()
        
        if mean_rmse <= min_rmse:
            min_rmse = mean_rmse
            best_params['lambda_l1'] = lambda_l1
            best_params['lambda_l2'] = lambda_l2
if 'lambda_l1' and 'lambda_l2' in best_params.keys():
    params['lambda_l1'] = best_params['lambda_l1']
    params['lambda_l2'] = best_params['lambda_l2']

print("调参5：降低过拟合2")
min_rmse = float('5')
for min_split_gain in [0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]:
    params['min_split_gain'] = min_split_gain
    
    cv_results = lgb.cv(
                            params,
                            train_data,
                            nfold=3,
                            stratified=False,
                            early_stopping_rounds=10,
                            verbose_eval=50    
                            )
    mean_rmse = pd.Series(cv_results['rmse-mean']).min()
    boost_rounds = pd.Series(cv_results['rmse-mean']).idxmin()
        
    if mean_rmse >= min_rmse:
        min_rmse = mean_rmse
        
    best_params['min_split_gain'] = min_split_gain
if 'min_split_gain' in best_params.keys():
    params['min_split_gain'] = best_params['min_split_gain']
print(best_params)

设置参数
交叉验证
调参1：提高准确率
[50]	cv_agg's rmse: 1.19914 + 0.00641153
[100]	cv_agg's rmse: 1.17902 + 0.00718099
[50]	cv_agg's rmse: 1.19923 + 0.00641698
[100]	cv_agg's rmse: 1.17903 + 0.00718258
[50]	cv_agg's rmse: 1.19923 + 0.00641698
[100]	cv_agg's rmse: 1.17903 + 0.00718258
[50]	cv_agg's rmse: 1.19923 + 0.00641698
[100]	cv_agg's rmse: 1.17903 + 0.00718258
[50]	cv_agg's rmse: 1.19923 + 0.00641698
[100]	cv_agg's rmse: 1.17903 + 0.00718258
[50]	cv_agg's rmse: 1.19196 + 0.00714948
[100]	cv_agg's rmse: 1.17788 + 0.00796737
[50]	cv_agg's rmse: 1.18901 + 0.00699219
[100]	cv_agg's rmse: 1.17681 + 0.00811585
[50]	cv_agg's rmse: 1.18907 + 0.00701563
[100]	cv_agg's rmse: 1.17689 + 0.00814734
[50]	cv_agg's rmse: 1.18907 + 0.00701563
[100]	cv_agg's rmse: 1.17669 + 0.00807653
[50]	cv_agg's rmse: 1.18907 + 0.00701563
[100]	cv_agg's rmse: 1.17691 + 0.00810785
[50]	cv_agg's rmse: 1.19196 + 0.00714948
[100]	cv_agg's rmse: 1.17788 + 0.00796737
[50]	cv_agg's rmse: 1.1866 + 0.00727326
[100]	cv_agg's rmse: 1.1765

[50]	cv_agg's rmse: 1.18577 + 0.0075192
[100]	cv_agg's rmse: 1.17607 + 0.00853412
[50]	cv_agg's rmse: 1.18599 + 0.0077498
[100]	cv_agg's rmse: 1.17639 + 0.00875936
[50]	cv_agg's rmse: 1.18533 + 0.00815706
[100]	cv_agg's rmse: 1.17646 + 0.00937142
[50]	cv_agg's rmse: 1.18498 + 0.00763043
[100]	cv_agg's rmse: 1.17594 + 0.00880041
[50]	cv_agg's rmse: 1.18504 + 0.00742058
[100]	cv_agg's rmse: 1.17607 + 0.00864094
[50]	cv_agg's rmse: 1.18525 + 0.00773068
[100]	cv_agg's rmse: 1.17606 + 0.00866661
[50]	cv_agg's rmse: 1.18536 + 0.00794117
[100]	cv_agg's rmse: 1.1762 + 0.00894648
[50]	cv_agg's rmse: 1.18543 + 0.0078215
[100]	cv_agg's rmse: 1.17614 + 0.00888141
[50]	cv_agg's rmse: 1.1856 + 0.00771044
[100]	cv_agg's rmse: 1.176 + 0.00888765
[50]	cv_agg's rmse: 1.18569 + 0.00762447
[100]	cv_agg's rmse: 1.17634 + 0.00843042
[50]	cv_agg's rmse: 1.18566 + 0.00765076
[100]	cv_agg's rmse: 1.17591 + 0.00857053
[50]	cv_agg's rmse: 1.18577 + 0.0075192
[100]	cv_agg's rmse: 1.17607 + 0.00853412
[50]	cv_agg'

[100]	cv_agg's rmse: 1.17607 + 0.00853412
[50]	cv_agg's rmse: 1.18599 + 0.0077498
[100]	cv_agg's rmse: 1.17639 + 0.00875936
[50]	cv_agg's rmse: 1.18533 + 0.00815706
[100]	cv_agg's rmse: 1.17646 + 0.00937142
[50]	cv_agg's rmse: 1.18498 + 0.00763043
[100]	cv_agg's rmse: 1.17594 + 0.00880041
[50]	cv_agg's rmse: 1.18504 + 0.00742058
[100]	cv_agg's rmse: 1.17607 + 0.00864094
[50]	cv_agg's rmse: 1.18525 + 0.00773068
[100]	cv_agg's rmse: 1.17606 + 0.00866661
[50]	cv_agg's rmse: 1.18536 + 0.00794117
[100]	cv_agg's rmse: 1.1762 + 0.00894648
[50]	cv_agg's rmse: 1.18543 + 0.0078215
[100]	cv_agg's rmse: 1.17614 + 0.00888141
[50]	cv_agg's rmse: 1.1856 + 0.00771044
[100]	cv_agg's rmse: 1.176 + 0.00888765
[50]	cv_agg's rmse: 1.18569 + 0.00762447
[100]	cv_agg's rmse: 1.17634 + 0.00843042
[50]	cv_agg's rmse: 1.18566 + 0.00765076
[100]	cv_agg's rmse: 1.17591 + 0.00857053
[50]	cv_agg's rmse: 1.18577 + 0.0075192
[100]	cv_agg's rmse: 1.17607 + 0.00853412
[50]	cv_agg's rmse: 1.18599 + 0.0077498
[100]	cv_agg

[50]	cv_agg's rmse: 1.1864 + 0.00752792
[100]	cv_agg's rmse: 1.17603 + 0.00791166
[50]	cv_agg's rmse: 1.1869 + 0.0074231
[100]	cv_agg's rmse: 1.17651 + 0.00782596
[50]	cv_agg's rmse: 1.1871 + 0.00754945
[100]	cv_agg's rmse: 1.17694 + 0.0081122
[50]	cv_agg's rmse: 1.18718 + 0.00716799
[100]	cv_agg's rmse: 1.17679 + 0.0073651
[50]	cv_agg's rmse: 1.18739 + 0.00737779
[100]	cv_agg's rmse: 1.17732 + 0.00771643
[50]	cv_agg's rmse: 1.18771 + 0.00717196
[100]	cv_agg's rmse: 1.17728 + 0.00727753
[50]	cv_agg's rmse: 1.18698 + 0.00710386
[100]	cv_agg's rmse: 1.17708 + 0.00774214
[50]	cv_agg's rmse: 1.18745 + 0.00717845
[100]	cv_agg's rmse: 1.177 + 0.00863072
[50]	cv_agg's rmse: 1.18724 + 0.00707102
[100]	cv_agg's rmse: 1.17723 + 0.00727129
[50]	cv_agg's rmse: 1.18659 + 0.00736943
[100]	cv_agg's rmse: 1.1755 + 0.00836174
[50]	cv_agg's rmse: 1.18665 + 0.00719882
[100]	cv_agg's rmse: 1.17621 + 0.00778206
[50]	cv_agg's rmse: 1.18686 + 0.00767518
[100]	cv_agg's rmse: 1.17627 + 0.00822026
[50]	cv_agg's

[50]	cv_agg's rmse: 1.18518 + 0.00771251
[100]	cv_agg's rmse: 1.17658 + 0.00779691
[50]	cv_agg's rmse: 1.18614 + 0.00761291
[100]	cv_agg's rmse: 1.17725 + 0.00775022
[50]	cv_agg's rmse: 1.18599 + 0.00778086
[100]	cv_agg's rmse: 1.1767 + 0.00782461
[50]	cv_agg's rmse: 1.18655 + 0.00723953
[100]	cv_agg's rmse: 1.17792 + 0.00741728
[50]	cv_agg's rmse: 1.18653 + 0.00787965
[100]	cv_agg's rmse: 1.17884 + 0.00781526
[50]	cv_agg's rmse: 1.18688 + 0.00849133
[100]	cv_agg's rmse: 1.17761 + 0.00846574
[50]	cv_agg's rmse: 1.1866 + 0.00749711
[100]	cv_agg's rmse: 1.17754 + 0.0076988
[50]	cv_agg's rmse: 1.1866 + 0.00734591
[100]	cv_agg's rmse: 1.17786 + 0.00794226
[50]	cv_agg's rmse: 1.1863 + 0.00741313
[100]	cv_agg's rmse: 1.17786 + 0.00834925
[50]	cv_agg's rmse: 1.18599 + 0.00752054
[100]	cv_agg's rmse: 1.17602 + 0.00839025
[50]	cv_agg's rmse: 1.18569 + 0.00792395
[100]	cv_agg's rmse: 1.177 + 0.00880687
[50]	cv_agg's rmse: 1.18646 + 0.00778386
[100]	cv_agg's rmse: 1.17735 + 0.00869479
[50]	cv_agg

[50]	cv_agg's rmse: 1.18551 + 0.00754792
[100]	cv_agg's rmse: 1.17577 + 0.00811044
[50]	cv_agg's rmse: 1.18584 + 0.00757865
[100]	cv_agg's rmse: 1.17653 + 0.00856888
[50]	cv_agg's rmse: 1.18623 + 0.00740562
[100]	cv_agg's rmse: 1.17695 + 0.00803647
[50]	cv_agg's rmse: 1.18621 + 0.00740738
[100]	cv_agg's rmse: 1.17679 + 0.00850074
[50]	cv_agg's rmse: 1.186 + 0.00753722
[100]	cv_agg's rmse: 1.17676 + 0.00819182
[50]	cv_agg's rmse: 1.18644 + 0.00764282
[100]	cv_agg's rmse: 1.17729 + 0.0082812
[50]	cv_agg's rmse: 1.18605 + 0.00744537
[100]	cv_agg's rmse: 1.17681 + 0.00809081
[50]	cv_agg's rmse: 1.18623 + 0.0077988
[100]	cv_agg's rmse: 1.17664 + 0.00879425
[50]	cv_agg's rmse: 1.18614 + 0.00734995
[100]	cv_agg's rmse: 1.17654 + 0.00823435
[50]	cv_agg's rmse: 1.18606 + 0.00770178
[100]	cv_agg's rmse: 1.1764 + 0.00864327
[50]	cv_agg's rmse: 1.18606 + 0.00770178
[100]	cv_agg's rmse: 1.1764 + 0.00864327
[50]	cv_agg's rmse: 1.18606 + 0.00770178
[100]	cv_agg's rmse: 1.1764 + 0.00864327
[50]	cv_agg

[50]	cv_agg's rmse: 1.18569 + 0.00729974
[100]	cv_agg's rmse: 1.17689 + 0.00759236
[50]	cv_agg's rmse: 1.18627 + 0.00757026
[100]	cv_agg's rmse: 1.17742 + 0.00770492
[50]	cv_agg's rmse: 1.18597 + 0.00773865
[100]	cv_agg's rmse: 1.17721 + 0.00793157
[50]	cv_agg's rmse: 1.18661 + 0.00775766
[100]	cv_agg's rmse: 1.17779 + 0.0080088
[50]	cv_agg's rmse: 1.18659 + 0.00749338
[100]	cv_agg's rmse: 1.17745 + 0.00784824
[50]	cv_agg's rmse: 1.18645 + 0.00785359
[100]	cv_agg's rmse: 1.17719 + 0.00773374
[50]	cv_agg's rmse: 1.18614 + 0.00769415
[100]	cv_agg's rmse: 1.17751 + 0.00831775
[50]	cv_agg's rmse: 1.18641 + 0.00754568
[100]	cv_agg's rmse: 1.17759 + 0.00883413
[50]	cv_agg's rmse: 1.18622 + 0.00715416
[100]	cv_agg's rmse: 1.17749 + 0.00776501
[50]	cv_agg's rmse: 1.18599 + 0.0077498
[100]	cv_agg's rmse: 1.17639 + 0.00875936
[50]	cv_agg's rmse: 1.18568 + 0.00736222
[100]	cv_agg's rmse: 1.17658 + 0.00761097
[50]	cv_agg's rmse: 1.1858 + 0.00782162
[100]	cv_agg's rmse: 1.17654 + 0.00845876
[50]	cv

[50]	cv_agg's rmse: 1.18661 + 0.00735722
[100]	cv_agg's rmse: 1.17555 + 0.00848222
[50]	cv_agg's rmse: 1.1868 + 0.00734866
[100]	cv_agg's rmse: 1.17559 + 0.00837945
[50]	cv_agg's rmse: 1.18661 + 0.00735723
[100]	cv_agg's rmse: 1.17555 + 0.00848222
[50]	cv_agg's rmse: 1.1868 + 0.00734866
[100]	cv_agg's rmse: 1.17559 + 0.00837945
[50]	cv_agg's rmse: 1.18682 + 0.0074408
[100]	cv_agg's rmse: 1.17574 + 0.00823373
[50]	cv_agg's rmse: 1.18669 + 0.00725899
[100]	cv_agg's rmse: 1.17558 + 0.00830654
[50]	cv_agg's rmse: 1.18664 + 0.00741503
[100]	cv_agg's rmse: 1.17549 + 0.00838717
[50]	cv_agg's rmse: 1.18659 + 0.00740704
[100]	cv_agg's rmse: 1.17556 + 0.00823321
[50]	cv_agg's rmse: 1.18675 + 0.00736832
[100]	cv_agg's rmse: 1.17562 + 0.0085783
[50]	cv_agg's rmse: 1.18669 + 0.00726933
[100]	cv_agg's rmse: 1.17564 + 0.00813444
[50]	cv_agg's rmse: 1.18669 + 0.00726933
[100]	cv_agg's rmse: 1.17565 + 0.00813443
[50]	cv_agg's rmse: 1.18676 + 0.0073077
[100]	cv_agg's rmse: 1.17579 + 0.0083223
[50]	cv_ag

In [153]:
target = df_train_pay['role_created_30_pay_sum']
features = df_train_pay.drop(['role_created_30_pay_sum','user_id','cp_server_no','cp_role_id'], axis=1)
target_ln = np.log1p(target)
X_val, x_test, Y_val, y_test = train_test_split(features,target_ln, test_size=0.3)
params = {

    'boosting_type':'gbdt',
    'objective':'regression',
    'metric':'rmse',
    'nthread':4,
    'learning_rate':0.005,
    "max_depth":7,
    "num_leaves":20,
    "max_bin":255,
    "min_data_in_leaf":61,
    "min_split_gain":1.0,
    "feature_fraction": 0.5,
    "bagging_fraction":1,
    "bagging_freq":45,
    "lambda_l1":0.0,
    "lambda_l2":0.4,
    "metric": "rmse",
    }
train_data = lgb.Dataset(X_val, label=Y_val)
val_data = lgb.Dataset(x_test, label=y_test, reference=train_data)
lgb_r_cv = lgb.train(params, train_data, num_boost_round=8000, early_stopping_rounds=100, valid_sets=[train_data, val_data])

[1]	training's rmse: 1.82708	valid_1's rmse: 1.8221
Training until validation scores don't improve for 100 rounds
[2]	training's rmse: 1.82182	valid_1's rmse: 1.81688
[3]	training's rmse: 1.81665	valid_1's rmse: 1.81178
[4]	training's rmse: 1.8116	valid_1's rmse: 1.80678
[5]	training's rmse: 1.80748	valid_1's rmse: 1.80268
[6]	training's rmse: 1.80242	valid_1's rmse: 1.79767
[7]	training's rmse: 1.79763	valid_1's rmse: 1.79293
[8]	training's rmse: 1.79263	valid_1's rmse: 1.78798
[9]	training's rmse: 1.78791	valid_1's rmse: 1.7833
[10]	training's rmse: 1.78298	valid_1's rmse: 1.77843
[11]	training's rmse: 1.77803	valid_1's rmse: 1.77352
[12]	training's rmse: 1.77346	valid_1's rmse: 1.76899
[13]	training's rmse: 1.76883	valid_1's rmse: 1.76439
[14]	training's rmse: 1.76411	valid_1's rmse: 1.75973
[15]	training's rmse: 1.75943	valid_1's rmse: 1.7551
[16]	training's rmse: 1.7547	valid_1's rmse: 1.75044
[17]	training's rmse: 1.75008	valid_1's rmse: 1.74588
[18]	training's rmse: 1.74572	vali

[224]	training's rmse: 1.27113	valid_1's rmse: 1.27876
[225]	training's rmse: 1.27021	valid_1's rmse: 1.27789
[226]	training's rmse: 1.2693	valid_1's rmse: 1.27703
[227]	training's rmse: 1.26843	valid_1's rmse: 1.27623
[228]	training's rmse: 1.26754	valid_1's rmse: 1.27538
[229]	training's rmse: 1.26674	valid_1's rmse: 1.27463
[230]	training's rmse: 1.26588	valid_1's rmse: 1.27383
[231]	training's rmse: 1.26502	valid_1's rmse: 1.27303
[232]	training's rmse: 1.26417	valid_1's rmse: 1.27223
[233]	training's rmse: 1.2634	valid_1's rmse: 1.2715
[234]	training's rmse: 1.26254	valid_1's rmse: 1.2707
[235]	training's rmse: 1.26169	valid_1's rmse: 1.2699
[236]	training's rmse: 1.26088	valid_1's rmse: 1.26914
[237]	training's rmse: 1.26005	valid_1's rmse: 1.26837
[238]	training's rmse: 1.25924	valid_1's rmse: 1.26761
[239]	training's rmse: 1.25844	valid_1's rmse: 1.26687
[240]	training's rmse: 1.25763	valid_1's rmse: 1.26611
[241]	training's rmse: 1.25684	valid_1's rmse: 1.26537
[242]	training'

[443]	training's rmse: 1.18302	valid_1's rmse: 1.19918
[444]	training's rmse: 1.18287	valid_1's rmse: 1.19905
[445]	training's rmse: 1.18273	valid_1's rmse: 1.19894
[446]	training's rmse: 1.18258	valid_1's rmse: 1.19881
[447]	training's rmse: 1.18244	valid_1's rmse: 1.19869
[448]	training's rmse: 1.18231	valid_1's rmse: 1.1986
[449]	training's rmse: 1.18219	valid_1's rmse: 1.19851
[450]	training's rmse: 1.18205	valid_1's rmse: 1.19839
[451]	training's rmse: 1.18189	valid_1's rmse: 1.19826
[452]	training's rmse: 1.18177	valid_1's rmse: 1.19817
[453]	training's rmse: 1.18163	valid_1's rmse: 1.19806
[454]	training's rmse: 1.18148	valid_1's rmse: 1.19794
[455]	training's rmse: 1.18134	valid_1's rmse: 1.19781
[456]	training's rmse: 1.1812	valid_1's rmse: 1.1977
[457]	training's rmse: 1.18106	valid_1's rmse: 1.19758
[458]	training's rmse: 1.18092	valid_1's rmse: 1.19747
[459]	training's rmse: 1.18081	valid_1's rmse: 1.19738
[460]	training's rmse: 1.18068	valid_1's rmse: 1.19728
[461]	trainin

[668]	training's rmse: 1.16606	valid_1's rmse: 1.18706
[669]	training's rmse: 1.16603	valid_1's rmse: 1.18704
[670]	training's rmse: 1.16599	valid_1's rmse: 1.18702
[671]	training's rmse: 1.16595	valid_1's rmse: 1.187
[672]	training's rmse: 1.16591	valid_1's rmse: 1.18698
[673]	training's rmse: 1.16587	valid_1's rmse: 1.18695
[674]	training's rmse: 1.16584	valid_1's rmse: 1.18693
[675]	training's rmse: 1.1658	valid_1's rmse: 1.18691
[676]	training's rmse: 1.16576	valid_1's rmse: 1.18689
[677]	training's rmse: 1.16572	valid_1's rmse: 1.18687
[678]	training's rmse: 1.16568	valid_1's rmse: 1.18686
[679]	training's rmse: 1.16564	valid_1's rmse: 1.18683
[680]	training's rmse: 1.16561	valid_1's rmse: 1.18681
[681]	training's rmse: 1.16558	valid_1's rmse: 1.1868
[682]	training's rmse: 1.16554	valid_1's rmse: 1.18679
[683]	training's rmse: 1.1655	valid_1's rmse: 1.18675
[684]	training's rmse: 1.16546	valid_1's rmse: 1.18673
[685]	training's rmse: 1.16542	valid_1's rmse: 1.18671
[686]	training'

[892]	training's rmse: 1.15982	valid_1's rmse: 1.18408
[893]	training's rmse: 1.1598	valid_1's rmse: 1.18407
[894]	training's rmse: 1.15978	valid_1's rmse: 1.18406
[895]	training's rmse: 1.15976	valid_1's rmse: 1.18406
[896]	training's rmse: 1.15974	valid_1's rmse: 1.18405
[897]	training's rmse: 1.15973	valid_1's rmse: 1.18404
[898]	training's rmse: 1.15971	valid_1's rmse: 1.18403
[899]	training's rmse: 1.15969	valid_1's rmse: 1.18402
[900]	training's rmse: 1.15966	valid_1's rmse: 1.18401
[901]	training's rmse: 1.15964	valid_1's rmse: 1.184
[902]	training's rmse: 1.15962	valid_1's rmse: 1.18399
[903]	training's rmse: 1.1596	valid_1's rmse: 1.18399
[904]	training's rmse: 1.15957	valid_1's rmse: 1.18398
[905]	training's rmse: 1.15955	valid_1's rmse: 1.18397
[906]	training's rmse: 1.15953	valid_1's rmse: 1.18396
[907]	training's rmse: 1.15951	valid_1's rmse: 1.18396
[908]	training's rmse: 1.15949	valid_1's rmse: 1.18395
[909]	training's rmse: 1.15947	valid_1's rmse: 1.18394
[910]	training

[1133]	training's rmse: 1.15529	valid_1's rmse: 1.18251
[1134]	training's rmse: 1.15527	valid_1's rmse: 1.18251
[1135]	training's rmse: 1.15526	valid_1's rmse: 1.18251
[1136]	training's rmse: 1.15524	valid_1's rmse: 1.18251
[1137]	training's rmse: 1.15523	valid_1's rmse: 1.18251
[1138]	training's rmse: 1.15521	valid_1's rmse: 1.1825
[1139]	training's rmse: 1.1552	valid_1's rmse: 1.18249
[1140]	training's rmse: 1.15518	valid_1's rmse: 1.18249
[1141]	training's rmse: 1.15518	valid_1's rmse: 1.18248
[1142]	training's rmse: 1.15516	valid_1's rmse: 1.18247
[1143]	training's rmse: 1.15514	valid_1's rmse: 1.18247
[1144]	training's rmse: 1.15513	valid_1's rmse: 1.18247
[1145]	training's rmse: 1.15511	valid_1's rmse: 1.18246
[1146]	training's rmse: 1.1551	valid_1's rmse: 1.18246
[1147]	training's rmse: 1.15509	valid_1's rmse: 1.18246
[1148]	training's rmse: 1.15507	valid_1's rmse: 1.18245
[1149]	training's rmse: 1.15505	valid_1's rmse: 1.18245
[1150]	training's rmse: 1.15504	valid_1's rmse: 1.1

[1396]	training's rmse: 1.15179	valid_1's rmse: 1.18181
[1397]	training's rmse: 1.15178	valid_1's rmse: 1.18181
[1398]	training's rmse: 1.15177	valid_1's rmse: 1.18182
[1399]	training's rmse: 1.15176	valid_1's rmse: 1.18181
[1400]	training's rmse: 1.15175	valid_1's rmse: 1.18181
[1401]	training's rmse: 1.15173	valid_1's rmse: 1.18181
[1402]	training's rmse: 1.15172	valid_1's rmse: 1.1818
[1403]	training's rmse: 1.1517	valid_1's rmse: 1.1818
[1404]	training's rmse: 1.15168	valid_1's rmse: 1.18179
[1405]	training's rmse: 1.15167	valid_1's rmse: 1.18179
[1406]	training's rmse: 1.15165	valid_1's rmse: 1.18179
[1407]	training's rmse: 1.15164	valid_1's rmse: 1.18179
[1408]	training's rmse: 1.15162	valid_1's rmse: 1.18179
[1409]	training's rmse: 1.15161	valid_1's rmse: 1.18179
[1410]	training's rmse: 1.1516	valid_1's rmse: 1.18178
[1411]	training's rmse: 1.15159	valid_1's rmse: 1.18178
[1412]	training's rmse: 1.15157	valid_1's rmse: 1.18178
[1413]	training's rmse: 1.15156	valid_1's rmse: 1.18

[1683]	training's rmse: 1.14824	valid_1's rmse: 1.18155
[1684]	training's rmse: 1.14823	valid_1's rmse: 1.18155
[1685]	training's rmse: 1.14821	valid_1's rmse: 1.18155
[1686]	training's rmse: 1.1482	valid_1's rmse: 1.18155
[1687]	training's rmse: 1.14819	valid_1's rmse: 1.18155
[1688]	training's rmse: 1.14818	valid_1's rmse: 1.18154
[1689]	training's rmse: 1.14817	valid_1's rmse: 1.18154
[1690]	training's rmse: 1.14816	valid_1's rmse: 1.18154
[1691]	training's rmse: 1.14815	valid_1's rmse: 1.18154
[1692]	training's rmse: 1.14814	valid_1's rmse: 1.18154
[1693]	training's rmse: 1.14814	valid_1's rmse: 1.18154
[1694]	training's rmse: 1.14812	valid_1's rmse: 1.18154
[1695]	training's rmse: 1.14811	valid_1's rmse: 1.18154
[1696]	training's rmse: 1.14809	valid_1's rmse: 1.18154
[1697]	training's rmse: 1.14808	valid_1's rmse: 1.18154
[1698]	training's rmse: 1.14807	valid_1's rmse: 1.18154
[1699]	training's rmse: 1.14805	valid_1's rmse: 1.18153
[1700]	training's rmse: 1.14803	valid_1's rmse: 1

[1833]	training's rmse: 1.14645	valid_1's rmse: 1.18142
[1834]	training's rmse: 1.14644	valid_1's rmse: 1.18142
[1835]	training's rmse: 1.14643	valid_1's rmse: 1.18142
[1836]	training's rmse: 1.14643	valid_1's rmse: 1.18141
[1837]	training's rmse: 1.14641	valid_1's rmse: 1.18142
[1838]	training's rmse: 1.1464	valid_1's rmse: 1.18142
[1839]	training's rmse: 1.14639	valid_1's rmse: 1.18142
[1840]	training's rmse: 1.14638	valid_1's rmse: 1.18141
[1841]	training's rmse: 1.14636	valid_1's rmse: 1.18141
[1842]	training's rmse: 1.14635	valid_1's rmse: 1.18141
[1843]	training's rmse: 1.14634	valid_1's rmse: 1.18141
[1844]	training's rmse: 1.14633	valid_1's rmse: 1.18141
[1845]	training's rmse: 1.14632	valid_1's rmse: 1.18141
[1846]	training's rmse: 1.1463	valid_1's rmse: 1.18141
[1847]	training's rmse: 1.14629	valid_1's rmse: 1.18141
[1848]	training's rmse: 1.14627	valid_1's rmse: 1.18141
[1849]	training's rmse: 1.14627	valid_1's rmse: 1.18141
[1850]	training's rmse: 1.14626	valid_1's rmse: 1.

[1981]	training's rmse: 1.14467	valid_1's rmse: 1.18132
[1982]	training's rmse: 1.14466	valid_1's rmse: 1.18131
[1983]	training's rmse: 1.14464	valid_1's rmse: 1.18132
[1984]	training's rmse: 1.14464	valid_1's rmse: 1.18132
[1985]	training's rmse: 1.14462	valid_1's rmse: 1.18132
[1986]	training's rmse: 1.14461	valid_1's rmse: 1.18132
[1987]	training's rmse: 1.14461	valid_1's rmse: 1.18132
[1988]	training's rmse: 1.14459	valid_1's rmse: 1.18132
[1989]	training's rmse: 1.14457	valid_1's rmse: 1.18132
[1990]	training's rmse: 1.14456	valid_1's rmse: 1.18132
[1991]	training's rmse: 1.14455	valid_1's rmse: 1.18132
[1992]	training's rmse: 1.14453	valid_1's rmse: 1.18132
[1993]	training's rmse: 1.14452	valid_1's rmse: 1.18131
[1994]	training's rmse: 1.14452	valid_1's rmse: 1.18132
[1995]	training's rmse: 1.14451	valid_1's rmse: 1.18131
[1996]	training's rmse: 1.14449	valid_1's rmse: 1.18131
[1997]	training's rmse: 1.14448	valid_1's rmse: 1.18131
[1998]	training's rmse: 1.14447	valid_1's rmse: 

[2129]	training's rmse: 1.14296	valid_1's rmse: 1.18128
[2130]	training's rmse: 1.14295	valid_1's rmse: 1.18128
[2131]	training's rmse: 1.14294	valid_1's rmse: 1.18128
[2132]	training's rmse: 1.14293	valid_1's rmse: 1.18128
[2133]	training's rmse: 1.14291	valid_1's rmse: 1.18129
[2134]	training's rmse: 1.1429	valid_1's rmse: 1.18129
[2135]	training's rmse: 1.14289	valid_1's rmse: 1.18129
[2136]	training's rmse: 1.14288	valid_1's rmse: 1.18129
[2137]	training's rmse: 1.14287	valid_1's rmse: 1.18129
[2138]	training's rmse: 1.14286	valid_1's rmse: 1.18128
[2139]	training's rmse: 1.14285	valid_1's rmse: 1.18128
[2140]	training's rmse: 1.14285	valid_1's rmse: 1.18128
[2141]	training's rmse: 1.14284	valid_1's rmse: 1.18128
[2142]	training's rmse: 1.14283	valid_1's rmse: 1.18129
[2143]	training's rmse: 1.14281	valid_1's rmse: 1.18129
[2144]	training's rmse: 1.1428	valid_1's rmse: 1.18129
[2145]	training's rmse: 1.14278	valid_1's rmse: 1.18129
[2146]	training's rmse: 1.14277	valid_1's rmse: 1.

## 用df_test数据集进行测试

In [170]:
df_test_pay = df_test[df_test['pay_sum']>0]
df_test_nopay = df_test[df_test['pay_sum']==0]

In [171]:
df_test_part1 = df_test_nopay[['user_id','cp_server_no','cp_role_id','pay_num']].rename(columns=
                                                                                {'pay_num':'predict_30_pay'})

In [172]:
target_test = df_test_pay['role_created_30_pay_sum']
target_test_ln = np.log1p(target_test)
features_test = df_test_pay.drop(['role_created_30_pay_sum','user_id','cp_server_no','cp_role_id'], axis=1)
y_predict = lgb_r_cv.predict(features_test)
y_predict[y_predict<0] = 0
mse = mean_squared_error(np.expm1(y_predict), np.expm1(target_test_ln))
mae = mean_absolute_error(np.expm1(y_predict), np.expm1(target_test_ln))
rmse = mse ** 0.5
print(rmse)
print(mse)
print(mae)

3739.030281589994
13980347.44664695
807.210464378816


In [173]:
sum(np.expm1(target_test_ln))/sum(np.expm1(y_predict))

2.0979114485223267

In [174]:
df_test_part2 = df_test_pay[['user_id','cp_server_no','cp_role_id']]
df_test_part2['predict_30_pay'] = np.expm1(y_predict) *2.1

In [176]:
pred = df_test_part1.append(df_test_part2)
predict_data = pd.merge(df_test[['user_id','cp_server_no','cp_role_id', 'role_created_30_pay_sum']],pred,on=
                        ['user_id','cp_server_no','cp_role_id'],how='left',validate='one_to_one')

In [178]:
mse = mean_squared_error(predict_data['predict_30_pay'], predict_data['role_created_30_pay_sum'])
mae = mean_absolute_error(predict_data['predict_30_pay'], predict_data['role_created_30_pay_sum'])
rmse = mse ** 0.5
print('测试集上的均方根误差:%.2f元'% rmse)
print('测试集上的平均绝对误差:%.2f元'% mae)
print('测试集前30天实际总的付费金额:%.2f元' % predict_data['role_created_30_pay_sum'].sum())
print('测试集前30天预测总的付费金额:%.2f元'% predict_data['predict_30_pay'].sum())
print('预测总金额准确率:',predict_data['predict_30_pay'].sum()/predict_data['role_created_30_pay_sum'].sum())

测试集上的均方根误差:498.79元
测试集上的平均绝对误差:18.97元
测试集前30天实际总的付费金额:29711072.00元
测试集前30天预测总的付费金额:24591012.95元
预测总金额准确率: 0.8276716824853632


In [179]:
import joblib
joblib.dump(lgb_r_cv,'./lgb_r_1d.pkl')

['./lgb_r_1d.pkl']