In [112]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import datetime,gc,math
import random
%matplotlib inline
import gc
import lightgbm as lgb
from matplotlib import pyplot as plt
from sklearn.model_selection import KFold, train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder,MinMaxScaler, StandardScaler
from sklearn.metrics import recall_score, auc, accuracy_score, f1_score, precision_score, classification_report, roc_auc_score,mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.feature_selection import VarianceThreshold

In [113]:
def active_time_transform(df):
    df['role_created_active_time'].fillna('"0-8,0","8-12,0","12-14,0","14-18,0","18-24,0"', inplace=True)
    df['role_created_active_time'] = df['role_created_active_time'].apply(lambda x:str(x)[1:-1])
    temp = df['role_created_active_time'].str.split(',',expand=True).iloc[:,[1,3,5,7,9]].rename(columns={1:'active_0-8',
                                        3:'active_8-12',5:'active_12-14',7:'active_14-18',9:'active_18-24'})
    for i in range(len(temp.columns)):
        temp.iloc[:,i] = temp.iloc[:,i].apply(lambda x:str(x)[:-1])
    df = df.join(temp).drop(['role_created_active_time'], axis=1)
    return df

def pay_grade_transform(df):
    df['pay_grade'].fillna('[0,0,0,0,0,0,0]',inplace=True)
    df['pay_grade'] = df['pay_grade'].apply(lambda x:str(x)[1:-1])
    temp = df['pay_grade'].str.split(',',expand=True).rename(columns={0:'pay_grade_1',
                1:'pay_grade_2',2:'pay_grade_3',3:'pay_grade_4',4:'pay_grade_5',5:'pay_grade_6',6:'pay_grade_7'})
    df = df.join(temp).drop(['pay_grade'], axis=1)
    return df

In [131]:
%%time
role_info = pd.read_csv('./data/mr_role_2d.csv')
role_info.drop_duplicates(subset=['user_id','cp_server_no','cp_role_id'],inplace=True)
role_info = active_time_transform(role_info)
role_info = pay_grade_transform(role_info)


Wall time: 1min 39s


In [132]:
# 删除2020-7-19日前30天的数据
role_info['create_role_time'] = pd.to_datetime(role_info['create_role_time'], format='%Y-%m-%d %H:%M:%S')
role_info.drop(role_info[role_info['create_role_time'] > (role_info['create_role_time'].max() + 
                                                          datetime.timedelta(-30))].index,inplace=True)

In [133]:
# 删除没有心跳数据的样本(1、没有登录;2、没有拿到心跳)
role_info.dropna(subset=['role_created_login_num'], inplace=True)
role_info.dropna(subset=['role_created_online'], inplace=True)
role_info['pay_num'].fillna(0, inplace=True)
role_info['pay_sum'].fillna(0, inplace=True)
role_info['role_created_active'] = role_info['role_created_active'].clip(0,2)
role_info['pay_rate'] = role_info['pay_num'] / (role_info['role_created_active'] + 1e-4)
role_info['pay_avg'] = role_info['pay_sum'] / (role_info['pay_num'] + 1e-4)

In [134]:
role_created_30_pay_sum = pd.read_csv('./data/role_created_30_pay_sum.csv',index_col=0)
# 对连接的关键字进行字符化或者整型化的调整
role_created_30_pay_sum['user_id'] = role_created_30_pay_sum['user_id'].astype(str)
role_created_30_pay_sum['mgame_id'] = role_created_30_pay_sum['mgame_id'].astype(str)
role_created_30_pay_sum['cp_server_no'] = role_created_30_pay_sum['cp_server_no'].astype(str)
role_created_30_pay_sum['cp_role_id'] = role_created_30_pay_sum['cp_role_id'].astype(str)
role_info['user_id'] = role_info['user_id'].astype(str)
role_info['mgame_id'] = role_info['mgame_id'].astype(str)
role_info['cp_server_no'] = role_info['cp_server_no'].astype(str)
role_info['cp_role_id'] = role_info['cp_role_id'].astype(str)
role_created_30_pay_sum.drop_duplicates(subset=['user_id','cp_server_no','mgame_id','cp_role_id'],inplace=True)
role_info.drop_duplicates(subset=['user_id','cp_server_no','mgame_id','cp_role_id'],inplace=True)
role_info = pd.merge(role_info,role_created_30_pay_sum,on=['user_id','mgame_id','cp_server_no','cp_role_id'],how='left',validate='one_to_one')

In [135]:
select_features = ['user_id','cp_server_no','cp_role_id','role_created_login_num',
       'role_created_active', 'role_created_online',
       'max_role_level', 'ip_num',
       'pay_num', 'pay_sum', 'active_0-8', 'active_8-12', 'active_12-14',
       'active_14-18', 'active_18-24', 'pay_grade_1', 'pay_grade_2',
       'pay_grade_3', 'pay_grade_4', 'pay_grade_5', 'pay_grade_6',
       'pay_rate', 'pay_avg','role_created_30_pay_sum']
role_info = role_info[select_features]
role_info['role_created_30_pay_sum'].fillna(0,inplace=True)
col_list = ['active_0-8','active_8-12','active_12-14','active_14-18','active_18-24','pay_grade_1','pay_grade_2',
            'pay_grade_3','pay_grade_4','pay_grade_5','pay_grade_6']
for col in col_list:
    role_info[col] = pd.to_numeric(role_info[col], errors='coerce')

In [186]:
role_info.to_pickle('./role_info_2d.pkl')

In [169]:
df_train, df_test = train_test_split(role_info,test_size=0.3)
df_train_pay = df_train[df_train['pay_sum']>0]
df_train_nopay = df_train[df_train['pay_sum']==0]

In [170]:
df_train_nopay_online_n = df_train_nopay[(df_train_nopay['role_created_online']<400) | (df_train_nopay['role_created_login_num']<3)|(df_train_nopay['max_role_level']<2)]

In [171]:
df_train_nopay_online_n.shape[0]/df_train.shape[0]

0.895412493286518

In [172]:
df_train_nopay_online_y.shape[0]/df_train.shape[0]

0.08706118555717109

In [173]:
df_train_pay.shape[0]/df_train.shape[0]

0.017539102125780855

In [174]:
target = df_train_pay['role_created_30_pay_sum']
features = df_train_pay.drop(['role_created_30_pay_sum','user_id','cp_server_no','cp_role_id'], axis=1)
target_ln = np.log1p(target)
X_val, x_test, Y_val, y_test = train_test_split(features,target_ln, test_size=0.3)

In [139]:
# lgb的调参与交叉验证
train_data = lgb.Dataset(X_val, label=Y_val)
val_data = lgb.Dataset(x_test, label=y_test, reference=train_data)

# 设置参数初始值，不含交叉验证参数
print('设置参数')
params = {
    'boosting_type':'gbdt',
    'objective':'regression',
    'metric':'rmse',
    'nthread':4,
    'learning_rate':0.05
}

# 交叉验证（调参）
print('交叉验证')
min_rmse = float('5')
best_params = {}

# 准确率
print('调参1：提高准确率')
for num_leaves in range(5, 100, 5):
    for max_depth in range(3, 8, 1):
        params['num_leaves'] = num_leaves
        params['max_depth'] = max_depth
        
        cv_results = lgb.cv(
                            params,
                            train_data,
                            nfold=3,
                            stratified=False,
                            early_stopping_rounds=10,
                            verbose_eval=50     
                            )
        mean_rmse = pd.Series(cv_results['rmse-mean']).min()
        boost_rounds = pd.Series(cv_results['rmse-mean']).idxmin()
        
        if mean_rmse <= min_rmse:
            min_rmse = mean_rmse
            best_params['num_leaves'] = num_leaves
            best_params['max_depth'] = max_depth

if 'num_leaves' and 'max_depth' in best_params.keys():
    params['num_leaves'] = best_params['num_leaves']
    params['max_depth'] = best_params['max_depth']

# 过拟合
print('调参2：降低过拟合')
min_rmse = float('5')
for max_bin in range(5, 256, 10):
    for min_data_in_leaf in range(1, 102, 10):
        params['max_bin'] = max_bin
        params['min_data_in_leaf'] = min_data_in_leaf
        
        cv_results = lgb.cv(
                            params,
                            train_data,
                            nfold=3,
                            stratified=False,
                            early_stopping_rounds=10,
                            verbose_eval=50 
                            )
        mean_rmse = pd.Series(cv_results['rmse-mean']).min()
        boost_rounds = pd.Series(cv_results['rmse-mean']).idxmin()
        
        if mean_rmse <= min_rmse:
            min_rmse = mean_rmse
            best_params['min_data_in_leaf'] = min_data_in_leaf
            best_params['max_bin'] = max_bin
            
if 'max_bin' and 'min_data_in_leaf' in best_params.keys():
    params['max_bin'] = best_params['max_bin']
    params['min_data_in_leaf'] = best_params['min_data_in_leaf']

print('调参3：降低过拟合')
min_rmse = float('5')
for feature_fraction in [0.5, 0.6, 0.7, 0.8, 0.9, 1.0]:
    for bagging_fraction in [0.5, 0.6, 0.7, 0.8, 0.9, 1.0]:
        for bagging_freq in range(0,50,5):
            params['feature_fraction'] = feature_fraction
            params['bagging_fraction'] = bagging_fraction
            params['bagging_freq'] = bagging_freq

            cv_results = lgb.cv(
                            params,
                            train_data,
                            nfold=3,
                            stratified=False,
                            early_stopping_rounds=10,
                            verbose_eval=50
                            )
            mean_rmse = pd.Series(cv_results['rmse-mean']).min()
            boost_rounds = pd.Series(cv_results['rmse-mean']).idxmin()
        
            if mean_rmse <= min_rmse:
                min_rmse = mean_rmse
                best_params['feature_fraction'] = feature_fraction
                best_params['bagging_fraction'] = bagging_fraction
                best_params['bagging_freq'] = bagging_freq

if 'feature_fraction' and 'bagging_fraction' and 'bagging_freq' in best_params.keys():
    params['feature_fraction'] = best_params['feature_fraction']
    params['bagging_fraction'] = best_params['bagging_fraction']
    params['bagging_freq'] = best_params['bagging_freq']

print('调参4：降低过拟合')
min_rmse = float('5')
for lambda_l1 in [1e-5,1e-3,1e-1,0.0,0.1,0.3,0.5,0.7,0.9,1.0]:
    for lambda_l2 in [1e-5,1e-3,1e-1,0.0,0.1,0.4,0.6,0.7,0.9,1.0]:
        params['lambda_l1'] = lambda_l1
        params['lambda_l2'] = lambda_l2
        cv_results = lgb.cv(
                            params,
                            train_data,
                            nfold=3,
                            stratified=False,
                            early_stopping_rounds=10,
                            verbose_eval=50
                            )
        mean_rmse = pd.Series(cv_results['rmse-mean']).min()
        boost_rounds = pd.Series(cv_results['rmse-mean']).idxmin()
        
        if mean_rmse <= min_rmse:
            min_rmse = mean_rmse
            best_params['lambda_l1'] = lambda_l1
            best_params['lambda_l2'] = lambda_l2
if 'lambda_l1' and 'lambda_l2' in best_params.keys():
    params['lambda_l1'] = best_params['lambda_l1']
    params['lambda_l2'] = best_params['lambda_l2']

print("调参5：降低过拟合2")
min_rmse = float('5')
for min_split_gain in [0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]:
    params['min_split_gain'] = min_split_gain
    
    cv_results = lgb.cv(
                            params,
                            train_data,
                            nfold=3,
                            stratified=False,
                            early_stopping_rounds=10,
                            verbose_eval=50    
                            )
    mean_rmse = pd.Series(cv_results['rmse-mean']).min()
    boost_rounds = pd.Series(cv_results['rmse-mean']).idxmin()
        
    if mean_rmse >= min_rmse:
        min_rmse = mean_rmse
        
    best_params['min_split_gain'] = min_split_gain
if 'min_split_gain' in best_params.keys():
    params['min_split_gain'] = best_params['min_split_gain']
print(best_params)

设置参数
交叉验证
调参1：提高准确率
[50]	cv_agg's rmse: 1.0183 + 0.0102108
[100]	cv_agg's rmse: 0.996543 + 0.0118377
[50]	cv_agg's rmse: 1.0183 + 0.0102108
[100]	cv_agg's rmse: 0.996409 + 0.0118562
[50]	cv_agg's rmse: 1.0183 + 0.0102108
[100]	cv_agg's rmse: 0.996409 + 0.0118562
[50]	cv_agg's rmse: 1.0183 + 0.0102108
[100]	cv_agg's rmse: 0.996409 + 0.0118562
[50]	cv_agg's rmse: 1.0183 + 0.0102108
[100]	cv_agg's rmse: 0.996409 + 0.0118562
[50]	cv_agg's rmse: 1.011 + 0.0105939
[100]	cv_agg's rmse: 0.995418 + 0.0119457
[50]	cv_agg's rmse: 1.00836 + 0.0107086
[100]	cv_agg's rmse: 0.994766 + 0.0121361
[50]	cv_agg's rmse: 1.00831 + 0.0107181
[100]	cv_agg's rmse: 0.994653 + 0.0120981
[50]	cv_agg's rmse: 1.00831 + 0.0107181
[100]	cv_agg's rmse: 0.994711 + 0.011977
[50]	cv_agg's rmse: 1.00831 + 0.0107181
[100]	cv_agg's rmse: 0.994719 + 0.0119527
[50]	cv_agg's rmse: 1.011 + 0.0105939
[100]	cv_agg's rmse: 0.995418 + 0.0119457
[50]	cv_agg's rmse: 1.00566 + 0.0105363
[100]	cv_agg's rmse: 0.994559 + 0.0121521
[50]	c

[100]	cv_agg's rmse: 0.994083 + 0.011921
[50]	cv_agg's rmse: 1.00375 + 0.010147
[100]	cv_agg's rmse: 0.994033 + 0.0118777
[50]	cv_agg's rmse: 1.00357 + 0.010133
[100]	cv_agg's rmse: 0.993847 + 0.0118715
[50]	cv_agg's rmse: 1.00374 + 0.0101297
[100]	cv_agg's rmse: 0.993964 + 0.011979
[50]	cv_agg's rmse: 1.00389 + 0.0101883
[100]	cv_agg's rmse: 0.99406 + 0.0119454
[50]	cv_agg's rmse: 1.00377 + 0.0101497
[100]	cv_agg's rmse: 0.994016 + 0.0119861
[50]	cv_agg's rmse: 1.00378 + 0.0101142
[100]	cv_agg's rmse: 0.994715 + 0.0116443
[50]	cv_agg's rmse: 1.00347 + 0.0102365
[100]	cv_agg's rmse: 0.994304 + 0.0118406
[50]	cv_agg's rmse: 1.00349 + 0.0101751
[100]	cv_agg's rmse: 0.994126 + 0.0119593
[50]	cv_agg's rmse: 1.00359 + 0.0101458
[100]	cv_agg's rmse: 0.994319 + 0.0116927
[50]	cv_agg's rmse: 1.00363 + 0.0101183
[100]	cv_agg's rmse: 0.994025 + 0.0117781
[50]	cv_agg's rmse: 1.00366 + 0.0102414
[100]	cv_agg's rmse: 0.994083 + 0.011921
[50]	cv_agg's rmse: 1.00375 + 0.010147
[100]	cv_agg's rmse: 0.

[50]	cv_agg's rmse: 1.00357 + 0.010133
[100]	cv_agg's rmse: 0.993847 + 0.0118715
[50]	cv_agg's rmse: 1.00374 + 0.0101297
[100]	cv_agg's rmse: 0.993964 + 0.011979
[50]	cv_agg's rmse: 1.00389 + 0.0101883
[100]	cv_agg's rmse: 0.99406 + 0.0119454
[50]	cv_agg's rmse: 1.00377 + 0.0101497
[100]	cv_agg's rmse: 0.994016 + 0.0119861
[50]	cv_agg's rmse: 1.00378 + 0.0101142
[100]	cv_agg's rmse: 0.994715 + 0.0116443
[50]	cv_agg's rmse: 1.00347 + 0.0102365
[100]	cv_agg's rmse: 0.994304 + 0.0118406
[50]	cv_agg's rmse: 1.00349 + 0.0101751
[100]	cv_agg's rmse: 0.994126 + 0.0119593
[50]	cv_agg's rmse: 1.00359 + 0.0101458
[100]	cv_agg's rmse: 0.994319 + 0.0116927
[50]	cv_agg's rmse: 1.00363 + 0.0101183
[100]	cv_agg's rmse: 0.994025 + 0.0117781
[50]	cv_agg's rmse: 1.00366 + 0.0102414
[100]	cv_agg's rmse: 0.994083 + 0.011921
[50]	cv_agg's rmse: 1.00375 + 0.010147
[100]	cv_agg's rmse: 0.994033 + 0.0118777
[50]	cv_agg's rmse: 1.00357 + 0.010133
[100]	cv_agg's rmse: 0.993847 + 0.0118715
[50]	cv_agg's rmse: 1.

[50]	cv_agg's rmse: 1.00389 + 0.0101883
[100]	cv_agg's rmse: 0.99406 + 0.0119454
[50]	cv_agg's rmse: 1.00377 + 0.0101497
[100]	cv_agg's rmse: 0.994016 + 0.0119861
[50]	cv_agg's rmse: 1.00378 + 0.0101142
[100]	cv_agg's rmse: 0.994715 + 0.0116443
[50]	cv_agg's rmse: 1.00347 + 0.0102365
[100]	cv_agg's rmse: 0.994304 + 0.0118406
[50]	cv_agg's rmse: 1.00349 + 0.0101751
[100]	cv_agg's rmse: 0.994126 + 0.0119593
[50]	cv_agg's rmse: 1.00359 + 0.0101458
[100]	cv_agg's rmse: 0.994319 + 0.0116927
[50]	cv_agg's rmse: 1.00363 + 0.0101183
[100]	cv_agg's rmse: 0.994025 + 0.0117781
[50]	cv_agg's rmse: 1.00366 + 0.0102414
[100]	cv_agg's rmse: 0.994083 + 0.011921
[50]	cv_agg's rmse: 1.00375 + 0.010147
[100]	cv_agg's rmse: 0.994033 + 0.0118777
[50]	cv_agg's rmse: 1.00357 + 0.010133
[100]	cv_agg's rmse: 0.993847 + 0.0118715
[50]	cv_agg's rmse: 1.00374 + 0.0101297
[100]	cv_agg's rmse: 0.993964 + 0.011979
[50]	cv_agg's rmse: 1.00389 + 0.0101883
[100]	cv_agg's rmse: 0.99406 + 0.0119454
[50]	cv_agg's rmse: 1.

[50]	cv_agg's rmse: 1.00538 + 0.010098
[100]	cv_agg's rmse: 0.994385 + 0.0117209
[50]	cv_agg's rmse: 1.00559 + 0.0102679
[100]	cv_agg's rmse: 0.994594 + 0.0118306
[50]	cv_agg's rmse: 1.0058 + 0.0101309
[100]	cv_agg's rmse: 0.994461 + 0.0120428
[50]	cv_agg's rmse: 1.00573 + 0.010556
[100]	cv_agg's rmse: 0.994459 + 0.0119846
[50]	cv_agg's rmse: 1.00598 + 0.0104692
[100]	cv_agg's rmse: 0.99518 + 0.0118106
[50]	cv_agg's rmse: 1.00559 + 0.0104612
[100]	cv_agg's rmse: 0.994716 + 0.0119091
[50]	cv_agg's rmse: 1.00596 + 0.0100834
[100]	cv_agg's rmse: 0.995124 + 0.0120961
[50]	cv_agg's rmse: 1.00618 + 0.0101704
[100]	cv_agg's rmse: 0.994817 + 0.0118199
[50]	cv_agg's rmse: 1.00532 + 0.0104851
[100]	cv_agg's rmse: 0.993849 + 0.0122052
[50]	cv_agg's rmse: 1.00544 + 0.0100469
[100]	cv_agg's rmse: 0.994031 + 0.0116948
[50]	cv_agg's rmse: 1.00555 + 0.0100643
[100]	cv_agg's rmse: 0.994158 + 0.0116383
[50]	cv_agg's rmse: 1.00569 + 0.0103666
[100]	cv_agg's rmse: 0.994379 + 0.01174
[50]	cv_agg's rmse: 1.

[50]	cv_agg's rmse: 1.00471 + 0.0104997
[100]	cv_agg's rmse: 0.995086 + 0.0121803
[50]	cv_agg's rmse: 1.00544 + 0.00985168
[100]	cv_agg's rmse: 0.995653 + 0.0120221
[50]	cv_agg's rmse: 1.00556 + 0.0106872
[100]	cv_agg's rmse: 0.99554 + 0.0122916
[50]	cv_agg's rmse: 1.0053 + 0.0105313
[100]	cv_agg's rmse: 0.995755 + 0.0123647
[50]	cv_agg's rmse: 1.00517 + 0.00996344
[100]	cv_agg's rmse: 0.995455 + 0.0115217
[50]	cv_agg's rmse: 1.00539 + 0.0102792
[100]	cv_agg's rmse: 0.995663 + 0.0122505
[50]	cv_agg's rmse: 1.00588 + 0.0106683
[100]	cv_agg's rmse: 0.995824 + 0.0121586
[50]	cv_agg's rmse: 1.00411 + 0.0105446
[100]	cv_agg's rmse: 0.993935 + 0.0121136
[50]	cv_agg's rmse: 1.00419 + 0.0104485
[100]	cv_agg's rmse: 0.993932 + 0.0119902
[50]	cv_agg's rmse: 1.00469 + 0.0101415
[100]	cv_agg's rmse: 0.994543 + 0.011722
[50]	cv_agg's rmse: 1.00475 + 0.0103567
[100]	cv_agg's rmse: 0.994805 + 0.0117119
[50]	cv_agg's rmse: 1.00497 + 0.0099358
[100]	cv_agg's rmse: 0.994733 + 0.0122056
[50]	cv_agg's rms

[100]	cv_agg's rmse: 0.993995 + 0.0123141
[50]	cv_agg's rmse: 1.00382 + 0.0101421
[100]	cv_agg's rmse: 0.993789 + 0.0119331
[50]	cv_agg's rmse: 1.00398 + 0.010365
[100]	cv_agg's rmse: 0.993983 + 0.0118838
[50]	cv_agg's rmse: 1.00388 + 0.0103131
[100]	cv_agg's rmse: 0.994051 + 0.0118402
[50]	cv_agg's rmse: 1.00404 + 0.0100829
[100]	cv_agg's rmse: 0.993722 + 0.0113283
[50]	cv_agg's rmse: 1.00376 + 0.010209
[100]	cv_agg's rmse: 0.993688 + 0.0118375
[50]	cv_agg's rmse: 1.00389 + 0.0101582
[100]	cv_agg's rmse: 0.993855 + 0.0121275
[50]	cv_agg's rmse: 1.0038 + 0.0103518
[100]	cv_agg's rmse: 0.99389 + 0.0118518
[50]	cv_agg's rmse: 1.0038 + 0.0103518
[100]	cv_agg's rmse: 0.99389 + 0.0118518
[50]	cv_agg's rmse: 1.0038 + 0.0103518
[100]	cv_agg's rmse: 0.99389 + 0.0118518
[50]	cv_agg's rmse: 1.0038 + 0.0103518
[100]	cv_agg's rmse: 0.99389 + 0.0118518
[50]	cv_agg's rmse: 1.0038 + 0.0103518
[100]	cv_agg's rmse: 0.99389 + 0.0118518
[50]	cv_agg's rmse: 1.0038 + 0.0103518
[100]	cv_agg's rmse: 0.99389 

[50]	cv_agg's rmse: 1.00457 + 0.0103605
[100]	cv_agg's rmse: 0.994677 + 0.0113122
[50]	cv_agg's rmse: 1.00456 + 0.0104474
[100]	cv_agg's rmse: 0.99481 + 0.0116637
[50]	cv_agg's rmse: 1.00408 + 0.0102818
[100]	cv_agg's rmse: 0.9944 + 0.0118084
[50]	cv_agg's rmse: 1.00444 + 0.0100548
[100]	cv_agg's rmse: 0.995014 + 0.0119555
[50]	cv_agg's rmse: 1.00461 + 0.0101996
[100]	cv_agg's rmse: 0.995096 + 0.0117093
[50]	cv_agg's rmse: 1.00357 + 0.010133
[100]	cv_agg's rmse: 0.993847 + 0.0118715
[50]	cv_agg's rmse: 1.00368 + 0.0102343
[100]	cv_agg's rmse: 0.993716 + 0.0116963
[50]	cv_agg's rmse: 1.00362 + 0.0100252
[100]	cv_agg's rmse: 0.994249 + 0.0117305
[50]	cv_agg's rmse: 1.0042 + 0.0105755
[100]	cv_agg's rmse: 0.994393 + 0.0118956
[50]	cv_agg's rmse: 1.00397 + 0.0102058
[100]	cv_agg's rmse: 0.99401 + 0.0118812
[50]	cv_agg's rmse: 1.00434 + 0.0104596
[100]	cv_agg's rmse: 0.994465 + 0.011734
[50]	cv_agg's rmse: 1.00424 + 0.0107371
[100]	cv_agg's rmse: 0.994608 + 0.0122816
[50]	cv_agg's rmse: 1.0

[100]	cv_agg's rmse: 0.993462 + 0.0117748
[50]	cv_agg's rmse: 1.00466 + 0.01021
[100]	cv_agg's rmse: 0.993559 + 0.0117962
[50]	cv_agg's rmse: 1.00468 + 0.0102175
[100]	cv_agg's rmse: 0.993487 + 0.0117151
[50]	cv_agg's rmse: 1.00465 + 0.0101972
[100]	cv_agg's rmse: 0.993662 + 0.0117268
[50]	cv_agg's rmse: 1.00491 + 0.0101888
[100]	cv_agg's rmse: 0.993753 + 0.0117122
[50]	cv_agg's rmse: 1.00461 + 0.0101339
[100]	cv_agg's rmse: 0.993633 + 0.011813
[50]	cv_agg's rmse: 1.00461 + 0.0101339
[100]	cv_agg's rmse: 0.993633 + 0.011813
[50]	cv_agg's rmse: 1.00475 + 0.0100822
[100]	cv_agg's rmse: 0.993702 + 0.0116364
[50]	cv_agg's rmse: 1.00461 + 0.0101339
[100]	cv_agg's rmse: 0.993633 + 0.011813
[50]	cv_agg's rmse: 1.00475 + 0.0100822
[100]	cv_agg's rmse: 0.993702 + 0.0116364
[50]	cv_agg's rmse: 1.0046 + 0.0102925
[100]	cv_agg's rmse: 0.993517 + 0.0118382
[50]	cv_agg's rmse: 1.00465 + 0.0102348
[100]	cv_agg's rmse: 0.99357 + 0.0116576
[50]	cv_agg's rmse: 1.00479 + 0.0101849
[100]	cv_agg's rmse: 0.

In [175]:
target = df_train_pay['role_created_30_pay_sum']
features = df_train_pay.drop(['role_created_30_pay_sum','user_id','cp_server_no','cp_role_id'], axis=1)
target_ln = np.log1p(target)
X_val, x_test, Y_val, y_test = train_test_split(features,target_ln, test_size=0.3)
params = {

    'boosting_type':'gbdt',
    'objective':'regression',
    'metric':'rmse',
    'nthread':4,
    'learning_rate':0.005,
    "max_depth":5,
    "num_leaves":25,
    "max_bin":255,
    "min_data_in_leaf":71,
    "min_split_gain":1.0,
    "feature_fraction": 0.6,
    "bagging_fraction":0.9,
    "bagging_freq":35,
    "lambda_l1":0.0,
    "lambda_l2":0.1,
    "metric": "rmse",
    }
train_data = lgb.Dataset(X_val, label=Y_val)
val_data = lgb.Dataset(x_test, label=y_test, reference=train_data)
lgb_r_cv = lgb.train(params, train_data, num_boost_round=8000, early_stopping_rounds=100, valid_sets=[train_data, val_data])

[1]	training's rmse: 1.80572	valid_1's rmse: 1.79847
Training until validation scores don't improve for 100 rounds
[2]	training's rmse: 1.7995	valid_1's rmse: 1.79235
[3]	training's rmse: 1.79331	valid_1's rmse: 1.78627
[4]	training's rmse: 1.78741	valid_1's rmse: 1.78046
[5]	training's rmse: 1.7813	valid_1's rmse: 1.77446
[6]	training's rmse: 1.77523	valid_1's rmse: 1.7685
[7]	training's rmse: 1.76921	valid_1's rmse: 1.76258
[8]	training's rmse: 1.76326	valid_1's rmse: 1.75672
[9]	training's rmse: 1.75734	valid_1's rmse: 1.75091
[10]	training's rmse: 1.75154	valid_1's rmse: 1.74522
[11]	training's rmse: 1.74572	valid_1's rmse: 1.73949
[12]	training's rmse: 1.7401	valid_1's rmse: 1.73397
[13]	training's rmse: 1.73434	valid_1's rmse: 1.7283
[14]	training's rmse: 1.72875	valid_1's rmse: 1.7228
[15]	training's rmse: 1.72347	valid_1's rmse: 1.71762
[16]	training's rmse: 1.71791	valid_1's rmse: 1.71216
[17]	training's rmse: 1.71264	valid_1's rmse: 1.70698
[18]	training's rmse: 1.7074	valid_

[165]	training's rmse: 1.20312	valid_1's rmse: 1.2109
[166]	training's rmse: 1.2012	valid_1's rmse: 1.20906
[167]	training's rmse: 1.19943	valid_1's rmse: 1.20736
[168]	training's rmse: 1.19761	valid_1's rmse: 1.20562
[169]	training's rmse: 1.1958	valid_1's rmse: 1.20389
[170]	training's rmse: 1.19394	valid_1's rmse: 1.20209
[171]	training's rmse: 1.19209	valid_1's rmse: 1.20032
[172]	training's rmse: 1.19026	valid_1's rmse: 1.19856
[173]	training's rmse: 1.18864	valid_1's rmse: 1.19703
[174]	training's rmse: 1.18685	valid_1's rmse: 1.19531
[175]	training's rmse: 1.18518	valid_1's rmse: 1.19371
[176]	training's rmse: 1.18346	valid_1's rmse: 1.19207
[177]	training's rmse: 1.18172	valid_1's rmse: 1.19039
[178]	training's rmse: 1.17997	valid_1's rmse: 1.18872
[179]	training's rmse: 1.17824	valid_1's rmse: 1.18705
[180]	training's rmse: 1.17653	valid_1's rmse: 1.18541
[181]	training's rmse: 1.17486	valid_1's rmse: 1.18381
[182]	training's rmse: 1.17317	valid_1's rmse: 1.18219
[183]	trainin

[332]	training's rmse: 1.03381	valid_1's rmse: 1.05102
[333]	training's rmse: 1.03338	valid_1's rmse: 1.05063
[334]	training's rmse: 1.03296	valid_1's rmse: 1.05024
[335]	training's rmse: 1.0325	valid_1's rmse: 1.04983
[336]	training's rmse: 1.03206	valid_1's rmse: 1.04944
[337]	training's rmse: 1.03162	valid_1's rmse: 1.04903
[338]	training's rmse: 1.03119	valid_1's rmse: 1.04864
[339]	training's rmse: 1.03078	valid_1's rmse: 1.04827
[340]	training's rmse: 1.03035	valid_1's rmse: 1.04788
[341]	training's rmse: 1.02992	valid_1's rmse: 1.04749
[342]	training's rmse: 1.02949	valid_1's rmse: 1.0471
[343]	training's rmse: 1.0291	valid_1's rmse: 1.04675
[344]	training's rmse: 1.02871	valid_1's rmse: 1.0464
[345]	training's rmse: 1.02829	valid_1's rmse: 1.04602
[346]	training's rmse: 1.02787	valid_1's rmse: 1.04565
[347]	training's rmse: 1.02747	valid_1's rmse: 1.04527
[348]	training's rmse: 1.02706	valid_1's rmse: 1.0449
[349]	training's rmse: 1.02668	valid_1's rmse: 1.04457
[350]	training'

[506]	training's rmse: 0.992643	valid_1's rmse: 1.01513
[507]	training's rmse: 0.992543	valid_1's rmse: 1.01505
[508]	training's rmse: 0.992433	valid_1's rmse: 1.01496
[509]	training's rmse: 0.992332	valid_1's rmse: 1.01489
[510]	training's rmse: 0.992231	valid_1's rmse: 1.01482
[511]	training's rmse: 0.992135	valid_1's rmse: 1.01474
[512]	training's rmse: 0.992028	valid_1's rmse: 1.01466
[513]	training's rmse: 0.991924	valid_1's rmse: 1.01458
[514]	training's rmse: 0.991819	valid_1's rmse: 1.0145
[515]	training's rmse: 0.991725	valid_1's rmse: 1.01443
[516]	training's rmse: 0.991621	valid_1's rmse: 1.01435
[517]	training's rmse: 0.991521	valid_1's rmse: 1.01427
[518]	training's rmse: 0.99142	valid_1's rmse: 1.01419
[519]	training's rmse: 0.991317	valid_1's rmse: 1.0141
[520]	training's rmse: 0.991218	valid_1's rmse: 1.01403
[521]	training's rmse: 0.991119	valid_1's rmse: 1.01395
[522]	training's rmse: 0.991018	valid_1's rmse: 1.01387
[523]	training's rmse: 0.990929	valid_1's rmse: 1.0

[679]	training's rmse: 0.981644	valid_1's rmse: 1.00742
[680]	training's rmse: 0.98161	valid_1's rmse: 1.0074
[681]	training's rmse: 0.981577	valid_1's rmse: 1.00738
[682]	training's rmse: 0.981543	valid_1's rmse: 1.00737
[683]	training's rmse: 0.981507	valid_1's rmse: 1.00735
[684]	training's rmse: 0.981475	valid_1's rmse: 1.00733
[685]	training's rmse: 0.98144	valid_1's rmse: 1.00731
[686]	training's rmse: 0.981408	valid_1's rmse: 1.00729
[687]	training's rmse: 0.981368	valid_1's rmse: 1.00727
[688]	training's rmse: 0.981333	valid_1's rmse: 1.00725
[689]	training's rmse: 0.9813	valid_1's rmse: 1.00723
[690]	training's rmse: 0.981268	valid_1's rmse: 1.00721
[691]	training's rmse: 0.981237	valid_1's rmse: 1.00719
[692]	training's rmse: 0.981199	valid_1's rmse: 1.00717
[693]	training's rmse: 0.981167	valid_1's rmse: 1.00715
[694]	training's rmse: 0.981136	valid_1's rmse: 1.00714
[695]	training's rmse: 0.981095	valid_1's rmse: 1.00712
[696]	training's rmse: 0.981063	valid_1's rmse: 1.007

[861]	training's rmse: 0.977187	valid_1's rmse: 1.00517
[862]	training's rmse: 0.977173	valid_1's rmse: 1.00517
[863]	training's rmse: 0.977156	valid_1's rmse: 1.00516
[864]	training's rmse: 0.977141	valid_1's rmse: 1.00516
[865]	training's rmse: 0.977125	valid_1's rmse: 1.00515
[866]	training's rmse: 0.977113	valid_1's rmse: 1.00514
[867]	training's rmse: 0.977099	valid_1's rmse: 1.00514
[868]	training's rmse: 0.977086	valid_1's rmse: 1.00513
[869]	training's rmse: 0.977071	valid_1's rmse: 1.00512
[870]	training's rmse: 0.977056	valid_1's rmse: 1.00511
[871]	training's rmse: 0.977042	valid_1's rmse: 1.0051
[872]	training's rmse: 0.977027	valid_1's rmse: 1.0051
[873]	training's rmse: 0.977013	valid_1's rmse: 1.00509
[874]	training's rmse: 0.976998	valid_1's rmse: 1.00508
[875]	training's rmse: 0.976979	valid_1's rmse: 1.00507
[876]	training's rmse: 0.976965	valid_1's rmse: 1.00506
[877]	training's rmse: 0.976948	valid_1's rmse: 1.00505
[878]	training's rmse: 0.976931	valid_1's rmse: 1.

[1053]	training's rmse: 0.974534	valid_1's rmse: 1.00437
[1054]	training's rmse: 0.974524	valid_1's rmse: 1.00437
[1055]	training's rmse: 0.974512	valid_1's rmse: 1.00437
[1056]	training's rmse: 0.974503	valid_1's rmse: 1.00437
[1057]	training's rmse: 0.974483	valid_1's rmse: 1.00436
[1058]	training's rmse: 0.974461	valid_1's rmse: 1.00436
[1059]	training's rmse: 0.974444	valid_1's rmse: 1.00436
[1060]	training's rmse: 0.974435	valid_1's rmse: 1.00435
[1061]	training's rmse: 0.974421	valid_1's rmse: 1.00435
[1062]	training's rmse: 0.97441	valid_1's rmse: 1.00435
[1063]	training's rmse: 0.974402	valid_1's rmse: 1.00435
[1064]	training's rmse: 0.974388	valid_1's rmse: 1.00434
[1065]	training's rmse: 0.974368	valid_1's rmse: 1.00434
[1066]	training's rmse: 0.974355	valid_1's rmse: 1.00433
[1067]	training's rmse: 0.974346	valid_1's rmse: 1.00433
[1068]	training's rmse: 0.974337	valid_1's rmse: 1.00432
[1069]	training's rmse: 0.974316	valid_1's rmse: 1.00432
[1070]	training's rmse: 0.974308

[1267]	training's rmse: 0.972233	valid_1's rmse: 1.00395
[1268]	training's rmse: 0.972216	valid_1's rmse: 1.00395
[1269]	training's rmse: 0.972197	valid_1's rmse: 1.00394
[1270]	training's rmse: 0.972184	valid_1's rmse: 1.00394
[1271]	training's rmse: 0.972169	valid_1's rmse: 1.00394
[1272]	training's rmse: 0.972156	valid_1's rmse: 1.00394
[1273]	training's rmse: 0.972145	valid_1's rmse: 1.00393
[1274]	training's rmse: 0.972137	valid_1's rmse: 1.00393
[1275]	training's rmse: 0.97212	valid_1's rmse: 1.00393
[1276]	training's rmse: 0.972107	valid_1's rmse: 1.00392
[1277]	training's rmse: 0.972095	valid_1's rmse: 1.00392
[1278]	training's rmse: 0.97208	valid_1's rmse: 1.00392
[1279]	training's rmse: 0.972061	valid_1's rmse: 1.00391
[1280]	training's rmse: 0.972048	valid_1's rmse: 1.00391
[1281]	training's rmse: 0.972031	valid_1's rmse: 1.00391
[1282]	training's rmse: 0.972022	valid_1's rmse: 1.00391
[1283]	training's rmse: 0.972008	valid_1's rmse: 1.0039
[1284]	training's rmse: 0.971995	v

[1486]	training's rmse: 0.969897	valid_1's rmse: 1.00357
[1487]	training's rmse: 0.96989	valid_1's rmse: 1.00357
[1488]	training's rmse: 0.969885	valid_1's rmse: 1.00357
[1489]	training's rmse: 0.969873	valid_1's rmse: 1.00357
[1490]	training's rmse: 0.969863	valid_1's rmse: 1.00357
[1491]	training's rmse: 0.969859	valid_1's rmse: 1.00357
[1492]	training's rmse: 0.96985	valid_1's rmse: 1.00357
[1493]	training's rmse: 0.969844	valid_1's rmse: 1.00357
[1494]	training's rmse: 0.969831	valid_1's rmse: 1.00357
[1495]	training's rmse: 0.969822	valid_1's rmse: 1.00357
[1496]	training's rmse: 0.969811	valid_1's rmse: 1.00357
[1497]	training's rmse: 0.969796	valid_1's rmse: 1.00357
[1498]	training's rmse: 0.969784	valid_1's rmse: 1.00357
[1499]	training's rmse: 0.969772	valid_1's rmse: 1.00356
[1500]	training's rmse: 0.969762	valid_1's rmse: 1.00357
[1501]	training's rmse: 0.969748	valid_1's rmse: 1.00356
[1502]	training's rmse: 0.969741	valid_1's rmse: 1.00357
[1503]	training's rmse: 0.969729	

[1708]	training's rmse: 0.967778	valid_1's rmse: 1.00347
[1709]	training's rmse: 0.967775	valid_1's rmse: 1.00347
[1710]	training's rmse: 0.967769	valid_1's rmse: 1.00347
[1711]	training's rmse: 0.967757	valid_1's rmse: 1.00347
[1712]	training's rmse: 0.967752	valid_1's rmse: 1.00347
[1713]	training's rmse: 0.967748	valid_1's rmse: 1.00347
[1714]	training's rmse: 0.96774	valid_1's rmse: 1.00347
[1715]	training's rmse: 0.967732	valid_1's rmse: 1.00347
[1716]	training's rmse: 0.967719	valid_1's rmse: 1.00346
[1717]	training's rmse: 0.967713	valid_1's rmse: 1.00346
[1718]	training's rmse: 0.967706	valid_1's rmse: 1.00346
[1719]	training's rmse: 0.967693	valid_1's rmse: 1.00346
[1720]	training's rmse: 0.967678	valid_1's rmse: 1.00345
[1721]	training's rmse: 0.967668	valid_1's rmse: 1.00345
[1722]	training's rmse: 0.967657	valid_1's rmse: 1.00345
[1723]	training's rmse: 0.967651	valid_1's rmse: 1.00345
[1724]	training's rmse: 0.967636	valid_1's rmse: 1.00344
[1725]	training's rmse: 0.967624

[1939]	training's rmse: 0.966132	valid_1's rmse: 1.00333
[1940]	training's rmse: 0.96613	valid_1's rmse: 1.00333
[1941]	training's rmse: 0.966127	valid_1's rmse: 1.00333
[1942]	training's rmse: 0.966125	valid_1's rmse: 1.00333
[1943]	training's rmse: 0.96612	valid_1's rmse: 1.00333
[1944]	training's rmse: 0.966119	valid_1's rmse: 1.00333
[1945]	training's rmse: 0.966116	valid_1's rmse: 1.00332
[1946]	training's rmse: 0.966114	valid_1's rmse: 1.00332
[1947]	training's rmse: 0.966108	valid_1's rmse: 1.00332
[1948]	training's rmse: 0.966101	valid_1's rmse: 1.00332
[1949]	training's rmse: 0.966095	valid_1's rmse: 1.00332
[1950]	training's rmse: 0.966088	valid_1's rmse: 1.00332
[1951]	training's rmse: 0.966086	valid_1's rmse: 1.00332
[1952]	training's rmse: 0.966081	valid_1's rmse: 1.00332
[1953]	training's rmse: 0.966079	valid_1's rmse: 1.00332
[1954]	training's rmse: 0.966076	valid_1's rmse: 1.00332
[1955]	training's rmse: 0.966071	valid_1's rmse: 1.00332
[1956]	training's rmse: 0.96607	v

[2177]	training's rmse: 0.964588	valid_1's rmse: 1.00319
[2178]	training's rmse: 0.964579	valid_1's rmse: 1.00319
[2179]	training's rmse: 0.96457	valid_1's rmse: 1.00319
[2180]	training's rmse: 0.964559	valid_1's rmse: 1.00319
[2181]	training's rmse: 0.964556	valid_1's rmse: 1.00319
[2182]	training's rmse: 0.964548	valid_1's rmse: 1.00319
[2183]	training's rmse: 0.964538	valid_1's rmse: 1.00319
[2184]	training's rmse: 0.964529	valid_1's rmse: 1.00319
[2185]	training's rmse: 0.964519	valid_1's rmse: 1.00319
[2186]	training's rmse: 0.964509	valid_1's rmse: 1.00319
[2187]	training's rmse: 0.964501	valid_1's rmse: 1.00319
[2188]	training's rmse: 0.964492	valid_1's rmse: 1.00319
[2189]	training's rmse: 0.964483	valid_1's rmse: 1.00319
[2190]	training's rmse: 0.96448	valid_1's rmse: 1.00318
[2191]	training's rmse: 0.964471	valid_1's rmse: 1.00319
[2192]	training's rmse: 0.964462	valid_1's rmse: 1.00318
[2193]	training's rmse: 0.96445	valid_1's rmse: 1.00318
[2194]	training's rmse: 0.964441	v

[2428]	training's rmse: 0.963182	valid_1's rmse: 1.00311
[2429]	training's rmse: 0.963181	valid_1's rmse: 1.00311
[2430]	training's rmse: 0.96318	valid_1's rmse: 1.00311
[2431]	training's rmse: 0.963176	valid_1's rmse: 1.00311
[2432]	training's rmse: 0.963174	valid_1's rmse: 1.00311
[2433]	training's rmse: 0.963171	valid_1's rmse: 1.00311
[2434]	training's rmse: 0.96317	valid_1's rmse: 1.00311
[2435]	training's rmse: 0.963169	valid_1's rmse: 1.00311
[2436]	training's rmse: 0.963166	valid_1's rmse: 1.00311
[2437]	training's rmse: 0.963156	valid_1's rmse: 1.00311
[2438]	training's rmse: 0.96315	valid_1's rmse: 1.00311
[2439]	training's rmse: 0.963149	valid_1's rmse: 1.00311
[2440]	training's rmse: 0.963148	valid_1's rmse: 1.00311
[2441]	training's rmse: 0.963147	valid_1's rmse: 1.00311
[2442]	training's rmse: 0.963143	valid_1's rmse: 1.00311
[2443]	training's rmse: 0.963142	valid_1's rmse: 1.00311
[2444]	training's rmse: 0.963141	valid_1's rmse: 1.00311
[2445]	training's rmse: 0.96314	va

## 用df_test数据集进行测试

In [177]:
df_test_pay = df_test[df_test['pay_sum']>0]
df_test_nopay = df_test[df_test['pay_sum']==0]

In [178]:
df_test_part1 = df_test_nopay[['user_id','cp_server_no','cp_role_id','pay_num']].rename(columns=
                                                                                {'pay_num':'predict_30_pay'})

In [179]:
target_test = df_test_pay['role_created_30_pay_sum']
target_test_ln = np.log1p(target_test)
features_test = df_test_pay.drop(['role_created_30_pay_sum','user_id','cp_server_no','cp_role_id'], axis=1)
y_predict = lgb_r_cv.predict(features_test)
y_predict[y_predict<0] = 0
mse = mean_squared_error(np.expm1(y_predict), np.expm1(target_test_ln))
mae = mean_absolute_error(np.expm1(y_predict), np.expm1(target_test_ln))
rmse = mse ** 0.5
print(rmse)
print(mse)
print(mae)

3252.3895203086468
10578037.59181351
674.2463062522809


In [180]:
sum(np.expm1(target_test_ln))/sum(np.expm1(y_predict))

1.6432305029217638

In [182]:
df_test_part2 = df_test_pay[['user_id','cp_server_no','cp_role_id']]
df_test_part2['predict_30_pay'] = np.expm1(y_predict) *1.64

In [183]:
pred = df_test_part1.append(df_test_part2)
predict_data = pd.merge(df_test[['user_id','cp_server_no','cp_role_id', 'role_created_30_pay_sum']],pred,on=['user_id','cp_server_no','cp_role_id'],how='left',validate='one_to_one')


In [184]:
mse = mean_squared_error(predict_data['predict_30_pay'], predict_data['role_created_30_pay_sum'])
mae = mean_absolute_error(predict_data['predict_30_pay'], predict_data['role_created_30_pay_sum'])
rmse = mse ** 0.5
print('测试集上的均方根误差:%.2f元'% rmse)
print('测试集上的平均绝对误差:%.2f元'% mae)
print('测试集前30天实际总的付费金额:%.2f元' % predict_data['role_created_30_pay_sum'].sum())
print('测试集前30天预测总的付费金额:%.2f元'% predict_data['predict_30_pay'].sum())
print('预测总金额准确率:',predict_data['predict_30_pay'].sum()/predict_data['role_created_30_pay_sum'].sum())

测试集上的均方根误差:447.18元
测试集上的平均绝对误差:15.98元
测试集前30天实际总的付费金额:29087286.00元
测试集前30天预测总的付费金额:26206086.76元
预测总金额准确率: 0.9009464394924627


In [185]:
import joblib
joblib.dump(lgb_r_cv,'./lgb_r_2d.pkl')

['./lgb_r_2d.pkl']