In [43]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import datetime,gc,math
import random
%matplotlib inline
import gc
import lightgbm as lgb
from matplotlib import pyplot as plt
from sklearn.model_selection import KFold, train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder,MinMaxScaler, StandardScaler
from sklearn.metrics import recall_score, auc, accuracy_score, f1_score, precision_score, classification_report, roc_auc_score,mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.feature_selection import VarianceThreshold

In [44]:
def active_time_transform(df):
    df['role_created_active_time'].fillna('"0-8,0","8-12,0","12-14,0","14-18,0","18-24,0"', inplace=True)
    df['role_created_active_time'] = df['role_created_active_time'].apply(lambda x:str(x)[1:-1])
    temp = df['role_created_active_time'].str.split(',',expand=True).iloc[:,[1,3,5,7,9]].rename(columns={1:'active_0-8',
                                        3:'active_8-12',5:'active_12-14',7:'active_14-18',9:'active_18-24'})
    for i in range(len(temp.columns)):
        temp.iloc[:,i] = temp.iloc[:,i].apply(lambda x:str(x)[:-1])
    df = df.join(temp).drop(['role_created_active_time'], axis=1)
    return df

def pay_grade_transform(df):
    df['pay_grade'].fillna('[0,0,0,0,0,0,0]',inplace=True)
    df['pay_grade'] = df['pay_grade'].apply(lambda x:str(x)[1:-1])
    temp = df['pay_grade'].str.split(',',expand=True).rename(columns={0:'pay_grade_1',
                1:'pay_grade_2',2:'pay_grade_3',3:'pay_grade_4',4:'pay_grade_5',5:'pay_grade_6',6:'pay_grade_7'})
    df = df.join(temp).drop(['pay_grade'], axis=1)
    return df

In [45]:
%%time
role_info = pd.read_csv('./data/mr_role_3d.csv')
role_info.drop_duplicates(subset=['user_id','cp_server_no','cp_role_id'],inplace=True)
role_info = active_time_transform(role_info)
role_info = pay_grade_transform(role_info)

Wall time: 1min 33s


In [46]:
# 删除2020-7-19日前30天的数据
role_info['create_role_time'] = pd.to_datetime(role_info['create_role_time'], format='%Y-%m-%d %H:%M:%S')
role_info.drop(role_info[role_info['create_role_time'] > (role_info['create_role_time'].max() + 
                                                          datetime.timedelta(-30))].index,inplace=True)

In [47]:
# 删除没有心跳数据的样本(1、没有登录;2、没有拿到心跳)
role_info.dropna(subset=['role_created_login_num'], inplace=True)
role_info.dropna(subset=['role_created_online'], inplace=True)
role_info['pay_num'].fillna(0, inplace=True)
role_info['pay_sum'].fillna(0, inplace=True)
role_info['role_created_active'] = role_info['role_created_active'].clip(0,3)
role_info['pay_rate'] = role_info['pay_num'] / (role_info['role_created_active'] + 1e-4)
role_info['pay_avg'] = role_info['pay_sum'] / (role_info['pay_num'] + 1e-4)

In [48]:
role_created_30_pay_sum = pd.read_csv('./data/role_created_30_pay_sum.csv',index_col=0)
# 对连接的关键字进行字符化或者整型化的调整
role_created_30_pay_sum['user_id'] = role_created_30_pay_sum['user_id'].astype(str)
role_created_30_pay_sum['mgame_id'] = role_created_30_pay_sum['mgame_id'].astype(str)
role_created_30_pay_sum['cp_server_no'] = role_created_30_pay_sum['cp_server_no'].astype(str)
role_created_30_pay_sum['cp_role_id'] = role_created_30_pay_sum['cp_role_id'].astype(str)
role_info['user_id'] = role_info['user_id'].astype(str)
role_info['mgame_id'] = role_info['mgame_id'].astype(str)
role_info['cp_server_no'] = role_info['cp_server_no'].astype(str)
role_info['cp_role_id'] = role_info['cp_role_id'].astype(str)
role_created_30_pay_sum.drop_duplicates(subset=['user_id','cp_server_no','mgame_id','cp_role_id'],inplace=True)
role_info.drop_duplicates(subset=['user_id','cp_server_no','mgame_id','cp_role_id'],inplace=True)
role_info = pd.merge(role_info,role_created_30_pay_sum,on=['user_id','mgame_id','cp_server_no','cp_role_id'],how='left',validate='one_to_one')

In [49]:
select_features = ['user_id','cp_server_no','cp_role_id','role_created_login_num',
       'role_created_active', 'role_created_online',
       'max_role_level', 'ip_num',
       'pay_num', 'pay_sum', 'active_0-8', 'active_8-12', 'active_12-14',
       'active_14-18', 'active_18-24', 'pay_grade_1', 'pay_grade_2',
       'pay_grade_3', 'pay_grade_4', 'pay_grade_5', 'pay_grade_6',
       'pay_rate', 'pay_avg','role_created_30_pay_sum']
role_info = role_info[select_features]
role_info['role_created_30_pay_sum'].fillna(0,inplace=True)
col_list = ['active_0-8','active_8-12','active_12-14','active_14-18','active_18-24','pay_grade_1','pay_grade_2',
            'pay_grade_3','pay_grade_4','pay_grade_5','pay_grade_6']
for col in col_list:
    role_info[col] = pd.to_numeric(role_info[col], errors='coerce')

In [60]:
role_info.to_pickle('./role_info_3d.pkl')

In [50]:
df_train, df_test = train_test_split(role_info,test_size=0.3)
df_train_pay = df_train[df_train['pay_sum']>0]
df_train_nopay = df_train[df_train['pay_sum']==0]

In [51]:
target = df_train_pay['role_created_30_pay_sum']
features = df_train_pay.drop(['role_created_30_pay_sum','user_id','cp_server_no','cp_role_id'], axis=1)
target_ln = np.log1p(target)
X_val, x_test, Y_val, y_test = train_test_split(features,target_ln, test_size=0.3)

In [52]:
# lgb的调参与交叉验证
train_data = lgb.Dataset(X_val, label=Y_val)
val_data = lgb.Dataset(x_test, label=y_test, reference=train_data)

# 设置参数初始值，不含交叉验证参数
print('设置参数')
params = {
    'boosting_type':'gbdt',
    'objective':'regression',
    'metric':'rmse',
    'nthread':4,
    'learning_rate':0.05
}

# 交叉验证（调参）
print('交叉验证')
min_rmse = float('5')
best_params = {}

# 准确率
print('调参1：提高准确率')
for num_leaves in range(5, 100, 5):
    for max_depth in range(3, 8, 1):
        params['num_leaves'] = num_leaves
        params['max_depth'] = max_depth
        
        cv_results = lgb.cv(
                            params,
                            train_data,
                            nfold=3,
                            stratified=False,
                            early_stopping_rounds=10,
                            verbose_eval=50     
                            )
        mean_rmse = pd.Series(cv_results['rmse-mean']).min()
        boost_rounds = pd.Series(cv_results['rmse-mean']).idxmin()
        
        if mean_rmse <= min_rmse:
            min_rmse = mean_rmse
            best_params['num_leaves'] = num_leaves
            best_params['max_depth'] = max_depth

if 'num_leaves' and 'max_depth' in best_params.keys():
    params['num_leaves'] = best_params['num_leaves']
    params['max_depth'] = best_params['max_depth']

# 过拟合
print('调参2：降低过拟合')
min_rmse = float('5')
for max_bin in range(5, 256, 10):
    for min_data_in_leaf in range(1, 102, 10):
        params['max_bin'] = max_bin
        params['min_data_in_leaf'] = min_data_in_leaf
        
        cv_results = lgb.cv(
                            params,
                            train_data,
                            nfold=3,
                            stratified=False,
                            early_stopping_rounds=10,
                            verbose_eval=50 
                            )
        mean_rmse = pd.Series(cv_results['rmse-mean']).min()
        boost_rounds = pd.Series(cv_results['rmse-mean']).idxmin()
        
        if mean_rmse <= min_rmse:
            min_rmse = mean_rmse
            best_params['min_data_in_leaf'] = min_data_in_leaf
            best_params['max_bin'] = max_bin
            
if 'max_bin' and 'min_data_in_leaf' in best_params.keys():
    params['max_bin'] = best_params['max_bin']
    params['min_data_in_leaf'] = best_params['min_data_in_leaf']

print('调参3：降低过拟合')
min_rmse = float('5')
for feature_fraction in [0.5, 0.6, 0.7, 0.8, 0.9, 1.0]:
    for bagging_fraction in [0.5, 0.6, 0.7, 0.8, 0.9, 1.0]:
        for bagging_freq in range(0,50,5):
            params['feature_fraction'] = feature_fraction
            params['bagging_fraction'] = bagging_fraction
            params['bagging_freq'] = bagging_freq

            cv_results = lgb.cv(
                            params,
                            train_data,
                            nfold=3,
                            stratified=False,
                            early_stopping_rounds=10,
                            verbose_eval=50
                            )
            mean_rmse = pd.Series(cv_results['rmse-mean']).min()
            boost_rounds = pd.Series(cv_results['rmse-mean']).idxmin()
        
            if mean_rmse <= min_rmse:
                min_rmse = mean_rmse
                best_params['feature_fraction'] = feature_fraction
                best_params['bagging_fraction'] = bagging_fraction
                best_params['bagging_freq'] = bagging_freq

if 'feature_fraction' and 'bagging_fraction' and 'bagging_freq' in best_params.keys():
    params['feature_fraction'] = best_params['feature_fraction']
    params['bagging_fraction'] = best_params['bagging_fraction']
    params['bagging_freq'] = best_params['bagging_freq']

print('调参4：降低过拟合')
min_rmse = float('5')
for lambda_l1 in [1e-5,1e-3,1e-1,0.0,0.1,0.3,0.5,0.7,0.9,1.0]:
    for lambda_l2 in [1e-5,1e-3,1e-1,0.0,0.1,0.4,0.6,0.7,0.9,1.0]:
        params['lambda_l1'] = lambda_l1
        params['lambda_l2'] = lambda_l2
        cv_results = lgb.cv(
                            params,
                            train_data,
                            nfold=3,
                            stratified=False,
                            early_stopping_rounds=10,
                            verbose_eval=50
                            )
        mean_rmse = pd.Series(cv_results['rmse-mean']).min()
        boost_rounds = pd.Series(cv_results['rmse-mean']).idxmin()
        
        if mean_rmse <= min_rmse:
            min_rmse = mean_rmse
            best_params['lambda_l1'] = lambda_l1
            best_params['lambda_l2'] = lambda_l2
if 'lambda_l1' and 'lambda_l2' in best_params.keys():
    params['lambda_l1'] = best_params['lambda_l1']
    params['lambda_l2'] = best_params['lambda_l2']

print("调参5：降低过拟合2")
min_rmse = float('5')
for min_split_gain in [0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]:
    params['min_split_gain'] = min_split_gain
    
    cv_results = lgb.cv(
                            params,
                            train_data,
                            nfold=3,
                            stratified=False,
                            early_stopping_rounds=10,
                            verbose_eval=50    
                            )
    mean_rmse = pd.Series(cv_results['rmse-mean']).min()
    boost_rounds = pd.Series(cv_results['rmse-mean']).idxmin()
        
    if mean_rmse >= min_rmse:
        min_rmse = mean_rmse
        
    best_params['min_split_gain'] = min_split_gain
if 'min_split_gain' in best_params.keys():
    params['min_split_gain'] = best_params['min_split_gain']
print(best_params)

设置参数
交叉验证
调参1：提高准确率
[50]	cv_agg's rmse: 0.975392 + 0.00937335
[100]	cv_agg's rmse: 0.952175 + 0.0102235
[50]	cv_agg's rmse: 0.975392 + 0.00937335
[100]	cv_agg's rmse: 0.952032 + 0.0101564
[50]	cv_agg's rmse: 0.975392 + 0.00937335
[100]	cv_agg's rmse: 0.952032 + 0.0101564
[50]	cv_agg's rmse: 0.975392 + 0.00937335
[100]	cv_agg's rmse: 0.952032 + 0.0101564
[50]	cv_agg's rmse: 0.975392 + 0.00937335
[100]	cv_agg's rmse: 0.952032 + 0.0101564
[50]	cv_agg's rmse: 0.966715 + 0.00953944
[100]	cv_agg's rmse: 0.950467 + 0.0102787
[50]	cv_agg's rmse: 0.964022 + 0.00974429
[100]	cv_agg's rmse: 0.949976 + 0.0105817
[50]	cv_agg's rmse: 0.964051 + 0.00971296
[100]	cv_agg's rmse: 0.949667 + 0.0105519
[50]	cv_agg's rmse: 0.964051 + 0.00971296
[100]	cv_agg's rmse: 0.949786 + 0.0105413
[50]	cv_agg's rmse: 0.964051 + 0.00971296
[100]	cv_agg's rmse: 0.949786 + 0.0105742
[50]	cv_agg's rmse: 0.966715 + 0.00953944
[100]	cv_agg's rmse: 0.950467 + 0.0102787
[50]	cv_agg's rmse: 0.961787 + 0.00973654
[100]	cv_agg's

[50]	cv_agg's rmse: 0.959381 + 0.00961974
[100]	cv_agg's rmse: 0.949417 + 0.0105327
[50]	cv_agg's rmse: 0.959356 + 0.00963518
[100]	cv_agg's rmse: 0.949389 + 0.0104836
[50]	cv_agg's rmse: 0.959517 + 0.00974239
[100]	cv_agg's rmse: 0.949456 + 0.0103907
[50]	cv_agg's rmse: 0.959602 + 0.00976346
[100]	cv_agg's rmse: 0.94952 + 0.0104379
[50]	cv_agg's rmse: 0.959605 + 0.00975807
[100]	cv_agg's rmse: 0.949491 + 0.0106166
[50]	cv_agg's rmse: 0.959585 + 0.00974996
[100]	cv_agg's rmse: 0.949526 + 0.0104563
[50]	cv_agg's rmse: 0.959695 + 0.00974962
[100]	cv_agg's rmse: 0.949563 + 0.0105371
[50]	cv_agg's rmse: 0.959586 + 0.00976318
[100]	cv_agg's rmse: 0.94943 + 0.0106807
[50]	cv_agg's rmse: 0.959244 + 0.00944951
[100]	cv_agg's rmse: 0.949701 + 0.0101285
[50]	cv_agg's rmse: 0.959312 + 0.00960645
[100]	cv_agg's rmse: 0.949225 + 0.0104132
[50]	cv_agg's rmse: 0.959298 + 0.00967282
[100]	cv_agg's rmse: 0.949274 + 0.0104048
[50]	cv_agg's rmse: 0.959381 + 0.00961974
[100]	cv_agg's rmse: 0.949417 + 0.01

[50]	cv_agg's rmse: 0.959298 + 0.00967282
[100]	cv_agg's rmse: 0.949274 + 0.0104048
[50]	cv_agg's rmse: 0.959381 + 0.00961974
[100]	cv_agg's rmse: 0.949417 + 0.0105327
[50]	cv_agg's rmse: 0.959356 + 0.00963518
[100]	cv_agg's rmse: 0.949389 + 0.0104836
[50]	cv_agg's rmse: 0.959517 + 0.00974239
[100]	cv_agg's rmse: 0.949456 + 0.0103907
[50]	cv_agg's rmse: 0.959602 + 0.00976346
[100]	cv_agg's rmse: 0.94952 + 0.0104379
[50]	cv_agg's rmse: 0.959605 + 0.00975807
[100]	cv_agg's rmse: 0.949491 + 0.0106166
[50]	cv_agg's rmse: 0.959585 + 0.00974996
[100]	cv_agg's rmse: 0.949526 + 0.0104563
[50]	cv_agg's rmse: 0.959695 + 0.00974962
[100]	cv_agg's rmse: 0.949563 + 0.0105371
[50]	cv_agg's rmse: 0.959586 + 0.00976318
[100]	cv_agg's rmse: 0.94943 + 0.0106807
[50]	cv_agg's rmse: 0.959244 + 0.00944951
[100]	cv_agg's rmse: 0.949701 + 0.0101285
[50]	cv_agg's rmse: 0.959312 + 0.00960645
[100]	cv_agg's rmse: 0.949225 + 0.0104132
[50]	cv_agg's rmse: 0.959298 + 0.00967282
[100]	cv_agg's rmse: 0.949274 + 0.01

[50]	cv_agg's rmse: 0.959312 + 0.00960645
[100]	cv_agg's rmse: 0.949225 + 0.0104132
[50]	cv_agg's rmse: 0.959298 + 0.00967282
[100]	cv_agg's rmse: 0.949274 + 0.0104048
[50]	cv_agg's rmse: 0.959381 + 0.00961974
[100]	cv_agg's rmse: 0.949417 + 0.0105327
[50]	cv_agg's rmse: 0.959356 + 0.00963518
[100]	cv_agg's rmse: 0.949389 + 0.0104836
[50]	cv_agg's rmse: 0.959517 + 0.00974239
[100]	cv_agg's rmse: 0.949456 + 0.0103907
[50]	cv_agg's rmse: 0.959602 + 0.00976346
[100]	cv_agg's rmse: 0.94952 + 0.0104379
[50]	cv_agg's rmse: 0.959605 + 0.00975807
[100]	cv_agg's rmse: 0.949491 + 0.0106166
[50]	cv_agg's rmse: 0.959585 + 0.00974996
[100]	cv_agg's rmse: 0.949526 + 0.0104563
[50]	cv_agg's rmse: 0.959695 + 0.00974962
[100]	cv_agg's rmse: 0.949563 + 0.0105371
[50]	cv_agg's rmse: 0.959586 + 0.00976318
[100]	cv_agg's rmse: 0.94943 + 0.0106807
[50]	cv_agg's rmse: 0.959244 + 0.00944951
[100]	cv_agg's rmse: 0.949701 + 0.0101285
[50]	cv_agg's rmse: 0.959312 + 0.00960645
[100]	cv_agg's rmse: 0.949225 + 0.01

[100]	cv_agg's rmse: 0.94895 + 0.0106167
[50]	cv_agg's rmse: 0.96059 + 0.0095284
[100]	cv_agg's rmse: 0.949613 + 0.0106948
[50]	cv_agg's rmse: 0.960215 + 0.00968064
[100]	cv_agg's rmse: 0.949354 + 0.0107325
[50]	cv_agg's rmse: 0.960625 + 0.00935605
[100]	cv_agg's rmse: 0.949415 + 0.0103233
[50]	cv_agg's rmse: 0.96025 + 0.00940654
[100]	cv_agg's rmse: 0.949053 + 0.0105394
[50]	cv_agg's rmse: 0.961254 + 0.00946921
[100]	cv_agg's rmse: 0.949774 + 0.0104542
[50]	cv_agg's rmse: 0.960812 + 0.00940669
[100]	cv_agg's rmse: 0.949736 + 0.0106725
[50]	cv_agg's rmse: 0.960506 + 0.00908779
[100]	cv_agg's rmse: 0.949694 + 0.0106376
[50]	cv_agg's rmse: 0.960633 + 0.00937764
[100]	cv_agg's rmse: 0.949545 + 0.0106937
[50]	cv_agg's rmse: 0.960888 + 0.00909249
[100]	cv_agg's rmse: 0.950212 + 0.0101733
[50]	cv_agg's rmse: 0.960265 + 0.009743
[100]	cv_agg's rmse: 0.94895 + 0.0106167
[50]	cv_agg's rmse: 0.960541 + 0.00977011
[100]	cv_agg's rmse: 0.949296 + 0.010704
[50]	cv_agg's rmse: 0.960189 + 0.00975841


[50]	cv_agg's rmse: 0.959837 + 0.00963941
[100]	cv_agg's rmse: 0.949014 + 0.0106555
[50]	cv_agg's rmse: 0.95962 + 0.00966237
[100]	cv_agg's rmse: 0.948898 + 0.0105544
[50]	cv_agg's rmse: 0.95962 + 0.00966237
[100]	cv_agg's rmse: 0.948898 + 0.0105544
[50]	cv_agg's rmse: 0.95962 + 0.00966237
[100]	cv_agg's rmse: 0.948898 + 0.0105544
[50]	cv_agg's rmse: 0.95962 + 0.00966237
[100]	cv_agg's rmse: 0.948898 + 0.0105544
[50]	cv_agg's rmse: 0.95962 + 0.00966237
[100]	cv_agg's rmse: 0.948898 + 0.0105544
[50]	cv_agg's rmse: 0.95962 + 0.00966237
[100]	cv_agg's rmse: 0.948898 + 0.0105544
[50]	cv_agg's rmse: 0.95962 + 0.00966237
[100]	cv_agg's rmse: 0.948898 + 0.0105544
[50]	cv_agg's rmse: 0.95962 + 0.00966237
[100]	cv_agg's rmse: 0.948898 + 0.0105544
[50]	cv_agg's rmse: 0.95962 + 0.00966237
[100]	cv_agg's rmse: 0.948898 + 0.0105544
[50]	cv_agg's rmse: 0.95962 + 0.00966237
[100]	cv_agg's rmse: 0.948898 + 0.0105544
[50]	cv_agg's rmse: 0.959538 + 0.00949609
[100]	cv_agg's rmse: 0.94901 + 0.0103541
[50

[100]	cv_agg's rmse: 0.949366 + 0.0105662
[50]	cv_agg's rmse: 0.959539 + 0.00985338
[100]	cv_agg's rmse: 0.949646 + 0.0107784
[50]	cv_agg's rmse: 0.95979 + 0.00962768
[100]	cv_agg's rmse: 0.949743 + 0.010288
[50]	cv_agg's rmse: 0.959468 + 0.00957097
[100]	cv_agg's rmse: 0.949468 + 0.0105087
[50]	cv_agg's rmse: 0.959291 + 0.0098696
[100]	cv_agg's rmse: 0.949009 + 0.0107556
[50]	cv_agg's rmse: 0.959431 + 0.00992503
[100]	cv_agg's rmse: 0.948983 + 0.0105869
[50]	cv_agg's rmse: 0.959243 + 0.00982651
[100]	cv_agg's rmse: 0.949079 + 0.0105106
[50]	cv_agg's rmse: 0.959672 + 0.00967088
[100]	cv_agg's rmse: 0.949287 + 0.0103435
[50]	cv_agg's rmse: 0.959621 + 0.00979533
[100]	cv_agg's rmse: 0.949173 + 0.0105976
[50]	cv_agg's rmse: 0.959439 + 0.00998223
[100]	cv_agg's rmse: 0.949133 + 0.0107203
[50]	cv_agg's rmse: 0.959446 + 0.00990695
[100]	cv_agg's rmse: 0.949286 + 0.0106902
[50]	cv_agg's rmse: 0.959634 + 0.0100679
[100]	cv_agg's rmse: 0.949577 + 0.0110148
[50]	cv_agg's rmse: 0.959673 + 0.01004

[50]	cv_agg's rmse: 0.960406 + 0.00892314
[100]	cv_agg's rmse: 0.9508 + 0.0103451
[50]	cv_agg's rmse: 0.96026 + 0.00927976
[100]	cv_agg's rmse: 0.951064 + 0.0106754
[50]	cv_agg's rmse: 0.960181 + 0.00950491
[100]	cv_agg's rmse: 0.950714 + 0.0110854
[50]	cv_agg's rmse: 0.960802 + 0.00930678
[100]	cv_agg's rmse: 0.95197 + 0.0104501
[50]	cv_agg's rmse: 0.959312 + 0.00960645
[100]	cv_agg's rmse: 0.949225 + 0.0104132
[50]	cv_agg's rmse: 0.959558 + 0.0100256
[100]	cv_agg's rmse: 0.949936 + 0.0110203
[50]	cv_agg's rmse: 0.959322 + 0.00983836
[100]	cv_agg's rmse: 0.949512 + 0.0108088
[50]	cv_agg's rmse: 0.95965 + 0.00966266
[100]	cv_agg's rmse: 0.949742 + 0.0104028
[50]	cv_agg's rmse: 0.959496 + 0.00979747
[100]	cv_agg's rmse: 0.949479 + 0.0109019
[50]	cv_agg's rmse: 0.960501 + 0.00961024
[100]	cv_agg's rmse: 0.950395 + 0.0104545
[50]	cv_agg's rmse: 0.959736 + 0.00962509
[100]	cv_agg's rmse: 0.949861 + 0.0103346
[50]	cv_agg's rmse: 0.959622 + 0.00951327
[100]	cv_agg's rmse: 0.950046 + 0.010707

[50]	cv_agg's rmse: 0.959521 + 0.00962184
[100]	cv_agg's rmse: 0.948633 + 0.0104004
[50]	cv_agg's rmse: 0.959458 + 0.0096349
[100]	cv_agg's rmse: 0.94852 + 0.0104536
[50]	cv_agg's rmse: 0.959618 + 0.00965405
[100]	cv_agg's rmse: 0.948789 + 0.0104196
[50]	cv_agg's rmse: 0.959644 + 0.0096213
[100]	cv_agg's rmse: 0.948543 + 0.0103396
[50]	cv_agg's rmse: 0.959635 + 0.00964938
[100]	cv_agg's rmse: 0.948615 + 0.0103644
[50]	cv_agg's rmse: 0.959622 + 0.00960731
[100]	cv_agg's rmse: 0.948605 + 0.0102971
[50]	cv_agg's rmse: 0.959462 + 0.009674
[100]	cv_agg's rmse: 0.948623 + 0.0103965
[50]	cv_agg's rmse: 0.959462 + 0.00967334
[100]	cv_agg's rmse: 0.948698 + 0.0104503
[50]	cv_agg's rmse: 0.959431 + 0.00962647
[100]	cv_agg's rmse: 0.948533 + 0.0104832
[50]	cv_agg's rmse: 0.959462 + 0.009674
[100]	cv_agg's rmse: 0.948623 + 0.0103965
[50]	cv_agg's rmse: 0.959431 + 0.00962647
[100]	cv_agg's rmse: 0.948533 + 0.0104832
[50]	cv_agg's rmse: 0.959621 + 0.00967115
[100]	cv_agg's rmse: 0.948585 + 0.0104744

In [53]:
params = {

'boosting_type':'gbdt',
'objective':'regression',
'metric':'rmse',
'nthread':4,
'learning_rate':0.05,
"max_depth":5,
"num_leaves":95,
"max_bin":255,
"min_data_in_leaf":11,
"min_split_gain":1.0,
"feature_fraction": 0.6,
"bagging_fraction":0.9,
"bagging_freq":30,
"lambda_l1":1.0,
"lambda_l2":0.6,
"metric": "rmse",
}
train_data = lgb.Dataset(X_val, label=Y_val)
val_data = lgb.Dataset(x_test, label=y_test, reference=train_data)
lgb_r_cv = lgb.train(params, train_data, num_boost_round=8000, early_stopping_rounds=100, valid_sets=[train_data, val_data])

[1]	training's rmse: 1.74962	valid_1's rmse: 1.73501
Training until validation scores don't improve for 100 rounds
[2]	training's rmse: 1.68903	valid_1's rmse: 1.67545
[3]	training's rmse: 1.63234	valid_1's rmse: 1.61968
[4]	training's rmse: 1.5813	valid_1's rmse: 1.5696
[5]	training's rmse: 1.53175	valid_1's rmse: 1.5209
[6]	training's rmse: 1.48559	valid_1's rmse: 1.47552
[7]	training's rmse: 1.44264	valid_1's rmse: 1.43336
[8]	training's rmse: 1.40274	valid_1's rmse: 1.3942
[9]	training's rmse: 1.36573	valid_1's rmse: 1.3578
[10]	training's rmse: 1.33204	valid_1's rmse: 1.32479
[11]	training's rmse: 1.30025	valid_1's rmse: 1.29363
[12]	training's rmse: 1.27181	valid_1's rmse: 1.26569
[13]	training's rmse: 1.24439	valid_1's rmse: 1.23875
[14]	training's rmse: 1.21961	valid_1's rmse: 1.21469
[15]	training's rmse: 1.19749	valid_1's rmse: 1.19318
[16]	training's rmse: 1.17636	valid_1's rmse: 1.17249
[17]	training's rmse: 1.15746	valid_1's rmse: 1.15405
[18]	training's rmse: 1.14004	vali

[151]	training's rmse: 0.931088	valid_1's rmse: 0.946307
[152]	training's rmse: 0.930958	valid_1's rmse: 0.946318
[153]	training's rmse: 0.930888	valid_1's rmse: 0.946303
[154]	training's rmse: 0.930818	valid_1's rmse: 0.94632
[155]	training's rmse: 0.930693	valid_1's rmse: 0.946349
[156]	training's rmse: 0.930532	valid_1's rmse: 0.946371
[157]	training's rmse: 0.930373	valid_1's rmse: 0.94636
[158]	training's rmse: 0.930288	valid_1's rmse: 0.946384
[159]	training's rmse: 0.930173	valid_1's rmse: 0.946396
[160]	training's rmse: 0.930109	valid_1's rmse: 0.946376
[161]	training's rmse: 0.929971	valid_1's rmse: 0.946382
[162]	training's rmse: 0.92989	valid_1's rmse: 0.946385
[163]	training's rmse: 0.929832	valid_1's rmse: 0.946391
[164]	training's rmse: 0.929722	valid_1's rmse: 0.946376
[165]	training's rmse: 0.929669	valid_1's rmse: 0.94638
[166]	training's rmse: 0.929501	valid_1's rmse: 0.946387
[167]	training's rmse: 0.929434	valid_1's rmse: 0.946373
[168]	training's rmse: 0.929319	val

In [54]:
df_test_pay = df_test[df_test['pay_sum']>0]
df_test_nopay = df_test[df_test['pay_sum']==0]

In [55]:
df_test_part1 = df_test_nopay[['user_id','cp_server_no','cp_role_id','pay_num']].rename(columns=
                                                                                {'pay_num':'predict_30_pay'})

In [56]:
target_test = df_test_pay['role_created_30_pay_sum']
target_test_ln = np.log1p(target_test)
features_test = df_test_pay.drop(['role_created_30_pay_sum','user_id','cp_server_no','cp_role_id'], axis=1)
y_predict = lgb_r_cv.predict(features_test)
y_predict[y_predict<0] = 0
mse = mean_squared_error(np.expm1(y_predict), np.expm1(target_test_ln))
mae = mean_absolute_error(np.expm1(y_predict), np.expm1(target_test_ln))
rmse = mse ** 0.5
print(rmse)
print(mse)
print(mae)

2716.002163087235
7376667.749894541
541.548766599547


In [57]:
df_test_part2 = df_test_pay[['user_id','cp_server_no','cp_role_id']]
df_test_part2['predict_30_pay'] = np.expm1(y_predict) *1.54
pred = df_test_part1.append(df_test_part2)
predict_data = pd.merge(df_test[['user_id','cp_server_no','cp_role_id', 'role_created_30_pay_sum']],pred,on=['user_id','cp_server_no','cp_role_id'],how='left',validate='one_to_one')

In [58]:
mse = mean_squared_error(predict_data['predict_30_pay'], predict_data['role_created_30_pay_sum'])
mae = mean_absolute_error(predict_data['predict_30_pay'], predict_data['role_created_30_pay_sum'])
rmse = mse ** 0.5
print('测试集上的均方根误差:%.2f元'% rmse)
print('测试集上的平均绝对误差:%.2f元'% mae)
print('测试集前30天实际总的付费金额:%.2f元' % predict_data['role_created_30_pay_sum'].sum())
print('测试集前30天预测总的付费金额:%.2f元'% predict_data['predict_30_pay'].sum())
print('预测总金额准确率:',predict_data['predict_30_pay'].sum()/predict_data['role_created_30_pay_sum'].sum())

测试集上的均方根误差:388.34元
测试集上的平均绝对误差:14.13元
测试集前30天实际总的付费金额:28815998.00元
测试集前30天预测总的付费金额:27051970.63元
预测总金额准确率: 0.9387830548870398


In [59]:
import joblib
joblib.dump(lgb_r_cv,'./lgb_r_3d.pkl')

['./lgb_r_3d.pkl']