In [30]:
import pandas as pd
import numpy as np
import random
import gc
import seaborn as sns
import lightgbm as lgb
from matplotlib import pyplot as plt
# import matplotlib.pyplot as plt
from sklearn.model_selection import KFold, train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder,MinMaxScaler, StandardScaler
from sklearn.metrics import recall_score, auc, accuracy_score, f1_score, precision_score, classification_report, roc_auc_score,mean_squared_error
from pandas.api.types import is_categorical_dtype
from pandas.api.types import is_datetime64_any_dtype as is_datetime
# import missingno as msno
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
import warnings
warnings.filterwarnings('ignore')

In [31]:
data = pd.read_pickle('./data_4d.pickle')
select_features = ['user_id','cp_server_no','cp_role_id','role_created_login_num',
       'role_created_active', 'role_created_online',
       'max_role_level', 'ip_num',
       'device_num',
       'pay_num', 'pay_sum', 'active_0-8', 'active_8-12', 'active_12-14',
       'active_14-18', 'active_18-24', 'pay_grade_1', 'pay_grade_2',
       'pay_grade_3', 'pay_grade_4', 'pay_grade_5', 'pay_grade_6',
       'pay_rate', 'pay_avg',
       'model_money_level', 'hour', 'weekend', 'is_holidays','mobile','platform',
       'user_creates_3_server_num','user_creates_3_role_num', 'time_interval']
label = ['role_created_30_pay_sum']
df = data[select_features].join(data[label])
df['role_created_30_pay_sum'].fillna(0,inplace=True)
df['platform'].fillna(0,inplace=True)
df['user_creates_3_server_num'].fillna(1,inplace=True)
df['user_creates_3_role_num'].fillna(1,inplace=True)
df['time_interval'].fillna(0,inplace=True)
df['model_money_level'].fillna(4,inplace=True)

In [32]:
df['pay_sum'].sum()

31133926.0

In [33]:
df_train, df_test = train_test_split(df,test_size=0.3)
df_train_pay = df_train[df_train['pay_sum']>0]
df_train_nopay = df_train[df_train['pay_sum']==0]

In [34]:
target = df_train_pay['role_created_30_pay_sum']
features = df_train_pay.drop(['role_created_30_pay_sum','user_id','cp_server_no','cp_role_id'], axis=1)
target_ln = np.log1p(target)

In [36]:
X_val, x_test, Y_val, y_test = train_test_split(features,target_ln, test_size=0.3)

In [37]:
# lgb的调参与交叉验证
train_data = lgb.Dataset(X_val, label=Y_val)
val_data = lgb.Dataset(x_test, label=y_test, reference=train_data)

# 设置参数初始值，不含交叉验证参数
print('设置参数')
params = {
    'boosting_type':'gbdt',
    'objective':'regression',
    'metric':'rmse',
    'nthread':4,
    'learning_rate':0.05
}

# 交叉验证（调参）
print('交叉验证')
min_rmse = float('2')
best_params = {}

# 准确率
print('调参1：提高准确率')
for num_leaves in range(5, 100, 5):
    for max_depth in range(3, 8, 1):
        params['num_leaves'] = num_leaves
        params['max_depth'] = max_depth
        
        cv_results = lgb.cv(
                            params,
                            train_data,
                            nfold=3,
                            stratified=False,
                            early_stopping_rounds=10,
                            verbose_eval=50     
                            )
        mean_rmse = pd.Series(cv_results['rmse-mean']).min()
        boost_rounds = pd.Series(cv_results['rmse-mean']).idxmin()
        
        if mean_rmse <= min_rmse:
            min_rmse = mean_rmse
            best_params['num_leaves'] = num_leaves
            best_params['max_depth'] = max_depth

if 'num_leaves' and 'max_depth' in best_params.keys():
    params['num_leaves'] = best_params['num_leaves']
    params['max_depth'] = best_params['max_depth']

# 过拟合
print('调参2：降低过拟合')
for max_bin in range(5, 256, 10):
    for min_data_in_leaf in range(1, 102, 10):
        params['max_bin'] = max_bin
        params['min_data_in_leaf'] = min_data_in_leaf
        
        cv_results = lgb.cv(
                            params,
                            train_data,
                            nfold=3,
                            stratified=False,
                            early_stopping_rounds=10,
                            verbose_eval=50 
                            )
        mean_rmse = pd.Series(cv_results['rmse-mean']).min()
        boost_rounds = pd.Series(cv_results['rmse-mean']).idxmin()
        
        if mean_rmse <= min_rmse:
            min_rmse = mean_rmse
            best_params['min_data_in_leaf'] = min_data_in_leaf
            best_params['max_bin'] = max_bin
            
if 'max_bin' and 'min_data_in_leaf' in best_params.keys():
    params['max_bin'] = best_params['max_bin']
    params['min_data_in_leaf'] = best_params['min_data_in_leaf']

print('调参3：降低过拟合')
for feature_fraction in [0.5, 0.6, 0.7, 0.8, 0.9, 1.0]:
    for bagging_fraction in [0.5, 0.6, 0.7, 0.8, 0.9, 1.0]:
        for bagging_freq in range(0,50,5):
            params['feature_fraction'] = feature_fraction
            params['bagging_fraction'] = bagging_fraction
            params['bagging_freq'] = bagging_freq

            cv_results = lgb.cv(
                            params,
                            train_data,
                            nfold=3,
                            stratified=False,
                            early_stopping_rounds=10,
                            verbose_eval=50
                            )
            mean_rmse = pd.Series(cv_results['rmse-mean']).min()
            boost_rounds = pd.Series(cv_results['rmse-mean']).idxmin()
        
            if mean_rmse <= min_rmse:
                min_rmse = mean_rmse
                best_params['feature_fraction'] = feature_fraction
                best_params['bagging_fraction'] = bagging_fraction
                best_params['bagging_freq'] = bagging_freq

if 'feature_fraction' and 'bagging_fraction' and 'bagging_freq' in best_params.keys():
    params['feature_fraction'] = best_params['feature_fraction']
    params['bagging_fraction'] = best_params['bagging_fraction']
    params['bagging_freq'] = best_params['bagging_freq']

print('调参4：降低过拟合')

for lambda_l1 in [1e-5,1e-3,1e-1,0.0,0.1,0.3,0.5,0.7,0.9,1.0]:
    for lambda_l2 in [1e-5,1e-3,1e-1,0.0,0.1,0.4,0.6,0.7,0.9,1.0]:
        params['lambda_l1'] = lambda_l1
        params['lambda_l2'] = lambda_l2
        cv_results = lgb.cv(
                            params,
                            train_data,
                            nfold=3,
                            stratified=False,
                            early_stopping_rounds=10,
                            verbose_eval=50
                            )
        mean_rmse = pd.Series(cv_results['rmse-mean']).min()
        boost_rounds = pd.Series(cv_results['rmse-mean']).idxmin()
        
        if mean_rmse <= min_rmse:
            min_rmse = mean_rmse
            best_params['lambda_l1'] = lambda_l1
            best_params['lambda_l2'] = lambda_l2
if 'lambda_l1' and 'lambda_l2' in best_params.keys():
    params['lambda_l1'] = best_params['lambda_l1']
    params['lambda_l2'] = best_params['lambda_l2']

print("调参5：降低过拟合2")
for min_split_gain in [0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]:
    params['min_split_gain'] = min_split_gain
    
    cv_results = lgb.cv(
                            params,
                            train_data,
                            nfold=3,
                            stratified=False,
                            early_stopping_rounds=10,
                            verbose_eval=50    
                            )
    mean_rmse = pd.Series(cv_results['rmse-mean']).min()
    boost_rounds = pd.Series(cv_results['rmse-mean']).idxmin()
        
    if mean_rmse >= min_rmse:
        min_rmse = mean_rmse
        
    best_params['min_split_gain'] = min_split_gain
if 'min_split_gain' in best_params.keys():
    params['min_split_gain'] = best_params['min_split_gain']
print(best_params)

设置参数
交叉验证
调参1：提高准确率
[50]	cv_agg's rmse: 0.898571 + 0.00642053
[100]	cv_agg's rmse: 0.871305 + 0.00728686
[50]	cv_agg's rmse: 0.898571 + 0.00642053
[100]	cv_agg's rmse: 0.871157 + 0.00727738
[50]	cv_agg's rmse: 0.898571 + 0.00642053
[100]	cv_agg's rmse: 0.871157 + 0.00727738
[50]	cv_agg's rmse: 0.898571 + 0.00642053
[100]	cv_agg's rmse: 0.871157 + 0.00727738
[50]	cv_agg's rmse: 0.898571 + 0.00642053
[100]	cv_agg's rmse: 0.871157 + 0.00727738
[50]	cv_agg's rmse: 0.888844 + 0.00674508
[100]	cv_agg's rmse: 0.869181 + 0.0075918
[50]	cv_agg's rmse: 0.885553 + 0.00686756
[100]	cv_agg's rmse: 0.868241 + 0.00751995
[50]	cv_agg's rmse: 0.885711 + 0.0067323
[100]	cv_agg's rmse: 0.868012 + 0.00767013
[50]	cv_agg's rmse: 0.885728 + 0.00672915
[100]	cv_agg's rmse: 0.868089 + 0.00760806
[50]	cv_agg's rmse: 0.885728 + 0.00672915
[100]	cv_agg's rmse: 0.868095 + 0.00762208
[50]	cv_agg's rmse: 0.888844 + 0.00674508
[100]	cv_agg's rmse: 0.869181 + 0.0075918
[50]	cv_agg's rmse: 0.882748 + 0.00709768
[100]	

[50]	cv_agg's rmse: 0.878634 + 0.00743768
[100]	cv_agg's rmse: 0.867069 + 0.0081756
[50]	cv_agg's rmse: 0.878537 + 0.00750183
[100]	cv_agg's rmse: 0.866997 + 0.00784619
[50]	cv_agg's rmse: 0.878391 + 0.00751493
[100]	cv_agg's rmse: 0.866917 + 0.00797184
[50]	cv_agg's rmse: 0.878427 + 0.00747132
[100]	cv_agg's rmse: 0.866868 + 0.00784534
[50]	cv_agg's rmse: 0.878447 + 0.00738821
[100]	cv_agg's rmse: 0.866822 + 0.007728
[50]	cv_agg's rmse: 0.87846 + 0.0074485
[100]	cv_agg's rmse: 0.867032 + 0.00776468
[50]	cv_agg's rmse: 0.878431 + 0.00753121
[100]	cv_agg's rmse: 0.867007 + 0.00786526
[50]	cv_agg's rmse: 0.878463 + 0.00740705
[100]	cv_agg's rmse: 0.866898 + 0.00777658
[50]	cv_agg's rmse: 0.878514 + 0.00741694
[100]	cv_agg's rmse: 0.866978 + 0.00784263
[50]	cv_agg's rmse: 0.878858 + 0.00775074
[100]	cv_agg's rmse: 0.867737 + 0.00843227
[50]	cv_agg's rmse: 0.87872 + 0.00753216
[100]	cv_agg's rmse: 0.867128 + 0.00825152
[50]	cv_agg's rmse: 0.878634 + 0.00743768
[100]	cv_agg's rmse: 0.867069

[100]	cv_agg's rmse: 0.867737 + 0.00843227
[50]	cv_agg's rmse: 0.87872 + 0.00753216
[100]	cv_agg's rmse: 0.867128 + 0.00825152
[50]	cv_agg's rmse: 0.878634 + 0.00743768
[100]	cv_agg's rmse: 0.867069 + 0.0081756
[50]	cv_agg's rmse: 0.878537 + 0.00750183
[100]	cv_agg's rmse: 0.866997 + 0.00784619
[50]	cv_agg's rmse: 0.878391 + 0.00751493
[100]	cv_agg's rmse: 0.866917 + 0.00797184
[50]	cv_agg's rmse: 0.878427 + 0.00747132
[100]	cv_agg's rmse: 0.866868 + 0.00784534
[50]	cv_agg's rmse: 0.878447 + 0.00738821
[100]	cv_agg's rmse: 0.866822 + 0.007728
[50]	cv_agg's rmse: 0.87846 + 0.0074485
[100]	cv_agg's rmse: 0.867032 + 0.00776468
[50]	cv_agg's rmse: 0.878431 + 0.00753121
[100]	cv_agg's rmse: 0.867007 + 0.00786526
[50]	cv_agg's rmse: 0.878463 + 0.00740705
[100]	cv_agg's rmse: 0.866898 + 0.00777658
[50]	cv_agg's rmse: 0.878514 + 0.00741694
[100]	cv_agg's rmse: 0.866978 + 0.00784263
[50]	cv_agg's rmse: 0.878858 + 0.00775074
[100]	cv_agg's rmse: 0.867737 + 0.00843227
[50]	cv_agg's rmse: 0.87872 

[50]	cv_agg's rmse: 0.878514 + 0.00741694
[100]	cv_agg's rmse: 0.866978 + 0.00784263
[50]	cv_agg's rmse: 0.878858 + 0.00775074
[100]	cv_agg's rmse: 0.867737 + 0.00843227
[50]	cv_agg's rmse: 0.87872 + 0.00753216
[100]	cv_agg's rmse: 0.867128 + 0.00825152
[50]	cv_agg's rmse: 0.878634 + 0.00743768
[100]	cv_agg's rmse: 0.867069 + 0.0081756
[50]	cv_agg's rmse: 0.878537 + 0.00750183
[100]	cv_agg's rmse: 0.866997 + 0.00784619
[50]	cv_agg's rmse: 0.878391 + 0.00751493
[100]	cv_agg's rmse: 0.866917 + 0.00797184
[50]	cv_agg's rmse: 0.878427 + 0.00747132
[100]	cv_agg's rmse: 0.866868 + 0.00784534
[50]	cv_agg's rmse: 0.878447 + 0.00738821
[100]	cv_agg's rmse: 0.866822 + 0.007728
[50]	cv_agg's rmse: 0.87846 + 0.0074485
[100]	cv_agg's rmse: 0.867032 + 0.00776468
[50]	cv_agg's rmse: 0.878431 + 0.00753121
[100]	cv_agg's rmse: 0.867007 + 0.00786526
[50]	cv_agg's rmse: 0.878463 + 0.00740705
[100]	cv_agg's rmse: 0.866898 + 0.00777658
[50]	cv_agg's rmse: 0.878514 + 0.00741694
[100]	cv_agg's rmse: 0.866978

[50]	cv_agg's rmse: 0.881696 + 0.00744323
[100]	cv_agg's rmse: 0.868955 + 0.00753146
[50]	cv_agg's rmse: 0.881992 + 0.0072215
[100]	cv_agg's rmse: 0.869463 + 0.00752762
[50]	cv_agg's rmse: 0.880643 + 0.00714647
[100]	cv_agg's rmse: 0.867312 + 0.00753271
[50]	cv_agg's rmse: 0.880575 + 0.00717013
[100]	cv_agg's rmse: 0.867755 + 0.00738173
[50]	cv_agg's rmse: 0.880463 + 0.00770147
[100]	cv_agg's rmse: 0.867719 + 0.00807706
[50]	cv_agg's rmse: 0.880935 + 0.0074477
[100]	cv_agg's rmse: 0.868089 + 0.00766312
[50]	cv_agg's rmse: 0.880561 + 0.00744481
[100]	cv_agg's rmse: 0.868193 + 0.00785029
[50]	cv_agg's rmse: 0.880836 + 0.00749647
[100]	cv_agg's rmse: 0.867988 + 0.00779283
[50]	cv_agg's rmse: 0.88142 + 0.00734084
[100]	cv_agg's rmse: 0.868521 + 0.00761756
[50]	cv_agg's rmse: 0.881034 + 0.00712191
[100]	cv_agg's rmse: 0.868315 + 0.00756835
[50]	cv_agg's rmse: 0.881309 + 0.00745097
[100]	cv_agg's rmse: 0.868685 + 0.00784824
[50]	cv_agg's rmse: 0.881446 + 0.00734351
[100]	cv_agg's rmse: 0.868

[50]	cv_agg's rmse: 0.879855 + 0.00721216
[100]	cv_agg's rmse: 0.867318 + 0.00774429
[50]	cv_agg's rmse: 0.879837 + 0.00728167
[100]	cv_agg's rmse: 0.866984 + 0.00768427
[50]	cv_agg's rmse: 0.879934 + 0.00712693
[100]	cv_agg's rmse: 0.867384 + 0.00753798
[50]	cv_agg's rmse: 0.879798 + 0.00729465
[100]	cv_agg's rmse: 0.867334 + 0.00755268
[50]	cv_agg's rmse: 0.879824 + 0.00733212
[100]	cv_agg's rmse: 0.866889 + 0.00729374
[50]	cv_agg's rmse: 0.879609 + 0.00731134
[100]	cv_agg's rmse: 0.866849 + 0.00756502
[50]	cv_agg's rmse: 0.879609 + 0.00731134
[100]	cv_agg's rmse: 0.866849 + 0.00756502
[50]	cv_agg's rmse: 0.879609 + 0.00731134
[100]	cv_agg's rmse: 0.866849 + 0.00756502
[50]	cv_agg's rmse: 0.879609 + 0.00731134
[100]	cv_agg's rmse: 0.866849 + 0.00756502
[50]	cv_agg's rmse: 0.879609 + 0.00731134
[100]	cv_agg's rmse: 0.866849 + 0.00756502
[50]	cv_agg's rmse: 0.879609 + 0.00731134
[100]	cv_agg's rmse: 0.866849 + 0.00756502
[50]	cv_agg's rmse: 0.879609 + 0.00731134
[100]	cv_agg's rmse: 0.

[50]	cv_agg's rmse: 0.87899 + 0.00736078
[100]	cv_agg's rmse: 0.86706 + 0.00729477
[50]	cv_agg's rmse: 0.879107 + 0.00722052
[100]	cv_agg's rmse: 0.867286 + 0.00769424
[50]	cv_agg's rmse: 0.878984 + 0.0074653
[100]	cv_agg's rmse: 0.867445 + 0.00753477
[50]	cv_agg's rmse: 0.879506 + 0.00703769
[100]	cv_agg's rmse: 0.867359 + 0.00758587
[50]	cv_agg's rmse: 0.879699 + 0.00681994
[100]	cv_agg's rmse: 0.867564 + 0.00713943
[50]	cv_agg's rmse: 0.8794 + 0.00687913
[100]	cv_agg's rmse: 0.867391 + 0.0073831
[50]	cv_agg's rmse: 0.879484 + 0.00716669
[100]	cv_agg's rmse: 0.867782 + 0.00758837
[50]	cv_agg's rmse: 0.879593 + 0.00717758
[100]	cv_agg's rmse: 0.867751 + 0.00725416
[50]	cv_agg's rmse: 0.878925 + 0.00731947
[100]	cv_agg's rmse: 0.866871 + 0.00765237
[50]	cv_agg's rmse: 0.878942 + 0.00737588
[100]	cv_agg's rmse: 0.866761 + 0.00783479
[50]	cv_agg's rmse: 0.879071 + 0.00714442
[100]	cv_agg's rmse: 0.866789 + 0.00780114
[50]	cv_agg's rmse: 0.879117 + 0.0069084
[100]	cv_agg's rmse: 0.867064 

[50]	cv_agg's rmse: 0.878828 + 0.0072765
[100]	cv_agg's rmse: 0.866997 + 0.00759878
[50]	cv_agg's rmse: 0.878447 + 0.00738821
[100]	cv_agg's rmse: 0.866822 + 0.007728
[50]	cv_agg's rmse: 0.879059 + 0.00737831
[100]	cv_agg's rmse: 0.867844 + 0.00777902
[50]	cv_agg's rmse: 0.879073 + 0.00812589
[100]	cv_agg's rmse: 0.867804 + 0.00775123
[50]	cv_agg's rmse: 0.878881 + 0.00734994
[100]	cv_agg's rmse: 0.868253 + 0.00817608
[50]	cv_agg's rmse: 0.879303 + 0.00792638
[100]	cv_agg's rmse: 0.867999 + 0.00826399
[50]	cv_agg's rmse: 0.879629 + 0.0076935
[100]	cv_agg's rmse: 0.868632 + 0.00780034
[50]	cv_agg's rmse: 0.879654 + 0.00745971
[100]	cv_agg's rmse: 0.868573 + 0.0074947
[50]	cv_agg's rmse: 0.879588 + 0.00729919
[100]	cv_agg's rmse: 0.868886 + 0.00776172
[50]	cv_agg's rmse: 0.879787 + 0.00744897
[100]	cv_agg's rmse: 0.86912 + 0.00739129
[50]	cv_agg's rmse: 0.879824 + 0.00754745
[100]	cv_agg's rmse: 0.869203 + 0.00776055
[50]	cv_agg's rmse: 0.878447 + 0.00738821
[100]	cv_agg's rmse: 0.866822

[50]	cv_agg's rmse: 0.879534 + 0.00748483
[100]	cv_agg's rmse: 0.866788 + 0.00781418
[50]	cv_agg's rmse: 0.879562 + 0.00742614
[100]	cv_agg's rmse: 0.866689 + 0.00762037
[50]	cv_agg's rmse: 0.879551 + 0.00751071
[100]	cv_agg's rmse: 0.866708 + 0.00771217
[50]	cv_agg's rmse: 0.879658 + 0.00741058
[100]	cv_agg's rmse: 0.866792 + 0.00782523
[50]	cv_agg's rmse: 0.879429 + 0.00745448
[100]	cv_agg's rmse: 0.866792 + 0.00767243
[50]	cv_agg's rmse: 0.87943 + 0.00745448
[100]	cv_agg's rmse: 0.866792 + 0.00767243
[50]	cv_agg's rmse: 0.879415 + 0.0074427
[100]	cv_agg's rmse: 0.866512 + 0.00772457
[50]	cv_agg's rmse: 0.879429 + 0.00745448
[100]	cv_agg's rmse: 0.866792 + 0.00767243
[50]	cv_agg's rmse: 0.879415 + 0.0074427
[100]	cv_agg's rmse: 0.866512 + 0.00772457
[50]	cv_agg's rmse: 0.879476 + 0.0074308
[100]	cv_agg's rmse: 0.866705 + 0.00770399
[50]	cv_agg's rmse: 0.87953 + 0.00736009
[100]	cv_agg's rmse: 0.866615 + 0.00764902
[50]	cv_agg's rmse: 0.879549 + 0.00736617
[100]	cv_agg's rmse: 0.86669

In [44]:
params = {

'boosting_type':'gbdt',
'objective':'regression',
'metric':'rmse',
'nthread':4,
'learning_rate':0.05,
"max_depth":7,
"num_leaves":30,
"max_bin":255,
"min_data_in_leaf":61,
"min_split_gain":1.0,
"feature_fraction": 0.6,
"bagging_fraction":0.7,
"bagging_freq":5,
"lambda_l1":0.0,
"lambda_l2":0.0,
"metric": "rmse",
}
train_data = lgb.Dataset(X_val, label=Y_val)
val_data = lgb.Dataset(x_test, label=y_test, reference=train_data)
lgb_r_cv = lgb.train(params, train_data, num_boost_round=8000, early_stopping_rounds=100, valid_sets=[train_data, val_data])

[1]	training's rmse: 1.74067	valid_1's rmse: 1.73119
Training until validation scores don't improve for 100 rounds
[2]	training's rmse: 1.67634	valid_1's rmse: 1.66764
[3]	training's rmse: 1.61735	valid_1's rmse: 1.60945
[4]	training's rmse: 1.56254	valid_1's rmse: 1.5553
[5]	training's rmse: 1.51069	valid_1's rmse: 1.50426
[6]	training's rmse: 1.46297	valid_1's rmse: 1.45718
[7]	training's rmse: 1.41649	valid_1's rmse: 1.41151
[8]	training's rmse: 1.37537	valid_1's rmse: 1.37108
[9]	training's rmse: 1.33705	valid_1's rmse: 1.33336
[10]	training's rmse: 1.29934	valid_1's rmse: 1.29642
[11]	training's rmse: 1.26428	valid_1's rmse: 1.26214
[12]	training's rmse: 1.23179	valid_1's rmse: 1.23047
[13]	training's rmse: 1.20214	valid_1's rmse: 1.20146
[14]	training's rmse: 1.17488	valid_1's rmse: 1.17491
[15]	training's rmse: 1.14944	valid_1's rmse: 1.15012
[16]	training's rmse: 1.12555	valid_1's rmse: 1.12689
[17]	training's rmse: 1.10343	valid_1's rmse: 1.10536
[18]	training's rmse: 1.08314	

[187]	training's rmse: 0.840249	valid_1's rmse: 0.8754
[188]	training's rmse: 0.840146	valid_1's rmse: 0.875428
[189]	training's rmse: 0.84003	valid_1's rmse: 0.875462
[190]	training's rmse: 0.83998	valid_1's rmse: 0.875458
[191]	training's rmse: 0.839949	valid_1's rmse: 0.875485
[192]	training's rmse: 0.839893	valid_1's rmse: 0.875508
[193]	training's rmse: 0.839843	valid_1's rmse: 0.875518
[194]	training's rmse: 0.839768	valid_1's rmse: 0.875541
[195]	training's rmse: 0.839752	valid_1's rmse: 0.875558
[196]	training's rmse: 0.839642	valid_1's rmse: 0.875563
[197]	training's rmse: 0.839561	valid_1's rmse: 0.875553
[198]	training's rmse: 0.839435	valid_1's rmse: 0.875554
[199]	training's rmse: 0.839323	valid_1's rmse: 0.875588
[200]	training's rmse: 0.839177	valid_1's rmse: 0.875554
[201]	training's rmse: 0.839076	valid_1's rmse: 0.875558
[202]	training's rmse: 0.838941	valid_1's rmse: 0.875545
[203]	training's rmse: 0.838819	valid_1's rmse: 0.875552
[204]	training's rmse: 0.838711	val

In [45]:
y_predict_lgb = lgb_r_cv.predict(x_test)
mse = mean_squared_error(np.expm1(y_predict_lgb), np.expm1(y_test))
mae = mean_absolute_error(np.expm1(y_predict_lgb), np.expm1(y_test))
rmse = mse ** 0.5
print(rmse)
print(mse)
print(mae)

2719.652018507354
7396507.101771125
504.3984608501788


In [11]:
# 用df_test数据集进行测试

In [46]:
df_test_pay = df_test[df_test['pay_sum']>0]
df_test_nopay = df_test[df_test['pay_sum']==0]

In [47]:
df_test_part1 = df_test_nopay[['user_id','cp_server_no','cp_role_id','pay_num']].rename(columns=
                                                                                {'pay_num':'predict_30_pay'})

In [48]:
target_test = df_test_pay['role_created_30_pay_sum']
target_test_ln = np.log1p(target_test)
features_test = df_test_pay.drop(['role_created_30_pay_sum','user_id','cp_server_no','cp_role_id'], axis=1)
y_predict = lgb_r_cv.predict(features_test)
y_predict[y_predict<0] = 0
mse = mean_squared_error(np.expm1(y_predict), np.expm1(target_test_ln))
mae = mean_absolute_error(np.expm1(y_predict), np.expm1(target_test_ln))
rmse = mse ** 0.5
print(rmse)
print(mse)
print(mae)

2599.509518080262
6757449.734589876
497.4591481824169


In [49]:
sum(np.expm1(target_test_ln))/sum(np.expm1(y_predict))

1.4323091448802285

In [50]:
df_test_part2 = df_test_pay[['user_id','cp_server_no','cp_role_id']]
df_test_part2['predict_30_pay'] = np.expm1(y_predict) *1.42
pred = df_test_part1.append(df_test_part2)
predict_data = pd.merge(df_test[['user_id','cp_server_no','cp_role_id', 'role_created_30_pay_sum']],pred,on=['user_id','cp_server_no','cp_role_id'],how='left',validate='one_to_one')

In [51]:
mse = mean_squared_error(predict_data['predict_30_pay'], predict_data['role_created_30_pay_sum'])
mae = mean_absolute_error(predict_data['predict_30_pay'], predict_data['role_created_30_pay_sum'])
rmse = mse ** 0.5
print('测试集上的均方根误差:%.2f元'% rmse)
# print('测试集上的均方误差:%f元' % mse)
print('测试集上的平均绝对误差:%.2f元'% mae)

测试集上的均方根误差:379.55元
测试集上的平均绝对误差:13.12元


In [52]:
print('测试集前30天实际总的付费金额:%.2f元' % predict_data['role_created_30_pay_sum'].sum())
print('测试集前30天预测总的付费金额:%.2f元'% predict_data['predict_30_pay'].sum())
print('预测总金额准确率:',predict_data['predict_30_pay'].sum()/predict_data['role_created_30_pay_sum'].sum())

测试集前30天实际总的付费金额:29399464.00元
测试集前30天预测总的付费金额:28016521.25元
预测总金额准确率: 0.9529602734973173


# 分计划

In [54]:
data['create_role_time'] = data['create_role_time'].dt.date

In [55]:
df_source = data[['user_id','cp_server_no','cp_role_id','create_role_time','channel_id','source_id','pay_sum']]
df_source.drop_duplicates(inplace=True)
df_source_predict = pd.merge(predict_data,df_source,on=['user_id','cp_server_no','cp_role_id'],how='left',validate='one_to_one')

In [56]:
df_source_predict['is_pay'] = (df_source_predict['pay_sum'] != 0).astype(int)
groups = df_source_predict.groupby(['channel_id','source_id','create_role_time'])
print(groups.ngroups)
temp1 = groups['role_created_30_pay_sum'].agg([('30_pay_sum','sum')]).reset_index()
temp2 = groups['predict_30_pay'].agg([('predict_30_pay','sum')]).reset_index()
temp3 = groups['pay_sum'].agg([('n_pay_sum','sum')]).reset_index()
temp4 = groups['user_id'].agg([('n_user_sum','count')]).reset_index()
temp5 = groups['is_pay'].agg([('pay_user_sum','sum')]).reset_index()

125399


In [57]:
df_source_predict = pd.merge(temp1,temp2,on=['channel_id','source_id','create_role_time'],how='inner',validate='one_to_one')
df_source_predict = pd.merge(df_source_predict,temp3,on=['channel_id','source_id','create_role_time'],how='inner',validate='one_to_one')
df_source_predict = pd.merge(df_source_predict,temp4,on=['channel_id','source_id','create_role_time'],how='inner',validate='one_to_one')
df_source_predict = pd.merge(df_source_predict,temp5,on=['channel_id','source_id','create_role_time'],how='inner',validate='one_to_one')

In [58]:
def weight_error(df):
    df['error'] = np.abs((df['predict_30_pay']-df['30_pay_sum'])/df['30_pay_sum'])
    df['weight'] = df['30_pay_sum'] / df['30_pay_sum'].sum()
    df['weight_error'] = df_source_predict['weight'] * df_source_predict['error']
    return df['weight_error'].sum()

In [59]:
weight_error(df_source_predict)

0.5796987122301628

In [60]:
print(weight_error(df_source_predict[df_source_predict['n_user_sum']>50]))
print(df_source_predict[df_source_predict['n_user_sum']>4].shape[0])

0.30694011085495354
32894


In [61]:
print(weight_error(df_source_predict[df_source_predict['pay_user_sum']>5]))
print(df_source_predict[df_source_predict['pay_user_sum']>5].shape[0])

0.1301848199692556
995


In [62]:
print(weight_error(df_source_predict[df_source_predict['pay_user_sum']>3]))
print(df_source_predict[df_source_predict['pay_user_sum']>3].shape[0])

0.19609433382615982
1890


In [63]:
print(weight_error(df_source_predict[df_source_predict['pay_user_sum']>2]))
print(df_source_predict[df_source_predict['pay_user_sum']>2].shape[0])

0.2604567513201781
3026
