In [1]:
import pandas as pd
import numpy as np
import random
import gc
import seaborn as sns
import lightgbm as lgb
from matplotlib import pyplot as plt
# import matplotlib.pyplot as plt
from sklearn.model_selection import KFold, train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder,MinMaxScaler, StandardScaler
from sklearn.metrics import recall_score, auc, accuracy_score, f1_score, precision_score, classification_report, roc_auc_score,mean_squared_error
from pandas.api.types import is_categorical_dtype
from pandas.api.types import is_datetime64_any_dtype as is_datetime
# import missingno as msno
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
import warnings
warnings.filterwarnings('ignore')

In [2]:
data = pd.read_pickle('./data_3d.pickle')
select_features = ['user_id','cp_server_no','cp_role_id','role_created_login_num',
       'role_created_active', 'role_created_online',
       'max_role_level', 'ip_num',
       'device_num',
       'pay_num', 'pay_sum', 'active_0-8', 'active_8-12', 'active_12-14',
       'active_14-18', 'active_18-24', 'pay_grade_1', 'pay_grade_2',
       'pay_grade_3', 'pay_grade_4', 'pay_grade_5', 'pay_grade_6',
       'pay_rate', 'pay_avg',
       'model_money_level', 'hour', 'weekend', 'is_holidays','mobile','platform',
       'user_creates_3_server_num','user_creates_3_role_num', 'time_interval']
label = ['role_created_30_pay_sum']
df = data[select_features].join(data[label])
df['role_created_30_pay_sum'].fillna(0,inplace=True)
df['platform'].fillna(0,inplace=True)
df['user_creates_3_server_num'].fillna(1,inplace=True)
df['user_creates_3_role_num'].fillna(1,inplace=True)
df['time_interval'].fillna(0,inplace=True)
df['model_money_level'].fillna(4,inplace=True)

In [57]:
df['pay_sum'].sum()

26215212.0

In [6]:
df_train, df_test = train_test_split(df,test_size=0.3)
df_train_pay = df_train[df_train['pay_sum']>0]
df_train_nopay = df_train[df_train['pay_sum']==0]

In [7]:
target = df_train_pay['role_created_30_pay_sum']
features = df_train_pay.drop(['role_created_30_pay_sum','user_id','cp_server_no','cp_role_id'], axis=1)

In [8]:
target_ln = np.log1p(target)

In [9]:
X_val, x_test, Y_val, y_test = train_test_split(features,target_ln, test_size=0.3)

In [10]:
# lgb的调参与交叉验证
train_data = lgb.Dataset(X_val, label=Y_val)
val_data = lgb.Dataset(x_test, label=y_test, reference=train_data)

# 设置参数初始值，不含交叉验证参数
print('设置参数')
params = {
    'boosting_type':'gbdt',
    'objective':'regression',
    'metric':'rmse',
    'nthread':4,
    'learning_rate':0.05
}

# 交叉验证（调参）
print('交叉验证')
min_rmse = float('2')
best_params = {}

# 准确率
print('调参1：提高准确率')
for num_leaves in range(5, 100, 5):
    for max_depth in range(3, 8, 1):
        params['num_leaves'] = num_leaves
        params['max_depth'] = max_depth
        
        cv_results = lgb.cv(
                            params,
                            train_data,
                            nfold=3,
                            stratified=False,
                            early_stopping_rounds=10,
                            verbose_eval=50     
                            )
        mean_rmse = pd.Series(cv_results['rmse-mean']).min()
        boost_rounds = pd.Series(cv_results['rmse-mean']).idxmin()
        
        if mean_rmse <= min_rmse:
            min_rmse = mean_rmse
            best_params['num_leaves'] = num_leaves
            best_params['max_depth'] = max_depth

if 'num_leaves' and 'max_depth' in best_params.keys():
    params['num_leaves'] = best_params['num_leaves']
    params['max_depth'] = best_params['max_depth']

# 过拟合
print('调参2：降低过拟合')
for max_bin in range(5, 256, 10):
    for min_data_in_leaf in range(1, 102, 10):
        params['max_bin'] = max_bin
        params['min_data_in_leaf'] = min_data_in_leaf
        
        cv_results = lgb.cv(
                            params,
                            train_data,
                            nfold=3,
                            stratified=False,
                            early_stopping_rounds=10,
                            verbose_eval=50 
                            )
        mean_rmse = pd.Series(cv_results['rmse-mean']).min()
        boost_rounds = pd.Series(cv_results['rmse-mean']).idxmin()
        
        if mean_rmse <= min_rmse:
            min_rmse = mean_rmse
            best_params['min_data_in_leaf'] = min_data_in_leaf
            best_params['max_bin'] = max_bin
            
if 'max_bin' and 'min_data_in_leaf' in best_params.keys():
    params['max_bin'] = best_params['max_bin']
    params['min_data_in_leaf'] = best_params['min_data_in_leaf']

print('调参3：降低过拟合')
for feature_fraction in [0.5, 0.6, 0.7, 0.8, 0.9, 1.0]:
    for bagging_fraction in [0.5, 0.6, 0.7, 0.8, 0.9, 1.0]:
        for bagging_freq in range(0,50,5):
            params['feature_fraction'] = feature_fraction
            params['bagging_fraction'] = bagging_fraction
            params['bagging_freq'] = bagging_freq

            cv_results = lgb.cv(
                            params,
                            train_data,
                            nfold=3,
                            stratified=False,
                            early_stopping_rounds=10,
                            verbose_eval=50
                            )
            mean_rmse = pd.Series(cv_results['rmse-mean']).min()
            boost_rounds = pd.Series(cv_results['rmse-mean']).idxmin()
        
            if mean_rmse <= min_rmse:
                min_rmse = mean_rmse
                best_params['feature_fraction'] = feature_fraction
                best_params['bagging_fraction'] = bagging_fraction
                best_params['bagging_freq'] = bagging_freq

if 'feature_fraction' and 'bagging_fraction' and 'bagging_freq' in best_params.keys():
    params['feature_fraction'] = best_params['feature_fraction']
    params['bagging_fraction'] = best_params['bagging_fraction']
    params['bagging_freq'] = best_params['bagging_freq']

print('调参4：降低过拟合')

for lambda_l1 in [1e-5,1e-3,1e-1,0.0,0.1,0.3,0.5,0.7,0.9,1.0]:
    for lambda_l2 in [1e-5,1e-3,1e-1,0.0,0.1,0.4,0.6,0.7,0.9,1.0]:
        params['lambda_l1'] = lambda_l1
        params['lambda_l2'] = lambda_l2
        cv_results = lgb.cv(
                            params,
                            train_data,
                            nfold=3,
                            stratified=False,
                            early_stopping_rounds=10,
                            verbose_eval=50
                            )
        mean_rmse = pd.Series(cv_results['rmse-mean']).min()
        boost_rounds = pd.Series(cv_results['rmse-mean']).idxmin()
        
        if mean_rmse <= min_rmse:
            min_rmse = mean_rmse
            best_params['lambda_l1'] = lambda_l1
            best_params['lambda_l2'] = lambda_l2
if 'lambda_l1' and 'lambda_l2' in best_params.keys():
    params['lambda_l1'] = best_params['lambda_l1']
    params['lambda_l2'] = best_params['lambda_l2']

print("调参5：降低过拟合2")
for min_split_gain in [0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]:
    params['min_split_gain'] = min_split_gain
    
    cv_results = lgb.cv(
                            params,
                            train_data,
                            nfold=3,
                            stratified=False,
                            early_stopping_rounds=10,
                            verbose_eval=50    
                            )
    mean_rmse = pd.Series(cv_results['rmse-mean']).min()
    boost_rounds = pd.Series(cv_results['rmse-mean']).idxmin()
        
    if mean_rmse >= min_rmse:
        min_rmse = mean_rmse
        
    best_params['min_split_gain'] = min_split_gain
if 'min_split_gain' in best_params.keys():
    params['min_split_gain'] = best_params['min_split_gain']
print(best_params)

设置参数
交叉验证
调参1：提高准确率
[50]	cv_agg's rmse: 0.975291 + 0.00767061
[100]	cv_agg's rmse: 0.949127 + 0.00802917
[50]	cv_agg's rmse: 0.975291 + 0.00767061
[100]	cv_agg's rmse: 0.948919 + 0.00794585
[50]	cv_agg's rmse: 0.975291 + 0.00767061
[100]	cv_agg's rmse: 0.948919 + 0.00794585
[50]	cv_agg's rmse: 0.975291 + 0.00767061
[100]	cv_agg's rmse: 0.948919 + 0.00794585
[50]	cv_agg's rmse: 0.975291 + 0.00767061
[100]	cv_agg's rmse: 0.948919 + 0.00794585
[50]	cv_agg's rmse: 0.96574 + 0.00794416
[100]	cv_agg's rmse: 0.94705 + 0.00814008
[50]	cv_agg's rmse: 0.962325 + 0.00808889
[100]	cv_agg's rmse: 0.945828 + 0.0081274
[50]	cv_agg's rmse: 0.96229 + 0.00805714
[100]	cv_agg's rmse: 0.945746 + 0.0080149
[50]	cv_agg's rmse: 0.962335 + 0.0080158
[100]	cv_agg's rmse: 0.945847 + 0.00792912
[50]	cv_agg's rmse: 0.962335 + 0.0080158
[100]	cv_agg's rmse: 0.94582 + 0.00809954
[50]	cv_agg's rmse: 0.96574 + 0.00794416
[100]	cv_agg's rmse: 0.94705 + 0.00814008
[50]	cv_agg's rmse: 0.959383 + 0.00826106
[100]	cv_agg'

[100]	cv_agg's rmse: 0.944944 + 0.00794154
[50]	cv_agg's rmse: 0.956479 + 0.00827608
[100]	cv_agg's rmse: 0.944978 + 0.00807512
[50]	cv_agg's rmse: 0.956563 + 0.00823499
[100]	cv_agg's rmse: 0.944986 + 0.00801439
[50]	cv_agg's rmse: 0.956556 + 0.00828527
[100]	cv_agg's rmse: 0.94498 + 0.00793143
[50]	cv_agg's rmse: 0.956558 + 0.00828006
[100]	cv_agg's rmse: 0.945027 + 0.00787788
[50]	cv_agg's rmse: 0.956544 + 0.00826191
[100]	cv_agg's rmse: 0.944906 + 0.00798271
[50]	cv_agg's rmse: 0.956534 + 0.00823506
[100]	cv_agg's rmse: 0.944895 + 0.00787425
[50]	cv_agg's rmse: 0.956465 + 0.0082174
[100]	cv_agg's rmse: 0.945088 + 0.00788435
[50]	cv_agg's rmse: 0.956554 + 0.00819242
[100]	cv_agg's rmse: 0.94496 + 0.00776146
[50]	cv_agg's rmse: 0.956479 + 0.0083319
[100]	cv_agg's rmse: 0.945021 + 0.00802708
[50]	cv_agg's rmse: 0.956556 + 0.00817708
[100]	cv_agg's rmse: 0.944999 + 0.00796319
[50]	cv_agg's rmse: 0.95647 + 0.00822812
[100]	cv_agg's rmse: 0.944944 + 0.00794154
[50]	cv_agg's rmse: 0.95647

[100]	cv_agg's rmse: 0.945021 + 0.00802708
[50]	cv_agg's rmse: 0.956556 + 0.00817708
[100]	cv_agg's rmse: 0.944999 + 0.00796319
[50]	cv_agg's rmse: 0.95647 + 0.00822812
[100]	cv_agg's rmse: 0.944944 + 0.00794154
[50]	cv_agg's rmse: 0.956479 + 0.00827608
[100]	cv_agg's rmse: 0.944978 + 0.00807512
[50]	cv_agg's rmse: 0.956563 + 0.00823499
[100]	cv_agg's rmse: 0.944986 + 0.00801439
[50]	cv_agg's rmse: 0.956556 + 0.00828527
[100]	cv_agg's rmse: 0.94498 + 0.00793143
[50]	cv_agg's rmse: 0.956558 + 0.00828006
[100]	cv_agg's rmse: 0.945027 + 0.00787788
[50]	cv_agg's rmse: 0.956544 + 0.00826191
[100]	cv_agg's rmse: 0.944906 + 0.00798271
[50]	cv_agg's rmse: 0.956534 + 0.00823506
[100]	cv_agg's rmse: 0.944895 + 0.00787425
[50]	cv_agg's rmse: 0.956465 + 0.0082174
[100]	cv_agg's rmse: 0.945088 + 0.00788435
[50]	cv_agg's rmse: 0.956554 + 0.00819242
[100]	cv_agg's rmse: 0.94496 + 0.00776146
[50]	cv_agg's rmse: 0.956479 + 0.0083319
[100]	cv_agg's rmse: 0.945021 + 0.00802708
[50]	cv_agg's rmse: 0.95655

[100]	cv_agg's rmse: 0.945088 + 0.00788435
[50]	cv_agg's rmse: 0.956554 + 0.00819242
[100]	cv_agg's rmse: 0.94496 + 0.00776146
[50]	cv_agg's rmse: 0.956479 + 0.0083319
[100]	cv_agg's rmse: 0.945021 + 0.00802708
[50]	cv_agg's rmse: 0.956556 + 0.00817708
[100]	cv_agg's rmse: 0.944999 + 0.00796319
[50]	cv_agg's rmse: 0.95647 + 0.00822812
[100]	cv_agg's rmse: 0.944944 + 0.00794154
[50]	cv_agg's rmse: 0.956479 + 0.00827608
[100]	cv_agg's rmse: 0.944978 + 0.00807512
[50]	cv_agg's rmse: 0.956563 + 0.00823499
[100]	cv_agg's rmse: 0.944986 + 0.00801439
[50]	cv_agg's rmse: 0.956556 + 0.00828527
[100]	cv_agg's rmse: 0.94498 + 0.00793143
[50]	cv_agg's rmse: 0.956558 + 0.00828006
[100]	cv_agg's rmse: 0.945027 + 0.00787788
[50]	cv_agg's rmse: 0.956544 + 0.00826191
[100]	cv_agg's rmse: 0.944906 + 0.00798271
[50]	cv_agg's rmse: 0.956534 + 0.00823506
[100]	cv_agg's rmse: 0.944895 + 0.00787425
[50]	cv_agg's rmse: 0.956465 + 0.0082174
[100]	cv_agg's rmse: 0.945088 + 0.00788435
[50]	cv_agg's rmse: 0.95655

[50]	cv_agg's rmse: 0.959212 + 0.00812986
[100]	cv_agg's rmse: 0.946783 + 0.00771628
[50]	cv_agg's rmse: 0.959308 + 0.00790154
[100]	cv_agg's rmse: 0.946443 + 0.00785102
[50]	cv_agg's rmse: 0.958415 + 0.00834641
[100]	cv_agg's rmse: 0.945368 + 0.00821854
[50]	cv_agg's rmse: 0.958495 + 0.00830609
[100]	cv_agg's rmse: 0.945623 + 0.00852814
[50]	cv_agg's rmse: 0.958884 + 0.00836921
[100]	cv_agg's rmse: 0.945729 + 0.00839773
[50]	cv_agg's rmse: 0.958607 + 0.00789889
[100]	cv_agg's rmse: 0.945697 + 0.00827353
[50]	cv_agg's rmse: 0.95906 + 0.00813543
[100]	cv_agg's rmse: 0.946004 + 0.00815877
[50]	cv_agg's rmse: 0.958689 + 0.00813407
[100]	cv_agg's rmse: 0.946355 + 0.00819751
[50]	cv_agg's rmse: 0.959106 + 0.0078576
[100]	cv_agg's rmse: 0.945968 + 0.0080937
[50]	cv_agg's rmse: 0.958885 + 0.00823531
[100]	cv_agg's rmse: 0.946132 + 0.00806119
[50]	cv_agg's rmse: 0.959219 + 0.00815029
[100]	cv_agg's rmse: 0.946477 + 0.008212
[50]	cv_agg's rmse: 0.959223 + 0.0079916
[100]	cv_agg's rmse: 0.946366

[50]	cv_agg's rmse: 0.957446 + 0.00796374
[100]	cv_agg's rmse: 0.945016 + 0.00792297
[50]	cv_agg's rmse: 0.957784 + 0.00824265
[100]	cv_agg's rmse: 0.945406 + 0.00787314
[50]	cv_agg's rmse: 0.957971 + 0.00802172
[100]	cv_agg's rmse: 0.945432 + 0.00792203
[50]	cv_agg's rmse: 0.95798 + 0.00801332
[100]	cv_agg's rmse: 0.945584 + 0.00813151
[50]	cv_agg's rmse: 0.957854 + 0.0079887
[100]	cv_agg's rmse: 0.945326 + 0.00799538
[50]	cv_agg's rmse: 0.957775 + 0.00830645
[100]	cv_agg's rmse: 0.945337 + 0.00813622
[50]	cv_agg's rmse: 0.957775 + 0.00830645
[100]	cv_agg's rmse: 0.945337 + 0.00813622
[50]	cv_agg's rmse: 0.957775 + 0.00830645
[100]	cv_agg's rmse: 0.945337 + 0.00813622
[50]	cv_agg's rmse: 0.957775 + 0.00830645
[100]	cv_agg's rmse: 0.945337 + 0.00813622
[50]	cv_agg's rmse: 0.957775 + 0.00830645
[100]	cv_agg's rmse: 0.945337 + 0.00813622
[50]	cv_agg's rmse: 0.957775 + 0.00830645
[100]	cv_agg's rmse: 0.945337 + 0.00813622
[50]	cv_agg's rmse: 0.957775 + 0.00830645
[100]	cv_agg's rmse: 0.94

[50]	cv_agg's rmse: 0.957601 + 0.00780322
[100]	cv_agg's rmse: 0.945781 + 0.00802101
[50]	cv_agg's rmse: 0.95742 + 0.00760011
[100]	cv_agg's rmse: 0.945687 + 0.00773751
[50]	cv_agg's rmse: 0.957581 + 0.00748852
[100]	cv_agg's rmse: 0.946239 + 0.0075296
[50]	cv_agg's rmse: 0.957011 + 0.00752952
[100]	cv_agg's rmse: 0.945636 + 0.00772043
[50]	cv_agg's rmse: 0.95788 + 0.0074941
[100]	cv_agg's rmse: 0.946079 + 0.00750925
[50]	cv_agg's rmse: 0.957756 + 0.00791314
[100]	cv_agg's rmse: 0.946115 + 0.00754337
[50]	cv_agg's rmse: 0.957669 + 0.00771629
[100]	cv_agg's rmse: 0.946168 + 0.00767397
[50]	cv_agg's rmse: 0.957696 + 0.00770887
[100]	cv_agg's rmse: 0.945844 + 0.00768912
[50]	cv_agg's rmse: 0.957272 + 0.00815893
[100]	cv_agg's rmse: 0.945483 + 0.00801808
[50]	cv_agg's rmse: 0.957161 + 0.00792047
[100]	cv_agg's rmse: 0.945438 + 0.00798756
[50]	cv_agg's rmse: 0.957255 + 0.00788738
[100]	cv_agg's rmse: 0.945435 + 0.00804133
[50]	cv_agg's rmse: 0.95731 + 0.00781463
[100]	cv_agg's rmse: 0.94551

[50]	cv_agg's rmse: 0.956969 + 0.00808258
[100]	cv_agg's rmse: 0.945207 + 0.00768274
[50]	cv_agg's rmse: 0.956554 + 0.00819242
[100]	cv_agg's rmse: 0.94496 + 0.00776146
[50]	cv_agg's rmse: 0.956278 + 0.0080913
[100]	cv_agg's rmse: 0.945289 + 0.00818919
[50]	cv_agg's rmse: 0.95717 + 0.00815682
[100]	cv_agg's rmse: 0.945975 + 0.00832273
[50]	cv_agg's rmse: 0.95676 + 0.00748689
[100]	cv_agg's rmse: 0.945543 + 0.00807974
[50]	cv_agg's rmse: 0.95731 + 0.0080047
[100]	cv_agg's rmse: 0.946432 + 0.00771768
[50]	cv_agg's rmse: 0.956825 + 0.00809708
[100]	cv_agg's rmse: 0.946319 + 0.00776645
[50]	cv_agg's rmse: 0.957432 + 0.00729633
[100]	cv_agg's rmse: 0.946303 + 0.00784742
[50]	cv_agg's rmse: 0.957092 + 0.00817822
[100]	cv_agg's rmse: 0.946253 + 0.00783001
[50]	cv_agg's rmse: 0.957441 + 0.00815529
[100]	cv_agg's rmse: 0.947084 + 0.00738187
[50]	cv_agg's rmse: 0.957362 + 0.00795528
[100]	cv_agg's rmse: 0.946241 + 0.00772941
[50]	cv_agg's rmse: 0.956554 + 0.00819242
[100]	cv_agg's rmse: 0.94496 

[50]	cv_agg's rmse: 0.956398 + 0.00802865
[100]	cv_agg's rmse: 0.945015 + 0.00790296
[50]	cv_agg's rmse: 0.956452 + 0.00802509
[100]	cv_agg's rmse: 0.944838 + 0.00801176
[50]	cv_agg's rmse: 0.956406 + 0.00803762
[100]	cv_agg's rmse: 0.94488 + 0.00790451
[50]	cv_agg's rmse: 0.956382 + 0.00800672
[100]	cv_agg's rmse: 0.944941 + 0.00795552
[50]	cv_agg's rmse: 0.956124 + 0.00806066
[100]	cv_agg's rmse: 0.944755 + 0.008058
[50]	cv_agg's rmse: 0.956124 + 0.00806066
[100]	cv_agg's rmse: 0.944755 + 0.00805799
[50]	cv_agg's rmse: 0.9563 + 0.0079933
[100]	cv_agg's rmse: 0.944835 + 0.00789392
[50]	cv_agg's rmse: 0.956124 + 0.00806066
[100]	cv_agg's rmse: 0.944755 + 0.008058
[50]	cv_agg's rmse: 0.9563 + 0.0079933
[100]	cv_agg's rmse: 0.944835 + 0.00789392
[50]	cv_agg's rmse: 0.956266 + 0.00797781
[100]	cv_agg's rmse: 0.944841 + 0.00783637
[50]	cv_agg's rmse: 0.956392 + 0.00808053
[100]	cv_agg's rmse: 0.94481 + 0.00809654
[50]	cv_agg's rmse: 0.956449 + 0.00803679
[100]	cv_agg's rmse: 0.944961 + 0.0

In [12]:
params = {

'boosting_type':'gbdt',
'objective':'regression',
'metric':'rmse',
'nthread':4,
'learning_rate':0.05,
"max_depth":5,
"num_leaves":25,
"max_bin":255,
"min_data_in_leaf":61,
"min_split_gain":1.0,
"feature_fraction": 1.0,
"bagging_fraction":0.9,
"bagging_freq":25,
"lambda_l1":0.1,
"lambda_l2":1.0,
"metric": "rmse",
}
train_data = lgb.Dataset(X_val, label=Y_val)
val_data = lgb.Dataset(x_test, label=y_test, reference=train_data)
lgb_r_cv = lgb.train(params, train_data, num_boost_round=8000, early_stopping_rounds=100, valid_sets=[train_data, val_data])

[1]	training's rmse: 1.74436	valid_1's rmse: 1.73576
Training until validation scores don't improve for 100 rounds
[2]	training's rmse: 1.68421	valid_1's rmse: 1.67584
[3]	training's rmse: 1.62799	valid_1's rmse: 1.61985
[4]	training's rmse: 1.5755	valid_1's rmse: 1.56762
[5]	training's rmse: 1.52651	valid_1's rmse: 1.51881
[6]	training's rmse: 1.48084	valid_1's rmse: 1.47333
[7]	training's rmse: 1.43837	valid_1's rmse: 1.43105
[8]	training's rmse: 1.39885	valid_1's rmse: 1.39167
[9]	training's rmse: 1.36212	valid_1's rmse: 1.35519
[10]	training's rmse: 1.32801	valid_1's rmse: 1.3213
[11]	training's rmse: 1.29643	valid_1's rmse: 1.28991
[12]	training's rmse: 1.26725	valid_1's rmse: 1.26091
[13]	training's rmse: 1.24027	valid_1's rmse: 1.23418
[14]	training's rmse: 1.21533	valid_1's rmse: 1.20949
[15]	training's rmse: 1.19234	valid_1's rmse: 1.1867
[16]	training's rmse: 1.17118	valid_1's rmse: 1.16579
[17]	training's rmse: 1.15171	valid_1's rmse: 1.14646
[18]	training's rmse: 1.13382	va

[191]	training's rmse: 0.921179	valid_1's rmse: 0.937771
[192]	training's rmse: 0.921144	valid_1's rmse: 0.937769
[193]	training's rmse: 0.921016	valid_1's rmse: 0.937724
[194]	training's rmse: 0.920989	valid_1's rmse: 0.937717
[195]	training's rmse: 0.920939	valid_1's rmse: 0.937725
[196]	training's rmse: 0.92086	valid_1's rmse: 0.937723
[197]	training's rmse: 0.920758	valid_1's rmse: 0.937704
[198]	training's rmse: 0.920727	valid_1's rmse: 0.937698
[199]	training's rmse: 0.920675	valid_1's rmse: 0.9377
[200]	training's rmse: 0.920596	valid_1's rmse: 0.937721
[201]	training's rmse: 0.920496	valid_1's rmse: 0.937714
[202]	training's rmse: 0.920433	valid_1's rmse: 0.937728
[203]	training's rmse: 0.920376	valid_1's rmse: 0.937717
[204]	training's rmse: 0.920327	valid_1's rmse: 0.937722
[205]	training's rmse: 0.920221	valid_1's rmse: 0.937716
[206]	training's rmse: 0.92018	valid_1's rmse: 0.937701
[207]	training's rmse: 0.92009	valid_1's rmse: 0.937703
[208]	training's rmse: 0.920068	vali

In [13]:
y_predict_lgb = lgb_r_cv.predict(x_test)
mse = mean_squared_error(np.expm1(y_predict_lgb), np.expm1(y_test))
mae = mean_absolute_error(np.expm1(y_predict_lgb), np.expm1(y_test))
rmse = mse ** 0.5
print(rmse)
print(mse)
print(mae)

2655.816257578629
7053359.994018956
545.8582311830041


In [11]:
# 用df_test数据集进行测试

In [14]:
df_test_pay = df_test[df_test['pay_sum']>0]
df_test_nopay = df_test[df_test['pay_sum']==0]

In [16]:
df_test_part1 = df_test_nopay[['user_id','cp_server_no','cp_role_id','pay_num']].rename(columns=
                                                                                {'pay_num':'predict_30_pay'})

In [17]:
target_test = df_test_pay['role_created_30_pay_sum']
target_test_ln = np.log1p(target_test)
features_test = df_test_pay.drop(['role_created_30_pay_sum','user_id','cp_server_no','cp_role_id'], axis=1)
y_predict = lgb_r_cv.predict(features_test)
y_predict[y_predict<0] = 0
mse = mean_squared_error(np.expm1(y_predict), np.expm1(target_test_ln))
mae = mean_absolute_error(np.expm1(y_predict), np.expm1(target_test_ln))
rmse = mse ** 0.5
print(rmse)
print(mse)
print(mae)

2878.5295722532132
8285932.498336266
564.4621144318143


In [18]:
sum(np.expm1(y_predict))

18126497.53777649

In [19]:
sum(np.expm1(target_test_ln))

28092823.0

In [20]:
sum(np.expm1(target_test_ln))/sum(np.expm1(y_predict))

1.5498208046785218

In [43]:
df_test_part2 = df_test_pay[['user_id','cp_server_no','cp_role_id']]
df_test_part2['predict_30_pay'] = np.expm1(y_predict) *1.54
pred = df_test_part1.append(df_test_part2)
predict_data = pd.merge(df_test[['user_id','cp_server_no','cp_role_id', 'role_created_30_pay_sum']],pred,on=['user_id','cp_server_no','cp_role_id'],how='left',validate='one_to_one')

In [44]:
mse = mean_squared_error(predict_data['predict_30_pay'], predict_data['role_created_30_pay_sum'])
mae = mean_absolute_error(predict_data['predict_30_pay'], predict_data['role_created_30_pay_sum'])
rmse = mse ** 0.5
print('测试集上的均方根误差:%.2f元'% rmse)
# print('测试集上的均方误差:%f元' % mse)
print('测试集上的平均绝对误差:%.2f元'% mae)

测试集上的均方根误差:423.23元
测试集上的平均绝对误差:14.78元


In [45]:
print('测试集前30天实际总的付费金额:%.2f元' % predict_data['role_created_30_pay_sum'].sum())
print('测试集前30天预测总的付费金额:%.2f元'% predict_data['predict_30_pay'].sum())
print('预测总金额准确率:',predict_data['predict_30_pay'].sum()/predict_data['role_created_30_pay_sum'].sum())

测试集前30天实际总的付费金额:29597555.00元
测试集前30天预测总的付费金额:27914806.21元
预测总金额准确率: 0.9431456824111218


## 分计划

In [None]:
data['create_role_time'] = data['create_role_time'].dt.date

In [48]:

df_source = data[['user_id','cp_server_no','cp_role_id','create_role_time','channel_id','source_id','pay_sum']]
df_source.drop_duplicates(inplace=True)
df_source_predict = pd.merge(predict_data,df_source,on=['user_id','cp_server_no','cp_role_id'],how='left',validate='one_to_one')

In [49]:
df_source_predict['is_pay'] = (df_source_predict['pay_sum'] != 0).astype(int)
groups = df_source_predict.groupby(['channel_id','source_id','create_role_time'])
print(groups.ngroups)
temp1 = groups['role_created_30_pay_sum'].agg([('30_pay_sum','sum')]).reset_index()
temp2 = groups['predict_30_pay'].agg([('predict_30_pay','sum')]).reset_index()
temp3 = groups['pay_sum'].agg([('n_pay_sum','sum')]).reset_index()
temp4 = groups['user_id'].agg([('n_user_sum','count')]).reset_index()
temp5 = groups['is_pay'].agg([('pay_user_sum','sum')]).reset_index()

125479


In [50]:
df_source_predict = pd.merge(temp1,temp2,on=['channel_id','source_id','create_role_time'],how='inner',validate='one_to_one')
df_source_predict = pd.merge(df_source_predict,temp3,on=['channel_id','source_id','create_role_time'],how='inner',validate='one_to_one')
df_source_predict = pd.merge(df_source_predict,temp4,on=['channel_id','source_id','create_role_time'],how='inner',validate='one_to_one')
df_source_predict = pd.merge(df_source_predict,temp5,on=['channel_id','source_id','create_role_time'],how='inner',validate='one_to_one')

In [51]:
def weight_error(df):
    df['error'] = np.abs((df['predict_30_pay']-df['30_pay_sum'])/df['30_pay_sum'])
    df['weight'] = df['30_pay_sum'] / df['30_pay_sum'].sum()
    df['weight_error'] = df_source_predict['weight'] * df_source_predict['error']
    return df['weight_error'].sum()

In [52]:
weight_error(df_source_predict)

0.64753637195696

In [53]:
print(weight_error(df_source_predict[df_source_predict['n_user_sum']>50]))
print(df_source_predict[df_source_predict['n_user_sum']>4].shape[0])

0.3489486353237339
32922


In [54]:
print(weight_error(df_source_predict[df_source_predict['pay_user_sum']>5]))
print(df_source_predict[df_source_predict['pay_user_sum']>5].shape[0])

0.14699997476340754
942


In [55]:
print(weight_error(df_source_predict[df_source_predict['pay_user_sum']>3]))
print(df_source_predict[df_source_predict['pay_user_sum']>3].shape[0])

0.2336595076824201
1838


In [56]:
print(weight_error(df_source_predict[df_source_predict['pay_user_sum']>2]))
print(df_source_predict[df_source_predict['pay_user_sum']>2].shape[0])

0.2950920895464055
2831
