In [1]:
import pandas as pd
import numpy as np
import random
import gc
import seaborn as sns
import lightgbm as lgb
from matplotlib import pyplot as plt
# import matplotlib.pyplot as plt
from sklearn.model_selection import KFold, train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder,MinMaxScaler, StandardScaler
from sklearn.metrics import recall_score, auc, accuracy_score, f1_score, precision_score, classification_report, roc_auc_score,mean_squared_error
from pandas.api.types import is_categorical_dtype
from pandas.api.types import is_datetime64_any_dtype as is_datetime
# import missingno as msno
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
import warnings
warnings.filterwarnings('ignore')

In [2]:
data = pd.read_pickle('./data_5d.pickle')
select_features = ['user_id','cp_server_no','cp_role_id','role_created_login_num',
       'role_created_active', 'role_created_online',
       'max_role_level', 'ip_num',
       'device_num',
       'pay_num', 'pay_sum', 'active_0-8', 'active_8-12', 'active_12-14',
       'active_14-18', 'active_18-24', 'pay_grade_1', 'pay_grade_2',
       'pay_grade_3', 'pay_grade_4', 'pay_grade_5', 'pay_grade_6',
       'pay_rate', 'pay_avg',
       'model_money_level', 'hour', 'weekend', 'is_holidays','mobile','platform',
       'user_creates_3_server_num','user_creates_3_role_num', 'time_interval']
label = ['role_created_30_pay_sum']
df = data[select_features].join(data[label])
df['role_created_30_pay_sum'].fillna(0,inplace=True)
df['platform'].fillna(0,inplace=True)
df['user_creates_3_server_num'].fillna(1,inplace=True)
df['user_creates_3_role_num'].fillna(1,inplace=True)
df['time_interval'].fillna(0,inplace=True)
df['model_money_level'].fillna(4,inplace=True)

In [3]:
df['pay_sum'].sum()

35384896.0

In [4]:
df_train, df_test = train_test_split(df,test_size=0.3)
df_train_pay = df_train[df_train['pay_sum']>0]
df_train_nopay = df_train[df_train['pay_sum']==0]

In [5]:
target = df_train_pay['role_created_30_pay_sum']
features = df_train_pay.drop(['role_created_30_pay_sum','user_id','cp_server_no','cp_role_id'], axis=1)
target_ln = np.log1p(target)

In [6]:
X_val, x_test, Y_val, y_test = train_test_split(features,target_ln, test_size=0.3)

In [7]:
# lgb的调参与交叉验证
train_data = lgb.Dataset(X_val, label=Y_val)
val_data = lgb.Dataset(x_test, label=y_test, reference=train_data)

# 设置参数初始值，不含交叉验证参数
print('设置参数')
params = {
    'boosting_type':'gbdt',
    'objective':'regression',
    'metric':'rmse',
    'nthread':4,
    'learning_rate':0.05
}

# 交叉验证（调参）
print('交叉验证')
min_rmse = float('2')
best_params = {}

# 准确率
print('调参1：提高准确率')
for num_leaves in range(5, 100, 5):
    for max_depth in range(3, 8, 1):
        params['num_leaves'] = num_leaves
        params['max_depth'] = max_depth
        
        cv_results = lgb.cv(
                            params,
                            train_data,
                            nfold=3,
                            stratified=False,
                            early_stopping_rounds=10,
                            verbose_eval=50     
                            )
        mean_rmse = pd.Series(cv_results['rmse-mean']).min()
        boost_rounds = pd.Series(cv_results['rmse-mean']).idxmin()
        
        if mean_rmse <= min_rmse:
            min_rmse = mean_rmse
            best_params['num_leaves'] = num_leaves
            best_params['max_depth'] = max_depth

if 'num_leaves' and 'max_depth' in best_params.keys():
    params['num_leaves'] = best_params['num_leaves']
    params['max_depth'] = best_params['max_depth']

# 过拟合
print('调参2：降低过拟合')
for max_bin in range(5, 256, 10):
    for min_data_in_leaf in range(1, 102, 10):
        params['max_bin'] = max_bin
        params['min_data_in_leaf'] = min_data_in_leaf
        
        cv_results = lgb.cv(
                            params,
                            train_data,
                            nfold=3,
                            stratified=False,
                            early_stopping_rounds=10,
                            verbose_eval=50 
                            )
        mean_rmse = pd.Series(cv_results['rmse-mean']).min()
        boost_rounds = pd.Series(cv_results['rmse-mean']).idxmin()
        
        if mean_rmse <= min_rmse:
            min_rmse = mean_rmse
            best_params['min_data_in_leaf'] = min_data_in_leaf
            best_params['max_bin'] = max_bin
            
if 'max_bin' and 'min_data_in_leaf' in best_params.keys():
    params['max_bin'] = best_params['max_bin']
    params['min_data_in_leaf'] = best_params['min_data_in_leaf']

print('调参3：降低过拟合')
for feature_fraction in [0.5, 0.6, 0.7, 0.8, 0.9, 1.0]:
    for bagging_fraction in [0.5, 0.6, 0.7, 0.8, 0.9, 1.0]:
        for bagging_freq in range(0,50,5):
            params['feature_fraction'] = feature_fraction
            params['bagging_fraction'] = bagging_fraction
            params['bagging_freq'] = bagging_freq

            cv_results = lgb.cv(
                            params,
                            train_data,
                            nfold=3,
                            stratified=False,
                            early_stopping_rounds=10,
                            verbose_eval=50
                            )
            mean_rmse = pd.Series(cv_results['rmse-mean']).min()
            boost_rounds = pd.Series(cv_results['rmse-mean']).idxmin()
        
            if mean_rmse <= min_rmse:
                min_rmse = mean_rmse
                best_params['feature_fraction'] = feature_fraction
                best_params['bagging_fraction'] = bagging_fraction
                best_params['bagging_freq'] = bagging_freq

if 'feature_fraction' and 'bagging_fraction' and 'bagging_freq' in best_params.keys():
    params['feature_fraction'] = best_params['feature_fraction']
    params['bagging_fraction'] = best_params['bagging_fraction']
    params['bagging_freq'] = best_params['bagging_freq']

print('调参4：降低过拟合')

for lambda_l1 in [1e-5,1e-3,1e-1,0.0,0.1,0.3,0.5,0.7,0.9,1.0]:
    for lambda_l2 in [1e-5,1e-3,1e-1,0.0,0.1,0.4,0.6,0.7,0.9,1.0]:
        params['lambda_l1'] = lambda_l1
        params['lambda_l2'] = lambda_l2
        cv_results = lgb.cv(
                            params,
                            train_data,
                            nfold=3,
                            stratified=False,
                            early_stopping_rounds=10,
                            verbose_eval=50
                            )
        mean_rmse = pd.Series(cv_results['rmse-mean']).min()
        boost_rounds = pd.Series(cv_results['rmse-mean']).idxmin()
        
        if mean_rmse <= min_rmse:
            min_rmse = mean_rmse
            best_params['lambda_l1'] = lambda_l1
            best_params['lambda_l2'] = lambda_l2
if 'lambda_l1' and 'lambda_l2' in best_params.keys():
    params['lambda_l1'] = best_params['lambda_l1']
    params['lambda_l2'] = best_params['lambda_l2']

print("调参5：降低过拟合2")
for min_split_gain in [0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]:
    params['min_split_gain'] = min_split_gain
    
    cv_results = lgb.cv(
                            params,
                            train_data,
                            nfold=3,
                            stratified=False,
                            early_stopping_rounds=10,
                            verbose_eval=50    
                            )
    mean_rmse = pd.Series(cv_results['rmse-mean']).min()
    boost_rounds = pd.Series(cv_results['rmse-mean']).idxmin()
        
    if mean_rmse >= min_rmse:
        min_rmse = mean_rmse
        
    best_params['min_split_gain'] = min_split_gain
if 'min_split_gain' in best_params.keys():
    params['min_split_gain'] = best_params['min_split_gain']
print(best_params)

设置参数
交叉验证
调参1：提高准确率
[50]	cv_agg's rmse: 0.844177 + 0.0025904
[100]	cv_agg's rmse: 0.81604 + 0.00186027
[50]	cv_agg's rmse: 0.844177 + 0.0025904
[100]	cv_agg's rmse: 0.815938 + 0.00183587
[50]	cv_agg's rmse: 0.844177 + 0.0025904
[100]	cv_agg's rmse: 0.815938 + 0.00183587
[50]	cv_agg's rmse: 0.844177 + 0.0025904
[100]	cv_agg's rmse: 0.815938 + 0.00183587
[50]	cv_agg's rmse: 0.844177 + 0.0025904
[100]	cv_agg's rmse: 0.815938 + 0.00183587
[50]	cv_agg's rmse: 0.834431 + 0.00226769
[100]	cv_agg's rmse: 0.81445 + 0.00168682
[50]	cv_agg's rmse: 0.831176 + 0.00233931
[100]	cv_agg's rmse: 0.813531 + 0.00205629
[50]	cv_agg's rmse: 0.831145 + 0.00246565
[100]	cv_agg's rmse: 0.813421 + 0.00199547
[50]	cv_agg's rmse: 0.831076 + 0.00239478
[100]	cv_agg's rmse: 0.81334 + 0.00194796
[50]	cv_agg's rmse: 0.831076 + 0.00239478
[100]	cv_agg's rmse: 0.813456 + 0.00200861
[50]	cv_agg's rmse: 0.834431 + 0.00226769
[100]	cv_agg's rmse: 0.81445 + 0.00168682
[50]	cv_agg's rmse: 0.82816 + 0.00244805
[100]	cv_agg'

[100]	cv_agg's rmse: 0.812969 + 0.00206448
[50]	cv_agg's rmse: 0.826045 + 0.00256287
[100]	cv_agg's rmse: 0.812841 + 0.00191786
[50]	cv_agg's rmse: 0.826066 + 0.00256347
[100]	cv_agg's rmse: 0.812916 + 0.00199313
[50]	cv_agg's rmse: 0.826083 + 0.00255861
[100]	cv_agg's rmse: 0.812915 + 0.00203779
[50]	cv_agg's rmse: 0.82601 + 0.00249975
[100]	cv_agg's rmse: 0.812772 + 0.00193729
[50]	cv_agg's rmse: 0.826015 + 0.00244962
[100]	cv_agg's rmse: 0.812679 + 0.00195325
[50]	cv_agg's rmse: 0.825934 + 0.00246392
[100]	cv_agg's rmse: 0.812623 + 0.00187278
[50]	cv_agg's rmse: 0.825982 + 0.00249844
[100]	cv_agg's rmse: 0.812605 + 0.00185969
[50]	cv_agg's rmse: 0.825938 + 0.00246613
[100]	cv_agg's rmse: 0.812666 + 0.00181331
[50]	cv_agg's rmse: 0.825964 + 0.00251218
[100]	cv_agg's rmse: 0.812675 + 0.00183483
[50]	cv_agg's rmse: 0.826404 + 0.00246582
[100]	cv_agg's rmse: 0.813564 + 0.00178307
[50]	cv_agg's rmse: 0.826099 + 0.00255656
[100]	cv_agg's rmse: 0.812969 + 0.00206448
[50]	cv_agg's rmse: 0.8

[50]	cv_agg's rmse: 0.825964 + 0.00251218
[100]	cv_agg's rmse: 0.812675 + 0.00183483
[50]	cv_agg's rmse: 0.826404 + 0.00246582
[100]	cv_agg's rmse: 0.813564 + 0.00178307
[50]	cv_agg's rmse: 0.826099 + 0.00255656
[100]	cv_agg's rmse: 0.812969 + 0.00206448
[50]	cv_agg's rmse: 0.826045 + 0.00256287
[100]	cv_agg's rmse: 0.812841 + 0.00191786
[50]	cv_agg's rmse: 0.826066 + 0.00256347
[100]	cv_agg's rmse: 0.812916 + 0.00199313
[50]	cv_agg's rmse: 0.826083 + 0.00255861
[100]	cv_agg's rmse: 0.812915 + 0.00203779
[50]	cv_agg's rmse: 0.82601 + 0.00249975
[100]	cv_agg's rmse: 0.812772 + 0.00193729
[50]	cv_agg's rmse: 0.826015 + 0.00244962
[100]	cv_agg's rmse: 0.812679 + 0.00195325
[50]	cv_agg's rmse: 0.825934 + 0.00246392
[100]	cv_agg's rmse: 0.812623 + 0.00187278
[50]	cv_agg's rmse: 0.825982 + 0.00249844
[100]	cv_agg's rmse: 0.812605 + 0.00185969
[50]	cv_agg's rmse: 0.825938 + 0.00246613
[100]	cv_agg's rmse: 0.812666 + 0.00181331
[50]	cv_agg's rmse: 0.825964 + 0.00251218
[100]	cv_agg's rmse: 0.8

[100]	cv_agg's rmse: 0.812623 + 0.00187278
[50]	cv_agg's rmse: 0.825982 + 0.00249844
[100]	cv_agg's rmse: 0.812605 + 0.00185969
[50]	cv_agg's rmse: 0.825938 + 0.00246613
[100]	cv_agg's rmse: 0.812666 + 0.00181331
[50]	cv_agg's rmse: 0.825964 + 0.00251218
[100]	cv_agg's rmse: 0.812675 + 0.00183483
[50]	cv_agg's rmse: 0.826404 + 0.00246582
[100]	cv_agg's rmse: 0.813564 + 0.00178307
[50]	cv_agg's rmse: 0.826099 + 0.00255656
[100]	cv_agg's rmse: 0.812969 + 0.00206448
[50]	cv_agg's rmse: 0.826045 + 0.00256287
[100]	cv_agg's rmse: 0.812841 + 0.00191786
[50]	cv_agg's rmse: 0.826066 + 0.00256347
[100]	cv_agg's rmse: 0.812916 + 0.00199313
[50]	cv_agg's rmse: 0.826083 + 0.00255861
[100]	cv_agg's rmse: 0.812915 + 0.00203779
[50]	cv_agg's rmse: 0.82601 + 0.00249975
[100]	cv_agg's rmse: 0.812772 + 0.00193729
[50]	cv_agg's rmse: 0.826015 + 0.00244962
[100]	cv_agg's rmse: 0.812679 + 0.00195325
[50]	cv_agg's rmse: 0.825934 + 0.00246392
[100]	cv_agg's rmse: 0.812623 + 0.00187278
[50]	cv_agg's rmse: 0.8

[50]	cv_agg's rmse: 0.829248 + 0.00341603
[100]	cv_agg's rmse: 0.814507 + 0.00237132
[50]	cv_agg's rmse: 0.829369 + 0.00295943
[100]	cv_agg's rmse: 0.815102 + 0.00216273
[50]	cv_agg's rmse: 0.829533 + 0.00313231
[100]	cv_agg's rmse: 0.815183 + 0.0021489
[50]	cv_agg's rmse: 0.829812 + 0.0028569
[100]	cv_agg's rmse: 0.81494 + 0.00226063
[50]	cv_agg's rmse: 0.829812 + 0.00301234
[100]	cv_agg's rmse: 0.815439 + 0.00224925
[50]	cv_agg's rmse: 0.828655 + 0.00321202
[100]	cv_agg's rmse: 0.813369 + 0.00234195
[50]	cv_agg's rmse: 0.828784 + 0.00329829
[100]	cv_agg's rmse: 0.813926 + 0.0023093
[50]	cv_agg's rmse: 0.828814 + 0.00322109
[100]	cv_agg's rmse: 0.813766 + 0.00231563
[50]	cv_agg's rmse: 0.829122 + 0.00323296
[100]	cv_agg's rmse: 0.814146 + 0.00240194
[50]	cv_agg's rmse: 0.828898 + 0.00292189
[100]	cv_agg's rmse: 0.813935 + 0.00204953
[50]	cv_agg's rmse: 0.828609 + 0.00337394
[100]	cv_agg's rmse: 0.814033 + 0.00244736
[50]	cv_agg's rmse: 0.829063 + 0.00310023
[100]	cv_agg's rmse: 0.8142

[50]	cv_agg's rmse: 0.828086 + 0.00312872
[100]	cv_agg's rmse: 0.81344 + 0.00220845
[50]	cv_agg's rmse: 0.82793 + 0.00305791
[100]	cv_agg's rmse: 0.81349 + 0.00228696
[50]	cv_agg's rmse: 0.828137 + 0.00302433
[100]	cv_agg's rmse: 0.813674 + 0.00227005
[50]	cv_agg's rmse: 0.827837 + 0.00299549
[100]	cv_agg's rmse: 0.813454 + 0.0021691
[50]	cv_agg's rmse: 0.828044 + 0.0030539
[100]	cv_agg's rmse: 0.81336 + 0.00230802
[50]	cv_agg's rmse: 0.827742 + 0.00301501
[100]	cv_agg's rmse: 0.813265 + 0.00225224
[50]	cv_agg's rmse: 0.827961 + 0.00302704
[100]	cv_agg's rmse: 0.813621 + 0.00232744
[50]	cv_agg's rmse: 0.827891 + 0.00298608
[100]	cv_agg's rmse: 0.813466 + 0.00231038
[50]	cv_agg's rmse: 0.82801 + 0.00308229
[100]	cv_agg's rmse: 0.813441 + 0.00235241
[50]	cv_agg's rmse: 0.82801 + 0.00308229
[100]	cv_agg's rmse: 0.813441 + 0.00235241
[50]	cv_agg's rmse: 0.82801 + 0.00308229
[100]	cv_agg's rmse: 0.813441 + 0.00235241
[50]	cv_agg's rmse: 0.82801 + 0.00308229
[100]	cv_agg's rmse: 0.813441 + 0

[50]	cv_agg's rmse: 0.82704 + 0.00275965
[100]	cv_agg's rmse: 0.814063 + 0.0021248
[50]	cv_agg's rmse: 0.826722 + 0.00277833
[100]	cv_agg's rmse: 0.812874 + 0.00195803
[50]	cv_agg's rmse: 0.826534 + 0.00288053
[100]	cv_agg's rmse: 0.812961 + 0.00225811
[50]	cv_agg's rmse: 0.826905 + 0.00278304
[100]	cv_agg's rmse: 0.813007 + 0.00221344
[50]	cv_agg's rmse: 0.826873 + 0.00322172
[100]	cv_agg's rmse: 0.812989 + 0.00207744
[50]	cv_agg's rmse: 0.826938 + 0.00271403
[100]	cv_agg's rmse: 0.813161 + 0.00207577
[50]	cv_agg's rmse: 0.826765 + 0.00313713
[100]	cv_agg's rmse: 0.81315 + 0.00226979
[50]	cv_agg's rmse: 0.827105 + 0.00322767
[100]	cv_agg's rmse: 0.813399 + 0.00210885
[50]	cv_agg's rmse: 0.826605 + 0.00292807
[100]	cv_agg's rmse: 0.813261 + 0.00215889
[50]	cv_agg's rmse: 0.82702 + 0.00295482
[100]	cv_agg's rmse: 0.813576 + 0.00233905
[50]	cv_agg's rmse: 0.826754 + 0.00300981
[100]	cv_agg's rmse: 0.813493 + 0.00241811
[50]	cv_agg's rmse: 0.826722 + 0.00277833
[100]	cv_agg's rmse: 0.8128

[100]	cv_agg's rmse: 0.81289 + 0.00204883
[50]	cv_agg's rmse: 0.826535 + 0.0026924
[100]	cv_agg's rmse: 0.81289 + 0.00204883
[50]	cv_agg's rmse: 0.826535 + 0.0026924
[100]	cv_agg's rmse: 0.81289 + 0.00204883
[50]	cv_agg's rmse: 0.826535 + 0.0026924
[100]	cv_agg's rmse: 0.81289 + 0.00204883
[50]	cv_agg's rmse: 0.825982 + 0.00249844
[100]	cv_agg's rmse: 0.812605 + 0.00185969
[50]	cv_agg's rmse: 0.825884 + 0.00297077
[100]	cv_agg's rmse: 0.813261 + 0.0023669
[50]	cv_agg's rmse: 0.826542 + 0.00247393
[100]	cv_agg's rmse: 0.814065 + 0.00151697
[50]	cv_agg's rmse: 0.826505 + 0.00274828
[100]	cv_agg's rmse: 0.813984 + 0.00201935
[50]	cv_agg's rmse: 0.826818 + 0.00255379
[100]	cv_agg's rmse: 0.814167 + 0.00140408
[50]	cv_agg's rmse: 0.826457 + 0.00315708
[100]	cv_agg's rmse: 0.814158 + 0.00196497
[50]	cv_agg's rmse: 0.82649 + 0.00248519
[100]	cv_agg's rmse: 0.814565 + 0.0016298
[50]	cv_agg's rmse: 0.826546 + 0.00264859
[100]	cv_agg's rmse: 0.814192 + 0.00158432
[50]	cv_agg's rmse: 0.826882 + 0

[100]	cv_agg's rmse: 0.812555 + 0.00195498
[50]	cv_agg's rmse: 0.825915 + 0.00277134
[100]	cv_agg's rmse: 0.812723 + 0.00175447
[50]	cv_agg's rmse: 0.825869 + 0.00275653
[100]	cv_agg's rmse: 0.812523 + 0.00200787
[50]	cv_agg's rmse: 0.825897 + 0.00272631
[100]	cv_agg's rmse: 0.81281 + 0.0021187
[50]	cv_agg's rmse: 0.825818 + 0.00285181
[100]	cv_agg's rmse: 0.812496 + 0.00197119
[50]	cv_agg's rmse: 0.825811 + 0.00280063
[100]	cv_agg's rmse: 0.812594 + 0.00199986
[50]	cv_agg's rmse: 0.825824 + 0.00288273
[100]	cv_agg's rmse: 0.812711 + 0.00196022
[50]	cv_agg's rmse: 0.825779 + 0.00271895
[100]	cv_agg's rmse: 0.812637 + 0.00193062
[50]	cv_agg's rmse: 0.825779 + 0.00271895
[100]	cv_agg's rmse: 0.812637 + 0.00193062
[50]	cv_agg's rmse: 0.825863 + 0.00284725
[100]	cv_agg's rmse: 0.812766 + 0.0020275
[50]	cv_agg's rmse: 0.825779 + 0.00271895
[100]	cv_agg's rmse: 0.812637 + 0.00193062
[50]	cv_agg's rmse: 0.825863 + 0.00284725
[100]	cv_agg's rmse: 0.812766 + 0.0020275
[50]	cv_agg's rmse: 0.8258

In [8]:
params = {

'boosting_type':'gbdt',
'objective':'regression',
'metric':'rmse',
'nthread':4,
'learning_rate':0.05,
"max_depth":7,
"num_leaves":20,
"max_bin":255,
"min_data_in_leaf":81,
"min_split_gain":1.0,
"feature_fraction": 1.0,
"bagging_fraction":0.9,
"bagging_freq":5,
"lambda_l1":0.0,
"lambda_l2":0.7,
"metric": "rmse",
}
train_data = lgb.Dataset(X_val, label=Y_val)
val_data = lgb.Dataset(x_test, label=y_test, reference=train_data)
lgb_r_cv = lgb.train(params, train_data, num_boost_round=8000, early_stopping_rounds=100, valid_sets=[train_data, val_data])

[1]	training's rmse: 1.72685	valid_1's rmse: 1.72943
Training until validation scores don't improve for 100 rounds
[2]	training's rmse: 1.66102	valid_1's rmse: 1.66274
[3]	training's rmse: 1.59922	valid_1's rmse: 1.60016
[4]	training's rmse: 1.54126	valid_1's rmse: 1.5413
[5]	training's rmse: 1.48699	valid_1's rmse: 1.48625
[6]	training's rmse: 1.4363	valid_1's rmse: 1.43472
[7]	training's rmse: 1.38889	valid_1's rmse: 1.3865
[8]	training's rmse: 1.34465	valid_1's rmse: 1.34138
[9]	training's rmse: 1.3034	valid_1's rmse: 1.29937
[10]	training's rmse: 1.2649	valid_1's rmse: 1.26003
[11]	training's rmse: 1.22902	valid_1's rmse: 1.22344
[12]	training's rmse: 1.19566	valid_1's rmse: 1.18943
[13]	training's rmse: 1.16466	valid_1's rmse: 1.15772
[14]	training's rmse: 1.13595	valid_1's rmse: 1.12844
[15]	training's rmse: 1.10934	valid_1's rmse: 1.10117
[16]	training's rmse: 1.08467	valid_1's rmse: 1.07582
[17]	training's rmse: 1.06184	valid_1's rmse: 1.05238
[18]	training's rmse: 1.0408	valid

[171]	training's rmse: 0.795085	valid_1's rmse: 0.789534
[172]	training's rmse: 0.794976	valid_1's rmse: 0.789535
[173]	training's rmse: 0.794886	valid_1's rmse: 0.789516
[174]	training's rmse: 0.794826	valid_1's rmse: 0.789478
[175]	training's rmse: 0.794785	valid_1's rmse: 0.789479
[176]	training's rmse: 0.794722	valid_1's rmse: 0.789473
[177]	training's rmse: 0.794647	valid_1's rmse: 0.789453
[178]	training's rmse: 0.79456	valid_1's rmse: 0.789444
[179]	training's rmse: 0.79448	valid_1's rmse: 0.789414
[180]	training's rmse: 0.794409	valid_1's rmse: 0.789387
[181]	training's rmse: 0.794302	valid_1's rmse: 0.789392
[182]	training's rmse: 0.794198	valid_1's rmse: 0.78941
[183]	training's rmse: 0.79408	valid_1's rmse: 0.789413
[184]	training's rmse: 0.793994	valid_1's rmse: 0.789446
[185]	training's rmse: 0.793937	valid_1's rmse: 0.789445
[186]	training's rmse: 0.793851	valid_1's rmse: 0.789457
[187]	training's rmse: 0.793762	valid_1's rmse: 0.789445
[188]	training's rmse: 0.793686	val

In [9]:
y_predict_lgb = lgb_r_cv.predict(x_test)
mse = mean_squared_error(np.expm1(y_predict_lgb), np.expm1(y_test))
mae = mean_absolute_error(np.expm1(y_predict_lgb), np.expm1(y_test))
rmse = mse ** 0.5
print(rmse)
print(mse)
print(mae)

2510.90460076651
6304641.914150427
458.89354553552863


In [10]:
# 用df_test数据集进行测试

In [11]:
df_test_pay = df_test[df_test['pay_sum']>0]
df_test_nopay = df_test[df_test['pay_sum']==0]
df_test_part1 = df_test_nopay[['user_id','cp_server_no','cp_role_id','pay_num']].rename(columns=
                                                                                {'pay_num':'predict_30_pay'})

In [12]:
target_test = df_test_pay['role_created_30_pay_sum']
target_test_ln = np.log1p(target_test)
features_test = df_test_pay.drop(['role_created_30_pay_sum','user_id','cp_server_no','cp_role_id'], axis=1)
y_predict = lgb_r_cv.predict(features_test)
y_predict[y_predict<0] = 0
mse = mean_squared_error(np.expm1(y_predict), np.expm1(target_test_ln))
mae = mean_absolute_error(np.expm1(y_predict), np.expm1(target_test_ln))
rmse = mse ** 0.5
print(rmse)
print(mse)
print(mae)

2544.5231005346186
6474597.809154309
472.4689115995092


In [13]:
sum(np.expm1(target_test_ln))/sum(np.expm1(y_predict))

1.3831185293475072

In [14]:
df_test_part2 = df_test_pay[['user_id','cp_server_no','cp_role_id']]
df_test_part2['predict_30_pay'] = np.expm1(y_predict) *1.38
pred = df_test_part1.append(df_test_part2)
predict_data = pd.merge(df_test[['user_id','cp_server_no','cp_role_id', 'role_created_30_pay_sum']],pred,on=['user_id','cp_server_no','cp_role_id'],how='left',validate='one_to_one')

In [15]:
mse = mean_squared_error(predict_data['predict_30_pay'], predict_data['role_created_30_pay_sum'])
mae = mean_absolute_error(predict_data['predict_30_pay'], predict_data['role_created_30_pay_sum'])
rmse = mse ** 0.5
print('测试集上的均方根误差:%.2f元'% rmse)
# print('测试集上的均方误差:%f元' % mse)
print('测试集上的平均绝对误差:%.2f元'% mae)

测试集上的均方根误差:372.45元
测试集上的平均绝对误差:12.70元


In [16]:
print('测试集前30天实际总的付费金额:%.2f元' % predict_data['role_created_30_pay_sum'].sum())
print('测试集前30天预测总的付费金额:%.2f元'% predict_data['predict_30_pay'].sum())
print('预测总金额准确率:',predict_data['predict_30_pay'].sum()/predict_data['role_created_30_pay_sum'].sum())

测试集前30天实际总的付费金额:31165548.00元
测试集前30天预测总的付费金额:30038895.99元
预测总金额准确率: 0.9638494399793256


# 分计划

In [17]:
data['create_role_time'] = data['create_role_time'].dt.date

In [18]:
df_source = data[['user_id','cp_server_no','cp_role_id','create_role_time','channel_id','source_id','pay_sum']]
df_source.drop_duplicates(inplace=True)
df_source_predict = pd.merge(predict_data,df_source,on=['user_id','cp_server_no','cp_role_id'],how='left',validate='one_to_one')

In [19]:
df_source_predict['is_pay'] = (df_source_predict['pay_sum'] != 0).astype(int)
groups = df_source_predict.groupby(['channel_id','source_id','create_role_time'])
print(groups.ngroups)
temp1 = groups['role_created_30_pay_sum'].agg([('30_pay_sum','sum')]).reset_index()
temp2 = groups['predict_30_pay'].agg([('predict_30_pay','sum')]).reset_index()
temp3 = groups['pay_sum'].agg([('n_pay_sum','sum')]).reset_index()
temp4 = groups['user_id'].agg([('n_user_sum','count')]).reset_index()
temp5 = groups['is_pay'].agg([('pay_user_sum','sum')]).reset_index()

125454


In [20]:
df_source_predict = pd.merge(temp1,temp2,on=['channel_id','source_id','create_role_time'],how='inner',validate='one_to_one')
df_source_predict = pd.merge(df_source_predict,temp3,on=['channel_id','source_id','create_role_time'],how='inner',validate='one_to_one')
df_source_predict = pd.merge(df_source_predict,temp4,on=['channel_id','source_id','create_role_time'],how='inner',validate='one_to_one')
df_source_predict = pd.merge(df_source_predict,temp5,on=['channel_id','source_id','create_role_time'],how='inner',validate='one_to_one')

In [21]:
def weight_error(df):
    df['error'] = np.abs((df['predict_30_pay']-df['30_pay_sum'])/df['30_pay_sum'])
    df['weight'] = df['30_pay_sum'] / df['30_pay_sum'].sum()
    df['weight_error'] = df_source_predict['weight'] * df_source_predict['error']
    return df['weight_error'].sum()

In [22]:
weight_error(df_source_predict)

0.5284456791458036

In [23]:
print(weight_error(df_source_predict[df_source_predict['n_user_sum']>50]))
print(df_source_predict[df_source_predict['n_user_sum']>4].shape[0])

0.2781459469183969
32843


In [24]:
print(weight_error(df_source_predict[df_source_predict['pay_user_sum']>5]))
print(df_source_predict[df_source_predict['pay_user_sum']>5].shape[0])

0.12476722591086399
1012


In [25]:
print(weight_error(df_source_predict[df_source_predict['pay_user_sum']>3]))
print(df_source_predict[df_source_predict['pay_user_sum']>3].shape[0])

0.18962731840656202
1957


In [26]:
print(weight_error(df_source_predict[df_source_predict['pay_user_sum']>2]))
print(df_source_predict[df_source_predict['pay_user_sum']>2].shape[0])

0.2436918278484385
3011
