In [1]:
import pandas as pd
import numpy as np
import random
import gc
import seaborn as sns
import lightgbm as lgb
from matplotlib import pyplot as plt
# import matplotlib.pyplot as plt
from sklearn.model_selection import KFold, train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder,MinMaxScaler, StandardScaler
from sklearn.metrics import recall_score, auc, accuracy_score, f1_score, precision_score, classification_report, roc_auc_score,mean_squared_error
from pandas.api.types import is_categorical_dtype
from pandas.api.types import is_datetime64_any_dtype as is_datetime
# import missingno as msno
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
import warnings
warnings.filterwarnings('ignore')

In [2]:
data = pd.read_pickle('./data_6d.pickle')
select_features = ['user_id','cp_server_no','cp_role_id','role_created_login_num',
       'role_created_active', 'role_created_online',
       'max_role_level', 'ip_num',
       'device_num',
       'pay_num', 'pay_sum', 'active_0-8', 'active_8-12', 'active_12-14',
       'active_14-18', 'active_18-24', 'pay_grade_1', 'pay_grade_2',
       'pay_grade_3', 'pay_grade_4', 'pay_grade_5', 'pay_grade_6',
       'pay_rate', 'pay_avg',
       'model_money_level', 'hour', 'weekend', 'is_holidays','mobile','platform',
       'user_creates_3_server_num','user_creates_3_role_num', 'time_interval']
label = ['role_created_30_pay_sum']
df = data[select_features].join(data[label])
df['role_created_30_pay_sum'].fillna(0,inplace=True)
df['platform'].fillna(0,inplace=True)
df['user_creates_3_server_num'].fillna(1,inplace=True)
df['user_creates_3_role_num'].fillna(1,inplace=True)
df['time_interval'].fillna(0,inplace=True)
df['model_money_level'].fillna(4,inplace=True)

In [3]:
df['pay_sum'].sum()

39348154.0

In [4]:
df_train, df_test = train_test_split(df,test_size=0.3)
df_train_pay = df_train[df_train['pay_sum']>0]
df_train_nopay = df_train[df_train['pay_sum']==0]

In [5]:
target = df_train_pay['role_created_30_pay_sum']
features = df_train_pay.drop(['role_created_30_pay_sum','user_id','cp_server_no','cp_role_id'], axis=1)
target_ln = np.log1p(target)

In [6]:
X_val, x_test, Y_val, y_test = train_test_split(features,target_ln, test_size=0.3)

In [7]:
# lgb的调参与交叉验证
train_data = lgb.Dataset(X_val, label=Y_val)
val_data = lgb.Dataset(x_test, label=y_test, reference=train_data)

# 设置参数初始值，不含交叉验证参数
print('设置参数')
params = {
    'boosting_type':'gbdt',
    'objective':'regression',
    'metric':'rmse',
    'nthread':4,
    'learning_rate':0.05
}

# 交叉验证（调参）
print('交叉验证')
min_rmse = float('2')
best_params = {}

# 准确率
print('调参1：提高准确率')
for num_leaves in range(5, 100, 5):
    for max_depth in range(3, 8, 1):
        params['num_leaves'] = num_leaves
        params['max_depth'] = max_depth
        
        cv_results = lgb.cv(
                            params,
                            train_data,
                            nfold=3,
                            stratified=False,
                            early_stopping_rounds=10,
                            verbose_eval=50     
                            )
        mean_rmse = pd.Series(cv_results['rmse-mean']).min()
        boost_rounds = pd.Series(cv_results['rmse-mean']).idxmin()
        
        if mean_rmse <= min_rmse:
            min_rmse = mean_rmse
            best_params['num_leaves'] = num_leaves
            best_params['max_depth'] = max_depth

if 'num_leaves' and 'max_depth' in best_params.keys():
    params['num_leaves'] = best_params['num_leaves']
    params['max_depth'] = best_params['max_depth']

# 过拟合
print('调参2：降低过拟合')
for max_bin in range(5, 256, 10):
    for min_data_in_leaf in range(1, 102, 10):
        params['max_bin'] = max_bin
        params['min_data_in_leaf'] = min_data_in_leaf
        
        cv_results = lgb.cv(
                            params,
                            train_data,
                            nfold=3,
                            stratified=False,
                            early_stopping_rounds=10,
                            verbose_eval=50 
                            )
        mean_rmse = pd.Series(cv_results['rmse-mean']).min()
        boost_rounds = pd.Series(cv_results['rmse-mean']).idxmin()
        
        if mean_rmse <= min_rmse:
            min_rmse = mean_rmse
            best_params['min_data_in_leaf'] = min_data_in_leaf
            best_params['max_bin'] = max_bin
            
if 'max_bin' and 'min_data_in_leaf' in best_params.keys():
    params['max_bin'] = best_params['max_bin']
    params['min_data_in_leaf'] = best_params['min_data_in_leaf']

print('调参3：降低过拟合')
for feature_fraction in [0.5, 0.6, 0.7, 0.8, 0.9, 1.0]:
    for bagging_fraction in [0.5, 0.6, 0.7, 0.8, 0.9, 1.0]:
        for bagging_freq in range(0,50,5):
            params['feature_fraction'] = feature_fraction
            params['bagging_fraction'] = bagging_fraction
            params['bagging_freq'] = bagging_freq

            cv_results = lgb.cv(
                            params,
                            train_data,
                            nfold=3,
                            stratified=False,
                            early_stopping_rounds=10,
                            verbose_eval=50
                            )
            mean_rmse = pd.Series(cv_results['rmse-mean']).min()
            boost_rounds = pd.Series(cv_results['rmse-mean']).idxmin()
        
            if mean_rmse <= min_rmse:
                min_rmse = mean_rmse
                best_params['feature_fraction'] = feature_fraction
                best_params['bagging_fraction'] = bagging_fraction
                best_params['bagging_freq'] = bagging_freq

if 'feature_fraction' and 'bagging_fraction' and 'bagging_freq' in best_params.keys():
    params['feature_fraction'] = best_params['feature_fraction']
    params['bagging_fraction'] = best_params['bagging_fraction']
    params['bagging_freq'] = best_params['bagging_freq']

print('调参4：降低过拟合')

for lambda_l1 in [1e-5,1e-3,1e-1,0.0,0.1,0.3,0.5,0.7,0.9,1.0]:
    for lambda_l2 in [1e-5,1e-3,1e-1,0.0,0.1,0.4,0.6,0.7,0.9,1.0]:
        params['lambda_l1'] = lambda_l1
        params['lambda_l2'] = lambda_l2
        cv_results = lgb.cv(
                            params,
                            train_data,
                            nfold=3,
                            stratified=False,
                            early_stopping_rounds=10,
                            verbose_eval=50
                            )
        mean_rmse = pd.Series(cv_results['rmse-mean']).min()
        boost_rounds = pd.Series(cv_results['rmse-mean']).idxmin()
        
        if mean_rmse <= min_rmse:
            min_rmse = mean_rmse
            best_params['lambda_l1'] = lambda_l1
            best_params['lambda_l2'] = lambda_l2
if 'lambda_l1' and 'lambda_l2' in best_params.keys():
    params['lambda_l1'] = best_params['lambda_l1']
    params['lambda_l2'] = best_params['lambda_l2']

print("调参5：降低过拟合2")
for min_split_gain in [0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]:
    params['min_split_gain'] = min_split_gain
    
    cv_results = lgb.cv(
                            params,
                            train_data,
                            nfold=3,
                            stratified=False,
                            early_stopping_rounds=10,
                            verbose_eval=50    
                            )
    mean_rmse = pd.Series(cv_results['rmse-mean']).min()
    boost_rounds = pd.Series(cv_results['rmse-mean']).idxmin()
        
    if mean_rmse >= min_rmse:
        min_rmse = mean_rmse
        
    best_params['min_split_gain'] = min_split_gain
if 'min_split_gain' in best_params.keys():
    params['min_split_gain'] = best_params['min_split_gain']
print(best_params)

设置参数
交叉验证
调参1：提高准确率
[50]	cv_agg's rmse: 0.772422 + 0.00668548
[100]	cv_agg's rmse: 0.743667 + 0.00613378
[50]	cv_agg's rmse: 0.772422 + 0.00668548
[100]	cv_agg's rmse: 0.743603 + 0.00620599
[50]	cv_agg's rmse: 0.772422 + 0.00668548
[100]	cv_agg's rmse: 0.743603 + 0.00620599
[50]	cv_agg's rmse: 0.772422 + 0.00668548
[100]	cv_agg's rmse: 0.743603 + 0.00620599
[50]	cv_agg's rmse: 0.772422 + 0.00668548
[100]	cv_agg's rmse: 0.743603 + 0.00620599
[50]	cv_agg's rmse: 0.762997 + 0.00618426
[100]	cv_agg's rmse: 0.742155 + 0.00573031
[50]	cv_agg's rmse: 0.76025 + 0.00600335
[100]	cv_agg's rmse: 0.741525 + 0.0055922
[50]	cv_agg's rmse: 0.760222 + 0.00601677
[100]	cv_agg's rmse: 0.741457 + 0.00557567
[50]	cv_agg's rmse: 0.760222 + 0.00601677
[100]	cv_agg's rmse: 0.741383 + 0.00558653
[50]	cv_agg's rmse: 0.760222 + 0.00601677
[100]	cv_agg's rmse: 0.741409 + 0.00559749
[50]	cv_agg's rmse: 0.762997 + 0.00618426
[100]	cv_agg's rmse: 0.742155 + 0.00573031
[50]	cv_agg's rmse: 0.756949 + 0.00575458
[100]

[100]	cv_agg's rmse: 0.740938 + 0.00544664
[50]	cv_agg's rmse: 0.754136 + 0.00543738
[100]	cv_agg's rmse: 0.740813 + 0.00550657
[50]	cv_agg's rmse: 0.754055 + 0.00537259
[100]	cv_agg's rmse: 0.740837 + 0.0054176
[50]	cv_agg's rmse: 0.753941 + 0.00544223
[100]	cv_agg's rmse: 0.740746 + 0.0053536
[50]	cv_agg's rmse: 0.753935 + 0.00541877
[100]	cv_agg's rmse: 0.740672 + 0.0054354
[50]	cv_agg's rmse: 0.753941 + 0.00542559
[100]	cv_agg's rmse: 0.740677 + 0.00535457
[50]	cv_agg's rmse: 0.75391 + 0.00542754
[100]	cv_agg's rmse: 0.74054 + 0.00546451
[50]	cv_agg's rmse: 0.753933 + 0.00543864
[100]	cv_agg's rmse: 0.740603 + 0.00538949
[50]	cv_agg's rmse: 0.754015 + 0.00541928
[100]	cv_agg's rmse: 0.740571 + 0.00537808
[50]	cv_agg's rmse: 0.754018 + 0.00537384
[100]	cv_agg's rmse: 0.74052 + 0.00533209
[50]	cv_agg's rmse: 0.75418 + 0.00553275
[100]	cv_agg's rmse: 0.741336 + 0.00557575
[50]	cv_agg's rmse: 0.754127 + 0.00545324
[100]	cv_agg's rmse: 0.740938 + 0.00544664
[50]	cv_agg's rmse: 0.754136 

[50]	cv_agg's rmse: 0.75418 + 0.00553275
[100]	cv_agg's rmse: 0.741336 + 0.00557575
[50]	cv_agg's rmse: 0.754127 + 0.00545324
[100]	cv_agg's rmse: 0.740938 + 0.00544664
[50]	cv_agg's rmse: 0.754136 + 0.00543738
[100]	cv_agg's rmse: 0.740813 + 0.00550657
[50]	cv_agg's rmse: 0.754055 + 0.00537259
[100]	cv_agg's rmse: 0.740837 + 0.0054176
[50]	cv_agg's rmse: 0.753941 + 0.00544223
[100]	cv_agg's rmse: 0.740746 + 0.0053536
[50]	cv_agg's rmse: 0.753935 + 0.00541877
[100]	cv_agg's rmse: 0.740672 + 0.0054354
[50]	cv_agg's rmse: 0.753941 + 0.00542559
[100]	cv_agg's rmse: 0.740677 + 0.00535457
[50]	cv_agg's rmse: 0.75391 + 0.00542754
[100]	cv_agg's rmse: 0.74054 + 0.00546451
[50]	cv_agg's rmse: 0.753933 + 0.00543864
[100]	cv_agg's rmse: 0.740603 + 0.00538949
[50]	cv_agg's rmse: 0.754015 + 0.00541928
[100]	cv_agg's rmse: 0.740571 + 0.00537808
[50]	cv_agg's rmse: 0.754018 + 0.00537384
[100]	cv_agg's rmse: 0.74052 + 0.00533209
[50]	cv_agg's rmse: 0.75418 + 0.00553275
[100]	cv_agg's rmse: 0.741336 +

[100]	cv_agg's rmse: 0.740571 + 0.00537808
[50]	cv_agg's rmse: 0.754018 + 0.00537384
[100]	cv_agg's rmse: 0.74052 + 0.00533209
[50]	cv_agg's rmse: 0.75418 + 0.00553275
[100]	cv_agg's rmse: 0.741336 + 0.00557575
[50]	cv_agg's rmse: 0.754127 + 0.00545324
[100]	cv_agg's rmse: 0.740938 + 0.00544664
[50]	cv_agg's rmse: 0.754136 + 0.00543738
[100]	cv_agg's rmse: 0.740813 + 0.00550657
[50]	cv_agg's rmse: 0.754055 + 0.00537259
[100]	cv_agg's rmse: 0.740837 + 0.0054176
[50]	cv_agg's rmse: 0.753941 + 0.00544223
[100]	cv_agg's rmse: 0.740746 + 0.0053536
[50]	cv_agg's rmse: 0.753935 + 0.00541877
[100]	cv_agg's rmse: 0.740672 + 0.0054354
[50]	cv_agg's rmse: 0.753941 + 0.00542559
[100]	cv_agg's rmse: 0.740677 + 0.00535457
[50]	cv_agg's rmse: 0.75391 + 0.00542754
[100]	cv_agg's rmse: 0.74054 + 0.00546451
[50]	cv_agg's rmse: 0.753933 + 0.00543864
[100]	cv_agg's rmse: 0.740603 + 0.00538949
[50]	cv_agg's rmse: 0.754015 + 0.00541928
[100]	cv_agg's rmse: 0.740571 + 0.00537808
[50]	cv_agg's rmse: 0.754018 

[100]	cv_agg's rmse: 0.74199 + 0.0051868
[50]	cv_agg's rmse: 0.757967 + 0.0052327
[100]	cv_agg's rmse: 0.742007 + 0.00564344
[50]	cv_agg's rmse: 0.758021 + 0.00504721
[100]	cv_agg's rmse: 0.742272 + 0.00551037
[50]	cv_agg's rmse: 0.757089 + 0.00536084
[100]	cv_agg's rmse: 0.741159 + 0.00549566
[50]	cv_agg's rmse: 0.757314 + 0.00531181
[100]	cv_agg's rmse: 0.74139 + 0.00555014
[50]	cv_agg's rmse: 0.757285 + 0.00556335
[100]	cv_agg's rmse: 0.741311 + 0.00555232
[50]	cv_agg's rmse: 0.757213 + 0.005516
[100]	cv_agg's rmse: 0.741526 + 0.00566391
[50]	cv_agg's rmse: 0.757441 + 0.00548351
[100]	cv_agg's rmse: 0.741745 + 0.00572878
[50]	cv_agg's rmse: 0.757857 + 0.00552968
[100]	cv_agg's rmse: 0.741862 + 0.00551002
[50]	cv_agg's rmse: 0.757505 + 0.0054972
[100]	cv_agg's rmse: 0.741621 + 0.00560668
[50]	cv_agg's rmse: 0.757811 + 0.00556232
[100]	cv_agg's rmse: 0.741823 + 0.00555545
[50]	cv_agg's rmse: 0.757778 + 0.00536121
[100]	cv_agg's rmse: 0.741681 + 0.00580522
[50]	cv_agg's rmse: 0.757955 

[100]	cv_agg's rmse: 0.741183 + 0.00541856
[50]	cv_agg's rmse: 0.756278 + 0.00510902
[100]	cv_agg's rmse: 0.74125 + 0.00550858
[50]	cv_agg's rmse: 0.756052 + 0.00516244
[100]	cv_agg's rmse: 0.741172 + 0.00533051
[50]	cv_agg's rmse: 0.756216 + 0.00530089
[100]	cv_agg's rmse: 0.741267 + 0.00541162
[50]	cv_agg's rmse: 0.756031 + 0.00524413
[100]	cv_agg's rmse: 0.741263 + 0.00536228
[50]	cv_agg's rmse: 0.756082 + 0.00515593
[100]	cv_agg's rmse: 0.741249 + 0.00533443
[50]	cv_agg's rmse: 0.756037 + 0.00532177
[100]	cv_agg's rmse: 0.741026 + 0.00559936
[50]	cv_agg's rmse: 0.756037 + 0.00532177
[100]	cv_agg's rmse: 0.741026 + 0.00559936
[50]	cv_agg's rmse: 0.756037 + 0.00532177
[100]	cv_agg's rmse: 0.741026 + 0.00559936
[50]	cv_agg's rmse: 0.756037 + 0.00532177
[100]	cv_agg's rmse: 0.741026 + 0.00559936
[50]	cv_agg's rmse: 0.756037 + 0.00532177
[100]	cv_agg's rmse: 0.741026 + 0.00559936
[50]	cv_agg's rmse: 0.756037 + 0.00532177
[100]	cv_agg's rmse: 0.741026 + 0.00559936
[50]	cv_agg's rmse: 0.7

[100]	cv_agg's rmse: 0.741022 + 0.00556713
[50]	cv_agg's rmse: 0.755131 + 0.00555856
[100]	cv_agg's rmse: 0.74124 + 0.00551853
[50]	cv_agg's rmse: 0.755128 + 0.00539879
[100]	cv_agg's rmse: 0.741406 + 0.00546047
[50]	cv_agg's rmse: 0.755104 + 0.0054985
[100]	cv_agg's rmse: 0.741319 + 0.00546751
[50]	cv_agg's rmse: 0.755314 + 0.00524135
[100]	cv_agg's rmse: 0.741291 + 0.00521175
[50]	cv_agg's rmse: 0.755474 + 0.0053898
[100]	cv_agg's rmse: 0.741475 + 0.00549327
[50]	cv_agg's rmse: 0.755524 + 0.00536469
[100]	cv_agg's rmse: 0.741432 + 0.0053139
[50]	cv_agg's rmse: 0.755263 + 0.00543286
[100]	cv_agg's rmse: 0.741372 + 0.00564593
[50]	cv_agg's rmse: 0.755527 + 0.00531033
[100]	cv_agg's rmse: 0.741685 + 0.00526887
[50]	cv_agg's rmse: 0.754735 + 0.00533648
[100]	cv_agg's rmse: 0.740733 + 0.00540979
[50]	cv_agg's rmse: 0.754779 + 0.00542764
[100]	cv_agg's rmse: 0.740687 + 0.00553483
[50]	cv_agg's rmse: 0.754858 + 0.00549569
[100]	cv_agg's rmse: 0.741013 + 0.00535359
[50]	cv_agg's rmse: 0.7550

[50]	cv_agg's rmse: 0.75457 + 0.00534798
[100]	cv_agg's rmse: 0.74086 + 0.00547838
[50]	cv_agg's rmse: 0.754018 + 0.00537384
[100]	cv_agg's rmse: 0.74052 + 0.00533209
[50]	cv_agg's rmse: 0.753841 + 0.00553037
[100]	cv_agg's rmse: 0.740964 + 0.00585192
[50]	cv_agg's rmse: 0.753655 + 0.00600357
[100]	cv_agg's rmse: 0.740869 + 0.00587723
[50]	cv_agg's rmse: 0.754193 + 0.0055461
[100]	cv_agg's rmse: 0.741176 + 0.00594075
[50]	cv_agg's rmse: 0.753835 + 0.00602753
[100]	cv_agg's rmse: 0.741041 + 0.00605139
[50]	cv_agg's rmse: 0.754478 + 0.00573924
[100]	cv_agg's rmse: 0.741561 + 0.00540121
[50]	cv_agg's rmse: 0.754455 + 0.00544893
[100]	cv_agg's rmse: 0.741569 + 0.00583234
[50]	cv_agg's rmse: 0.754982 + 0.00551358
[100]	cv_agg's rmse: 0.741916 + 0.00554788
[50]	cv_agg's rmse: 0.754696 + 0.00582393
[100]	cv_agg's rmse: 0.741421 + 0.00594235
[50]	cv_agg's rmse: 0.755045 + 0.00539698
[100]	cv_agg's rmse: 0.741918 + 0.00548974
[50]	cv_agg's rmse: 0.754018 + 0.00537384
[100]	cv_agg's rmse: 0.7405

[50]	cv_agg's rmse: 0.753915 + 0.00545517
[100]	cv_agg's rmse: 0.740543 + 0.00547556
[50]	cv_agg's rmse: 0.753951 + 0.00537743
[100]	cv_agg's rmse: 0.740477 + 0.00543917
[50]	cv_agg's rmse: 0.75404 + 0.00543109
[100]	cv_agg's rmse: 0.740572 + 0.00549672
[50]	cv_agg's rmse: 0.753988 + 0.00542293
[100]	cv_agg's rmse: 0.740534 + 0.00554653
[50]	cv_agg's rmse: 0.75385 + 0.00544991
[100]	cv_agg's rmse: 0.740505 + 0.00557685
[50]	cv_agg's rmse: 0.75385 + 0.00544993
[100]	cv_agg's rmse: 0.740505 + 0.00557685
[50]	cv_agg's rmse: 0.753865 + 0.00540123
[100]	cv_agg's rmse: 0.740614 + 0.00550402
[50]	cv_agg's rmse: 0.75385 + 0.00544991
[100]	cv_agg's rmse: 0.740505 + 0.00557685
[50]	cv_agg's rmse: 0.753865 + 0.00540123
[100]	cv_agg's rmse: 0.740614 + 0.00550402
[50]	cv_agg's rmse: 0.753924 + 0.00540398
[100]	cv_agg's rmse: 0.740657 + 0.00544271
[50]	cv_agg's rmse: 0.753925 + 0.00547181
[100]	cv_agg's rmse: 0.740552 + 0.00551039
[50]	cv_agg's rmse: 0.754034 + 0.00544789
[100]	cv_agg's rmse: 0.7406

In [8]:
params = {

'boosting_type':'gbdt',
'objective':'regression',
'metric':'rmse',
'nthread':4,
'learning_rate':0.05,
"max_depth":5,
"num_leaves":95,
"max_bin":255,
"min_data_in_leaf":101,
"min_split_gain":1.0,
"feature_fraction": 0.8,
"bagging_fraction":0.9,
"bagging_freq":5,
"lambda_l1":1.0,
"lambda_l2":1.0,
"metric": "rmse",
}
train_data = lgb.Dataset(X_val, label=Y_val)
val_data = lgb.Dataset(x_test, label=y_test, reference=train_data)
lgb_r_cv = lgb.train(params, train_data, num_boost_round=8000, early_stopping_rounds=100, valid_sets=[train_data, val_data])

[1]	training's rmse: 1.72505	valid_1's rmse: 1.73099
Training until validation scores don't improve for 100 rounds
[2]	training's rmse: 1.65578	valid_1's rmse: 1.66152
[3]	training's rmse: 1.59062	valid_1's rmse: 1.59619
[4]	training's rmse: 1.53099	valid_1's rmse: 1.53646
[5]	training's rmse: 1.47417	valid_1's rmse: 1.47954
[6]	training's rmse: 1.42	valid_1's rmse: 1.42523
[7]	training's rmse: 1.36925	valid_1's rmse: 1.37439
[8]	training's rmse: 1.32246	valid_1's rmse: 1.32756
[9]	training's rmse: 1.27888	valid_1's rmse: 1.28401
[10]	training's rmse: 1.2373	valid_1's rmse: 1.24235
[11]	training's rmse: 1.19849	valid_1's rmse: 1.20351
[12]	training's rmse: 1.16232	valid_1's rmse: 1.16736
[13]	training's rmse: 1.12932	valid_1's rmse: 1.13438
[14]	training's rmse: 1.09882	valid_1's rmse: 1.10389
[15]	training's rmse: 1.0696	valid_1's rmse: 1.07472
[16]	training's rmse: 1.04251	valid_1's rmse: 1.04771
[17]	training's rmse: 1.0174	valid_1's rmse: 1.02264
[18]	training's rmse: 0.994136	vali

[173]	training's rmse: 0.725399	valid_1's rmse: 0.743116
[174]	training's rmse: 0.725386	valid_1's rmse: 0.743115
[175]	training's rmse: 0.725381	valid_1's rmse: 0.743114
[176]	training's rmse: 0.725352	valid_1's rmse: 0.743111
[177]	training's rmse: 0.725287	valid_1's rmse: 0.743107
[178]	training's rmse: 0.72522	valid_1's rmse: 0.743091
[179]	training's rmse: 0.725174	valid_1's rmse: 0.743094
[180]	training's rmse: 0.725122	valid_1's rmse: 0.743091
[181]	training's rmse: 0.725091	valid_1's rmse: 0.74307
[182]	training's rmse: 0.725059	valid_1's rmse: 0.743063
[183]	training's rmse: 0.725038	valid_1's rmse: 0.743054
[184]	training's rmse: 0.725031	valid_1's rmse: 0.74305
[185]	training's rmse: 0.725027	valid_1's rmse: 0.743052
[186]	training's rmse: 0.725007	valid_1's rmse: 0.743047
[187]	training's rmse: 0.725006	valid_1's rmse: 0.743047
[188]	training's rmse: 0.724954	valid_1's rmse: 0.743051
[189]	training's rmse: 0.724953	valid_1's rmse: 0.743051
[190]	training's rmse: 0.72494	val

In [9]:
y_predict_lgb = lgb_r_cv.predict(x_test)
mse = mean_squared_error(np.expm1(y_predict_lgb), np.expm1(y_test))
mae = mean_absolute_error(np.expm1(y_predict_lgb), np.expm1(y_test))
rmse = mse ** 0.5
print(rmse)
print(mse)
print(mae)

2416.823417938452
5841035.433495701
437.9847197263064


In [10]:
# 用df_test数据集进行测试

In [10]:
df_test_pay = df_test[df_test['pay_sum']>0]
df_test_nopay = df_test[df_test['pay_sum']==0]
df_test_part1 = df_test_nopay[['user_id','cp_server_no','cp_role_id','pay_num']].rename(columns=
                                                                                {'pay_num':'predict_30_pay'})

In [11]:
target_test = df_test_pay['role_created_30_pay_sum']
target_test_ln = np.log1p(target_test)
features_test = df_test_pay.drop(['role_created_30_pay_sum','user_id','cp_server_no','cp_role_id'], axis=1)
y_predict = lgb_r_cv.predict(features_test)
y_predict[y_predict<0] = 0
mse = mean_squared_error(np.expm1(y_predict), np.expm1(target_test_ln))
mae = mean_absolute_error(np.expm1(y_predict), np.expm1(target_test_ln))
rmse = mse ** 0.5
print(rmse)
print(mse)
print(mae)

2235.2306543750196
4996256.078257779
415.59675856767825


In [12]:
sum(np.expm1(target_test_ln))/sum(np.expm1(y_predict))

1.2801465111881667

In [13]:
df_test_part2 = df_test_pay[['user_id','cp_server_no','cp_role_id']]
df_test_part2['predict_30_pay'] = np.expm1(y_predict) *1.28
pred = df_test_part1.append(df_test_part2)
predict_data = pd.merge(df_test[['user_id','cp_server_no','cp_role_id', 'role_created_30_pay_sum']],pred,on=['user_id','cp_server_no','cp_role_id'],how='left',validate='one_to_one')

In [14]:
mse = mean_squared_error(predict_data['predict_30_pay'], predict_data['role_created_30_pay_sum'])
mae = mean_absolute_error(predict_data['predict_30_pay'], predict_data['role_created_30_pay_sum'])
rmse = mse ** 0.5
print('测试集上的均方根误差:%.2f元'% rmse)
# print('测试集上的均方误差:%f元' % mse)
print('测试集上的平均绝对误差:%.2f元'% mae)

测试集上的均方根误差:334.29元
测试集上的平均绝对误差:11.11元


In [15]:
print('测试集前30天实际总的付费金额:%.2f元' % predict_data['role_created_30_pay_sum'].sum())
print('测试集前30天预测总的付费金额:%.2f元'% predict_data['predict_30_pay'].sum())
print('预测总金额准确率:',predict_data['predict_30_pay'].sum()/predict_data['role_created_30_pay_sum'].sum())

测试集前30天实际总的付费金额:28922141.00元
测试集前30天预测总的付费金额:28054995.77元
预测总金额准确率: 0.9700179448278182


# 分计划

In [16]:
data['create_role_time'] = data['create_role_time'].dt.date

In [17]:
df_source = data[['user_id','cp_server_no','cp_role_id','create_role_time','channel_id','source_id','pay_sum']]
df_source.drop_duplicates(inplace=True)
df_source_predict = pd.merge(predict_data,df_source,on=['user_id','cp_server_no','cp_role_id'],how='left',validate='one_to_one')

In [18]:
df_source_predict['is_pay'] = (df_source_predict['pay_sum'] != 0).astype(int)
groups = df_source_predict.groupby(['channel_id','source_id','create_role_time'])
print(groups.ngroups)
temp1 = groups['role_created_30_pay_sum'].agg([('30_pay_sum','sum')]).reset_index()
temp2 = groups['predict_30_pay'].agg([('predict_30_pay','sum')]).reset_index()
temp3 = groups['pay_sum'].agg([('n_pay_sum','sum')]).reset_index()
temp4 = groups['user_id'].agg([('n_user_sum','count')]).reset_index()
temp5 = groups['is_pay'].agg([('pay_user_sum','sum')]).reset_index()

125633


In [19]:
df_source_predict = pd.merge(temp1,temp2,on=['channel_id','source_id','create_role_time'],how='inner',validate='one_to_one')
df_source_predict = pd.merge(df_source_predict,temp3,on=['channel_id','source_id','create_role_time'],how='inner',validate='one_to_one')
df_source_predict = pd.merge(df_source_predict,temp4,on=['channel_id','source_id','create_role_time'],how='inner',validate='one_to_one')
df_source_predict = pd.merge(df_source_predict,temp5,on=['channel_id','source_id','create_role_time'],how='inner',validate='one_to_one')

In [20]:
def weight_error(df):
    df['error'] = np.abs((df['predict_30_pay']-df['30_pay_sum'])/df['30_pay_sum'])
    df['weight'] = df['30_pay_sum'] / df['30_pay_sum'].sum()
    df['weight_error'] = df_source_predict['weight'] * df_source_predict['error']
    return df['weight_error'].sum()

In [21]:
weight_error(df_source_predict)

0.49980821137269693

In [22]:
print(weight_error(df_source_predict[df_source_predict['n_user_sum']>50]))
print(df_source_predict[df_source_predict['n_user_sum']>4].shape[0])

0.26690495649587576
32834


In [23]:
print(weight_error(df_source_predict[df_source_predict['pay_user_sum']>5]))
print(df_source_predict[df_source_predict['pay_user_sum']>5].shape[0])

0.11934556022723175
1049


In [24]:
print(weight_error(df_source_predict[df_source_predict['pay_user_sum']>3]))
print(df_source_predict[df_source_predict['pay_user_sum']>3].shape[0])

0.1823393528340594
1980


In [25]:
print(weight_error(df_source_predict[df_source_predict['pay_user_sum']>2]))
print(df_source_predict[df_source_predict['pay_user_sum']>2].shape[0])

0.23116567945159133
3107
