In [1]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import datetime,gc,math
import random
%matplotlib inline
import gc
import lightgbm as lgb
from matplotlib import pyplot as plt
from sklearn.model_selection import KFold, train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder,MinMaxScaler, StandardScaler
from sklearn.metrics import recall_score, auc, accuracy_score, f1_score, precision_score, classification_report, roc_auc_score,mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.feature_selection import VarianceThreshold

In [2]:
def active_time_transform(df):
    df['role_created_active_time'].fillna('"0-8,0","8-12,0","12-14,0","14-18,0","18-24,0"', inplace=True)
    df['role_created_active_time'] = df['role_created_active_time'].apply(lambda x:str(x)[1:-1])
    temp = df['role_created_active_time'].str.split(',',expand=True).iloc[:,[1,3,5,7,9]].rename(columns={1:'active_0-8',
                                        3:'active_8-12',5:'active_12-14',7:'active_14-18',9:'active_18-24'})
    for i in range(len(temp.columns)):
        temp.iloc[:,i] = temp.iloc[:,i].apply(lambda x:str(x)[:-1])
    df = df.join(temp).drop(['role_created_active_time'], axis=1)
    return df

def pay_grade_transform(df):
    df['pay_grade'].fillna('[0,0,0,0,0,0,0]',inplace=True)
    df['pay_grade'] = df['pay_grade'].apply(lambda x:str(x)[1:-1])
    temp = df['pay_grade'].str.split(',',expand=True).rename(columns={0:'pay_grade_1',
                1:'pay_grade_2',2:'pay_grade_3',3:'pay_grade_4',4:'pay_grade_5',5:'pay_grade_6',6:'pay_grade_7'})
    df = df.join(temp).drop(['pay_grade'], axis=1)
    return df

In [3]:
%%time
role_info = pd.read_csv('./data/mr_role_5d.csv')
role_info.drop_duplicates(subset=['user_id','cp_server_no','cp_role_id'],inplace=True)
role_info = active_time_transform(role_info)
role_info = pay_grade_transform(role_info)

Wall time: 1min 55s


In [4]:
# 删除2020-7-19日前30天的数据
role_info['create_role_time'] = pd.to_datetime(role_info['create_role_time'], format='%Y-%m-%d %H:%M:%S')
role_info.drop(role_info[role_info['create_role_time'] > (role_info['create_role_time'].max() + 
                                                          datetime.timedelta(-30))].index,inplace=True)

In [5]:
# 删除没有心跳数据的样本(1、没有登录;2、没有拿到心跳)
role_info.dropna(subset=['role_created_login_num'], inplace=True)
role_info.dropna(subset=['role_created_online'], inplace=True)
role_info['pay_num'].fillna(0, inplace=True)
role_info['pay_sum'].fillna(0, inplace=True)
role_info['role_created_active'] = role_info['role_created_active'].clip(0,5)
role_info['pay_rate'] = role_info['pay_num'] / (role_info['role_created_active'] + 1e-4)
role_info['pay_avg'] = role_info['pay_sum'] / (role_info['pay_num'] + 1e-4)

In [6]:
role_created_30_pay_sum = pd.read_csv('./data/role_created_30_pay_sum.csv',index_col=0)
# 对连接的关键字进行字符化或者整型化的调整
role_created_30_pay_sum['user_id'] = role_created_30_pay_sum['user_id'].astype(str)
role_created_30_pay_sum['mgame_id'] = role_created_30_pay_sum['mgame_id'].astype(str)
role_created_30_pay_sum['cp_server_no'] = role_created_30_pay_sum['cp_server_no'].astype(str)
role_created_30_pay_sum['cp_role_id'] = role_created_30_pay_sum['cp_role_id'].astype(str)
role_info['user_id'] = role_info['user_id'].astype(str)
role_info['mgame_id'] = role_info['mgame_id'].astype(str)
role_info['cp_server_no'] = role_info['cp_server_no'].astype(str)
role_info['cp_role_id'] = role_info['cp_role_id'].astype(str)
role_created_30_pay_sum.drop_duplicates(subset=['user_id','cp_server_no','mgame_id','cp_role_id'],inplace=True)
role_info.drop_duplicates(subset=['user_id','cp_server_no','mgame_id','cp_role_id'],inplace=True)
role_info = pd.merge(role_info,role_created_30_pay_sum,on=['user_id','mgame_id','cp_server_no','cp_role_id'],how='left',validate='one_to_one')

In [7]:
select_features = ['user_id','cp_server_no','cp_role_id','role_created_login_num',
       'role_created_active', 'role_created_online',
       'max_role_level', 'ip_num',
       'pay_num', 'pay_sum', 'active_0-8', 'active_8-12', 'active_12-14',
       'active_14-18', 'active_18-24', 'pay_grade_1', 'pay_grade_2',
       'pay_grade_3', 'pay_grade_4', 'pay_grade_5', 'pay_grade_6',
       'pay_rate', 'pay_avg','role_created_30_pay_sum']
role_info = role_info[select_features]
role_info['role_created_30_pay_sum'].fillna(0,inplace=True)
col_list = ['active_0-8','active_8-12','active_12-14','active_14-18','active_18-24','pay_grade_1','pay_grade_2',
            'pay_grade_3','pay_grade_4','pay_grade_5','pay_grade_6']
for col in col_list:
    role_info[col] = pd.to_numeric(role_info[col], errors='coerce')

In [8]:
role_info.to_pickle('./role_info_5d.pkl')

In [9]:
df_train, df_test = train_test_split(role_info,test_size=0.3)
df_train_pay = df_train[df_train['pay_sum']>0]
df_train_nopay = df_train[df_train['pay_sum']==0]

In [10]:
target = df_train_pay['role_created_30_pay_sum']
features = df_train_pay.drop(['role_created_30_pay_sum','user_id','cp_server_no','cp_role_id'], axis=1)
target_ln = np.log1p(target)
X_val, x_test, Y_val, y_test = train_test_split(features,target_ln, test_size=0.3)

In [11]:
# lgb的调参与交叉验证
train_data = lgb.Dataset(X_val, label=Y_val)
val_data = lgb.Dataset(x_test, label=y_test, reference=train_data)

# 设置参数初始值，不含交叉验证参数
print('设置参数')
params = {
    'boosting_type':'gbdt',
    'objective':'regression',
    'metric':'rmse',
    'nthread':4,
    'learning_rate':0.05
}

# 交叉验证（调参）
print('交叉验证')
min_rmse = float('5')
best_params = {}

# 准确率
print('调参1：提高准确率')
for num_leaves in range(5, 100, 5):
    for max_depth in range(3, 8, 1):
        params['num_leaves'] = num_leaves
        params['max_depth'] = max_depth
        
        cv_results = lgb.cv(
                            params,
                            train_data,
                            nfold=3,
                            stratified=False,
                            early_stopping_rounds=10,
                            verbose_eval=50     
                            )
        mean_rmse = pd.Series(cv_results['rmse-mean']).min()
        boost_rounds = pd.Series(cv_results['rmse-mean']).idxmin()
        
        if mean_rmse <= min_rmse:
            min_rmse = mean_rmse
            best_params['num_leaves'] = num_leaves
            best_params['max_depth'] = max_depth

if 'num_leaves' and 'max_depth' in best_params.keys():
    params['num_leaves'] = best_params['num_leaves']
    params['max_depth'] = best_params['max_depth']

# 过拟合
print('调参2：降低过拟合')
min_rmse = float('5')
for max_bin in range(5, 256, 10):
    for min_data_in_leaf in range(1, 102, 10):
        params['max_bin'] = max_bin
        params['min_data_in_leaf'] = min_data_in_leaf
        
        cv_results = lgb.cv(
                            params,
                            train_data,
                            nfold=3,
                            stratified=False,
                            early_stopping_rounds=10,
                            verbose_eval=50 
                            )
        mean_rmse = pd.Series(cv_results['rmse-mean']).min()
        boost_rounds = pd.Series(cv_results['rmse-mean']).idxmin()
        
        if mean_rmse <= min_rmse:
            min_rmse = mean_rmse
            best_params['min_data_in_leaf'] = min_data_in_leaf
            best_params['max_bin'] = max_bin
            
if 'max_bin' and 'min_data_in_leaf' in best_params.keys():
    params['max_bin'] = best_params['max_bin']
    params['min_data_in_leaf'] = best_params['min_data_in_leaf']

print('调参3：降低过拟合')
min_rmse = float('5')
for feature_fraction in [0.5, 0.6, 0.7, 0.8, 0.9, 1.0]:
    for bagging_fraction in [0.5, 0.6, 0.7, 0.8, 0.9, 1.0]:
        for bagging_freq in range(0,50,5):
            params['feature_fraction'] = feature_fraction
            params['bagging_fraction'] = bagging_fraction
            params['bagging_freq'] = bagging_freq

            cv_results = lgb.cv(
                            params,
                            train_data,
                            nfold=3,
                            stratified=False,
                            early_stopping_rounds=10,
                            verbose_eval=50
                            )
            mean_rmse = pd.Series(cv_results['rmse-mean']).min()
            boost_rounds = pd.Series(cv_results['rmse-mean']).idxmin()
        
            if mean_rmse <= min_rmse:
                min_rmse = mean_rmse
                best_params['feature_fraction'] = feature_fraction
                best_params['bagging_fraction'] = bagging_fraction
                best_params['bagging_freq'] = bagging_freq

if 'feature_fraction' and 'bagging_fraction' and 'bagging_freq' in best_params.keys():
    params['feature_fraction'] = best_params['feature_fraction']
    params['bagging_fraction'] = best_params['bagging_fraction']
    params['bagging_freq'] = best_params['bagging_freq']

print('调参4：降低过拟合')
min_rmse = float('5')
for lambda_l1 in [1e-5,1e-3,1e-1,0.0,0.1,0.3,0.5,0.7,0.9,1.0]:
    for lambda_l2 in [1e-5,1e-3,1e-1,0.0,0.1,0.4,0.6,0.7,0.9,1.0]:
        params['lambda_l1'] = lambda_l1
        params['lambda_l2'] = lambda_l2
        cv_results = lgb.cv(
                            params,
                            train_data,
                            nfold=3,
                            stratified=False,
                            early_stopping_rounds=10,
                            verbose_eval=50
                            )
        mean_rmse = pd.Series(cv_results['rmse-mean']).min()
        boost_rounds = pd.Series(cv_results['rmse-mean']).idxmin()
        
        if mean_rmse <= min_rmse:
            min_rmse = mean_rmse
            best_params['lambda_l1'] = lambda_l1
            best_params['lambda_l2'] = lambda_l2
if 'lambda_l1' and 'lambda_l2' in best_params.keys():
    params['lambda_l1'] = best_params['lambda_l1']
    params['lambda_l2'] = best_params['lambda_l2']

print("调参5：降低过拟合2")
min_rmse = float('5')
for min_split_gain in [0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]:
    params['min_split_gain'] = min_split_gain
    
    cv_results = lgb.cv(
                            params,
                            train_data,
                            nfold=3,
                            stratified=False,
                            early_stopping_rounds=10,
                            verbose_eval=50    
                            )
    mean_rmse = pd.Series(cv_results['rmse-mean']).min()
    boost_rounds = pd.Series(cv_results['rmse-mean']).idxmin()
        
    if mean_rmse >= min_rmse:
        min_rmse = mean_rmse
        
    best_params['min_split_gain'] = min_split_gain
if 'min_split_gain' in best_params.keys():
    params['min_split_gain'] = best_params['min_split_gain']
print(best_params)

设置参数
交叉验证
调参1：提高准确率
[50]	cv_agg's rmse: 0.839788 + 0.0011524
[100]	cv_agg's rmse: 0.814298 + 0.00113194
[50]	cv_agg's rmse: 0.839788 + 0.0011524
[100]	cv_agg's rmse: 0.814222 + 0.00106968
[50]	cv_agg's rmse: 0.839788 + 0.0011524
[100]	cv_agg's rmse: 0.814222 + 0.00106968
[50]	cv_agg's rmse: 0.839788 + 0.0011524
[100]	cv_agg's rmse: 0.814222 + 0.00106968
[50]	cv_agg's rmse: 0.839788 + 0.0011524
[100]	cv_agg's rmse: 0.814222 + 0.00106968
[50]	cv_agg's rmse: 0.830599 + 0.00109784
[100]	cv_agg's rmse: 0.81311 + 0.00117279
[50]	cv_agg's rmse: 0.828108 + 0.00102136
[100]	cv_agg's rmse: 0.812707 + 0.000984279
[50]	cv_agg's rmse: 0.828128 + 0.0010889
[100]	cv_agg's rmse: 0.812547 + 0.000996237
[50]	cv_agg's rmse: 0.828128 + 0.0010889
[100]	cv_agg's rmse: 0.812426 + 0.000855748
[50]	cv_agg's rmse: 0.828128 + 0.0010889
[100]	cv_agg's rmse: 0.812451 + 0.000904616
[50]	cv_agg's rmse: 0.830599 + 0.00109784
[100]	cv_agg's rmse: 0.81311 + 0.00117279
[50]	cv_agg's rmse: 0.825548 + 0.00115646
[100]	cv_

[100]	cv_agg's rmse: 0.812119 + 0.0010796
[50]	cv_agg's rmse: 0.823177 + 0.00127356
[100]	cv_agg's rmse: 0.811946 + 0.000951119
[50]	cv_agg's rmse: 0.823157 + 0.00136382
[100]	cv_agg's rmse: 0.811972 + 0.0010277
[50]	cv_agg's rmse: 0.823284 + 0.00137999
[100]	cv_agg's rmse: 0.811721 + 0.00112016
[50]	cv_agg's rmse: 0.823365 + 0.00135617
[100]	cv_agg's rmse: 0.811812 + 0.00115399
[50]	cv_agg's rmse: 0.823386 + 0.00132377
[100]	cv_agg's rmse: 0.81176 + 0.00108226
[50]	cv_agg's rmse: 0.823386 + 0.001335
[100]	cv_agg's rmse: 0.811861 + 0.00107153
[50]	cv_agg's rmse: 0.823273 + 0.00133362
[100]	cv_agg's rmse: 0.811762 + 0.000882697
[50]	cv_agg's rmse: 0.823208 + 0.00141287
[100]	cv_agg's rmse: 0.811759 + 0.000829295
[50]	cv_agg's rmse: 0.823256 + 0.00136634
[100]	cv_agg's rmse: 0.811929 + 0.000942436
[50]	cv_agg's rmse: 0.823459 + 0.00120484
[100]	cv_agg's rmse: 0.81265 + 0.00101207
[50]	cv_agg's rmse: 0.82323 + 0.00124555
[100]	cv_agg's rmse: 0.812119 + 0.0010796
[50]	cv_agg's rmse: 0.8231

[100]	cv_agg's rmse: 0.811929 + 0.000942436
[50]	cv_agg's rmse: 0.823459 + 0.00120484
[100]	cv_agg's rmse: 0.81265 + 0.00101207
[50]	cv_agg's rmse: 0.82323 + 0.00124555
[100]	cv_agg's rmse: 0.812119 + 0.0010796
[50]	cv_agg's rmse: 0.823177 + 0.00127356
[100]	cv_agg's rmse: 0.811946 + 0.000951119
[50]	cv_agg's rmse: 0.823157 + 0.00136382
[100]	cv_agg's rmse: 0.811972 + 0.0010277
[50]	cv_agg's rmse: 0.823284 + 0.00137999
[100]	cv_agg's rmse: 0.811721 + 0.00112016
[50]	cv_agg's rmse: 0.823365 + 0.00135617
[100]	cv_agg's rmse: 0.811812 + 0.00115399
[50]	cv_agg's rmse: 0.823386 + 0.00132377
[100]	cv_agg's rmse: 0.81176 + 0.00108226
[50]	cv_agg's rmse: 0.823386 + 0.001335
[100]	cv_agg's rmse: 0.811861 + 0.00107153
[50]	cv_agg's rmse: 0.823273 + 0.00133362
[100]	cv_agg's rmse: 0.811762 + 0.000882697
[50]	cv_agg's rmse: 0.823208 + 0.00141287
[100]	cv_agg's rmse: 0.811759 + 0.000829295
[50]	cv_agg's rmse: 0.823256 + 0.00136634
[100]	cv_agg's rmse: 0.811929 + 0.000942436
[50]	cv_agg's rmse: 0.82

[100]	cv_agg's rmse: 0.811762 + 0.000882697
[50]	cv_agg's rmse: 0.823208 + 0.00141287
[100]	cv_agg's rmse: 0.811759 + 0.000829295
[50]	cv_agg's rmse: 0.823256 + 0.00136634
[100]	cv_agg's rmse: 0.811929 + 0.000942436
[50]	cv_agg's rmse: 0.823459 + 0.00120484
[100]	cv_agg's rmse: 0.81265 + 0.00101207
[50]	cv_agg's rmse: 0.82323 + 0.00124555
[100]	cv_agg's rmse: 0.812119 + 0.0010796
[50]	cv_agg's rmse: 0.823177 + 0.00127356
[100]	cv_agg's rmse: 0.811946 + 0.000951119
[50]	cv_agg's rmse: 0.823157 + 0.00136382
[100]	cv_agg's rmse: 0.811972 + 0.0010277
[50]	cv_agg's rmse: 0.823284 + 0.00137999
[100]	cv_agg's rmse: 0.811721 + 0.00112016
[50]	cv_agg's rmse: 0.823365 + 0.00135617
[100]	cv_agg's rmse: 0.811812 + 0.00115399
[50]	cv_agg's rmse: 0.823386 + 0.00132377
[100]	cv_agg's rmse: 0.81176 + 0.00108226
[50]	cv_agg's rmse: 0.823386 + 0.001335
[100]	cv_agg's rmse: 0.811861 + 0.00107153
[50]	cv_agg's rmse: 0.823273 + 0.00133362
[100]	cv_agg's rmse: 0.811762 + 0.000882697
[50]	cv_agg's rmse: 0.82

[100]	cv_agg's rmse: 0.812913 + 0.00132918
[50]	cv_agg's rmse: 0.825306 + 0.00168999
[100]	cv_agg's rmse: 0.812523 + 0.00160469
[50]	cv_agg's rmse: 0.825481 + 0.00200123
[100]	cv_agg's rmse: 0.813045 + 0.00136307
[50]	cv_agg's rmse: 0.825525 + 0.00176815
[100]	cv_agg's rmse: 0.812768 + 0.001588
[50]	cv_agg's rmse: 0.82517 + 0.00196295
[100]	cv_agg's rmse: 0.812247 + 0.00138968
[50]	cv_agg's rmse: 0.824948 + 0.00199717
[100]	cv_agg's rmse: 0.812075 + 0.00143452
[50]	cv_agg's rmse: 0.824995 + 0.00201233
[100]	cv_agg's rmse: 0.812264 + 0.0012758
[50]	cv_agg's rmse: 0.825436 + 0.00202649
[100]	cv_agg's rmse: 0.812664 + 0.00139071
[50]	cv_agg's rmse: 0.825552 + 0.00193316
[100]	cv_agg's rmse: 0.812507 + 0.00133683
[50]	cv_agg's rmse: 0.825292 + 0.0020608
[100]	cv_agg's rmse: 0.813019 + 0.00109738
[50]	cv_agg's rmse: 0.825379 + 0.00214066
[100]	cv_agg's rmse: 0.812617 + 0.00120916
[50]	cv_agg's rmse: 0.825037 + 0.00147726
[100]	cv_agg's rmse: 0.812276 + 0.00136398
[50]	cv_agg's rmse: 0.82537

[100]	cv_agg's rmse: 0.81173 + 0.00128151
[50]	cv_agg's rmse: 0.824098 + 0.00167777
[100]	cv_agg's rmse: 0.811876 + 0.00154584
[50]	cv_agg's rmse: 0.824125 + 0.00169188
[100]	cv_agg's rmse: 0.81187 + 0.00150876
[50]	cv_agg's rmse: 0.824044 + 0.00173068
[100]	cv_agg's rmse: 0.811643 + 0.00137986
[50]	cv_agg's rmse: 0.824033 + 0.00160314
[100]	cv_agg's rmse: 0.811707 + 0.00122292
[50]	cv_agg's rmse: 0.824212 + 0.00143655
[100]	cv_agg's rmse: 0.811872 + 0.00133706
[50]	cv_agg's rmse: 0.8243 + 0.00150149
[100]	cv_agg's rmse: 0.811843 + 0.00145372
[50]	cv_agg's rmse: 0.824028 + 0.00173712
[100]	cv_agg's rmse: 0.811682 + 0.00134433
[50]	cv_agg's rmse: 0.824028 + 0.00173712
[100]	cv_agg's rmse: 0.811682 + 0.00134433
[50]	cv_agg's rmse: 0.824028 + 0.00173712
[100]	cv_agg's rmse: 0.811682 + 0.00134433
[50]	cv_agg's rmse: 0.824028 + 0.00173712
[100]	cv_agg's rmse: 0.811682 + 0.00134433
[50]	cv_agg's rmse: 0.824028 + 0.00173712
[100]	cv_agg's rmse: 0.811682 + 0.00134433
[50]	cv_agg's rmse: 0.8240

[100]	cv_agg's rmse: 0.811633 + 0.00114641
[50]	cv_agg's rmse: 0.823622 + 0.00185668
[100]	cv_agg's rmse: 0.81192 + 0.00156353
[50]	cv_agg's rmse: 0.823678 + 0.00178815
[100]	cv_agg's rmse: 0.811921 + 0.00135699
[50]	cv_agg's rmse: 0.823706 + 0.00168922
[100]	cv_agg's rmse: 0.812195 + 0.00115199
[50]	cv_agg's rmse: 0.823812 + 0.00155828
[100]	cv_agg's rmse: 0.812381 + 0.00128866
[50]	cv_agg's rmse: 0.823396 + 0.00146031
[100]	cv_agg's rmse: 0.812315 + 0.00117087
[50]	cv_agg's rmse: 0.823674 + 0.00183362
[100]	cv_agg's rmse: 0.811972 + 0.00119289
[50]	cv_agg's rmse: 0.823249 + 0.00127792
[100]	cv_agg's rmse: 0.812006 + 0.00170089
[50]	cv_agg's rmse: 0.823563 + 0.00139435
[100]	cv_agg's rmse: 0.812312 + 0.00100526
[50]	cv_agg's rmse: 0.823572 + 0.00111153
[100]	cv_agg's rmse: 0.811945 + 0.00122178
[50]	cv_agg's rmse: 0.823381 + 0.00147975
[100]	cv_agg's rmse: 0.811633 + 0.00114641
[50]	cv_agg's rmse: 0.823254 + 0.0016427
[100]	cv_agg's rmse: 0.811694 + 0.00139774
[50]	cv_agg's rmse: 0.82

[100]	cv_agg's rmse: 0.811702 + 0.00123934
[50]	cv_agg's rmse: 0.823356 + 0.00139331
[100]	cv_agg's rmse: 0.811702 + 0.00123934
[50]	cv_agg's rmse: 0.823356 + 0.00139331
[100]	cv_agg's rmse: 0.811702 + 0.00123934
[50]	cv_agg's rmse: 0.823284 + 0.00137999
[100]	cv_agg's rmse: 0.811721 + 0.00112016
[50]	cv_agg's rmse: 0.823767 + 0.00205852
[100]	cv_agg's rmse: 0.812647 + 0.00129589
[50]	cv_agg's rmse: 0.823886 + 0.00147042
[100]	cv_agg's rmse: 0.812596 + 0.00136169
[50]	cv_agg's rmse: 0.82385 + 0.00156655
[100]	cv_agg's rmse: 0.812642 + 0.00129993
[50]	cv_agg's rmse: 0.824399 + 0.00130107
[100]	cv_agg's rmse: 0.813218 + 0.00158847
[50]	cv_agg's rmse: 0.823864 + 0.00174313
[100]	cv_agg's rmse: 0.813394 + 0.00093733
[50]	cv_agg's rmse: 0.823919 + 0.00175939
[100]	cv_agg's rmse: 0.812788 + 0.00123758
[50]	cv_agg's rmse: 0.823709 + 0.00136431
[100]	cv_agg's rmse: 0.812399 + 0.00113048
[50]	cv_agg's rmse: 0.823947 + 0.00145888
[100]	cv_agg's rmse: 0.812904 + 0.00142867
[50]	cv_agg's rmse: 0.8

[100]	cv_agg's rmse: 0.811504 + 0.00148875
[50]	cv_agg's rmse: 0.823519 + 0.00172799
[100]	cv_agg's rmse: 0.8115 + 0.00138756
[50]	cv_agg's rmse: 0.823539 + 0.00165175
[100]	cv_agg's rmse: 0.811475 + 0.0013682
[50]	cv_agg's rmse: 0.823586 + 0.00175133
[100]	cv_agg's rmse: 0.811603 + 0.00150241
[50]	cv_agg's rmse: 0.823615 + 0.00173102
[100]	cv_agg's rmse: 0.811496 + 0.00144927
[50]	cv_agg's rmse: 0.823622 + 0.00175941
[100]	cv_agg's rmse: 0.811449 + 0.00159358
[50]	cv_agg's rmse: 0.823484 + 0.00174634
[100]	cv_agg's rmse: 0.811564 + 0.00144683
[50]	cv_agg's rmse: 0.823487 + 0.00174507
[100]	cv_agg's rmse: 0.811572 + 0.00145293
[50]	cv_agg's rmse: 0.82354 + 0.00173893
[100]	cv_agg's rmse: 0.811576 + 0.00143418
[50]	cv_agg's rmse: 0.823484 + 0.00174634
[100]	cv_agg's rmse: 0.811564 + 0.00144683
[50]	cv_agg's rmse: 0.82354 + 0.00173893
[100]	cv_agg's rmse: 0.811576 + 0.00143418
[50]	cv_agg's rmse: 0.823566 + 0.00173765
[100]	cv_agg's rmse: 0.811486 + 0.00139186
[50]	cv_agg's rmse: 0.82354

In [12]:
params = {

'boosting_type':'gbdt',
'objective':'regression',
'metric':'rmse',
'nthread':4,
'learning_rate':0.05,
"max_depth":5,
"num_leaves":30,
"max_bin":255,
"min_data_in_leaf":41,
"min_split_gain":1.0,
"feature_fraction": 0.7,
"bagging_fraction":0.9,
"bagging_freq":5,
"lambda_l1":0.1,
"lambda_l2":0.6,
"metric": "rmse",
}
train_data = lgb.Dataset(X_val, label=Y_val)
val_data = lgb.Dataset(x_test, label=y_test, reference=train_data)
lgb_r_cv = lgb.train(params, train_data, num_boost_round=8000, early_stopping_rounds=100, valid_sets=[train_data, val_data])

[1]	training's rmse: 1.73411	valid_1's rmse: 1.72321
Training until validation scores don't improve for 100 rounds
[2]	training's rmse: 1.66747	valid_1's rmse: 1.65678
[3]	training's rmse: 1.60471	valid_1's rmse: 1.59417
[4]	training's rmse: 1.54664	valid_1's rmse: 1.53632
[5]	training's rmse: 1.49141	valid_1's rmse: 1.48124
[6]	training's rmse: 1.43979	valid_1's rmse: 1.42975
[7]	training's rmse: 1.39155	valid_1's rmse: 1.38168
[8]	training's rmse: 1.34652	valid_1's rmse: 1.33679
[9]	training's rmse: 1.30454	valid_1's rmse: 1.29503
[10]	training's rmse: 1.2654	valid_1's rmse: 1.25612
[11]	training's rmse: 1.22886	valid_1's rmse: 1.21971
[12]	training's rmse: 1.19494	valid_1's rmse: 1.18595
[13]	training's rmse: 1.16345	valid_1's rmse: 1.1546
[14]	training's rmse: 1.1349	valid_1's rmse: 1.1263
[15]	training's rmse: 1.10821	valid_1's rmse: 1.0998
[16]	training's rmse: 1.0835	valid_1's rmse: 1.07522
[17]	training's rmse: 1.06119	valid_1's rmse: 1.05308
[18]	training's rmse: 1.04001	valid

In [13]:
df_test_pay = df_test[df_test['pay_sum']>0]
df_test_nopay = df_test[df_test['pay_sum']==0]

In [14]:
df_test_part1 = df_test_nopay[['user_id','cp_server_no','cp_role_id','pay_num']].rename(columns=
                                                                                {'pay_num':'predict_30_pay'})

In [15]:
target_test = df_test_pay['role_created_30_pay_sum']
target_test_ln = np.log1p(target_test)
features_test = df_test_pay.drop(['role_created_30_pay_sum','user_id','cp_server_no','cp_role_id'], axis=1)
y_predict = lgb_r_cv.predict(features_test)
y_predict[y_predict<0] = 0
mse = mean_squared_error(np.expm1(y_predict), np.expm1(target_test_ln))
mae = mean_absolute_error(np.expm1(y_predict), np.expm1(target_test_ln))
rmse = mse ** 0.5
print(rmse)
print(mse)
print(mae)

2567.3537522067286
6591305.288969969
467.64990958223257


In [16]:
sum(np.expm1(target_test_ln))/sum(np.expm1(y_predict))

1.324403401529047

In [17]:
df_test_part2 = df_test_pay[['user_id','cp_server_no','cp_role_id']]
df_test_part2['predict_30_pay'] = np.expm1(y_predict) *1.32
pred = df_test_part1.append(df_test_part2)
predict_data = pd.merge(df_test[['user_id','cp_server_no','cp_role_id', 'role_created_30_pay_sum']],pred,on=['user_id','cp_server_no','cp_role_id'],how='left',validate='one_to_one')

In [18]:
mse = mean_squared_error(predict_data['predict_30_pay'], predict_data['role_created_30_pay_sum'])
mae = mean_absolute_error(predict_data['predict_30_pay'], predict_data['role_created_30_pay_sum'])
rmse = mse ** 0.5
print('测试集上的均方根误差:%.2f元'% rmse)
print('测试集上的平均绝对误差:%.2f元'% mae)
print('测试集前30天实际总的付费金额:%.2f元' % predict_data['role_created_30_pay_sum'].sum())
print('测试集前30天预测总的付费金额:%.2f元'% predict_data['predict_30_pay'].sum())
print('预测总金额准确率:',predict_data['predict_30_pay'].sum()/predict_data['role_created_30_pay_sum'].sum())

测试集上的均方根误差:377.88元
测试集上的平均绝对误差:12.35元
测试集前30天实际总的付费金额:29874322.00元
测试集前30天预测总的付费金额:28767334.75元
预测总金额准确率: 0.9629451926357577


In [19]:
import joblib
joblib.dump(lgb_r_cv,'./lgb_r_5d.pkl')

['./lgb_r_5d.pkl']