In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 
import missingno
import time
from tqdm import tqdm,tqdm_notebook,tqdm_pandas
%matplotlib inline

## 读取历史曝光数据

In [2]:
totalExposureLog_data = pd.read_csv('./totalExposureLog.out', sep='\t',header=None,
                                    names=['广告请求id', '广告请求时间','广告位id','用户id','曝光广告id',
                                                 '曝光广告素材尺寸','曝光广告出价bid','pctr','equality_ecpm','total_ecpm'])

In [4]:
df_chunk = totalExposureLog_data

In [6]:
df_chunk['counts_day'] = 1
df_chunk['广告请求时间']  = df_chunk['广告请求时间'].apply(lambda x: time.strftime("%Y-%m-%d",time.localtime(x))).values
count_day = df_chunk.groupby(['曝光广告id', '广告请求时间']).agg({'counts_day':np.sum})
df_chunk = df_chunk.merge(count_day.reset_index(),how='inner', on=['曝光广告id','广告请求时间']).drop('counts_day_x',axis=1)
df_chunk.to_csv('./totalExposureLog_pro.csv', index=None)

In [7]:
df_chunk.head()

Unnamed: 0,广告请求id,广告请求时间,广告位id,用户id,曝光广告id,曝光广告素材尺寸,曝光广告出价bid,pctr,equality_ecpm,total_ecpm,counts_day_y
0,53991770,2019-02-17,94,1160618,451525,50,46,47.217,944.34,3122.34,11
1,17313214,2019-02-17,94,644259,451525,50,56,39.129,782.58,2960.58,11
2,6684446,2019-02-17,94,477892,451525,50,18,119.905,2398.1,4576.1,11
3,52666713,2019-02-17,94,861490,451525,50,100,21.763,435.26,2613.26,11
4,20655168,2019-02-17,94,570580,451525,50,20,107.212,2144.24,4322.24,11


In [14]:
import xgboost
from sklearn.model_selection import train_test_split
import lightgbm as lgb

In [15]:
# 选择几个特征，试一试
train_data, val_data, train_tar, val_tar = train_test_split(df_chunk[['广告请求时间','曝光广告id','pctr','total_ecpm']], 
                                                            df_chunk['counts_day_y'],test_size=0.3, random_state=10)

## LGBM 

In [20]:
lgb_train = lgb.Dataset(train_data.drop('广告请求时间',axis=1), train_tar) 
lgb_eval = lgb.Dataset(val_data.drop('广告请求时间',axis=1), val_tar, reference=lgb_train)  

In [None]:
param = {'num_leaves': 31,
         'min_data_in_leaf': 30, 
         'objective':'regression',
         'max_depth': -1,
         'learning_rate': 0.01,
         "min_child_samples": 20,
         "boosting": "gbdt",
         "feature_fraction": 0.9,
         "bagging_freq": 1,
         "bagging_fraction": 0.9 ,
         "bagging_seed": 11,
         "metric": 'rmse',
         "lambda_l1": 0.1,
         "verbosity": -1,
         "nthread": 4,
         "random_state": 4590}
start = time.time()


lgb_origi = lgb.train(train_set=lgb_train,
                      early_stopping_rounds=20,
                      num_boost_round=10000,
                      params=param,
                      valid_sets=lgb_eval)
end = time.time()
print('运行时间为{}秒'.format(round(end-start,0)))

[1]	valid_0's rmse: 2824.65
Training until validation scores don't improve for 20 rounds.
[2]	valid_0's rmse: 2823.28
[3]	valid_0's rmse: 2818.93
[4]	valid_0's rmse: 2817.53
[5]	valid_0's rmse: 2813.28
[6]	valid_0's rmse: 2811.96
[7]	valid_0's rmse: 2807.81
[8]	valid_0's rmse: 2806.53
[9]	valid_0's rmse: 2802.39
[10]	valid_0's rmse: 2798.56
[11]	valid_0's rmse: 2797.34
[12]	valid_0's rmse: 2793.15
[13]	valid_0's rmse: 2789.05
[14]	valid_0's rmse: 2787.85
[15]	valid_0's rmse: 2783.83
[16]	valid_0's rmse: 2779.89
[17]	valid_0's rmse: 2776.02
[18]	valid_0's rmse: 2774.86
[19]	valid_0's rmse: 2773.72
[20]	valid_0's rmse: 2770.26
[21]	valid_0's rmse: 2766.87
[22]	valid_0's rmse: 2763.55
[23]	valid_0's rmse: 2762.47
[24]	valid_0's rmse: 2761.41
[25]	valid_0's rmse: 2760.36
[26]	valid_0's rmse: 2756.75
[27]	valid_0's rmse: 2753.2
[28]	valid_0's rmse: 2752.2
[29]	valid_0's rmse: 2751.22
[30]	valid_0's rmse: 2747.79
[31]	valid_0's rmse: 2744.58
[32]	valid_0's rmse: 2741.37
[33]	valid_0's rmse: 

[277]	valid_0's rmse: 2524.04
[278]	valid_0's rmse: 2523.51
[279]	valid_0's rmse: 2523.1
[280]	valid_0's rmse: 2522.75
[281]	valid_0's rmse: 2522.28
[282]	valid_0's rmse: 2522.05
[283]	valid_0's rmse: 2521.64
[284]	valid_0's rmse: 2521.3
[285]	valid_0's rmse: 2521.14
[286]	valid_0's rmse: 2520.99
[287]	valid_0's rmse: 2520.83
[288]	valid_0's rmse: 2520.39
[289]	valid_0's rmse: 2520.24
[290]	valid_0's rmse: 2519.84
[291]	valid_0's rmse: 2519.61
[292]	valid_0's rmse: 2518.96
[293]	valid_0's rmse: 2518.66
[294]	valid_0's rmse: 2518.08
[295]	valid_0's rmse: 2517.7
[296]	valid_0's rmse: 2517.54
[297]	valid_0's rmse: 2517.31
[298]	valid_0's rmse: 2517.17
[299]	valid_0's rmse: 2517.02
[300]	valid_0's rmse: 2516.88
[301]	valid_0's rmse: 2516.38
[302]	valid_0's rmse: 2516.25
[303]	valid_0's rmse: 2515.83
[304]	valid_0's rmse: 2515.55
[305]	valid_0's rmse: 2514.99
[306]	valid_0's rmse: 2514.54
[307]	valid_0's rmse: 2514.18
[308]	valid_0's rmse: 2513.71
[309]	valid_0's rmse: 2513.29
[310]	valid_0

In [None]:
print('Save model...')
# save model to file
lgb_origi.save_model('./baseline_model/lightgbm_model_lph_42120.txt')

## XGBoost

In [10]:
clf = xgboost.XGBRegressor(colsample_bytree=0.3,
                 gamma=0.0,
                 learning_rate=0.01,
                 max_depth=4,
                 min_child_weight=1.5,
                 n_estimators=1668,                                                                  
                 reg_alpha=1,
                 reg_lambda=0.6,
                 subsample=0.2,
                 seed=42,
                 silent=1)

In [None]:
clf.fit()