In [None]:
import numpy as np
import pandas as pd
import feather
import os
import tensorflow as tf
from training_utils import *
from scoring_func import get_scores
import matplotlib.pyplot as plt
%matplotlib inline
import tensorflow as tf
from base_models import Model

  from ._conv import register_converters as _register_converters


In [2]:
'''
Config
'''
env = {
    "train_csv": "../data/Avazu/input/train.csv",
    "train_feather": "../data/Cache/Avazu_train.feather",
    "result_dir": "../data/Result"
}
args = {
    'batchsize': 10000, 
    'lr': 1e-3,
    'emb_size': 256,
    'dropout': 0.,
    'model': "MLP_concat",
    'log': "",
    'split_mode': "date", # or shuffle
    'n_bins': 100,
    'neg_sample_rate': 1,
    'n_epochs': 1,
    'n_hidden': 2,
    'dropout_type': 'embedding', # or 'none', 'field', 'embedding'
    'gpu': '0',
}

# learning rate
LR = args['lr']
EMB_SIZE = args['emb_size']
DROPOUT_RATE = args['dropout']
N_BINS = args['n_bins']
NEG_SAMPLE_RATE = args['neg_sample_rate']
BATCHSIZE = args['batchsize']
N_EPOCHS = args['n_epochs']
N_HIDDEN = args['n_hidden']
DROPOUT_TYPE = args['dropout_type']
BASE_MODEL = args['model']
# log file
LOG = "logs/"+args['model']+'.json'
# path to save the model
saver_path ="saver/model"+args['model']
# dataset split mode
split_mode = args['split_mode']

os.environ['CUDA_DEVICE_ORDER']="PCI_BUS_ID"
os.environ['CUDA_VISIBLE_DEVICES'] = args['gpu']

In [3]:
'''
Read the data and split into train/valid/test.
'''
data = feather.read_dataframe(env['train_feather'])
data = data[data.day<=10]
if split_mode=='date':
    train = data[data.day<6]
    valid = data[(6<=data.day) & (data.day<=7)]
    test = data[8<=data.day]
else:
    data = df.sample(frac=1)
    train = data[:int(0.6*len(data))]
    valid = data[int(0.6*len(data)):int(0.8*len(data))]
    test = data[int(0.8*len(data)):]
train.head()

Unnamed: 0,label,hour,C1,banner_pos,site_id,site_domain,site_category,app_id,app_domain,app_category,...,device_conn_type,C14,C15,C16,C17,C18,C19,C20,C21,day
0,0,0,2,0,582,7339,2,7884,254,0,...,1,285,3,2,56,0,2,0,22,0
1,0,0,2,0,582,7339,2,7884,254,0,...,0,283,3,2,56,0,2,65,22,0
2,0,0,2,0,582,7339,2,7884,254,0,...,0,283,3,2,56,0,2,65,22,0
3,0,0,2,0,582,7339,2,7884,254,0,...,0,285,3,2,56,0,2,65,22,0
4,0,0,2,1,4695,4456,0,7884,254,0,...,0,590,3,2,107,0,2,0,42,0


In [None]:
# specify useful columns of features
num_words_dict = {
    'C1': 7,
    'banner_pos': 7,
    'site_id': 4737,
    'site_domain': 7745,
    'site_category': 26,
    'app_id': 8552,
    'app_domain': 559,
    'app_category': 36,
    'device_model': 8251,
    'device_type': 5,
    'device_conn_type': 4,
    'C14': 2626,
    'C15': 8,
    'C16': 9,
    'C17': 435,
    'C18': 4,
    'C19': 68,
    'C20': 172,
    'C21': 60,
    'day': 10,
    'hour': 24
}
feat_cols = list(num_words_dict.keys())

In [5]:
train_y = train['label'].astype(np.float32)
valid_y = valid['label'].astype(np.float32)
test_y = test['label'].astype(np.float32)

train_x = train[feat_cols]
valid_x = valid[feat_cols]
test_x = test[feat_cols]

In [6]:
model = Model(BASE_MODEL, feat_cols, num_words_dict,
              emb_size=EMB_SIZE, lr=LR, 
              dropout_rate=DROPOUT_RATE,
              n_hidden=N_HIDDEN, 
              embedding_dropout_type=DROPOUT_TYPE)

INFO:tensorflow:Scale of 0 disables regularizer.
INFO:tensorflow:Scale of 0 disables regularizer.


In [7]:
scores = {}
valid_predicts, test_predicts = {}, {}
"""
训练模型
"""
batchsize = int(NEG_SAMPLE_RATE*BATCHSIZE)
config = tf.ConfigProto() 
config.gpu_options.allow_growth = True
sess = tf.Session(config=config)
sess.run(tf.global_variables_initializer())
for _ in range(N_EPOCHS):
    train_x_sampled, train_y_sampled = neg_sample(train_x, train_y, NEG_SAMPLE_RATE)
    train_on_batch(sess, model.train, train_x_sampled, train_y_sampled, batchsize, lr=LR, shuffle=False, verbose=True)
    # on training end
    valid_pred, valid_logit = predict_on_batch(sess, model.predict, valid_x)
    get_scores(valid_y, valid_pred, name='valid')
    valid_predicts['base'] = (valid_pred, valid_logit)

100%|██████████| 2386/2386 [01:42<00:00, 23.29it/s]


valid	{'loss': 0.38420269, 'AUC': 0.75707754, 'MSE': 0.11980446}


In [8]:
test_pred, test_logit = predict_on_batch(sess, model.predict, test_x)
test_score = get_scores(test_y, test_pred, name='test')
scores = pd.DataFrame([test_score], index=['base'])
test_predicts['base'] = (test_pred, test_logit)

test	{'loss': 0.39194346, 'AUC': 0.74409745, 'MSE': 0.12155335}
