In [2]:
from pyspark.sql import SparkSession
import os
os.environ["PYSPARK_PYTHON"] = "/usr/bin/python3.6"
spark = SparkSession.builder.appName("tkt_kwd_model").enableHiveSupport().getOrCreate()



In [17]:
import sys
sys.path.append("..")
sys.path.append(".")
import numpy as np 
import pandas as pd 
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, Dense, Dropout
from sklearn.model_selection import train_test_split

In [21]:
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.losses import binary_crossentropy
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import AUC, Accuracy, BinaryAccuracy

In [3]:
from WDL.model import WideDeep
from WDL.modules import sparseFeature, denseFeature

In [4]:
sample = -1
file = '../dataset/tkt_kwd_train.csv'
if sample == -1:
    data = pd.read_csv(file)
else:
    data = pd.read_csv(file, iterator=True)
    data = data.get_chunk(sample)

In [29]:
wide_dense_features = [x.strip() for x in 'around_tour, isholiday, is_unexposed_product, hot_p1m, ctr_p1m, order_p1m, gmv_p1m, unitgmv_p1m, comment_p, comment_pl2m, hot_pla1m, ctr_pla1m, order_pla1m, gmv_pla1m, unitgmv_pla1m, queryctr_pldk1m, new_p1m, season_p, basic_p, kwdcoeff_p, text_score, local_poi_distance, local_district_distance, poi_district_distance, businessstatus, dat, date_type, holiday_type, day_of_week, week_of_year, month, isheat, imquality_score, ishighqualitypoi, iscoverimage, hasticket, inchina, isfree'.split(',')]

deep_dense_features = [x.strip() for x in 'order_cnt_3m_p, order_cnt_1m_p, order_cnt_15d_p, order_cnt_7d_p, order_cnt_3d_p, gmv_3m_p, gmv_1m_p, gmv_15d_p, gmv_7d_p, gmv_3d_p, expos_uv_3m_p, expos_uv_1m_p, expos_uv_15d_p, expos_uv_7d_p, expos_uv_3d_p, click_uv_3m_p, click_uv_1m_p, click_uv_15d_p, click_uv_7d_p, click_uv_3d_p, ctr_3m_p, ctr_1m_p, ctr_15d_p, ctr_7d_p, ctr_3d_p, unit_decay_gmv_3m_p, unit_decay_gmv_1m_p, unit_decay_gmv_15d_p, unit_decay_gmv_7d_p, unit_decay_gmv_3d_p, ctcvr_3m_p, ctcvr_1m_p, ctcvr_15d_p, ctcvr_7d_p, ctcvr_3d_p, cvr_3m_p, cvr_1m_p, cvr_15d_p, cvr_7d_p, cvr_3d_p, detail_uv_3m_p, detail_uv_1m_p, detail_uv_15d_p, detail_uv_7d_p, detail_uv_3d_p, order_cnt_3m_pl, order_cnt_1m_pl, order_cnt_15d_pl, order_cnt_7d_pl, order_cnt_3d_pl, gmv_3m_pl, gmv_1m_pl, gmv_15d_pl, gmv_7d_pl, gmv_3d_pl, expos_uv_3m_pl, expos_uv_1m_pl, expos_uv_15d_pl, expos_uv_7d_pl, expos_uv_3d_pl, click_uv_3m_pl, click_uv_1m_pl, click_uv_15d_pl, click_uv_7d_pl, click_uv_3d_pl, ctr_3m_pl, ctr_1m_pl, ctr_15d_pl, ctr_7d_pl, ctr_3d_pl, unit_decay_gmv_3m_pl, unit_decay_gmv_1m_pl, unit_decay_gmv_15d_pl, unit_decay_gmv_7d_pl, unit_decay_gmv_3d_pl, ctcvr_3m_pl, ctcvr_1m_pl, ctcvr_15d_pl, ctcvr_7d_pl, ctcvr_3d_pl, cvr_3m_pl, cvr_1m_pl, cvr_15d_pl, cvr_7d_pl, cvr_3d_pl, detail_uv_3m_pl, detail_uv_1m_pl, detail_uv_15d_pl, detail_uv_7d_pl, detail_uv_3d_pl, order_cnt_3m_pdk, order_cnt_1m_pdk, order_cnt_15d_pdk, order_cnt_7d_pdk, order_cnt_3d_pdk, gmv_3m_pdk, gmv_1m_pdk, gmv_15d_pdk, gmv_7d_pdk, gmv_3d_pdk, expos_uv_3m_pdk, expos_uv_1m_pdk, expos_uv_15d_pdk, expos_uv_7d_pdk, expos_uv_3d_pdk, click_uv_3m_pdk, click_uv_1m_pdk, click_uv_15d_pdk, click_uv_7d_pdk, click_uv_3d_pdk, ctr_3m_pdk, ctr_1m_pdk, ctr_15d_pdk, ctr_7d_pdk, ctr_3d_pdk, unit_decay_gmv_3m_pdk, unit_decay_gmv_1m_pdk, unit_decay_gmv_15d_pdk, unit_decay_gmv_7d_pdk, unit_decay_gmv_3d_pdk, ctcvr_3m_pdk, ctcvr_1m_pdk, ctcvr_15d_pdk, ctcvr_7d_pdk, ctcvr_3d_pdk, cvr_3m_pdk, cvr_1m_pdk, cvr_15d_pdk, cvr_7d_pdk, cvr_3d_pdk, order_cnt_3m_u, order_cnt_1m_u, order_cnt_15d_u, order_cnt_7d_u, order_cnt_3d_u, gmv_3m_u, gmv_1m_u, gmv_15d_u, gmv_7d_u, gmv_3d_u, expos_uv_3m_u, expos_uv_1m_u, expos_uv_15d_u, expos_uv_7d_u, expos_uv_3d_u, click_uv_3m_u, click_uv_1m_u, click_uv_15d_u, click_uv_7d_u, click_uv_3d_u, ctr_3m_u, ctr_1m_u, ctr_15d_u, ctr_7d_u, ctr_3d_u, unit_decay_gmv_3m_u, unit_decay_gmv_1m_u, unit_decay_gmv_15d_u, unit_decay_gmv_7d_u, unit_decay_gmv_3d_u, ctcvr_3m_u, ctcvr_1m_u, ctcvr_15d_u, ctcvr_7d_u, ctcvr_3d_u, cvr_3m_u, cvr_1m_u, cvr_15d_u, cvr_7d_u, cvr_3d_u, order_cnt_3m_uk, order_cnt_1m_uk, order_cnt_15d_uk, order_cnt_7d_uk, order_cnt_3d_uk, gmv_3m_uk, gmv_1m_uk, gmv_15d_uk, gmv_7d_uk, gmv_3d_uk, expos_uv_3m_uk, expos_uv_1m_uk, expos_uv_15d_uk, expos_uv_7d_uk, expos_uv_3d_uk, click_uv_3m_uk, click_uv_1m_uk, click_uv_15d_uk, click_uv_7d_uk, click_uv_3d_uk, ctr_3m_uk, ctr_1m_uk, ctr_15d_uk, ctr_7d_uk, ctr_3d_uk, unit_decay_gmv_3m_uk, unit_decay_gmv_1m_uk, unit_decay_gmv_15d_uk, unit_decay_gmv_7d_uk, unit_decay_gmv_3d_uk, ctcvr_3m_uk, ctcvr_1m_uk, ctcvr_15d_uk, ctcvr_7d_uk, ctcvr_3d_uk, cvr_3m_uk, cvr_1m_uk, cvr_15d_uk, cvr_7d_uk, cvr_3d_uk, order_cnt_3m_puk, order_cnt_1m_puk, order_cnt_15d_puk, order_cnt_7d_puk, order_cnt_3d_puk, gmv_3m_puk, gmv_1m_puk, gmv_15d_puk, gmv_7d_puk, gmv_3d_puk, expos_uv_3m_puk, expos_uv_1m_puk, expos_uv_15d_puk, expos_uv_7d_puk, expos_uv_3d_puk, click_uv_3m_puk, click_uv_1m_puk, click_uv_15d_puk, click_uv_7d_puk, click_uv_3d_puk, ctr_3m_puk, ctr_1m_puk, ctr_15d_puk, ctr_7d_puk, ctr_3d_puk, unit_decay_gmv_3m_puk, unit_decay_gmv_1m_puk, unit_decay_gmv_15d_puk, unit_decay_gmv_7d_puk, unit_decay_gmv_3d_puk, ctcvr_3m_puk, ctcvr_1m_puk, ctcvr_15d_puk, ctcvr_7d_puk, ctcvr_3d_puk, cvr_3m_puk, cvr_1m_puk, cvr_15d_puk, cvr_7d_puk, cvr_3d_puk, order_cnt_3m_k, order_cnt_1m_k, order_cnt_15d_k, order_cnt_7d_k, order_cnt_3d_k, gmv_3m_k, gmv_1m_k, gmv_15d_k, gmv_7d_k, gmv_3d_k, expos_uv_3m_k, expos_uv_1m_k, expos_uv_15d_k, expos_uv_7d_k, expos_uv_3d_k, click_uv_3m_k, click_uv_1m_k, click_uv_15d_k, click_uv_7d_k, click_uv_3d_k, ctr_3m_k, ctr_1m_k, ctr_15d_k, ctr_7d_k, ctr_3d_k, unit_decay_gmv_3m_k, unit_decay_gmv_1m_k, unit_decay_gmv_15d_k, unit_decay_gmv_7d_k, unit_decay_gmv_3d_k, ctcvr_3m_k, ctcvr_1m_k, ctcvr_15d_k, ctcvr_7d_k, ctcvr_3d_k, cvr_3m_k, cvr_1m_k, cvr_15d_k, cvr_7d_k, cvr_3d_k, order_cnt_3m_pu, order_cnt_1m_pu, order_cnt_15d_pu, order_cnt_7d_pu, order_cnt_3d_pu, gmv_3m_pu, gmv_1m_pu, gmv_15d_pu, gmv_7d_pu, gmv_3d_pu, expos_uv_3m_pu, expos_uv_1m_pu, expos_uv_15d_pu, expos_uv_7d_pu, expos_uv_3d_pu, click_uv_3m_pu, click_uv_1m_pu, click_uv_15d_pu, click_uv_7d_pu, click_uv_3d_pu, ctr_3m_pu, ctr_1m_pu, ctr_15d_pu, ctr_7d_pu, ctr_3d_pu, unit_decay_gmv_3m_pu, unit_decay_gmv_1m_pu, unit_decay_gmv_15d_pu, unit_decay_gmv_7d_pu, unit_decay_gmv_3d_pu, ctcvr_3m_pu, ctcvr_1m_pu, ctcvr_15d_pu, ctcvr_7d_pu, ctcvr_3d_pu, cvr_3m_pu, cvr_1m_pu, cvr_15d_pu, cvr_7d_pu, cvr_3d_pu'.split(',')]

deep_sparse_features = [x.strip() for x in 'districtid_enc_index, user_residgscityid_enc_index, localdistrictid_enc_index, userlocaltype_enc_index, district_userlocaltype_enc_index, age_enc_index, gender_enc_index, star_enc_index, member_level_enc_index, crown_enc_index, user_value_enc_index, themeid_enc_index'.split(',')]

In [30]:
#测试一下
wide_dense_features = wide_dense_features[:5]
deep_dense_features = deep_dense_features[:5]
deep_sparse_features = deep_sparse_features[3:6]
dense_features = wide_dense_features + deep_dense_features
sparse_features = deep_sparse_features
sparse_features_num = dict(data[sparse_features].max())

In [32]:
dense_feature_list = [denseFeature(feat) for feat in dense_features] # 所有连续特征
sparse_feature_list = [sparseFeature(feat, feat_num=sparse_features_num[feat], embed_dim=8) for feat in sparse_features] # 所有离散特征
all_feature_list = dense_feature_list + sparse_feature_list
# 分别指定wide  deep侧特征
wide_feature_list = [] # 指定wide侧的特征，不指定则默认所有
deep_feature_list = [] # 指定deep侧的特征，不指定则默认所有
model = WideDeep(dense_feature_list, sparse_feature_list, wide_feature_list, deep_feature_list,
                hidden_units=[64, 32], activation='relu',
                 dnn_dropout=0.2) 
# model.summary()

In [None]:
model.summary()

In [34]:
all_feature_list

[{'feat_name': 'around_tour', 'type': 'dense'},
 {'feat_name': 'isholiday', 'type': 'dense'},
 {'feat_name': 'is_unexposed_product', 'type': 'dense'},
 {'feat_name': 'hot_p1m', 'type': 'dense'},
 {'feat_name': 'ctr_p1m', 'type': 'dense'},
 {'feat_name': 'order_cnt_3m_p', 'type': 'dense'},
 {'feat_name': 'order_cnt_1m_p', 'type': 'dense'},
 {'feat_name': 'order_cnt_15d_p', 'type': 'dense'},
 {'feat_name': 'order_cnt_7d_p', 'type': 'dense'},
 {'feat_name': 'order_cnt_3d_p', 'type': 'dense'},
 {'feat_name': 'userlocaltype_enc_index',
  'feat_num': 2,
  'embed_dim': 8,
  'type': 'sparse'},
 {'feat_name': 'district_userlocaltype_enc_index',
  'feat_num': 2,
  'embed_dim': 8,
  'type': 'sparse'},
 {'feat_name': 'age_enc_index',
  'feat_num': 84,
  'embed_dim': 8,
  'type': 'sparse'}]

In [36]:
data_df = data[[feat['feat_name'] for feat in all_feature_list]]
target = data['isorder']
train, test, train_y, test_y = train_test_split(data_df.values.astype('float32'), target.values.astype('int32'), test_size=0.2)

In [37]:
train_X = [train[:, i] for i in range(train.shape[1])]
test_X = [test[:, i] for i in range(test.shape[1])]

In [38]:
print(train_X[0].shape)
print(test_X[0].shape)

(101710,)
(25428,)


In [39]:
learning_rate = 0.001
batch_size = 128
epochs = 8

In [40]:
model.compile(loss=binary_crossentropy, optimizer=Adam(learning_rate=learning_rate),
                      metrics=[AUC(), BinaryAccuracy(threshold=0.5)])

In [41]:
model.fit(
    train_X,
    train_y,
    epochs=epochs,
    callbacks=[EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)],  # checkpoint
    batch_size=batch_size,
    validation_split=0.1
)

Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


<keras.callbacks.History at 0x7f33c8468710>

In [42]:
loss = model.evaluate(test_X, test_y, batch_size=batch_size)
print('test loss: %f' % loss[0])
print('test AUC: %f' % loss[1])
print('test accuracy: %f' % loss[2])

test loss: 0.281551
test AUC: 0.899704
test accuracy: 0.894447


In [6]:
# # example
# # 两个 dense feature
# dense_f1 = denseFeature('dense_feature_1')
# dense_f2 = denseFeature('dense_feature_2')
# # 两个 sparse feature
# sparse_f1 = sparseFeature('sparse_feature_1', feat_num=2, embed_dim=8) 
# sparse_f2 = sparseFeature('sparse_feature_2', feat_num=5, embed_dim=8)
# # 
# dense_feature_list = [dense_f1, dense_f2] # 所有连续特征
# sparse_feature_list = [sparse_f1, sparse_f2] # 所有离散特征
# # 分别指定wide  deep侧特征
# wide_feature_list = [dense_f1, dense_f2] # 指定wide侧的特征，不指定则默认所有
# deep_feature_list = [dense_f2, sparse_f1] # 指定deep侧的特征，不指定则默认所有


# model = WideDeep(dense_feature_list, sparse_feature_list, wide_feature_list, deep_feature_list) 
# model.summary()

# batch = 128
# d_f1 = tf.constant(np.random.rand(batch, 1))
# d_f2 = tf.constant(np.random.rand(batch, 1))
# s_f1 = tf.constant(np.random.randint(2, size=(batch, 1)), dtype=tf.float32)
# s_f2 = tf.constant(np.random.randint(5, size=(batch, 1)), dtype=tf.float32)
# inputs = [d_f1, d_f2, s_f1, s_f2]
# outputs = model(inputs)
# print(outputs.shape)