In [None]:
!pip install deepctr[gpu]

### DeepFM_LSTM
    用 LSTM 代替 Transformer 模块，并进行标签向量的预训练

In [None]:
cd /content/drive/MyDrive/DeepLearningRec/XunFei

In [None]:
import tensorflow as tf
from tensorflow.python.keras.layers import Input, Embedding, Flatten
from tensorflow.python.keras.initializers import RandomNormal, Zeros
from tensorflow.python.keras.regularizers import l2
from deepctr.layers.sequence import SequencePoolingLayer,Transformer
from deepctr.layers.utils import concat_func, Linear, add_func
from deepctr.layers.core import DNN, PredictionLayer
from deepctr.layers.interaction import FM
from tensorflow.keras.models import *
from tensorflow.keras.layers import *


def deepFM(nums_dict, embedding_size=int(64), tag_history_max_len=50, dnn_hidden_units=(128, 128),
           l2_reg_linear=0.00001, l2_reg_embedding=0.00001, l2_reg_dnn=0, seed=1024, dnn_dropout=0,
           dnn_activation='relu', dnn_use_bn=False, task='binary', tag_pretrained_embedding=None, num_transformers=1):
    # 输入层
    gender = Input(shape=(1,), name="gender", dtype="int32")
    age = Input(shape=(1,), name="age", dtype="int32")
    province = Input(shape=(1,), name="province", dtype="int32")
    city = Input(shape=(1,), name="city", dtype="int32")
    # 加入 model 和 make 信息
    tagid_history = Input(shape=(tag_history_max_len,), name="tagid_history", dtype="int32")
    tagid_hist_len = Input(shape=(1,), name='tagid_history_len', dtype="int32")
    # 嵌入层，包括lr+deep的嵌入
    gender_lr_emb = Embedding(nums_dict['gender'], 1,
                              embeddings_initializer=RandomNormal(mean=0.0, stddev=0.0001, seed=2020),
                              embeddings_regularizer=l2(l2_reg_linear),
                              name="gender_lr_emb")(gender)  # (B, 1, 1)
    age_lr_emb = Embedding(nums_dict['age'], 1,
                           embeddings_initializer=RandomNormal(mean=0.0, stddev=0.0001, seed=2020),
                           embeddings_regularizer=l2(l2_reg_linear),
                           name="age_lr_emb")(age)  # (B, 1, 1)
    province_lr_emb = Embedding(nums_dict['province'], 1,
                                embeddings_initializer=RandomNormal(mean=0.0, stddev=0.0001, seed=2020),
                                embeddings_regularizer=l2(l2_reg_linear),
                                name="province_lr_emb")(province)  # (B, 1, 1)
    city_lr_emb = Embedding(nums_dict['city'], 1,
                            embeddings_initializer=RandomNormal(mean=0.0, stddev=0.0001, seed=2020),
                            embeddings_regularizer=l2(l2_reg_linear),
                            name="city_lr_emb")(city)  # (B, 1, 1)
    # 加入 model 和 make 信息
    # model_lr_emb = Embedding(nums_dict['model'], 1,
    #                          embeddings_initializer=RandomNormal(mean=0.0, stddev=0.0001, seed=2020),
    #                          embeddings_regularizer=l2(l2_reg_linear),
    #                          name='model_lr_emb')(model)
    # make_lr_emb = Embedding(nums_dict['make'], 1,
    #                          embeddings_initializer=RandomNormal(mean=0.0, stddev=0.0001, seed=2020),
    #                          embeddings_regularizer=l2(l2_reg_linear),
    #                          name='make_lr_emb')(make)                         
    tagid_lr_emb = Embedding(nums_dict['tagid_history'], 1,
                             embeddings_initializer=RandomNormal(mean=0.0, stddev=0.0001, seed=2020),
                             embeddings_regularizer=l2(l2_reg_linear),
                             name="tagid_lr_emb",
                             mask_zero=True)(tagid_history)  # (B, max_len, 1)
    gender_emb = Embedding(nums_dict['gender'], embedding_size,
                           embeddings_initializer=RandomNormal(mean=0.0, stddev=0.0001, seed=2020),
                           embeddings_regularizer=l2(l2_reg_embedding),
                           name="gender_emb")(gender)  # (B, 1, d)
    age_emb = Embedding(nums_dict['age'], embedding_size,
                        embeddings_initializer=RandomNormal(mean=0.0, stddev=0.0001, seed=2020),
                        embeddings_regularizer=l2(l2_reg_embedding),
                        name="age_emb")(age)  # (B, 1, d)
    province_emb = Embedding(nums_dict['province'], embedding_size,
                             embeddings_initializer=RandomNormal(mean=0.0, stddev=0.0001, seed=2020),
                             embeddings_regularizer=l2(l2_reg_embedding),
                             name="province_emb")(province)  # (B, 1, d)
    city_emb = Embedding(nums_dict['city'], embedding_size,
                         embeddings_initializer=RandomNormal(mean=0.0, stddev=0.0001, seed=2020),
                         embeddings_regularizer=l2(l2_reg_embedding),
                         name="city_emb")(city)  # (B, 1, d)
    # 加入 model 和 make 信息
    # model_emb = Embedding(nums_dict['model'], embedding_size,
    #                      embeddings_initializer=RandomNormal(mean=0.0, stddev=0.0001, seed=2020),
    #                      embeddings_regularizer=l2(l2_reg_embedding),
    #                      name="model_emb")(model)  # (B, 1, d)
    # make_emb = Embedding(nums_dict['make'], embedding_size,
    #                      embeddings_initializer=RandomNormal(mean=0.0, stddev=0.0001, seed=2020),
    #                      embeddings_regularizer=l2(l2_reg_embedding),
    #                      name="make_emb")(make)  # (B, 1, d)
    # tagid_emb = Embedding(nums_dict['tagid_history'], embedding_size,
    #                       embeddings_initializer=RandomNormal(mean=0.0, stddev=0.0001, seed=2020),
    #                       embeddings_regularizer=l2(l2_reg_embedding),
    #                       name="tagid_emb",
    #                       mask_zero=True)(tagid_history)  # (B, max_len, d)
    tagid_emb = Embedding(nums_dict['tagid_history'], embedding_size,
                          input_length=MAX_SEQUENCE_LENGTH,
                          weights=[tag_pretrained_embedding],
                          trainable=False,
                          name="tagid_emb",
                          mask_zero=True)(tagid_history)  # (B, max_len, d)
    # 对tagid_history进行处理
    tagid_lr_emb = SequencePoolingLayer(mode='sum')((tagid_lr_emb, tagid_hist_len))  # (B, 1, 1)

    # 使用 LSTM 抽取序列特征，并直接用于分类
    l = LSTM(128)(tagid_emb)
    flat = BatchNormalization()(l)
    dropout = Dropout(0.4)(flat)
    lstm_output = Dense(1)(dropout)
    query_mask = tf.squeeze(tf.sequence_mask(tagid_hist_len, maxlen=tag_history_max_len, dtype=tf.float32), axis=1) 
    key_mask = tf.squeeze(tf.sequence_mask(tagid_hist_len, maxlen=tag_history_max_len, dtype=tf.float32), axis=1)
    for i in tf.range(num_transformers - 1):
      tagid_emb = Transformer(att_embedding_size=int(embedding_size/8),head_num=int(8),use_positional_encoding=False,supports_masking=True, output_type=None)\
          (inputs=[tagid_emb, tagid_emb], mask=[query_mask, key_mask])
    tagid_emb = Transformer(att_embedding_size=int(embedding_size/8),head_num=int(8),use_positional_encoding=False,supports_masking=True, output_type='mean')\
          (inputs=[tagid_emb, tagid_emb], mask=[query_mask, key_mask])
  
    lr_emb_concat = Flatten()(concat_func([gender_lr_emb, age_lr_emb, province_lr_emb, city_lr_emb, tagid_lr_emb]))
    lr_output = Linear(mode=0, seed=seed)(lr_emb_concat)
    # deep
    dnn_emb_concat = Flatten()(concat_func([gender_emb, age_emb, province_emb, city_emb, tagid_emb]))
    dnn_output = DNN(dnn_hidden_units, dnn_activation, l2_reg_dnn, dnn_dropout, dnn_use_bn, seed=seed)(dnn_emb_concat)
    deep_output = tf.keras.layers.Dense(
        1, use_bias=False, kernel_initializer=tf.keras.initializers.glorot_normal(seed=seed))(dnn_output)
    # fm
    fm_output = FM()(concat_func([gender_emb, age_emb, province_emb, city_emb, tagid_emb], axis=1))
    all_output = add_func([lr_output, fm_output, deep_output, lstm_output])
    output = PredictionLayer(task)(all_output)
    model = tf.keras.models.Model(
        inputs=[gender, age, province, city, tagid_history, tagid_hist_len], outputs=output)
    return model

In [None]:
import math
import numpy as np
import pandas as pd
import warnings
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import deepctr
import tensorflow as tf
from tensorflow.keras.preprocessing import text, sequence
from tensorflow.keras.utils import to_categorical
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score, fbeta_score, precision_score, recall_score, roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from tensorflow.python.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam, Adagrad
from gensim.models import Word2Vec
from tqdm import tqdm

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)

In [None]:
import os

train_path = './Dataset/train'
test_path = './Dataset/test'

TRAIN_COLUMNS = ['pid', 'label', 'gender', 'age', 'tagid', 'time', 'province', 'city', 'model', 'make']
TEST_COLUMNS = ['pid', 'gender', 'age', 'tagid', 'time', 'province', 'city', 'model', 'make']

train = pd.read_table(os.path.join(train_path, 'train.txt'), sep=',', names=TRAIN_COLUMNS)
test = pd.read_table(os.path.join(test_path, 'apply_new.txt'), sep=',', names=TEST_COLUMNS)
data = pd.concat([train, test]).reset_index(drop=True)

In [None]:
# 将 tagid 和 time 都转换成列表
data['tagid'] = data['tagid'].apply(eval)
data['tagid'] = data['tagid'].apply(lambda x: [str(i) for i in x])
data['time'] = data['time'].apply(eval)

In [None]:
embed_size = 64
MAX_NB_WORDS = 230637
MAX_SEQUENCE_LENGTH = 128

# 将 tagid 按照时间序列进行排列
all_tag_id = []
all_tag_len = []

for i in tqdm(range(data.shape[0])):
  tag_list = np.array(data.loc[i, 'tagid'])
  time_list = np.array(data.loc[i, 'time'])
  if len(tag_list) != len(time_list):
    time_list = time_list[:len(tag_list)]
  index = np.argsort(time_list)[::-1]
  sort_time_list = time_list[index]
  sort_tag_list = tag_list[index]
  all_tag_id.append(sort_tag_list.tolist())
  all_tag_len.append(min(len(sort_tag_list), MAX_SEQUENCE_LENGTH))

data['tagid'] = all_tag_id
data['tagid_history_len'] = all_tag_len

In [None]:
# 用word2vec对 tag 做预训练，
# word2vec 的预训练是在所有的 tag 上做的
w2v_model = Word2Vec(sentences=data['tagid'].tolist(), size=embed_size,
                     window=10, min_count=4, iter=10)

In [None]:
# 将序列
All_Tags = data[:]['tagid']

# tokenizer 将标签映射为新的 key 值
tokenizer = text.Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(All_Tags)
All_Tags = tokenizer.texts_to_sequences(All_Tags)
All_Tags = sequence.pad_sequences(All_Tags, padding='post', truncating='post', maxlen=MAX_SEQUENCE_LENGTH)

word_index = tokenizer.word_index

nb_words = len(word_index) + 1
print('Total %s word vectors.' % nb_words)

In [None]:
# 用预训练的embedding向量初始化 Embedding 层矩阵
embedding_matrix = np.zeros((nb_words, embed_size))

for word, i in word_index.items():
  try:
    embedding_vector = w2v_model.wv.get_vector(word)
  except KeyError:
    continue
  if embedding_vector is not None:
    embedding_matrix[i] = embedding_vector

In [None]:
data['tagid_history'] = All_Tags.tolist()
new_data = data.copy()

# label encoder
num_dict = {}  # 每个features的个数
embedding_dim_dict = {}  # 每个feature的embedding维度
sparse_features = ["gender", "age", "province", "city"]
target = ['label']
# 缺失值赋予新的值
new_data[["gender", "age"]] = new_data[["gender", "age"]].fillna(-1)
for feat in sparse_features:
    lbe = LabelEncoder()
    new_data[feat] = lbe.fit_transform(new_data[feat])
    num_dict[feat] = len(lbe.classes_)
    embedding_dim_dict[feat] = 64
num_dict['tagid_history'] = nb_words
embedding_dim_dict['tagid_history'] = 64

X_train = new_data[~new_data['label'].isna()]
X_test = new_data[new_data['label'].isna()]
y = X_train['label']

In [None]:
KF = StratifiedKFold(n_splits=5, random_state=2020, shuffle=True)
oof_nn = np.zeros(len(X_train))
predictions_nn = np.zeros((len(X_test)))
# 五折交叉验证
for fold_, (trn_idx, val_idx) in enumerate(KF.split(X_train.values, y.values)):
    print("fold n°{}".format(fold_))
    print('trn_idx:', trn_idx)
    print('val_idx:', val_idx)

    sparse_features = ["gender", "age", "province", "city"]
    dense_features = ["tagid_history_len"]
    varlen_features = ["tagid_history"]
    target = ['label']
    trn_data = X_train.iloc[trn_idx]
    val_data = X_train.iloc[val_idx]

    trn_model_input = {name: trn_data[name] for name in sparse_features + dense_features}
    val_model_input = {name: val_data[name] for name in sparse_features + dense_features}
    test_model_input = {name: X_test[name] for name in sparse_features + dense_features}
    for name in varlen_features:
        trn_model_input[name] = np.array(trn_data[name].values.tolist())
        val_model_input[name] = np.array(val_data[name].values.tolist())
        test_model_input[name] = np.array(X_test[name].values.tolist())
    model = deepFM(num_dict, 64, tag_history_max_len=MAX_SEQUENCE_LENGTH, dnn_hidden_units=(256, 128),
                   l2_reg_linear=0.00001, l2_reg_embedding=1e-2, l2_reg_dnn=0, seed=1024, dnn_dropout=0,
                   dnn_activation='relu', dnn_use_bn=True, task='binary', tag_pretrained_embedding=embedding_matrix, num_transformers=4)
    model.compile(Adam(), "binary_crossentropy",
                  metrics=["binary_crossentropy", "AUC"])
    es = EarlyStopping(monitor='val_auc', patience=5, restore_best_weights=True, mode="max")
    history = model.fit(trn_model_input, trn_data[target].values, batch_size=1024, epochs=100,
                        verbose=1, validation_data=(val_model_input, val_data[target].values),
                        callbacks=[es])
    oof_nn[val_idx] = model.predict(val_model_input, 128).reshape(-1)
    predictions_nn[:] += model.predict(test_model_input, 128).reshape(-1)
print("AUC score: {}".format(roc_auc_score(y, oof_nn)))
print("F1 score: {}".format(
    f1_score(y, [1 if i >= 0.5 else 0 for i in oof_nn])))
print("Precision score: {}".format(precision_score(
    y, [1 if i >= 0.5 else 0 for i in oof_nn])))
print("Recall score: {}".format(recall_score(
    y, [1 if i >= 0.5 else 0 for i in oof_nn])))

In [None]:
t = np.median(predictions_nn)
X_test['category_id'] = [1 if i > t else 0 for i in predictions_nn]
X_test['user_id'] = X_test['pid']
X_test[['user_id', 'category_id']].to_csv('deepfm_741.csv', index=False)