In [1]:
import math
import numpy as np
import pandas as pd
import warnings
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import deepctr
import tensorflow as tf
from myModel import deepFM
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score, fbeta_score, precision_score, recall_score, roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from tensorflow.python.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam, Adagrad
from tqdm import tqdm

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)


In [2]:
train = pd.read_csv('../data/train.txt', header=None, names=[
    'pid', 'label', 'gender', 'age', 'tagid', 'time', 'province', 'city', 'make', 'model'])
test = pd.read_csv('../data/test.txt', header=None, names=[
    'pid', 'gender', 'age', 'tagid', 'time', 'province', 'city', 'make', 'model'])
data = pd.concat([train, test]).reset_index(drop=True)

In [3]:
# 按tagid时间降序排序，筛选出最近50个tagid，不足补0
data['tagid'] = data['tagid'].apply(eval)
data['time'] = data['time'].apply(eval)
all_tag_id = []
all_tag_weight = []
all_tag_len = []
for i in tqdm(range(data.shape[0])):
    tagid_list = np.array(data.loc[i, 'tagid'])
    time_list = np.array(data.loc[i, 'time'])
    index = np.argsort(time_list)[::-1][:50]
    sort_time_list = time_list[index] / 1000 / 3600 / 24 / 30
    sort_tagid_list = tagid_list[index]
    latest_time = sort_time_list[0]
    tag_weight = [(1 + math.exp(time - latest_time)) / 2 for time in sort_time_list]
    all_tag_id.extend(sort_tagid_list.tolist() + [0] * (50 - len(index)))
    all_tag_weight.extend(tag_weight + [0] * (50 - len(index)))
    all_tag_len.append(len(index))
taglbe = LabelEncoder()
new_tag_id = taglbe.fit_transform(all_tag_id).reshape(-1, 50)

100%|██████████| 400000/400000 [00:23<00:00, 17041.02it/s]


In [4]:
new_df1 = pd.DataFrame({"tagid_history": new_tag_id.tolist()})
new_df2 = pd.DataFrame({"tagid_weight": np.array(all_tag_weight).reshape(-1, 50).tolist()})
new_df3 = pd.DataFrame({"tagid_history_len": all_tag_len})
new_data = pd.concat([data, new_df1, new_df2, new_df3], axis=1)

# label encoder
num_dict = {}  # 每个features的个数
embedding_dim_dict = {}  # 每个feature的embedding维度
sparse_features = ["gender", "age", "province", "city"]
target = ['label']
new_data[sparse_features] = new_data[sparse_features].fillna(-1)
for feat in sparse_features:
    lbe = LabelEncoder()
    new_data[feat] = lbe.fit_transform(new_data[feat])
    num_dict[feat] = len(lbe.classes_)
    embedding_dim_dict[feat] = 64
num_dict['tagid_history'] = len(taglbe.classes_)
embedding_dim_dict['tagid_history'] = 64

X_train = new_data[~new_data['label'].isna()]
X_test = new_data[new_data['label'].isna()]
y = X_train['label']

In [None]:
KF = StratifiedKFold(n_splits=5, random_state=2020, shuffle=True)
oof_nn = np.zeros(len(X_train))
predictions_nn = np.zeros((len(X_test)))
# 五折交叉验证
for fold_, (trn_idx, val_idx) in enumerate(KF.split(X_train.values, y.values)):
    print("fold n°{}".format(fold_))
    print('trn_idx:', trn_idx)
    print('val_idx:', val_idx)

    sparse_features = ["gender", "age", "province", "city"]
    dense_features = ["tagid_history_len"]
    varlen_features = ["tagid_history", "tagid_weight"]
    target = ['label']
    trn_data = X_train.iloc[trn_idx]
    val_data = X_train.iloc[val_idx]

    trn_model_input = {name: trn_data[name] for name in sparse_features + dense_features}
    val_model_input = {name: val_data[name] for name in sparse_features + dense_features}
    test_model_input = {name: X_test[name] for name in sparse_features + dense_features}
    for name in varlen_features:
        trn_model_input[name] = np.array(trn_data[name].values.tolist())
        val_model_input[name] = np.array(val_data[name].values.tolist())
        test_model_input[name] = np.array(X_test[name].values.tolist())
    model = deepFM(num_dict, embedding_dim_dict, tag_history_max_len=50, dnn_hidden_units=(256, 128),
                   l2_reg_linear=0.00001, l2_reg_embedding=1e-2, l2_reg_dnn=0, seed=1024, dnn_dropout=0,
                   dnn_activation='relu', dnn_use_bn=False, task='binary')
    model.compile('adagrad', "binary_crossentropy",
                  metrics=["binary_crossentropy", "AUC"])
    es = EarlyStopping(monitor='val_AUC', patience=2, restore_best_weights=True, mode="max")
    history = model.fit(trn_model_input, trn_data[target].values, batch_size=1024, epochs=100,
                        verbose=2, validation_data=(val_model_input, val_data[target].values),
                        callbacks=[es])
    oof_nn[val_idx] = model.predict(val_model_input, 128).reshape(-1)
    predictions_nn[:] += model.predict(test_model_input, 128).reshape(-1)
print("AUC score: {}".format(roc_auc_score(y, oof_nn)))
print("F1 score: {}".format(
    f1_score(y, [1 if i >= 0.5 else 0 for i in oof_nn])))
print("Precision score: {}".format(precision_score(
    y, [1 if i >= 0.5 else 0 for i in oof_nn])))
print("Recall score: {}".format(recall_score(
    y, [1 if i >= 0.5 else 0 for i in oof_nn])))


fold n°0
trn_idx: [     0      1      2 ... 299997 299998 299999]
val_idx: [     3      9     14 ... 299992 299993 299994]
Train on 240000 samples, validate on 60000 samples
Epoch 1/100
240000/240000 - 21s - loss: 0.5994 - binary_crossentropy: 0.5809 - AUC: 0.7646 - val_loss: 0.5820 - val_binary_crossentropy: 0.5624 - val_AUC: 0.7818
Epoch 2/100
240000/240000 - 19s - loss: 0.5713 - binary_crossentropy: 0.5511 - AUC: 0.7925 - val_loss: 0.5763 - val_binary_crossentropy: 0.5564 - val_AUC: 0.7877
Epoch 3/100
240000/240000 - 20s - loss: 0.5620 - binary_crossentropy: 0.5415 - AUC: 0.8012 - val_loss: 0.5766 - val_binary_crossentropy: 0.5566 - val_AUC: 0.7899
Epoch 4/100
240000/240000 - 19s - loss: 0.5552 - binary_crossentropy: 0.5343 - AUC: 0.8077 - val_loss: 0.5723 - val_binary_crossentropy: 0.5517 - val_AUC: 0.7913
Epoch 5/100
240000/240000 - 19s - loss: 0.5493 - binary_crossentropy: 0.5279 - AUC: 0.8133 - val_loss: 0.5746 - val_binary_crossentropy: 0.5533 - val_AUC: 0.7915
Epoch 6/100
2400

In [None]:
X_test['category_id'] = [1 if i >= 2.5 else 0 for i in predictions_nn]
X_test['user_id'] = X_test['pid']
X_test[['user_id', 'category_id']].to_csv('nn.csv', index=False)