In [1]:
import math
import numpy as np
import pandas as pd
import warnings
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch
from sklearn.preprocessing import LabelEncoder
from deepctr_torch.inputs import SparseFeat, VarLenSparseFeat, get_feature_names
from deepctr_torch.models import DeepFM
from sklearn.metrics import f1_score, fbeta_score, precision_score, recall_score, roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
from datetime import datetime
from deepctr_torch.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.feature_extraction.text import TfidfVectorizer
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)

In [2]:
train = pd.read_csv('data/train.txt', header=None, names=[
                    'pid', 'label', 'gender', 'age', 'tagid', 'time', 'province', 'city', 'make', 'model'])
test = pd.read_csv('data/test.txt', header=None, names=[
                   'pid', 'gender', 'age', 'tagid', 'time', 'province', 'city', 'make', 'model'])
data = pd.concat([train, test]).reset_index(drop=True)

In [3]:
#按tagid时间降序排序，筛选出最近50个tagid，不足补0
data['tagid'] = data['tagid'].apply(eval)
data['time'] = data['time'].apply(eval)
all_tag_id = []
for i in tqdm(range(data.shape[0])):
    tagid_list = np.array(data.loc[i, 'tagid'])
    time_list = np.array(data.loc[i, 'time'])
    index = np.argsort(time_list)[::-1][:50]
    sort_tagid_list = tagid_list[index]
    all_tag_id.extend(sort_tagid_list.tolist()+[0]*(50-len(index)))
taglbe = LabelEncoder()
new_tag_id = taglbe.fit_transform(all_tag_id).reshape(-1, 50)
new_df = pd.DataFrame({"tagid_history": new_tag_id.tolist()})
new_data = pd.concat([data, new_df], axis=1)

100%|██████████| 400000/400000 [00:25<00:00, 15957.21it/s]


In [4]:
# label encoder
sparse_features = ["gender", "age", "province", "city"]
target = ['label']
new_data[sparse_features] = new_data[sparse_features].fillna(-1)
for feat in sparse_features:
    lbe = LabelEncoder()
    new_data[feat] = lbe.fit_transform(new_data[feat])

In [18]:
X_train = new_data[~new_data['label'].isna()]
X_test = new_data[new_data['label'].isna()]
y = X_train['label']
KF = StratifiedKFold(n_splits=5, random_state=2020, shuffle=True)
oof_nn = np.zeros(len(X_train))
predictions_nn = np.zeros((len(X_test)))
# 五折交叉验证
for fold_, (trn_idx, val_idx) in enumerate(KF.split(X_train.values, y.values)):
    print("fold n°{}".format(fold_))
    print('trn_idx:', trn_idx)
    print('val_idx:', val_idx)

    sparse_features = ["gender", "age", "province", "city"]
    target = ['label']
    fixlen_feature_columns = [SparseFeat(feat, new_data[feat].nunique()+5, embedding_dim=64)
                              for feat in sparse_features]
    varlen_feature_columns = [VarLenSparseFeat(SparseFeat('tagid', vocabulary_size=len(
        taglbe.classes_) + 1, embedding_dim=64), maxlen=50, combiner='mean')]
    linear_feature_columns = fixlen_feature_columns + varlen_feature_columns
    dnn_feature_columns = fixlen_feature_columns + varlen_feature_columns
    feature_names = get_feature_names(
        linear_feature_columns + dnn_feature_columns)

    trn_data = X_train.iloc[trn_idx]
    val_data = X_train.iloc[val_idx]
    target = ['label']
    trn_model_input = {name: trn_data[name] for name in feature_names}  #
    trn_model_input["tagid"] = np.array(
        trn_data['tagid_history'].values.tolist())
    val_model_input = {name: val_data[name] for name in feature_names}
    val_model_input["tagid"] = np.array(
        val_data['tagid_history'].values.tolist())
    test_model_input = {name: X_test[name] for name in feature_names}
    test_model_input["tagid"] = np.array(
        X_test['tagid_history'].values.tolist())
    device = 'cpu'
    use_cuda = True
    if use_cuda and torch.cuda.is_available():
        print('cuda ready...')
        device = 'cuda:0'

    model = DeepFM(linear_feature_columns, dnn_feature_columns, dnn_hidden_units=(256, 128),
                   l2_reg_embedding=1e-2, task='binary', device=device)
    model.compile("adagrad", "binary_crossentropy",
                  metrics=["binary_crossentropy", "auc"])
    es = EarlyStopping(monitor='val_auc', min_delta=0,
                       verbose=1, patience=1, mode='max')
    mdckpt = ModelCheckpoint(filepath='model.ckpt', monitor='val_auc',
                             verbose=1, save_best_only=True, mode='max')
    history = model.fit(trn_model_input, trn_data[target].values, batch_size=1024, epochs=10,
                        verbose=2, validation_data=(val_model_input, val_data[target].values),
                        callbacks=[es, mdckpt])
    oof_nn[val_idx] = model.predict(val_model_input, 128).reshape(-1)
    predictions_nn[:] += model.predict(test_model_input, 128).reshape(-1)
print("AUC score: {}".format(roc_auc_score(y, oof_nn)))
print("F1 score: {}".format(
    f1_score(y, [1 if i >= 0.5 else 0 for i in oof_nn])))
print("Precision score: {}".format(precision_score(
    y, [1 if i >= 0.5 else 0 for i in oof_nn])))
print("Recall score: {}".format(recall_score(
    y, [1 if i >= 0.5 else 0 for i in oof_nn])))

fold n°0
trn_idx: [     0      1      2 ... 299997 299998 299999]
val_idx: [     3      9     14 ... 299992 299993 299994]
cuda ready...
cuda:1
Train on 240000 samples, validate on 60000 samples, 235 steps per epoch
Epoch 1/10
5s - loss:  0.5817 - binary_crossentropy:  0.5752 - auc:  0.7689 - val_binary_crossentropy:  0.5559 - val_auc:  0.7878
Epoch 00001: val_auc improved from -inf to 0.78783, saving model to model.ckpt
Epoch 2/10
6s - loss:  0.5434 - binary_crossentropy:  0.5329 - auc:  0.8099 - val_binary_crossentropy:  0.5525 - val_auc:  0.7922
Epoch 00002: val_auc improved from 0.78783 to 0.79215, saving model to model.ckpt
Epoch 3/10
7s - loss:  0.5259 - binary_crossentropy:  0.5126 - auc:  0.8269 - val_binary_crossentropy:  0.5534 - val_auc:  0.7920
Epoch 00003: val_auc did not improve from 0.79215
Epoch 00003: early stopping
fold n°1
trn_idx: [     0      1      2 ... 299996 299998 299999]
val_idx: [    10     13     25 ... 299986 299988 299997]
cuda ready...
cuda:1
Train on 24

In [19]:
X_test['category_id'] = [1 if i >= 2.5 else 0 for i in predictions_nn]
X_test['user_id'] = X_test['pid']
X_test[['user_id', 'category_id']].to_csv('nn.csv', index=False)

In [8]:
model_input = {name: X_train[name] for name in sparse_features}  #
model_input["tagid"] = np.array(X_train['tagid_history'].values.tolist())


In [11]:
fixlen_feature_columns = [SparseFeat(feat, new_data[feat].nunique()+5, embedding_dim=64)
                            for feat in sparse_features]
varlen_feature_columns = [VarLenSparseFeat(SparseFeat('tagid', vocabulary_size=len(
        taglbe.classes_) + 1, embedding_dim=64), maxlen=50, combiner='mean')] 
linear_feature_columns = fixlen_feature_columns + varlen_feature_columns
dnn_feature_columns = fixlen_feature_columns + varlen_feature_columns
device = 'cpu'
use_cuda = True
if use_cuda and torch.cuda.is_available():
    print('cuda ready...')
    device = 'cuda:1'

model = DeepFM(linear_feature_columns, dnn_feature_columns,dnn_hidden_units=(256, 128),
                           l2_reg_embedding=1e-2, task='binary', device=device)

model.compile("adagrad", "binary_crossentropy", metrics=["binary_crossentropy", "auc"])
history = model.fit(model_input,X_train[target].values,batch_size=1024,epochs=10,verbose=2,validation_split=0.1)


cuda ready...
cuda:1
Train on 270000 samples, validate on 30000 samples, 264 steps per epoch
Epoch 1/10
9s - loss:  0.5792 - binary_crossentropy:  0.5726 - auc:  0.7720 - val_binary_crossentropy:  0.5555 - val_auc:  0.7883
Epoch 2/10
9s - loss:  0.5422 - binary_crossentropy:  0.5316 - auc:  0.8111 - val_binary_crossentropy:  0.5511 - val_auc:  0.7926


KeyboardInterrupt: 

In [None]:
7912