In [1]:
import sys

sys.path.append("/home/xmh/DeepCTR-Torch")

# -*- coding: utf-8 -*-
import pandas as pd
import torch
from sklearn.metrics import log_loss, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

from deepctr_torch.inputs import SparseFeat, DenseFeat, get_feature_names
from deepctr_torch.models import *
import numpy as np

In [None]:
def load_criteo():
    # Data processing code adapted from https://github.com/facebookresearch/dlrm
    # Follow steps in https://github.com/ylongqi/dlrm/blob/master/data_utils.py to generate kaggle_processed.npz
    # Or using `./download_dataset.sh criteo` command to download the processed data.
    import os
    if os.path.exists("/home/web_server/"):
        dataset_folder = "/home/web_server/"
        datapath = dataset_folder + 'kaggle_processed.npz'
    elif os.path.exists("/home/xmh/kuiba/data/"):
        dataset_folder = "/home/xmh/kuiba/data/"
        datapath = dataset_folder + 'kaggle_processed_tiny.npz'
    else:
        raise Exception("dataset not found")
    import numpy as np

    with np.load(datapath) as data:
        X_int = data["X_int"]
        X_cat = data["X_cat"]
        y = data["y"]
        counts = data["counts"]

    raw_data = dict()

    raw_data['counts'] = counts
    print("[xmh] counts is ", counts)
    X_cat = X_cat.astype(np.int32)
    X_int = np.log(X_int + 1).astype(np.float32)
    
    int_df = pd.DataFrame(X_int,columns=['I' + str(i) for i in range(1, 14)])
    cat_df = pd.DataFrame(X_cat,columns=['C' + str(i) for i in range(1, 27)])
    target_df = pd.DataFrame(y,columns=['label'])
    df = pd.concat([int_df,cat_df,target_df], axis=1)

In [5]:
data = df
sparse_features = ['C' + str(i) for i in range(1, 27)]
dense_features = ['I' + str(i) for i in range(1, 14)]

data[sparse_features] = data[sparse_features].fillna('-1', )
data[dense_features] = data[dense_features].fillna(0, )
target = ['label']

# 1.Label Encoding for sparse features,and do simple Transformation for dense features
for feat in sparse_features:
    lbe = LabelEncoder()
    data[feat] = lbe.fit_transform(data[feat])
mms = MinMaxScaler(feature_range=(0, 1))
data[dense_features] = mms.fit_transform(data[dense_features])

# 2.count #unique features for each sparse field,and record dense feature field name

fixlen_feature_columns = [SparseFeat(feat, data[feat].nunique())
                          for feat in sparse_features] + [DenseFeat(feat, 1, )
                                                          for feat in dense_features]

dnn_feature_columns = fixlen_feature_columns
linear_feature_columns = fixlen_feature_columns

feature_names = get_feature_names(
    linear_feature_columns + dnn_feature_columns)

# 3.generate input data for model

train, test = train_test_split(data, test_size=0.2)

train_model_input = {name: train[name] for name in feature_names}
test_model_input = {name: test[name] for name in feature_names}



In [6]:
# 4.Define Model,train,predict and evaluate

device = 'cpu'
use_cuda = True
if use_cuda and torch.cuda.is_available():
    print('cuda ready...')
    device = 'cuda:0'

model = DeepFM(linear_feature_columns=linear_feature_columns, dnn_feature_columns=dnn_feature_columns,
               task='binary',
               l2_reg_embedding=1e-5, device=device)

model.compile(optimizer="adam", loss="binary_crossentropy",
              metrics=['binary_crossentropy', 'AUC'],
              optimizer_sparse=None,
              optimizer_dense_lr=0.001,
              optimizer_sparse_lr=0.001, )

model.fit(train_model_input, train[target].values,
          batch_size=32, epochs=1, validation_split=0.01, verbose=1)

pred_ans = model.predict(test_model_input, 256)
print("")
print("test LogLoss", round(log_loss(test[target].values, pred_ans), 4))
print("test AUC", round(roc_auc_score(test[target].values, pred_ans), 4))


0it [00:00, ?it/s]

cpu
Train on 36305771 samples, validate on 366725 samples, 1134556 steps per epoch


1it [00:04,  4.31s/it]

Iter1


2it [00:40, 13.76s/it]

1s - loss:  1.3579 - binary_crossentropy:  1.3579 - auc:  1.0240 - val_binary_crossentropy:  0.6157 - val_auc:  0.5531


101it [01:39,  1.66it/s]

Iter101


102it [02:20, 12.75s/it]

101s - loss:  0.5527 - binary_crossentropy:  0.5527 - auc:  0.6506 - val_binary_crossentropy:  0.5347 - val_auc:  0.6717


201it [03:20,  1.76it/s]

Iter201


202it [03:53, 10.33s/it]

201s - loss:  0.5452 - binary_crossentropy:  0.5452 - auc:  0.6661 - val_binary_crossentropy:  0.5241 - val_auc:  0.6901


301it [04:52,  1.66it/s]

Iter301


302it [05:26, 10.49s/it]

301s - loss:  0.5344 - binary_crossentropy:  0.5344 - auc:  0.6785 - val_binary_crossentropy:  0.5201 - val_auc:  0.6976


401it [06:24,  1.62it/s]

Iter401


402it [06:57, 10.45s/it]

401s - loss:  0.5300 - binary_crossentropy:  0.5300 - auc:  0.6868 - val_binary_crossentropy:  0.5163 - val_auc:  0.7054


501it [07:54,  1.75it/s]

Iter501


502it [08:27, 10.22s/it]

501s - loss:  0.5279 - binary_crossentropy:  0.5279 - auc:  0.6914 - val_binary_crossentropy:  0.5136 - val_auc:  0.7106


601it [09:21,  1.92it/s]

Iter601


602it [09:53, 10.20s/it]

601s - loss:  0.5254 - binary_crossentropy:  0.5254 - auc:  0.6942 - val_binary_crossentropy:  0.5115 - val_auc:  0.7138


701it [10:49,  1.64it/s]

Iter701


702it [11:23, 10.66s/it]

701s - loss:  0.5228 - binary_crossentropy:  0.5228 - auc:  0.6952 - val_binary_crossentropy:  0.5113 - val_auc:  0.7166


801it [12:21,  1.69it/s]

Iter801


802it [12:54, 10.38s/it]

801s - loss:  0.5222 - binary_crossentropy:  0.5222 - auc:  0.6991 - val_binary_crossentropy:  0.5118 - val_auc:  0.7195


901it [13:50,  1.66it/s]

Iter901


902it [14:24, 10.52s/it]

901s - loss:  0.5203 - binary_crossentropy:  0.5203 - auc:  0.7010 - val_binary_crossentropy:  0.5086 - val_auc:  0.7213


1001it [15:19,  1.88it/s]

Iter1001


1002it [15:52, 10.36s/it]

1001s - loss:  0.5199 - binary_crossentropy:  0.5199 - auc:  0.7027 - val_binary_crossentropy:  0.5063 - val_auc:  0.7233


1101it [16:45,  1.85it/s]

Iter1101


1102it [17:21, 11.17s/it]

1101s - loss:  0.5172 - binary_crossentropy:  0.5172 - auc:  0.7065 - val_binary_crossentropy:  0.5059 - val_auc:  0.7250


1201it [18:17,  1.70it/s]

Iter1201


1202it [18:49, 10.26s/it]

1201s - loss:  0.5165 - binary_crossentropy:  0.5165 - auc:  0.7076 - val_binary_crossentropy:  0.5041 - val_auc:  0.7266


1301it [19:44,  1.83it/s]

Iter1301


1302it [20:17, 10.26s/it]

1301s - loss:  0.5166 - binary_crossentropy:  0.5166 - auc:  0.7083 - val_binary_crossentropy:  0.5038 - val_auc:  0.7280


1401it [21:13,  1.80it/s]

Iter1401


1402it [21:46, 10.34s/it]

1401s - loss:  0.5155 - binary_crossentropy:  0.5155 - auc:  0.7097 - val_binary_crossentropy:  0.5032 - val_auc:  0.7291


1501it [22:39,  1.75it/s]

Iter1501


1502it [23:19, 12.28s/it]

1501s - loss:  0.5152 - binary_crossentropy:  0.5152 - auc:  0.7104 - val_binary_crossentropy:  0.5018 - val_auc:  0.7306


1601it [24:13,  1.88it/s]

Iter1601


1602it [24:47, 10.39s/it]

1601s - loss:  0.5145 - binary_crossentropy:  0.5145 - auc:  0.7115 - val_binary_crossentropy:  0.5029 - val_auc:  0.7313


1701it [25:40,  1.96it/s]

Iter1701


1702it [26:18, 11.94s/it]

1701s - loss:  0.5147 - binary_crossentropy:  0.5147 - auc:  0.7119 - val_binary_crossentropy:  0.5006 - val_auc:  0.7326


1801it [27:13,  1.85it/s]

Iter1801


1802it [27:46, 10.29s/it]

1801s - loss:  0.5141 - binary_crossentropy:  0.5141 - auc:  0.7127 - val_binary_crossentropy:  0.5003 - val_auc:  0.7333


1901it [28:41,  1.93it/s]

Iter1901


1902it [29:21, 12.45s/it]

1901s - loss:  0.5130 - binary_crossentropy:  0.5130 - auc:  0.7143 - val_binary_crossentropy:  0.4994 - val_auc:  0.7339


2001it [30:15,  1.72it/s]

Iter2001


2002it [30:59, 13.71s/it]

2001s - loss:  0.5127 - binary_crossentropy:  0.5127 - auc:  0.7153 - val_binary_crossentropy:  0.4992 - val_auc:  0.7349


2101it [31:53,  1.80it/s]

Iter2101


2102it [32:37, 13.61s/it]

2101s - loss:  0.5114 - binary_crossentropy:  0.5114 - auc:  0.7170 - val_binary_crossentropy:  0.4988 - val_auc:  0.7351


2201it [33:33,  1.78it/s]

Iter2201


2202it [34:06, 10.22s/it]

2201s - loss:  0.5107 - binary_crossentropy:  0.5107 - auc:  0.7179 - val_binary_crossentropy:  0.4982 - val_auc:  0.7368


2301it [35:03,  1.80it/s]

Iter2301


2301it [35:11,  1.09it/s]


KeyboardInterrupt: 