# Module 

In [7]:
import pandas as pd
from sklearn.metrics import log_loss, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

from deepctr.models import *
from deepctr.feature_column import SparseFeat, DenseFeat, get_feature_names

# Config 

In [20]:
data_dir = '../data/data.txt'

In [27]:
data = pd.read_csv(data_dir)


In [28]:

sparse_features = ['C' + str(i) for i in range(1, 27)]
dense_features = ['I' + str(i) for i in range(1, 14)]

data[sparse_features] = data[sparse_features].fillna('-1', )
data[dense_features] = data[dense_features].fillna(0, )
target = ['label']

In [29]:
# 1.Label Encoding for sparse features,and do simple Transformation for dense features
for feat in sparse_features:
    lbe = LabelEncoder()
    data[feat] = lbe.fit_transform(data[feat])
mms = MinMaxScaler(feature_range=(0, 1))
data[dense_features] = mms.fit_transform(data[dense_features])

In [30]:
# 2.count #unique features for each sparse field,and record dense feature field name

fixlen_feature_columns = [SparseFeat(feat, vocabulary_size=data[feat].max() + 1, embedding_dim=4)
                          for i, feat in enumerate(sparse_features)] + [DenseFeat(feat, 1, )
                                                                        for feat in dense_features]

dnn_feature_columns = fixlen_feature_columns
linear_feature_columns = fixlen_feature_columns

feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)

In [31]:
len(fixlen_feature_columns)

39

In [34]:
# 3.generate input data for model

train, test = train_test_split(data, test_size=0.2, random_state=2020)
train_model_input = {name: train[name] for name in feature_names}
test_model_input = {name: test[name] for name in feature_names}

In [18]:
# 4.Define Model,train,predict and evaluate
model = DeepFM(linear_feature_columns, dnn_feature_columns, task='binary')
model.compile("adam", "binary_crossentropy",
              metrics=['binary_crossentropy'], )


In [19]:
history = model.fit(train_model_input, train[target].values,
                    batch_size=256, epochs=10, verbose=2, validation_split=0.2, )
pred_ans = model.predict(test_model_input, batch_size=256)
print("test LogLoss", round(log_loss(test[target].values, pred_ans), 4))
print("test AUC", round(roc_auc_score(test[target].values, pred_ans), 4))



  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 128 samples, validate on 32 samples
Epoch 1/10
128/128 - 6s - loss: 0.7748 - binary_crossentropy: 0.7748 - val_loss: 0.7587 - val_binary_crossentropy: 0.7587
Epoch 2/10
128/128 - 0s - loss: 0.7503 - binary_crossentropy: 0.7503 - val_loss: 0.7457 - val_binary_crossentropy: 0.7456
Epoch 3/10
128/128 - 0s - loss: 0.7294 - binary_crossentropy: 0.7294 - val_loss: 0.7324 - val_binary_crossentropy: 0.7324
Epoch 4/10
128/128 - 0s - loss: 0.7089 - binary_crossentropy: 0.7088 - val_loss: 0.7183 - val_binary_crossentropy: 0.7183
Epoch 5/10
128/128 - 0s - loss: 0.6882 - binary_crossentropy: 0.6882 - val_loss: 0.7037 - val_binary_crossentropy: 0.7037
Epoch 6/10
128/128 - 0s - loss: 0.6673 - binary_crossentropy: 0.6673 - val_loss: 0.6884 - val_binary_crossentropy: 0.6884
Epoch 7/10
128/128 - 0s - loss: 0.6456 - binary_crossentropy: 0.6456 - val_loss: 0.6728 - val_binary_crossentropy: 0.6728
Epoch 8/10
128/128 - 0s - loss: 0.6234 - binary_crossentropy: 0.6234 - val_loss: 0.6572 - val_binary_