In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

In [2]:
train_file = "./train.csv"
test_file = "./test.csv"
num_cols = ["ps_reg_01", "ps_reg_02", "ps_reg_03","ps_car_12", "ps_car_13", "ps_car_14", "ps_car_15"]
ignore_cols = ["id", "target", "ps_calc_01", "ps_calc_02", "ps_calc_03", "ps_calc_04", "ps_calc_05", 
               "ps_calc_06", "ps_calc_07", "ps_calc_08", "ps_calc_09", "ps_calc_10", "ps_calc_11", 
               "ps_calc_12", "ps_calc_13", "ps_calc_14","ps_calc_15_bin", "ps_calc_16_bin", 
               "ps_calc_17_bin","ps_calc_18_bin", "ps_calc_19_bin", "ps_calc_20_bin"]

In [3]:
def overview():    
    dfTrain = pd.read_csv(train_file)
    dfTest = pd.read_csv(test_file)
    df = pd.concat([dfTrain,dfTest], sort=False)

    field_size = len(df.columns) - len(ignore_cols)
    feature_dict = {}
    feature_size = 0
    for col in df.columns:
        if col in ignore_cols:
            continue
        elif col in num_cols:
            feature_dict[col] = feature_size
            feature_size += 1
        else:
            unique_val = df[col].unique()
            feature_dict[col] = dict(zip(unique_val,range(feature_size,len(unique_val) + feature_size)))
            feature_size += len(unique_val)
    return dfTrain, field_size, feature_size, feature_dict

In [4]:
def preprocess(train_df, split):
    label_df = train_df[['target']]
    train_df.drop(['target','id'],axis=1,inplace=True)
    feature_idx = train_df.copy()
    feature_val = train_df.copy()
    for col in feature_idx.columns:
        if col in ignore_cols:
            feature_idx.drop(col,axis=1,inplace=True)
            feature_val.drop(col,axis=1,inplace=True)
            continue
        elif col in num_cols:
            feature_idx[col] = feature_dict[col]
        else:
            feature_idx[col] = feature_idx[col].map(feature_dict[col])
            feature_val[col] = 1        
            
    split_idx = feature_idx.shape[0] - round(feature_idx.shape[0]*split)
    train_input = [feature_idx[:split_idx].values, feature_val[:split_idx].values]
    train_y = label_df[:split_idx].values
    validate_input = [feature_idx[split_idx:].values, feature_val[split_idx:].values]
    validate_y = label_df[split_idx:].values

    return train_input, train_y, validate_input, validate_y

In [5]:
dfTrain, field_size, feature_size, feature_dict = overview()

In [6]:
train_input, train_y, validate_input, validate_y = preprocess(dfTrain, 0.1)

In [66]:
train_y.shape

(535691, 1)

In [70]:
train_y_pd = pd.DataFrame(train_y)
train_y_pd.columns = ['label']

In [73]:
true_pd = train_y_pd[train_y_pd['label']==1]
true_pd.shape

(19511, 1)

In [74]:
false_pd = train_y_pd[train_y_pd['label']==0]
false_pd.shape

(516180, 1)

In [75]:
true_pd.shape[0]/(false_pd.shape[0]+true_pd.shape[0])

0.036422116481329724

In [45]:
class DeepFM(tf.keras.Model):
    def __init__(self, cfg):
        super(DeepFM, self).__init__()
        self.feature_size = cfg['feature_size']
        self.field_size = cfg['field_size']
        self.embed_size = cfg['embed_size']
        self.deep_nn = cfg['deep_nn']
        
        self.dropout_fm = cfg['dropout_fm']
        self.dropout_deep = cfg['dropout_deep']
        
        # fm        
        self.feature_weight = tf.keras.layers.Embedding(cfg['feature_size'], 1)
        self.feature_embed = tf.keras.layers.Embedding(cfg['feature_size'], cfg['embed_size'])

        # dnn
        for layer in range(len(cfg['deep_nn'])):
            setattr(self, 'dense_' + str(layer), tf.keras.layers.Dense(self.deep_nn[layer]))
            setattr(self, 'batchNorm_' + str(layer), tf.keras.layers.BatchNormalization())
            setattr(self, 'activation_' + str(layer), tf.keras.layers.Activation('relu'))
            setattr(self, 'dropout_' + str(layer), tf.keras.layers.Dropout(self.dropout_deep))
            
        self.fc = tf.keras.layers.Dense(1, activation='sigmoid', use_bias=True)

    def call(self, inputs, training=True):
        # inputs = [feature_idx, feature_val]
        reshaped_feature_val = tf.cast(tf.reshape(inputs[1], shape=[-1,self.field_size,1]), tf.float32)
        # linear        
        weights = self.feature_weight(inputs[0])
        linear = tf.reduce_sum(tf.multiply(weights,reshaped_feature_val),2)
        
        # fm  
        embeddings = self.feature_embed(inputs[0])
        second_inner = tf.multiply(embeddings,reshaped_feature_val)
        
        summed_features_emb = tf.reduce_sum(second_inner,1)
        summed_features_emb_square = tf.square(summed_features_emb)
        
        squared_features_emb = tf.square(second_inner)
        squared_sum_features_emb = tf.reduce_sum(squared_features_emb,1)
        
        fm = 0.5 * tf.subtract(summed_features_emb_square,squared_sum_features_emb)
        
        # dnn
        y_deep = tf.reshape(embeddings,shape=[-1,self.field_size * self.embed_size])
        for layer in range(0, len(self.deep_nn)):
            y_deep = getattr(self, 'dense_' + str(layer))(y_deep)
            y_deep = getattr(self, 'batchNorm_' + str(layer))(y_deep, training=training)
            y_deep = getattr(self, 'activation_' + str(layer))(y_deep)
            y_deep = getattr(self, 'dropout_' + str(layer))(y_deep, training=training)
            
        # concat
        concat = tf.concat([linear, fm, y_deep], axis=1)                                
        out = self.fc(concat)
        return out

In [46]:
cfg = {
    "feature_size": feature_size,
    "field_size": field_size,
    "embed_size":8,
    "deep_nn":[32,32],
    "dropout_fm": 0,
    "dropout_deep": 0.2,
    "epoch":20,
    "batch":10000
}

In [63]:
model = DeepFM(cfg)
# model.compile(optimizer = 'adam', loss='binary_crossentropy', metrics=['binary_accuracy'])
model.compile(optimizer = 'adam', loss=tf.keras.losses.BinaryCrossentropy(from_logits=True), metrics=['binary_accuracy', 'AUC'])
model.fit(train_input, train_y, epochs=cfg['epoch'], batch_size=cfg['batch'], shuffle=True,
          verbose=1, validation_data=(validate_input, validate_y))

Train on 535691 samples, validate on 59521 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x7f167238da90>

In [48]:
model.summary()

Model: "deep_fm_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_8 (Embedding)      multiple                  257       
_________________________________________________________________
embedding_9 (Embedding)      multiple                  2056      
_________________________________________________________________
dense_10 (Dense)             multiple                  9504      
_________________________________________________________________
batch_normalization_7 (Batch multiple                  128       
_________________________________________________________________
activation_7 (Activation)    multiple                  0         
_________________________________________________________________
dropout_6 (Dropout)          multiple                  0         
_________________________________________________________________
dense_11 (Dense)             multiple                  10

In [14]:
def prepredict(path):
    predict_df = pd.read_csv(path)
    feature_idx = predict_df.copy()
    feature_val = predict_df.copy()
    for col in feature_idx.columns:
        if col in ignore_cols:
            feature_idx.drop(col,axis=1,inplace=True)
            feature_val.drop(col,axis=1,inplace=True)
            continue
        elif col in num_cols:
            feature_idx[col] = feature_dict[col]
        else:
            feature_idx[col] = feature_idx[col].map(feature_dict[col])
            feature_val[col] = 1 
    return [feature_idx.values, feature_val.values]

In [None]:
# predict_input = prepredict(test_file)
# label_pre = model.predict(predict_input)

In [49]:
validate_input

[array([[  3,   8,  20, ..., 254, 255, 256],
        [  0,   8,  19, ..., 254, 255, 256],
        [  1,   9,  22, ..., 254, 255, 256],
        ...,
        [  1,   9,  24, ..., 254, 255, 256],
        [  2,   8,  19, ..., 254, 255, 256],
        [  3,   9,  23, ..., 254, 255, 256]]),
 array([[1.        , 1.        , 1.        , ..., 0.77539646, 0.39076847,
         3.31662479],
        [1.        , 1.        , 1.        , ..., 0.99036682, 0.32863353,
         3.60555128],
        [1.        , 1.        , 1.        , ..., 0.43593729, 0.35566838,
         0.        ],
        ...,
        [1.        , 1.        , 1.        , ..., 0.59637334, 0.39874804,
         1.73205081],
        [1.        , 1.        , 1.        , ..., 0.76443411, 0.38496753,
         3.16227766],
        [1.        , 1.        , 1.        , ..., 0.9326493 , 0.37802116,
         3.74165739]])]

In [50]:
validate_y[:40]

array([[0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [1],
       [0],
       [0],
       [0],
       [0],
       [0]])

In [51]:
predict_for_vali = model.predict(validate_input)

In [52]:
predict_for_vali = pd.DataFrame(predict_for_vali)
label_for_vali = pd.DataFrame(validate_y)

In [64]:
predict_for_vali.shape

(59521, 1)

In [65]:
label_for_vali.shape

(59521, 1)

In [53]:
check_pd = pd.concat([predict_for_vali, label_for_vali], axis=1)

In [54]:
check_pd.columns = ['predict', 'true']

In [55]:
check_pd.head(40)

Unnamed: 0,predict,true
0,0.034621,0
1,0.062988,0
2,0.032896,0
3,0.048589,0
4,0.031507,0
5,0.027715,0
6,0.031893,0
7,0.022853,0
8,0.054694,0
9,0.029805,0


In [56]:
check_pd[(check_pd['true'] == 1) & (check_pd['predict'] >= 0.05)]

Unnamed: 0,predict,true
114,0.089955,1
183,0.054906,1
255,0.117817,1
372,0.052466,1
410,0.080825,1
...,...,...
59227,0.053753,1
59277,0.056567,1
59291,0.066664,1
59315,0.078166,1


In [57]:
check_pd[check_pd['true'] == 1]

Unnamed: 0,predict,true
34,0.045300,1
114,0.089955,1
120,0.026723,1
121,0.036562,1
131,0.033138,1
...,...,...
59315,0.078166,1
59355,0.036110,1
59408,0.061828,1
59422,0.026655,1


In [58]:
sort = check_pd.sort_values("predict",ascending=False)

In [59]:
sort[:40]

Unnamed: 0,predict,true
9897,0.329462,0
5962,0.328023,0
51937,0.327398,1
21925,0.307034,1
43358,0.274268,1
28686,0.259295,1
4979,0.254979,1
11241,0.250435,0
9064,0.20433,0
31050,0.188238,0


In [60]:
sort_true = check_pd.sort_values("true",ascending=False)
sort_true[:40]

Unnamed: 0,predict,true
29760,0.032945,1
6630,0.031204,1
6573,0.04075,1
36206,0.068651,1
36212,0.013859,1
53175,0.023155,1
36214,0.043448,1
6594,0.081718,1
36216,0.077886,1
6600,0.038176,1


In [62]:
sort_false = check_pd.sort_values("true",ascending=True)
sort_false[:40]

Unnamed: 0,predict,true
0,0.034621,0
39305,0.025143,0
39306,0.02301,0
39307,0.025231,0
39308,0.015314,0
39309,0.050333,0
39310,0.01762,0
39311,0.02561,0
39312,0.033695,0
39313,0.059581,0
