In [74]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from DataReader import FeatureDictionary, DataParser
import numpy as np
import pandas as pd
import config

In [75]:
#pip install pandas -i https://pypi.tuna.tsinghua.edu.cn/simple

In [123]:
class NFM(keras.Model):
    def __init__(self,feature_size,field_size,embedding_size,dropout_rate=0.2):
        super(NFM,self).__init__()
        self.feature_size = feature_size
        self.embedding_size = embedding_size
        self.field_size = field_size
        self.dropout_rate = dropout_rate
        self.loss_type = 'logloss'
        
        ## init weights
        ## globle bias
        self.bias = tf.Variable(tf.constant([0.1]),name='glable_bias')

        self.feature_embeddings = tf.Variable(
            tf.random.normal([self.feature_size,self.embedding_size],0.0,0.01),
            name='weight_embeddings')
        
        #self.feature_embeddings = layers.Embedding(self.feature_size,self.embedding_size,
        #                                           embeddings_initializer='uniform')
        self.weight_first = tf.Variable(tf.random.normal([self.feature_size,1],0.0,1.0),name='weight_firstorder')
        
  
        ##
        self.dense_1 = layers.Dense(32,activation='relu')
        self.dense_2 = layers.Dense(32,activation='relu')
        self.dense_out = layers.Dense(1,use_bias=False)
        
    def call(self,inputs_index,inputs_value):
        ## calc embedding
        self.embedding = tf.nn.embedding_lookup(self.feature_embeddings,inputs_index)
        inputs_value = tf.reshape(inputs_value,[-1,self.field_size,1])
        self.embedding = tf.multiply(self.embedding,inputs_value)  ## (N,F,K)
        #print('self.embedding',self.embedding)
        
        ## first order term
        first_order = tf.nn.embedding_lookup(self.weight_first,inputs_index)  ## (N,F,1)
        first_order = tf.reduce_sum(tf.multiply(first_order,inputs_value),axis=2)  ##(N,F)
        #print('first_order*inputs_value',first_order)
        
        ## second order term 
        sum_feature_emb = tf.reduce_sum(self.embedding,axis=1)  ##(N,K)
        sum_feature_emb_squre = tf.square(sum_feature_emb)
        #print('sum_feature_emb',sum_feature_emb)
        #print('sum_feature_emb_squre',sum_feature_emb_squre)
        square_feature_emb = tf.square(self.embedding)
        square_feature_emb_sum = tf.reduce_sum(square_feature_emb,axis=1) ##(N,K)
        #print('square_feature_emb',square_feature_emb)
        #print('square_feature_emb_sum',square_feature_emb_sum)
        
        second_order = 0.5 * tf.subtract(sum_feature_emb_squre,square_feature_emb_sum)
        #print('second_order',second_order)

        second_order = tf.reshape(second_order,[-1,self.embedding_size]) ##(N,K)
        #print('second_order',second_order)
        deep_feature = self.dense_1(second_order)
        layers.Dropout(self.dropout_rate)
        deep_feature = self.dense_2(deep_feature)
        layers.Dropout(self.dropout_rate)
        deep_out = self.dense_out(deep_feature)  ##(N,1)
  
        out = tf.add_n([tf.reduce_sum(first_order,axis=1,keepdims=True),
                             deep_out,
                             self.bias * tf.ones_like(deep_out)],name='out_nfm')
        
        return out
    

In [78]:
def load_data():
    dfTrain = pd.read_csv(config.TRAIN_FILE)
    dfTest = pd.read_csv(config.TEST_FILE)

    def preprocess(df):
        cols = [c for c in df.columns if c not in ['id','target']]
        #df['missing_feat'] = np.sum(df[df[cols]==-1].values,axis=1)
        df["missing_feat"] = np.sum((df[cols] == -1).values, axis=1)
        df['ps_car_13_x_ps_reg_03'] = df['ps_car_13'] * df['ps_reg_03']
        return df

    dfTrain = preprocess(dfTrain)
    dfTest = preprocess(dfTest)

    cols = [c for c in dfTrain.columns if c not in ['id','target']]
    cols = [c for c in cols if (not c in config.IGNORE_COLS)]

    X_train = dfTrain[cols].values
    y_train = dfTrain['target'].values

    X_test = dfTest[cols].values
    ids_test = dfTest['id'].values

    cat_features_indices = [i for i,c in enumerate(cols) if c in config.CATEGORICAL_COLS]

    return dfTrain,dfTest,X_train,y_train,X_test,ids_test,cat_features_indices

In [79]:
# load data
dfTrain, dfTest, X_train, y_train, X_test, ids_test, cat_features_indices = load_data()
#dfTrain.describe()

In [81]:
print('X_train:',X_train.shape)
print('X_test:',X_test.shape)

X_train: (10000, 39)
X_test: (2000, 39)


In [82]:
fd = FeatureDictionary(dfTrain=dfTrain,
                           dfTest=dfTest,
                           numeric_cols=config.NUMERIC_COLS,
                           ignore_cols = config.IGNORE_COLS)
data_parser = DataParser(feat_dict= fd)
# Xi_train ：列的序号
# Xv_train ：列的对应的值
Xi_train,Xv_train,y_train = data_parser.parse(df=dfTrain,has_label=True)
Xi_test,Xv_test,ids_test = data_parser.parse(df=dfTest)

#print(dfTrain.dtypes)
nfm_params = dict()
embedding_size = 8
nfm_params['feature_size'] = fd.feat_dim
nfm_params['field_size'] = len(Xi_train[0])
nfm_params['embedding_size'] = embedding_size
print('feature_size',fd.feat_dim)
print('field_size',len(Xi_train[0]))
print('embedding_size',embedding_size)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  df = pd.concat([dfTrain,dfTest])


feature_size 256
field_size 39
embedding_size 8


In [124]:
nfm_model = NFM(**nfm_params)
learning_rate = 0.001
#optimizer = tf.optimizers.Adam(learning_rate)
adam = tf.keras.optimizers.Adam(learning_rate)

In [117]:
def nfm_loss(y_pred,y,loss_type='logloss'): 
    #print(loss)
    if loss_type == 'logloss':
        y_pred = tf.reshape(tf.nn.sigmoid(y_pred),[-1,1])
        y = tf.reshape(y,[-1,1])
        loss = tf.compat.v1.losses.log_loss(y,y_pred)
    elif loss_type == 'mse':
        loss = tf.reduce_mean(tf.keras.losses.MSE(y_true=y,y_pred=y_pred))
    return loss

In [118]:
def train_step(train_x,train_y):
    with tf.GradientTape() as tape:  
        x_i, x_v  = train_x
        y_pred = nfm_model(x_i,x_v)
        loss = nfm_loss(y_pred,train_y)
        
    train_vars =  nfm_model.trainable_variables
    grad = tape.gradient(loss,train_vars)
    adam.apply_gradients(zip(grad,train_vars))        

In [119]:
def accuracy(y_pred,y):
    y_pred = tf.cast(y_pred > 0.5,dtype=tf.int32)
    pre_cls = tf.equal(y_pred,tf.cast(y,tf.int32))
    acc = tf.reduce_mean(tf.cast(pre_cls,tf.float32))
    return acc

In [127]:
batch_size = 512
train_data = tf.data.Dataset.from_tensor_slices(((Xi_train,Xv_train),y_train))
train_data = train_data.shuffle(10000).repeat().batch(batch_size)
training_steps = 1000

In [128]:
## RUN Training
display_step = 200
for step , (batch_x,batch_y) in enumerate(train_data.take(training_steps),1):
    train_step(batch_x,batch_y) 
    if step % display_step == 0:
        x_i, x_v = batch_x
        y_pred = nfm_model(x_i,x_v)
        cur_loss = nfm_loss(y_pred,batch_y)
        acc = accuracy(y_pred,batch_y)
        print("step: %i, loss: %f, acc: %f" % (step, cur_loss,acc))

step: 200, loss: 0.015959, acc: 0.950478
step: 400, loss: 0.053189, acc: 0.903404
step: 600, loss: 0.025132, acc: 0.928505
step: 800, loss: 0.031355, acc: 0.932091
step: 1000, loss: 0.019656, acc: 0.923111


In [55]:
nfm_model.save_weights()

[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]

In [60]:
arr_y = np.array(y_train)
idx = np.where(arr_y)
print(idx)

(array([   9,   19,   28,   39,   41,   65,   87,  108,  191,  224,  249,
        251,  264,  266,  293,  297,  305,  328,  339,  380,  384,  460,
        518,  528,  535,  559,  610,  631,  640,  647,  688,  702,  845,
        858,  864,  925, 1019, 1030, 1039, 1103, 1106, 1136, 1163, 1166,
       1168, 1185, 1263, 1279, 1352, 1362, 1370, 1387, 1404, 1407, 1417,
       1464, 1466, 1569, 1586, 1607, 1630, 1648, 1661, 1694, 1760, 1768,
       1771, 1777, 1781, 1823, 1840, 1847, 1882, 1886, 1958, 1980, 2005,
       2025, 2031, 2090, 2139, 2168, 2203, 2231, 2234, 2244, 2248, 2299,
       2308, 2343, 2369, 2399, 2433, 2437, 2474, 2483, 2537, 2579, 2583,
       2584, 2596, 2612, 2688, 2714, 2750, 2759, 2781, 2813, 2827, 2886,
       2945, 2959, 2964, 3015, 3084, 3108, 3122, 3173, 3208, 3231, 3273,
       3298, 3317, 3340, 3355, 3420, 3426, 3441, 3442, 3449, 3462, 3534,
       3558, 3567, 3639, 3679, 3696, 3797, 3810, 3824, 3889, 3901, 3930,
       3971, 3977, 3981, 4005, 4012, 4029, 4073, 4

In [None]:
print(model.feature_embeddings)
print(model.weight_first)

In [None]:
test_embeddings = layers.Embedding(5,3,embeddings_initializer='uniform')
#test_embeddings.numpy()
test_embeddings

In [None]:
index = tf.constant([[0,1]],dtype = tf.int32)
print(test_embeddings(index))

In [None]:
index = tf.constant([[0,1],[1,2]],dtype = tf.int32)
print(test_embeddings(index))

In [None]:
w = tf.Variable(tf.random.uniform([2,3]))
v = tf.Variable(tf.random.uniform([3]))
w_res = tf.reshape(w,[2,3,1])
v_res = tf.reshape(v,[1,3,1])


In [None]:
res = tf.reduce_sum(tf.multiply(w_res,v_res),axis=2)
res.shape