In [75]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from DataReader import FeatureDictionary, DataParser
import numpy as np
import pandas as pd
import config
#from sklearn.model_selection import KFold,StratifiedKFold

#pip install pandas -i https://pypi.tuna.tsinghua.edu.cn/simple
#folds = list(StratifiedKFold(n_splits=config.NUM_SPLITS, shuffle=True,
#                            random_state=config.RANDOM_SEED).split(X_train, y_train))

In [131]:
class NFM(keras.Model):
    def __init__(self,feature_size,field_size,embedding_size,dropout_rate=0.2):
        super(NFM,self).__init__()
        self.feature_size = feature_size
        self.embedding_size = embedding_size
        self.field_size = field_size
        self.dropout_rate = dropout_rate
        self.loss_type = 'logloss'
        
        ## init weights
        ## globle bias
        self.bias = tf.Variable(tf.constant([0.1]),name='glable_bias')

        self.feature_embeddings = tf.Variable(
            tf.random.normal([self.feature_size,self.embedding_size],0.0,0.01),
            name='weight_embeddings')
        
        #self.feature_embeddings = layers.Embedding(self.feature_size,self.embedding_size,
        #                                           embeddings_initializer='uniform')
        self.weight_first = tf.Variable(tf.random.normal([self.feature_size,1],0.0,1.0),name='weight_firstorder')
        
  
        ##
        self.dense_1 = layers.Dense(32,activation='relu')
        self.dense_2 = layers.Dense(32,activation='relu')
        self.dense_out = layers.Dense(1,use_bias=False)
        
    def call(self,inputs_index,inputs_value):
        ## calc embedding
        self.embedding = tf.nn.embedding_lookup(self.feature_embeddings,inputs_index)
        inputs_value = tf.reshape(inputs_value,[-1,self.field_size,1])
        self.embedding = tf.multiply(self.embedding,inputs_value)  ## (N,F,K)
        #print('self.embedding',self.embedding)
        
        ## first order term
        first_order = tf.nn.embedding_lookup(self.weight_first,inputs_index)  ## (N,F,1)
        first_order = tf.reduce_sum(tf.multiply(first_order,inputs_value),axis=2)  ##(N,F)
        #print('first_order*inputs_value',first_order)
        
        ## second order term 
        sum_feature_emb = tf.reduce_sum(self.embedding,axis=1)  ##(N,K)
        sum_feature_emb_squre = tf.square(sum_feature_emb)
        #print('sum_feature_emb',sum_feature_emb)
        #print('sum_feature_emb_squre',sum_feature_emb_squre)
        square_feature_emb = tf.square(self.embedding)
        square_feature_emb_sum = tf.reduce_sum(square_feature_emb,axis=1) ##(N,K)
        #print('square_feature_emb',square_feature_emb)
        #print('square_feature_emb_sum',square_feature_emb_sum)
        
        second_order = 0.5 * tf.subtract(sum_feature_emb_squre,square_feature_emb_sum)
        #print('second_order',second_order)

        second_order = tf.reshape(second_order,[-1,self.embedding_size]) ##(N,K)
        #print('second_order',second_order)
        deep_feature = self.dense_1(second_order)
        layers.Dropout(self.dropout_rate)
        deep_feature = self.dense_2(deep_feature)
        layers.Dropout(self.dropout_rate)
        deep_out = self.dense_out(deep_feature)  ##(N,1)
  
        out = tf.add_n([tf.reduce_sum(first_order,axis=1,keepdims=True),
                             deep_out,
                             self.bias * tf.ones_like(deep_out)],name='out_nfm')
        
        return out
    

In [132]:
def load_data():
    dfTrain = pd.read_csv(config.TRAIN_FILE)
    dfTest = pd.read_csv(config.TEST_FILE)

    def preprocess(df):
        cols = [c for c in df.columns if c not in ['id','target']]
        #df['missing_feat'] = np.sum(df[df[cols]==-1].values,axis=1)
        df["missing_feat"] = np.sum((df[cols] == -1).values, axis=1)
        df['ps_car_13_x_ps_reg_03'] = df['ps_car_13'] * df['ps_reg_03']
        return df

    dfTrain = preprocess(dfTrain)
    dfTest = preprocess(dfTest)

    cols = [c for c in dfTrain.columns if c not in ['id','target']]
    cols = [c for c in cols if (not c in config.IGNORE_COLS)]

    X_train = dfTrain[cols].values
    y_train = dfTrain['target'].values

    X_test = dfTest[cols].values
    ids_test = dfTest['id'].values

    cat_features_indices = [i for i,c in enumerate(cols) if c in config.CATEGORICAL_COLS]

    return dfTrain,dfTest,X_train,y_train,X_test,ids_test,cat_features_indices

In [133]:
# load data
dfTrain, dfTest, X_train, y_train, X_test, ids_test, cat_features_indices = load_data()
#dfTrain.describe()

In [137]:
print('X_train:',X_train.shape)
print('X_test:',X_test.shape)

X_train: (10000, 39)
X_test: (2000, 39)


In [141]:
fd = FeatureDictionary(dfTrain=dfTrain,
                           dfTest=dfTest,
                           numeric_cols=config.NUMERIC_COLS,
                           ignore_cols = config.IGNORE_COLS)
data_parser = DataParser(feat_dict= fd)
# Xi_train ：列的序号
# Xv_train ：列的对应的值
Xi_train,Xv_train,y_train = data_parser.parse(df=dfTrain,has_label=True)
Xi_test,Xv_test,ids_test = data_parser.parse(df=dfTest)

#print(dfTrain.dtypes)
nfm_params = dict()
embedding_size = 8
nfm_params['feature_size'] = fd.feat_dim
nfm_params['field_size'] = len(Xi_train[0])
nfm_params['embedding_size'] = embedding_size
print('feature_size',fd.feat_dim)
print('field_size',len(Xi_train[0]))
print('embedding_size',embedding_size)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  df = pd.concat([dfTrain,dfTest])


feature_size 256
field_size 39
embedding_size 8


In [142]:
nfm_model = NFM(**nfm_params)
learning_rate = 0.001
#optimizer = tf.optimizers.Adam(learning_rate)
adam = tf.keras.optimizers.Adam(learning_rate)

In [143]:
def nfm_loss(y_pred,y,loss_type='logloss'): 
    #print(loss)
    if loss_type == 'logloss':
        y_pred = tf.reshape(tf.nn.sigmoid(y_pred),[-1,1])
        y = tf.reshape(y,[-1,1])
        loss = tf.compat.v1.losses.log_loss(y,y_pred)
    elif loss_type == 'mse':
        loss = tf.reduce_mean(tf.keras.losses.MSE(y_true=y,y_pred=y_pred))
    return loss

In [144]:
def train_step(train_x,train_y):
    with tf.GradientTape() as tape:  
        x_i, x_v  = train_x
        y_pred = nfm_model(x_i,x_v)
        loss = nfm_loss(y_pred,train_y)
        
    train_vars =  nfm_model.trainable_variables
    grad = tape.gradient(loss,train_vars)
    adam.apply_gradients(zip(grad,train_vars))        

In [145]:
def accuracy(y_pred,y):
    y_pred = tf.cast(y_pred > 0.5,dtype=tf.int32)
    pre_cls = tf.equal(y_pred,tf.cast(y,tf.int32))
    acc = tf.reduce_mean(tf.cast(pre_cls,tf.float32))
    return acc

In [162]:
Xi_val = Xi_train[-2000:]
Xv_val = Xv_train[-2000:]
y_val = y_train[-2000:]

Xi_train = Xi_train[:8000]
Xv_train = Xv_train[:8000]
y_train = y_train[:8000]


In [163]:
batch_size = 1024
train_data = tf.data.Dataset.from_tensor_slices(((Xi_train,Xv_train),y_train))
train_data = train_data.shuffle(8000).repeat().batch(batch_size)

In [165]:
## RUN Training
display_step = 200
steps_per_epoch = int(len(Xi_train) / batch_size)
epoch = 50
training_steps = epoch * steps_per_epoch

_epoch = 0
for step , (batch_x,batch_y) in enumerate(train_data.take(training_steps),1):
    train_step(batch_x,batch_y) 
    if step % steps_per_epoch == 0:
        y_pred = nfm_model(Xi_val,Xv_val)
        cur_loss = nfm_loss(y_pred,y_val)
        acc = accuracy(y_pred,y_val)
        _epoch += 1
        print("_epoch: %i, loss: %f, acc: %f" % (_epoch, cur_loss,acc))
#         x_i, x_v = batch_x
#         y_pred = nfm_model(x_i,x_v)
#         cur_loss = nfm_loss(y_pred,batch_y)
#         acc = accuracy(y_pred,batch_y)
#         print("step: %i, loss: %f, acc: %f" % (step, cur_loss,acc))

_epoch: 1, loss: 0.110856, acc: 0.960500
_epoch: 2, loss: 0.110391, acc: 0.960500
_epoch: 3, loss: 0.111099, acc: 0.960500
_epoch: 4, loss: 0.111154, acc: 0.960500
_epoch: 5, loss: 0.109618, acc: 0.960500
_epoch: 6, loss: 0.107781, acc: 0.959579
_epoch: 7, loss: 0.107024, acc: 0.960039
_epoch: 8, loss: 0.106472, acc: 0.959579
_epoch: 9, loss: 0.104882, acc: 0.960039
_epoch: 10, loss: 0.106376, acc: 0.960500
_epoch: 11, loss: 0.108658, acc: 0.960500
_epoch: 12, loss: 0.104184, acc: 0.960500
_epoch: 13, loss: 0.112681, acc: 0.957277
_epoch: 14, loss: 0.103700, acc: 0.960500
_epoch: 15, loss: 0.115106, acc: 0.960500
_epoch: 16, loss: 0.103015, acc: 0.960500
_epoch: 17, loss: 0.104412, acc: 0.959579
_epoch: 18, loss: 0.102783, acc: 0.960039
_epoch: 19, loss: 0.104082, acc: 0.960500
_epoch: 20, loss: 0.103318, acc: 0.960039
_epoch: 21, loss: 0.099593, acc: 0.959579
_epoch: 22, loss: 0.099755, acc: 0.960500
_epoch: 23, loss: 0.103472, acc: 0.960500
_epoch: 24, loss: 0.098295, acc: 0.958197
_

In [158]:
#y_pred = nfm_model(Xi_val,Xv_val)

In [None]:
#nfm_model.save_weights()

print(model.feature_embeddings)
print(model.weight_first)

test_embeddings = layers.Embedding(5,3,embeddings_initializer='uniform')
#test_embeddings.numpy()
test_embeddings

index = tf.constant([[0,1]],dtype = tf.int32)
print(test_embeddings(index))

index = tf.constant([[0,1],[1,2]],dtype = tf.int32)
print(test_embeddings(index))