In [1]:
import tensorflow as tf
import numpy as np 
import pandas as pd
import tensorflow.keras.backend as bk
from sklearn.preprocessing import LabelEncoder
# import psutil

pd.options.display.max_columns = None

In [2]:
# explore the data

# train_iter = pd.read_csv('train.csv', chunksize=10000, iterator=True)
train_df = pd.read_csv("train.csv", nrows=1000)
train_df.head()

In [165]:
# train_df['hour'].map(lambda x:int(str(x)[6:])).unique()
# train_df['device_type'].unique()
def my_one_hot(df):
    '''
        one_hot_encode some categorical columns of the dataframe
    '''
    one_hot_cols = list(df.columns)
    print("One hot encoding: ", one_hot_cols)
    labelencoder = LabelEncoder() 
    one_hot_df = pd.DataFrame()

    # only keep the hour information
    # print("hour: ", , "++++")
    if df.iloc[0]['hour']>24:
        df['hour'] = df['hour'].map(lambda x:int(str(x)[6:]))

    for encode_col in one_hot_cols:
        cols_name = []
        # Use label encoder to transform the data, so string and object type can be one_hot encoded
        df[encode_col] = labelencoder.fit_transform(df[encode_col]) 
        for val in df[encode_col].unique(): # generate new col names
            cols_name.append(encode_col+"_LE="+str(val))
        curr_df = pd.DataFrame(bk.one_hot(df[encode_col], len(df[encode_col].unique())).numpy(), columns=cols_name)
        # df = df.drop([encode_col], axis=1) # drop the old column
        one_hot_df = pd.concat([one_hot_df, curr_df], axis=1)
    return one_hot_df

In [194]:
# TODO: remove the nrows parameter
def get_onehot_data():
    '''
        Read the train data and test data, put them together
        use one hot encoding to make sure that train and test data have the same shape

        Return: 
            one hot encoded train dataset and test dataset
    '''
    print("Reading data...")
    ori_df = pd.read_csv('train.csv', nrows=1000) # read the training data
    ordered_cols = ori_df.columns # record the order of the columns, because the concat operation will change the col order
    one_hot_cols = ordered_cols[2:] # we don't do one hot encoding to id and click
    train_sample_num = ori_df.shape[0] # record the number of samples used for train, use this number to split the dataset after onehot encoding
    print("Number of training sample: ", train_sample_num)

    # onehot encode the test data
    ori_df = pd.concat([ori_df, pd.read_csv('test.csv', nrows=2)], axis=0).reset_index(drop=True) # must rest the index, or the kernal will restart!!! (I don't know why)

    # one_hot_data = my_one_hot(ori_df[one_hot_cols])
    one_hot_df = pd.concat([ori_df[ordered_cols[:2]], my_one_hot(ori_df[one_hot_cols])], axis=1)
    return [one_hot_df[:train_sample_num], one_hot_df[train_sample_num:]]

In [195]:
class FMCrossLayer(tf.keras.layers.Layer):
    def __init__(self, output_dim, vector_len, **kwargs):
        '''
            output_dim: the dimension of the output of the cross layer
            vector_len: the length of the implicit vector
        '''
        self.output_dim = output_dim
        self.vector_len = vector_len
        super().__init__(**kwargs)
    

    def build(self, input_shape):
        self.vectors = self.add_weight(
            name='weights',
            shape=(input_shape[1], self.vector_len), # shape is (number of features*vector_len)
            initializer='uniform',
            trainable=True)
        super().build(input_shape) # to set self.built=True


    def call(self, x):
        # print('x:', x)
        first_part = bk.square(bk.dot(x, self.vectors))
        second_part = bk.dot(bk.square(x), bk.square(self.vectors))
        output = bk.sum(first_part - second_part, axis=1) * 0.5 # sum over axis=1
        return bk.reshape(output, (-1, self.output_dim))
        # print("call", first_part.shape, second_part.shape, self.vectors.shape)
        # return output
        
    
    def compute_output_shape(self, input_shape):
        assert input_shape and len(input_shape) == 2
        return (input_shape[0], self.output_dim)


In [196]:
def get_FM_model(feature_len=None, vector_len=5):
    '''
        Here we combine the linear features and cross features and get the full FM model. This function need the number of features and the length of the hidden vector(one hidden vector for each feature).
    '''
    output_dim = 1
    print("Hidden vector length: ", vector_len)
    # generate the input, linear features part and cross features part of FM
    input_layer = tf.keras.Input(shape=(feature_len, ), name='input')
    linear_layer = tf.keras.layers.Dense(output_dim, name='linear_layer')(input_layer)
    cross_layer = FMCrossLayer(output_dim, vector_len, name='cross_layer')(input_layer)
    # print("linear_layer.shape: ", linear_layer.shape)
    # print("cross_layer.shape: ", cross_layer.shape)

    # add the linear features part and crossed features part
    combine = tf.keras.layers.Add(name='combine')([linear_layer, cross_layer])
    # print("combine shape: ", combine.shape)

    # use sigmoid to get the final result
    # output = tf.keras.activations.sigmoid(combine)
    output = tf.keras.layers.Dense(output_dim, activation='sigmoid', name='output')(combine)

    model = tf.keras.Model(inputs=input_layer, outputs=output)
    model.compile(
        loss='binary_crossentropy',
        optimizer=tf.optimizers.Adam(0.001),
        metrics=['binary_accuracy']
    )
    print(model.summary())
    return model



def get_hidden_weights(model=None):
    '''
        Get the hidden vectors of FM, for better understanding
    '''
    return model.get_layer('cross_layer').get_weights()

In [197]:
def train(vector_len=5, epochs=10, verbose=1, batchsize=1024, test_size=0.2, seed=15):
    '''
        Get the FM model, split the data and train the model
        Inputs:
            vector_len: length of the hidden vector
    '''
    # print("Reading the data...")
    target = 'click'

    train, test = get_onehot_data()
    cols = train.columns
    train_x = train[cols[2:]]
    train_label = train[[target]]

    print("Shape of train_x: ", train_x.shape)    

    # test data from kaggle don't have click result, so just use validation_split while training
    # test_x = test[cols[2:]]
    # test_label = test[[target]]

    print("Generating the model...")
    fm_model = get_FM_model(train_x.shape[1], vector_len)

    print("Training the model...")
    fm_model.fit(train_x, train_label, epochs=epochs, batch_size=batchsize, validation_split=test_size)
    return fm_model


In [198]:
if __name__ == '__main__':
    fm = train()

Reading data...
Number of training sample:  1000
One hot encoding:  ['hour', 'C1', 'banner_pos', 'site_id', 'site_domain', 'site_category', 'app_id', 'app_domain', 'app_category', 'device_id', 'device_ip', 'device_model', 'device_type', 'device_conn_type', 'C14', 'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21']
Shape of train_x:  (1000, 2024)
Generating the model...
Hidden vector length:  5
Model: "model_24"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input (InputLayer)              [(None, 2024)]       0                                            
__________________________________________________________________________________________________
linear_layer (Dense)            (None, 1)            2025        input[0][0]                      
_____________________________________________________________________________________________