## DSSM TF2 实现

In [1]:
import os
import numpy as np
import pandas as pd
import argparse
import tensorflow as tf
from sklearn.metrics import log_loss, roc_auc_score, accuracy_score, classification_report

In [5]:
def gen_dataset(data_df: pd.DataFrame, columns: dict):
    data_dict = dict()

    def _get_type(type_str):
        if type_str == "int32":
            return np.int32
        elif type_str == "float32":
            return np.float32
        elif type_str == "string" or type_str == "str":
            return np.str
        else:
            return np.int32

    for key in columns.keys():
        data_dict[key] = np.array(data_df[key]).astype(_get_type(columns[key]))

    return data_dict


In [7]:
def dssm_model(feature_inputs, item_feature_columns, user_feature_columns, hidden_units):
    item_tower = tf.keras.layers.DenseFeatures(item_feature_columns)(feature_inputs)
    for num_nodes in hidden_units:
        item_tower = tf.keras.layers.Dense(num_nodes, activation='relu')(item_tower)

    user_tower = tf.keras.layers.DenseFeatures(user_feature_columns)(feature_inputs)
    for num_nodes in hidden_units:
        user_tower = tf.keras.layers.Dense(num_nodes, activation='relu')(user_tower)

    output = tf.keras.layers.Dot(axes=1)([item_tower, user_tower])
    output = tf.keras.layers.Dense(1, activation='sigmoid')(output)

    model = tf.keras.Model(feature_inputs, output)
    return model

In [2]:
data_path = "../../data/dssm_data"
model_path = "../../data/model/dssm"
monitor = "val_accuracy"
epoch = 10
batch_size = 12
# ====================================================================================
# read data
data_path = os.path.abspath(data_path)
print("[DSSM] read file path: {}".format(data_path))
train_data = pd.read_csv(os.path.join(data_path, "trainingSamples.csv"), sep=",")
test_data = pd.read_csv(os.path.join(data_path, "testSamples.csv"), sep=",")
data_pd = pd.concat([train_data, test_data])

[DSSM] read file path: /workspace/user_code/davidwwang/workspace/rec_proj/data/dssm_data


In [3]:
# ====================================================================================
# define input for keras model
columns_dict = {
    'movieId': 'int32',
    'movieGenre1': 'string',
    'movieAvgRating': 'float32',
    'userId': 'int32',
    'userGenre1': 'string',
    'userAvgRating': 'float32'
}
inputs = dict()
for key in columns_dict.keys():
    inputs[key] = tf.keras.layers.Input(name=key, shape=(), dtype=columns_dict[key])
print("[DSSM] input for keras model: \n {}".format(inputs))

[DSSM] input for keras model: 
 {'movieId': <KerasTensor: shape=(None,) dtype=int32 (created by layer 'movieId')>, 'movieGenre1': <KerasTensor: shape=(None,) dtype=string (created by layer 'movieGenre1')>, 'movieAvgRating': <KerasTensor: shape=(None,) dtype=float32 (created by layer 'movieAvgRating')>, 'userId': <KerasTensor: shape=(None,) dtype=int32 (created by layer 'userId')>, 'userGenre1': <KerasTensor: shape=(None,) dtype=string (created by layer 'userGenre1')>, 'userAvgRating': <KerasTensor: shape=(None,) dtype=float32 (created by layer 'userAvgRating')>}


In [4]:
#====================================================================================
# movie embedding feature
movie_col = tf.feature_column.categorical_column_with_identity(key='movieId', num_buckets=1001)
movie_emb_col = tf.feature_column.embedding_column(movie_col, 10)
movie_genre_1_vocab = data_pd['movieGenre1'].dropna().unique()
movie_genre_1_col = tf.feature_column.categorical_column_with_vocabulary_list(key='movieGenre1',
                                                                              vocabulary_list=movie_genre_1_vocab)
movie_genre_1_emb_col = tf.feature_column.embedding_column(movie_genre_1_col, 10)
movie_avg_rating = tf.feature_column.numeric_column(key='movieAvgRating')
# user embedding feature
user_col = tf.feature_column.categorical_column_with_identity(key='userId', num_buckets=30001)
user_emb_col = tf.feature_column.embedding_column(user_col, 10)
user_genre_1_vocab = data_pd['userGenre1'].dropna().unique()
user_genre_1_col = tf.feature_column.categorical_column_with_vocabulary_list(key='userGenre1',
                                                                             vocabulary_list=user_genre_1_vocab)
user_genre_1_emb_col = tf.feature_column.embedding_column(user_genre_1_col, 100)
user_avg_rating = tf.feature_column.numeric_column(key='userAvgRating')


In [8]:
# ====================================================================================
# train model
model = dssm_model(feature_inputs=inputs,
                   item_feature_columns=[movie_emb_col, movie_genre_1_emb_col, movie_avg_rating],
                   user_feature_columns=[user_emb_col, user_genre_1_emb_col, user_avg_rating],
                   hidden_units=[30, 10])
model.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=['accuracy', tf.keras.metrics.AUC(curve='ROC')])
filepath = os.path.join(model_path, "checkpoint", "dssm-weights-best.hdf5")
checkpoint = tf.keras.callbacks.ModelCheckpoint(
    filepath, monitor=monitor, verbose=1, save_best_only=True, mode='max')
train_data_input = gen_dataset(data_df=train_data, columns=columns_dict)
model.fit(x=train_data_input, y=train_data["label"].values,
          epochs=epoch, callbacks=[checkpoint], verbose=2, batch_size=batch_size, validation_split=0.1)

2022-08-24 16:23:07.947417: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/lib:/usr/local/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:/usr/local/cuda-9.0/lib64:/usr/local/cuda-9.0/extras/CUPTI/lib64:/usr/local/cuda-8.0/lib64:/usr/local/cuda-8.0/extras/CUPTI/lib64:/usr/lib:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64/:/usr/local/nvidia/lib:/usr/local/nvidia/lib64:/usr/lib:/usr/local/lib:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64/:/usr/local/nvidia/lib:/usr/local/nvidia/lib64:/usr/lib:/usr/local/lib:/usr/local/cuda-11.0/lib64:/usr/local/cuda-11.0/extras/CUPTI/lib64/:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64/:/usr/local/nvidia/lib:/usr/local/nvidia/lib64:/usr/lib:/usr/local/lib:/usr/local/cuda-11.0/lib64:/usr/local/cuda-1

Epoch 1/10
6662/6662 - 73s - loss: 0.6251 - accuracy: 0.6564 - auc: 0.6992 - val_loss: 0.5765 - val_accuracy: 0.7022 - val_auc: 0.7613

Epoch 00001: val_accuracy improved from -inf to 0.70224, saving model to ../../data/model/dssm/checkpoint/dssm-weights-best.hdf5
Epoch 2/10
6662/6662 - 69s - loss: 0.5616 - accuracy: 0.7151 - auc: 0.7775 - val_loss: 0.5736 - val_accuracy: 0.7029 - val_auc: 0.7667

Epoch 00002: val_accuracy improved from 0.70224 to 0.70292, saving model to ../../data/model/dssm/checkpoint/dssm-weights-best.hdf5
Epoch 3/10
6662/6662 - 66s - loss: 0.5261 - accuracy: 0.7411 - auc: 0.8102 - val_loss: 0.6002 - val_accuracy: 0.6804 - val_auc: 0.7469

Epoch 00003: val_accuracy did not improve from 0.70292
Epoch 4/10
6662/6662 - 63s - loss: 0.4943 - accuracy: 0.7631 - auc: 0.8359 - val_loss: 0.6163 - val_accuracy: 0.6714 - val_auc: 0.7375

Epoch 00004: val_accuracy did not improve from 0.70292
Epoch 5/10
6662/6662 - 64s - loss: 0.4660 - accuracy: 0.7793 - auc: 0.8567 - val_loss

<keras.callbacks.History at 0x7f933afa1590>

In [9]:
# ====================================================================================
# predict, use best model.
test_data_input = gen_dataset(data_df=test_data, columns=columns_dict)
model.load_weights(filepath=filepath)
pred_ans = model.predict(x=test_data_input, batch_size=batch_size)
print("\n[BEST] ===============================================================")
print("[test] LogLoss: {} ".format(round(log_loss(test_data["label"].values, pred_ans), 4)))
print("[test] Accuracy: {} ".format(round(accuracy_score(test_data["label"].values, pred_ans >= 0.5), 4)))
print("[test] AUC: {} ".format(round(roc_auc_score(test_data["label"].values, pred_ans), 4)))
print("[test] classification_report: \n{} ".format(classification_report(test_data["label"].values, pred_ans >= 0.5, digits=4)))
# ====================================================================================
# save model
model_path = os.path.abspath(model_path)
print("[DSSM] save model path: {}".format(model_path))
model.summary()
tf.keras.models.save_model(
    model,
    os.path.join(model_path, "dssm"),
    overwrite=True,
    include_optimizer=True,
    save_format=None,
    signatures=None,
    options=None
)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  # Remove the CWD from sys.path while we load stuff.



[test] LogLoss: 0.5796 
[test] Accuracy: 0.6966 
[test] AUC: 0.7624 
[test] classification_report: 
              precision    recall  f1-score   support

           0     0.7155    0.5132    0.5977      9856
           1     0.6879    0.8402    0.7564     12584

    accuracy                         0.6966     22440
   macro avg     0.7017    0.6767    0.6771     22440
weighted avg     0.7000    0.6966    0.6867     22440
 
[DSSM] save model path: /workspace/user_code/davidwwang/workspace/rec_proj/data/model/dssm
Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
movieAvgRating (InputLayer)     [(None,)]            0                                            
__________________________________________________________________________________________________
movieGenre1 (InputLayer)        [(None,)]            0               

2022-08-24 16:34:15.673388: W tensorflow/python/util/util.cc:348] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.


INFO:tensorflow:Assets written to: /workspace/user_code/davidwwang/workspace/rec_proj/data/model/dssm/dssm/assets


INFO:tensorflow:Assets written to: /workspace/user_code/davidwwang/workspace/rec_proj/data/model/dssm/dssm/assets
