## Embedding + MLP (Tensorflow)

In [1]:
import tensorflow as tf

#### 1. Load Sample Data

In [8]:
# Training samples path
training_samples_file_path = tf.keras.utils.get_file("trainingSamples.csv",
                                                     "file:///Users/zliu/Desktop/PythonProjects/SampleData/trainingSamples.csv")
# Test samples path
test_samples_file_path = tf.keras.utils.get_file("testSamples.csv",
                                                 "file:///Users/zliu/Desktop/PythonProjects/SampleData/testSamples.csv")

# load sample as tf dataset
def get_dataset(file_path):
    dataset = tf.data.experimental.make_csv_dataset(
        file_path,
        batch_size=12,
        label_name='label',
        na_value="0",
        num_epochs=1,
        ignore_errors=True)
    return dataset


# split as test dataset and training dataset
train_dataset = get_dataset(training_samples_file_path)
test_dataset = get_dataset(test_samples_file_path)

2022-04-03 16:09:32.873036: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [11]:
# genre features vocabulary
genre_vocab = ['Film-Noir', 'Action', 'Adventure', 'Horror', 'Romance', 'War', 'Comedy', 'Western', 'Documentary',
               'Sci-Fi', 'Drama', 'Thriller',
               'Crime', 'Fantasy', 'Animation', 'IMAX', 'Mystery', 'Children', 'Musical']

GENRE_FEATURES = {
    'userGenre1': genre_vocab,
    'userGenre2': genre_vocab,
    'userGenre3': genre_vocab,
    'userGenre4': genre_vocab,
    'userGenre5': genre_vocab,
    'movieGenre1': genre_vocab,
    'movieGenre2': genre_vocab,
    'movieGenre3': genre_vocab
}

In [14]:
# all categorical features
categorical_columns = []
for feature, vocab in GENRE_FEATURES.items():
    cat_col = tf.feature_column.categorical_column_with_vocabulary_list(
        key=feature, vocabulary_list=vocab)
    emb_col = tf.feature_column.embedding_column(cat_col, 10)
    categorical_columns.append(emb_col)
    
# movie id embedding feature
movie_col = tf.feature_column.categorical_column_with_identity(key='movieId', num_buckets=1001)
movie_emb_col = tf.feature_column.embedding_column(movie_col, 10)
categorical_columns.append(movie_emb_col)

In [15]:
# all numerical features
numerical_columns = [tf.feature_column.numeric_column('releaseYear'),
                     tf.feature_column.numeric_column('movieRatingCount'),
                     tf.feature_column.numeric_column('movieAvgRating'),
                     tf.feature_column.numeric_column('movieRatingStddev'),
                     tf.feature_column.numeric_column('userRatingCount'),
                     tf.feature_column.numeric_column('userAvgRating'),
                     tf.feature_column.numeric_column('userRatingStddev')]

In [16]:
# embedding + MLP model architecture
model = tf.keras.Sequential([
    tf.keras.layers.DenseFeatures(numerical_columns + categorical_columns),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')])

In [17]:
# compile the model, set loss function, optimizer and evaluation metrics
model.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=['accuracy', tf.keras.metrics.AUC(curve='ROC'), tf.keras.metrics.AUC(curve='PR')])

# train the model
model.fit(train_dataset, epochs=5)

# evaluate the model
test_loss, test_accuracy, test_roc_auc, test_pr_auc = model.evaluate(test_dataset)
print('\n\nTest Loss {}, Test Accuracy {}, Test ROC AUC {}, Test PR AUC {}'.format(test_loss, test_accuracy,
                                                                                   test_roc_auc, test_pr_auc))

# print some predict results
predictions = model.predict(test_dataset)
for prediction, goodRating in zip(predictions[:12], list(test_dataset)[0][1][:12]):
    print("Predicted good rating: {:.2%}".format(prediction[0]),
          " | Actual rating label: ",
          ("Good Rating" if bool(goodRating) else "Bad Rating"))

Epoch 1/5


2022-04-03 17:13:04.805055: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:390] Filling up shuffle buffer (this may take a while): 9308 of 10000


     26/Unknown - 18s 4ms/step - loss: 23.8142 - accuracy: 0.5417 - auc: 0.5352 - auc_1: 0.5951

2022-04-03 17:13:10.757151: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:415] Shuffle buffer filled.


Epoch 2/5


2022-04-03 17:13:39.371052: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:390] Filling up shuffle buffer (this may take a while): 8289 of 10000
2022-04-03 17:13:49.357201: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:390] Filling up shuffle buffer (this may take a while): 9941 of 10000
2022-04-03 17:13:49.497900: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:415] Shuffle buffer filled.


Epoch 3/5


2022-04-03 17:14:17.670911: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:390] Filling up shuffle buffer (this may take a while): 8735 of 10000


  20/7403 [..............................] - ETA: 19s - loss: 0.6186 - accuracy: 0.6542 - auc: 0.7080 - auc_1: 0.7108     

2022-04-03 17:14:27.174531: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:415] Shuffle buffer filled.


Epoch 4/5


2022-04-03 17:14:54.542206: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:390] Filling up shuffle buffer (this may take a while): 9466 of 10000
2022-04-03 17:15:01.008632: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:415] Shuffle buffer filled.


Epoch 5/5


2022-04-03 17:15:28.779281: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:390] Filling up shuffle buffer (this may take a while): 8788 of 10000


  15/7403 [..............................] - ETA: 58s - loss: 0.6516 - accuracy: 0.6444 - auc: 0.6860 - auc_1: 0.7568   

2022-04-03 17:15:36.442481: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:415] Shuffle buffer filled.




2022-04-03 17:16:04.829842: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:390] Filling up shuffle buffer (this may take a while): 9004 of 10000
2022-04-03 17:16:14.163120: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:415] Shuffle buffer filled.




Test Loss 0.6078851222991943, Test Accuracy 0.6698752045631409, Test ROC AUC 0.7406303286552429, Test PR AUC 0.7703824043273926


2022-04-03 17:16:39.770524: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:390] Filling up shuffle buffer (this may take a while): 9682 of 10000
2022-04-03 17:16:43.692995: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:415] Shuffle buffer filled.


Predicted good rating: 44.93%  | Actual rating label:  Bad Rating
Predicted good rating: 69.47%  | Actual rating label:  Bad Rating
Predicted good rating: 34.47%  | Actual rating label:  Good Rating
Predicted good rating: 41.83%  | Actual rating label:  Good Rating
Predicted good rating: 17.55%  | Actual rating label:  Good Rating
Predicted good rating: 28.79%  | Actual rating label:  Bad Rating
Predicted good rating: 31.67%  | Actual rating label:  Good Rating
Predicted good rating: 54.53%  | Actual rating label:  Bad Rating
Predicted good rating: 78.18%  | Actual rating label:  Good Rating
Predicted good rating: 59.13%  | Actual rating label:  Good Rating
Predicted good rating: 48.56%  | Actual rating label:  Good Rating
Predicted good rating: 49.76%  | Actual rating label:  Bad Rating
