In [2]:
import tensorflow as tf

In [6]:
train_data_path = "file:///C:/SourceCode/Demo/tensorflow/dl_recommenders/data/sampledata/modelSamples.csv"
sample_file_path = tf.keras.utils.get_file("modelSamples.csv", train_data_path)

Downloading data from file:///C:/SourceCode/Demo/tensorflow/dl_recommenders/data/sampledata/modelSamples.csv


In [7]:
def make_dataset(file_path):
    dataset = tf.data.experimental.make_csv_dataset(
        file_path,
        batch_size=12,
        label_name='label',
        na_value="0",
        num_epochs=1,
        ignore_errors=True
    )

    return dataset

In [8]:
raw_samples_data = make_dataset(sample_file_path)
print(raw_samples_data)

<PrefetchDataset shapes: (OrderedDict([(movieId, (None,)), (userId, (None,)), (rating, (None,)), (timestamp, (None,)), (releaseYear, (None,)), (movieGenre1, (None,)), (movieGenre2, (None,)), (movieGenre3, (None,)), (movieRatingCount, (None,)), (movieAvgRating, (None,)), (movieRatingStddev, (None,)), (userRatedMovie1, (None,)), (userRatedMovie2, (None,)), (userRatedMovie3, (None,)), (userRatedMovie4, (None,)), (userRatedMovie5, (None,)), (userRatingCount, (None,)), (userAvgReleaseYear, (None,)), (userReleaseYearStddev, (None,)), (userAvgRating, (None,)), (userRatingStddev, (None,)), (userGenre1, (None,)), (userGenre2, (None,)), (userGenre3, (None,)), (userGenre4, (None,)), (userGenre5, (None,))]), (None,)), types: (OrderedDict([(movieId, tf.int32), (userId, tf.int32), (rating, tf.float32), (timestamp, tf.int32), (releaseYear, tf.int32), (movieGenre1, tf.string), (movieGenre2, tf.string), (movieGenre3, tf.string), (movieRatingCount, tf.int32), (movieAvgRating, tf.float32), (movieRatingSt

In [9]:
test_dataset = raw_samples_data.take(1000)
train_dataset = raw_samples_data.skip(1000)

In [10]:
genre_vocab = ['Film-Noir', 'Action', 'Adventure', 'Horror', 'Romance', 'War', 'Comedy', 'Western', 'Documentary',
               'Sci-Fi', 'Drama', 'Thriller',
               'Crime', 'Fantasy', 'Animation', 'IMAX', 'Mystery', 'Children', 'Musical']

In [11]:
GENRE_FEATURES = {
    'userGenre1': genre_vocab,
    'userGenre2': genre_vocab,
    'userGenre3': genre_vocab,
    'userGenre4': genre_vocab,
    'userGenre5': genre_vocab,
    'movieGenre1': genre_vocab,
    'movieGenre2': genre_vocab,
    'movieGenre3': genre_vocab
}

In [12]:
categorical_columns = []
for feature, vocab in GENRE_FEATURES.items():
    cat_col = tf.feature_column.categorical_column_with_vocabulary_list(
        key=feature, vocabulary_list=vocab
    )
    emb_col = tf.feature_column.embedding_column(cat_col, 10)
    categorical_columns.append(emb_col)
# movie id embedding
movie_col = tf.feature_column.categorical_column_with_identity(key='movieId', num_buckets=1001)
movie_emb_col = tf.feature_column.embedding_column(movie_col, 10)
categorical_columns.append(movie_emb_col)

In [13]:
# user id embedding
user_col = tf.feature_column.categorical_column_with_identity(key='userId', num_buckets=30001)
user_emb_col = tf.feature_column.embedding_column(user_col, 10)
categorical_columns.append(user_emb_col)

In [14]:
# numerical features
numerical_columns = [tf.feature_column.numeric_column('releaseYear'),
                     tf.feature_column.numeric_column('movieRatingCount'),
                     tf.feature_column.numeric_column('movieAvgRating'),
                     tf.feature_column.numeric_column('movieRatingStddev'),
                     tf.feature_column.numeric_column('userRatingCount'),
                     tf.feature_column.numeric_column('userAvgRating'),
                     tf.feature_column.numeric_column('userRatingStddev')]

In [15]:
# cross feature between current movie and historical movie
rated_movie = tf.feature_column.categorical_column_with_identity(key='userRatedMovie1', num_buckets=1001)
crossed_feature = tf.feature_column.indicator_column(tf.feature_column.crossed_column([movie_col, rated_movie], 10000))

In [16]:
inputs = {
    'movieAvgRating': tf.keras.layers.Input(name='movieAvgRating', shape=(), dtype='float32'),
    'movieRatingStddev': tf.keras.layers.Input(name='movieRatingStddev', shape=(), dtype='float32'),
    'movieRatingCount': tf.keras.layers.Input(name='movieRatingCount', shape=(), dtype='int32'),
    'userAvgRating': tf.keras.layers.Input(name='userAvgRating', shape=(), dtype='float32'),
    'userRatingStddev': tf.keras.layers.Input(name='userRatingStddev', shape=(), dtype='float32'),
    'userRatingCount': tf.keras.layers.Input(name='userRatingCount', shape=(), dtype='int32'),
    'releaseYear': tf.keras.layers.Input(name='releaseYear', shape=(), dtype='int32'),

    'movieId': tf.keras.layers.Input(name='movieId', shape=(), dtype='int32'),
    'userId': tf.keras.layers.Input(name='userId', shape=(), dtype='int32'),
    'userRatedMovie1': tf.keras.layers.Input(name='userRatedMovie1', shape=(), dtype='int32'),

    'userGenre1': tf.keras.layers.Input(name='userGenre1', shape=(), dtype='string'),
    'userGenre2': tf.keras.layers.Input(name='userGenre2', shape=(), dtype='string'),
    'userGenre3': tf.keras.layers.Input(name='userGenre3', shape=(), dtype='string'),
    'userGenre4': tf.keras.layers.Input(name='userGenre4', shape=(), dtype='string'),
    'userGenre5': tf.keras.layers.Input(name='userGenre5', shape=(), dtype='string'),
    'movieGenre1': tf.keras.layers.Input(name='movieGenre1', shape=(), dtype='string'),
    'movieGenre2': tf.keras.layers.Input(name='movieGenre2', shape=(), dtype='string'),
    'movieGenre3': tf.keras.layers.Input(name='movieGenre3', shape=(), dtype='string'),
}

In [18]:
# model architecture
deep = tf.keras.layers.DenseFeatures(numerical_columns + categorical_columns)(inputs)
deep = tf.keras.layers.Dense(128, activation='relu')(deep)
deep = tf.keras.layers.Dense(128, activation='relu')(deep)

wide = tf.keras.layers.DenseFeatures(crossed_feature)(inputs)
WAndD = tf.keras.layers.concatenate([deep, wide])
output = tf.keras.layers.Dense(1, activation='sigmoid')(WAndD)
model = tf.keras.Model(inputs, output)

In [19]:
model.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

model.fit(train_dataset, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x2d380af1b08>

In [20]:
test_loss, test_accuracy = model.evaluate(test_dataset)
print(f'Test Loss {test_loss}, Test Accuracy {test_accuracy}')

Test Loss 0.5385934710502625, Test Accuracy 0.7388333082199097


In [None]:
predictions = model.predict(test_dataset)
for prediction, goodRating in zip(predictions[:12], list(test_dataset)[0][1][:12]):
    print("Predicted good rating: {:.2%}".format(prediction[0]),
          " | Actual rating label: ",
          ("Good Rating" if bool(goodRating) else "Bad Rating"))