In [1]:
import matplotlib
import tensorflow as tf
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
from pretty_confusion_matrix import pp_matrix_from_data
import nltk
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\vinic\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [2]:
matplotlib.rcParams['figure.figsize'] = [12, 8]
np.set_printoptions(precision=3, suppress=True)

seed = 22
tf.random.set_seed(seed)
hidden_neurons = 512
number_of_hidden_layers = 2
batch_size = 128
learning_rate = 0.00001
max_epochs = 20
test_val_split = 0.6 # percentage of data separated for test and validation
activation_func = 'relu'
padding_token = "<pad>"
auto = tf.data.AUTOTUNE

In [3]:
def plot_result(item):
    plt.plot(history.history[item], label=item)
    plt.plot(history.history["val_" + item], label="val_" + item)
    plt.xlabel("Epochs")
    plt.ylabel(item)
    plt.title("Train and Validation {} Over Epochs".format(item), fontsize=14)
    plt.legend()
    plt.grid()
    plt.show()

In [4]:
def get_number_of_genres(data):
    number_genres = 0
    list_of_genres = []
    for i in range(len(data)):
        string = ''.join(data.iloc[i])
        new_string = string.replace('[', '')
        new_string = new_string.replace(']', '')
        new_string = new_string.replace("'", '')
        new_string = new_string.replace("'", '')
        new_string = new_string.split(", ")
        for j in range(len(new_string)):
            if new_string[j] != '':
                if new_string[j] not in list_of_genres:
                    list_of_genres.append(new_string[j])
                    number_genres += 1
    return np.array(list_of_genres), number_genres

In [5]:
def preprocess_target_data(target):
    targets = []
    for i in range(len(target)):
        string = ''.join(target.iloc[i])
        new_strings = string.replace('[', '')
        new_strings = new_strings.replace(']', '')
        new_strings = new_strings.replace("'", '')
        new_strings = new_strings.replace("'", '')
        new_strings = new_strings.split(", ")
        targets.append(new_strings)
    return np.array(targets, dtype=object)

In [6]:
def make_dataset(data, target, is_train=True):
    labels = tf.ragged.constant(target)
    label_binarized = lookup(labels).numpy()
    dataset = tf.data.Dataset.from_tensor_slices((data, label_binarized))
    dataset = dataset.shuffle(batch_size * 10) if is_train else dataset
    return dataset.batch(batch_size)

In [7]:
def vectorize_text(vector_text, vector_label):
    vector_text = tf.expand_dims(vector_text, -1)
    return vectorize_layer(vector_text), vector_label

In [8]:
descriptions = pd.read_csv('descriptions.csv')
genres = pd.read_csv('genres.csv')

In [9]:
_ = genres.pop('Unnamed: 0')
_ = descriptions.pop('Unnamed: 0')

In [10]:
df = pd.concat([descriptions, genres], axis=1)

In [11]:
column_indices = {name: i for i, name in enumerate(df.columns)}

n = len(df)
train_df = df[0:int(n*test_val_split)]
val_df = df[int(n*test_val_split):int(n*0.7)]
test_df = df[int(n*0.7):]

In [12]:
target_train_data = train_df.pop('genres')
train_data = train_df.pop('description')
train_data = train_data.astype(str)
val_target = val_df.pop('genres')
val_data = val_df.pop('description')
val_data = val_data.astype(str)
target_test_data = test_df.pop('genres')
test_data = test_df.pop('description')
test_data = test_data.astype(str)

tensor_train_data = tf.convert_to_tensor(train_data)
tensor_val_data = tf.convert_to_tensor(val_data)
tensor_test_data = tf.convert_to_tensor(test_data)

list_genres, number_of_genres = get_number_of_genres(genres)

In [13]:
target_train_data = preprocess_target_data(target_train_data)
val_target = preprocess_target_data(val_target)
target_test_data = preprocess_target_data(target_test_data)

In [14]:
arrays_train = [np.array(x) for x in target_train_data]
arrays_val = [np.array(x) for x in val_target]
arrays_test = [np.array(x) for x in target_test_data]

In [15]:
terms = tf.ragged.constant(list_genres)
lookup = tf.keras.layers.StringLookup(output_mode="multi_hot")
lookup.adapt(terms)

In [16]:
raw_train_ds = make_dataset(tensor_train_data, arrays_train, is_train=True)
raw_val_ds = make_dataset(tensor_val_data, arrays_val, is_train=False)
raw_test_ds = make_dataset(tensor_test_data, arrays_test, is_train=False)

In [17]:
vocabulary = set()
train_data.str.lower().str.split().apply(vocabulary.update)
vocabulary_size = len(vocabulary)

In [18]:
vectorize_layer = tf.keras.layers.TextVectorization(
    max_tokens=vocabulary_size,
    ngrams=2,
    output_mode='tf_idf'
)

with tf.device('/CPU:0'):
    vectorize_layer.adapt(raw_train_ds.map(lambda text, label: text))

Instructions for updating:
Lambda fuctions will be no more assumed to be used in the statement where they are used, or at least in the same block. https://github.com/tensorflow/tensorflow/issues/56089


In [19]:
train_ds = raw_train_ds.map(vectorize_text)
val_ds = raw_val_ds.map(vectorize_text)
test_ds = raw_test_ds.map(vectorize_text)

In [20]:
model = tf.keras.Sequential([
    tf.keras.Input(shape=(vocabulary_size,)),
    tf.keras.layers.Dense(hidden_neurons, activation=activation_func),
    tf.keras.layers.Dense(hidden_neurons, activation=activation_func),
    tf.keras.layers.Dense(lookup.vocabulary_size(), activation='sigmoid')
])

model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate), loss=tf.keras.losses.BinaryCrossentropy(from_logits=False), metrics=['binary_accuracy'])

In [21]:
history = model.fit(train_ds, validation_data=val_ds, epochs=max_epochs, batch_size=batch_size, use_multiprocessing=True, verbose=1)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20

KeyboardInterrupt: 

In [None]:
plot_result("loss")
plot_result("binary_accuracy")

In [None]:
_, binary_acc = model.evaluate(test_ds)
print(f"Categorical accuracy on the test set: {round(binary_acc * 100, 2)}%.")