In [None]:
#install and import all required libraries
from sklearn.model_selection import train_test_split
from tensorflow.python.client import device_lib
from keras.preprocessing.text import Tokenizer
from tensorflow.keras import layers
from operator import itemgetter
from tensorflow import keras
from ast import literal_eval
import matplotlib.pyplot as plt
import tensorflow as tf
import pandas as pd
import numpy as np
import json
import re

pd.set_option('display.max_columns', None)

In [None]:
#import train, validation and test set
train_df = pd.read_csv('/content/699/699/Data 20230612/Youtube/train_set_youtube.csv', lineterminator='\n', converters={'top_tags': eval})
validation_df = pd.read_csv('/content/699/699/Data 20230612/Youtube/validation_set_youtube.csv', lineterminator='\n', converters={'top_tags': eval})
test_df = pd.read_csv('/content/699/699/Data 20230612/Youtube/test_set_youtube.csv', lineterminator='\n', converters={'top_tags': eval})

In [None]:
### Multi-label binarization
### Reference: https://keras.io/examples/nlp/multi_label_classification/#multilabel-binarization

with tf.device("/CPU:0"):
    terms = tf.ragged.constant(train_df["top_tags"].values)
    lookup = tf.keras.layers.StringLookup(output_mode="multi_hot")
    lookup.adapt(terms)
    vocab = lookup.get_vocabulary()

def invert_multi_hot(encoded_labels):
    hot_indices = np.argwhere(encoded_labels == 1.0)[..., 0]
    return np.take(vocab, hot_indices)

sample_label = train_df["top_tags"].iloc[0]
label_binarized = lookup([sample_label])

In [None]:
#combine "title" and "description" to form a column "combined"
train_df["description"] = train_df["description"].astype(str)
test_df["description"] = test_df["description"].astype(str)
validation_df["description"] = validation_df["description"].astype(str)

train_df["title"] = train_df["title"].astype(str)
test_df["title"] = test_df["title"].astype(str)
validation_df["title"] = validation_df["title"].astype(str)

train_df["combined"] = train_df["title"] + '' + train_df["description"]
test_df["combined"] = test_df["title"] + '' + test_df["description"]
validation_df["combined"] =validation_df["title"] + '' + validation_df["description"]

In [None]:
### Prepare Dataset for training
### Reference: https://keras.io/examples/nlp/multi_label_classification/#data-preprocessing-and-tfdatadataset-objects

max_seqlen = 150
batch_size = 128
padding_token = "<pad>"
auto = tf.data.AUTOTUNE

def make_dataset(dataframe):
    labels = tf.ragged.constant(dataframe["top_tags"].values)
    label_binarized = lookup(labels).numpy()
    dataset = tf.data.Dataset.from_tensor_slices((dataframe["combined"].values, label_binarized)).shuffle(batch_size * 10)
    return dataset.batch(batch_size)

with tf.device("/CPU:0"):
    train_dataset = make_dataset(train_df)
    validation_dataset = make_dataset(validation_df)
    test_dataset = make_dataset(test_df)

In [None]:
### Prepare Dataset for training
### Reference: https://keras.io/examples/nlp/multi_label_classification/#data-preprocessing-and-tfdatadataset-objects

with tf.device("/CPU:0"):
    text_batch, label_batch = next(iter(train_dataset))

for i, text in enumerate(text_batch[:5]):
    label = label_batch[i].numpy()[None, ...]

vocabulary = set()
train_df["combined"].str.lower().str.split().apply(vocabulary.update)
vocabulary_size = len(vocabulary)

In [None]:
### Text Vectorization
### Reference: https://keras.io/examples/nlp/multi_label_classification/#vectorization

text_vectorizer = layers.TextVectorization(max_tokens=vocabulary_size, ngrams=2, output_mode="tf_idf")

tf.debugging.set_log_device_placement(True)
gpus = tf.config.list_logical_devices('GPU')
strategy = tf.distribute.MirroredStrategy(gpus)
with strategy.scope():
    text_vectorizer.adapt(train_dataset.map(lambda text, label: text))

train_dataset = train_dataset.map(lambda text, label: (text_vectorizer(text), label), num_parallel_calls=auto).prefetch(auto)
validation_dataset = validation_dataset.map(lambda text, label: (text_vectorizer(text), label), num_parallel_calls=auto).prefetch(auto)
test_dataset = test_dataset.map(lambda text, label: (text_vectorizer(text), label), num_parallel_calls=auto).prefetch(auto)

In [None]:
### Model Training with Early Stopping using Nvidia A100 40GB GPU
### Note: 2 hidden layers with size 1024 consumes 39GB GPU Memory

epochs = 20
layer_size = 768

dnn_model = keras.Sequential(
    [layers.Dense(layer_size, activation="relu"),
     layers.Dense(layer_size, activation="relu"),
     layers.Dense(lookup.vocabulary_size(), activation="sigmoid")])

METRICS = ['binary_accuracy',keras.metrics.Precision(name='precision'),]

callback = keras.callbacks.EarlyStopping(monitor='val_precision', patience=3)
dnn_model.compile(loss="binary_crossentropy", optimizer="adam", metrics=METRICS)

with tf.device("/GPU:0"):
  history = dnn_model.fit(train_dataset, validation_data=validation_dataset, epochs=epochs, callbacks=[callback])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20


In [None]:
dnn_model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 768)               331398912 
                                                                 
 dense_1 (Dense)             (None, 768)               590592    
                                                                 
 dense_2 (Dense)             (None, 78586)             60432634  
                                                                 
Total params: 392,422,138
Trainable params: 392,422,138
Non-trainable params: 0
_________________________________________________________________


In [None]:
tf.keras.saving.save_model(dnn_model, '/content/699/699/model/dnn_768.keras', overwrite=True)

In [None]:
x1,x2,x3 = dnn_model.evaluate(test_dataset)

print("Test dataset loss: " + str(x1))
print("Test dataset binary accuracy: " + str(x2))
print("Test dataset precision: " + str(x3))

Test dataset loss: 0.0013816292630508542
Test dataset binary accuracy: 0.999862551689148
Test dataset precision: 0.8361828327178955


In [None]:
#create a make_inf_dataset function to create dataset for the whole test set without splitting into batches
def make_inf_dataset(dataframe):
    labels = tf.ragged.constant(dataframe["top_tags"].values)
    label_binarized = lookup(labels).numpy()
    dataset = tf.data.Dataset.from_tensor_slices((dataframe["combined"].values, label_binarized))
    return dataset

#create tag_generator
tag_generator = keras.Sequential([text_vectorizer, dnn_model])

inference_dataset = make_inf_dataset(test_df)
text_batch, label_batch = next(iter(inference_dataset))
predicted_probabilities = tag_generator.predict(test_df['combined'])



In [None]:
tf.keras.saving.save_model(tag_generator, '/content/699/699/model/dnn_768', save_format='tf', overwrite=True)



In [None]:
%%time
### Generate Prediction Results against Test Set
### Reference:  https://keras.io/examples/nlp/multi_label_classification/#inference

test_df['dnn'] = None
for i, text in enumerate(test_df['combined']):
    label = label_batch[i].numpy()[None, ...]

    predicted_proba = [proba for proba in predicted_probabilities[i]]
    top_10_labels = [x for _, x in sorted(zip(predicted_probabilities[i], lookup.get_vocabulary()), key=lambda pair: pair[0], reverse=True)][:10]

    x = '"'+'", "'.join([label for label in top_10_labels])
    x = '['+x+'"]'
    test_df['dnn'].iloc[i] = x

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


CPU times: user 26min 38s, sys: 3.09 s, total: 26min 41s
Wall time: 26min 34s


In [None]:
test_df.sample(10)

Unnamed: 0.1,Unnamed: 0,video_id,title,description,en_tag_list,top_tags,combined,dnn
315,14920,Mgj9Rp8h0F8,"VALORANT | New ZEDD Skins - SPECTRUM Guns, Fin...",GIVEAWAY - https://gleam.io/r1SKh/10000vp-valo...,"['hitscan', 'mysca', 'ryancentral', 'valorant'...","[hitscan, mysca, ryancentral, valorant, valora...","VALORANT | New ZEDD Skins - SPECTRUM Guns, Fin...","[""valorant"", ""apex"", ""fps"", ""gaming"", ""launch""..."
4350,37130,SI1j_Y0N-ug,I Created YouTubers Their Own Fortnite Skin!,► I Created NickEh30 & Tfue Their Own Fortnite...,"['fortnite', 'fortnite icon series', 'fortnite...","[fortnite, fortnite youtuber skins, fortnite s...",I Created YouTubers Their Own Fortnite Skin!► ...,"[""fortnite"", ""fortnite live"", ""fortnite update..."
4412,37664,BV3r6oZb2wI,"The Lion Awakens – Warhammer 40,000","Long has he walked the shadowed paths. Now, th...","['40k', 'aos', 'black library', 'citadel minia...","[40k, aos, black library, citadel miniatures, ...","The Lion Awakens – Warhammer 40,000Long has he...","[""warhammer 40000"", ""black library"", ""citadel ..."
2386,49293,KrOVwV96vfY,My first Christmas living alone,OPEN FOR LINKSGo to https://www.casetify.com/f...,"['littlemissflossie', 'makeup', 'tutorial', 'r...","[littlemissflossie, makeup, tutorial, relation...",My first Christmas living aloneOPEN FOR LINKSG...,"[""24 hour"", ""week"", ""spotify playlist"", ""get l..."
3148,19411,Y1kkUlLweNc,SECURITY BREACH IS FINALLY HERE... - Five Nigh...,Leave a like if you enjoyed today's video! Lot...,"['fnaf security breach', 'security breach', 'f...","[fnaf security breach, security breach, fnaf, ...",SECURITY BREACH IS FINALLY HERE... - Five Nigh...,"[""fnaf security breach"", ""fnaf"", ""security bre..."
4717,51151,frMaw9SbZLc,Angers vs PSG (0-3) | Mbappe and Ramos inspire...,PSG cruised to victory against Angers as Mbapp...,"['bt sport', 'bt sport official']","[bt sport, bt sport official]",Angers vs PSG (0-3) | Mbappe and Ramos inspire...,"[""football"", ""soccer"", ""highlights"", ""sport"", ..."
3501,60775,XLeACMS8R9k,"Miky Woodz, Myke Towers - Ta To Saldo (Video L...","Miky Woodz, Myke Towers - Ta To Saldo (Video L...","['miky woodz', 'living life', 'living life ep'...","[miky woodz, living life, living life miky, mi...","Miky Woodz, Myke Towers - Ta To Saldo (Video L...","[""bad bunny"", ""jam"", ""hop"", ""universal"", ""mexi..."
478,20144,0jcUoo03YPU,NEON: All Agent Interactions & OP Tricks To Ab...,Valorant: NEON: All Agent Interactions & OP Tr...,"['valorant', 'valorant', 'valorant moments', '...","[valorant, valorant, valorant moments, valoran...",NEON: All Agent Interactions & OP Tricks To Ab...,"[""valorant immortal"", ""valorant battles"", ""val..."
456,25806,a6H6azkS-LQ,"Bought it, washed it, and took REALLY good pic...",Today we will do an experiment. I found this T...,"['mtb', 'mountain bike', 'bike repair', 'bikin...","[mtb, mountain bike, bike repair, biking, bike...","Bought it, washed it, and took REALLY good pic...","[""outdoors"", ""mountain bike"", ""biking"", ""bike ..."
2042,48638,jgYYOUC10aM,Planning a Heist - Key & Peele,"This heist plan is foolproof, except for one l...","['key and peele', 'jordan peele', 'keegan-mich...","[key and peele, jordan peele, keegan-michael k...",Planning a Heist - Key & PeeleThis heist plan ...,"[""funny clips"", ""comedy videos"", ""keegan-micha..."


In [None]:
#test_df.to_csv('/content/699/699/Data 20230612/Youtube/dnn_results_1024.csv')
#test_df.to_csv('/content/699/699/Data 20230612/Youtube/dnn_results_1024.csv.backup')
#backup_df = test_df.copy()

In [26]:
### Save Vocab list

with open('/content/699/699/model/dnn_768/dnn_vocab.txt', 'w') as f:
    for line in lookup.get_vocabulary():
        f.write(f"{line}\n")

In [38]:
test_df.iloc[200].description

'Stephen A. Smith, Max Kellerman and Jay Williams weigh in on whether Michael Jordan and the Charlotte Hornets made the right choice by drafting LaMelo Ball with the No. 3 overall pick in the 2020 NBA Draft.#FirstTake #NBA✔️ Subscribe to ESPN+https://plus.espn.com/✔️ Get the ESPN App: http://www.espn.com/espn/apps/espn✔️ Subscribe to ESPN on YouTube: http://es.pn/SUBSCRIBEtoYOUTUBE✔️ Subscribe to NBA on ESPN on YouTube: http://bit.ly/SUBSCRIBEtoNBAonESPN✔️ Watch ESPN on YouTube TV: http://es.pn/YouTubeTVESPN on Social Media:► Follow on Twitter: http://www.twitter.com/espn► Like on Facebook: http://www.facebook.com/espn► Follow on Instagram: http://www.instagram.com/espnVisit ESPN on YouTube to get up-to-the-minute sports news coverage, scores, highlights and commentary for NFL, NHL, MLB, NBA, College Football, NCAA Basketball, soccer and more. More on ESPN.com: https://www.espn.com'