<a href="https://colab.research.google.com/github/vmunteanu/devcon/blob/main/tensorflow.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Imports and utility functions:

In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np


def dataframe_to_dataset(dataframe):
    label = dataframe.pop("label").to_numpy(dtype="int")
    val = dataframe.to_numpy(dtype="str")
    return tf.data.Dataset.from_tensor_slices((val, label))


def dataframe_split(dataframe):

    l = len(dataframe) // 10
    test_df = dataframe.iloc[:l, ]
    val_df = dataframe.iloc[l:l+l, ]
    tr_df = dataframe.iloc[2 * l:, ]

    return tr_df, val_df, test_df

Load learning data:

In [None]:
words = pd.read_csv("/content/sample_data/words.txt", header=None, names=['val'])
words['label'] = 1

ids = pd.read_csv("/content/sample_data/not_words.txt", header=None, names=['val'])
ids['label'] = 0

words_tr, words_val, words_test = dataframe_split(words)
ids_tr, ids_val, ids_test = dataframe_split(ids)

train = pd.concat([ids_tr, words_tr], axis=0)
validate = pd.concat([ids_val, words_val], axis=0)
test = pd.concat([ids_test, words_test], axis=0)

train = train.sample(frac=1).reset_index(drop=True)
validate = validate.sample(frac=1).reset_index(drop=True)
test = test.sample(frac=1).reset_index(drop=True)

print(train.head)

train_ds = dataframe_to_dataset(train)
validate_ds = dataframe_to_dataset(validate)

train_ds = train_ds.batch(len(train))
validate_ds = validate_ds.batch(len(validate))

Setup the model:

In [17]:
tokens = "-0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
tokens = list(tokens)
max_len = 64

vectorize_layer = tf.keras.layers.TextVectorization(
    standardize="strip_punctuation",
    output_mode='int',
    split="character",
    output_sequence_length=max_len,
    vocabulary=tokens)

inputs = tf.keras.Input(shape=(1,), dtype=tf.string)
x = tf.keras.layers.Dropout(0.1)(inputs)
x = vectorize_layer(inputs)
x = tf.keras.layers.Dense(32, activation="relu")(x)

output = tf.keras.layers.Dense(1, activation="sigmoid")(x)

model = tf.keras.Model(inputs, output)

model.compile("adam", loss="binary_crossentropy", metrics=["accuracy"])

Learn:

In [None]:
model.fit(train_ds, epochs=100, validation_data=validate_ds)

Test the model:

In [None]:
y = test.pop("label").to_numpy(dtype="int")
x = test.to_numpy(dtype="str")

score = model.evaluate(x=x,
                        y=y,
                        return_dict=True, verbose=2, batch_size=32)

print('Test: ', score)

Export the model to be used in Tensorflow Serving:

In [23]:
model.save('/content/saved_model/words')

Play:

In [None]:
values = np.array([
    "george",
    "net-banking",
    "kyc",
    "unsecured-lending",
    "conturi",
    "14343123",
    "lkdsj0q93ure",
    "abd19198e731231dfdas2d",
    "abd19198e7-d02e-4d13-8709-9a9f46"])

prediction = model.predict(values)

it = np.nditer(values, flags=['f_index'])

for word in it:
    confidence = prediction[it.index][0]

    pred = "NOT"

    if confidence > 0.8:
        pred = "WORD"

    print(word, "-", pred, "({:.2f})".format(confidence), end='\n')