In [8]:
import pandas as pd
import numpy as np
from unidecode import unidecode
import tensorflow as tf
import random
import pickle

In [20]:
# Lists of words to combine
adjectives = [
    "Ancient",
    "Modern",
    "Eternal",
    "Mystic",
    "Golden",
    "Emerald",
    "Silent",
    "Royal",
]
nouns = ["Dragon", "Lotus", "Phoenix", "River", "Mountain", "Sea", "Forest", "Sky"]
business_types = [
    "Café",
    "Bakery",
    "Restaurant",
    "Boutique",
    "Salon",
    "Bookstore",
    "Gallery",
    "Market",
]

In [24]:
# Lists of Vietnamese words to combine
adjectives = [
    "Cổ Kính",
    "Hiện Đại",
    "Vĩnh Cửu",
    "Huyền Bí",
    "Vàng",
    "Ngọc Bích",
    "Yên Tĩnh",
    "Hoàng Gia",
]
nouns = ["Rồng", "Sen", "Phượng", "Sông", "Núi", "Biển", "Rừng", "Bầu Trời"]
business_types = [
    "Quán Cà Phê",
    "Tiệm Bánh",
    "Nhà Hàng",
    "Cửa Hàng",
    "Salon Tóc",
    "Hiệu Sách",
    "Phòng Trưng Bày",
    "Chợ",
]


# Function to generate names
def generate_business_names(num_names):
    generated_names = []
    for _ in range(num_names):
        adjective = random.choice(adjectives)
        noun = random.choice(nouns)
        business_type = random.choice(business_types)
        name = f"{business_type} {adjective} {noun}"
        generated_names.append(name)
    return generated_names


# Generate 500 business names
business_names = generate_business_names(100)

In [2]:
def load_texts(path):
    with open(path, "r") as f:
        lines = f.readlines()
    return [line[:-1] for line in lines]


addresses = load_texts("signboard_text_dataset/addresses.txt")
names = load_texts("signboard_text_dataset/names.txt")
phone_numbers = load_texts("signboard_text_dataset/phone_numbers.txt")
print(addresses[0], names[0], phone_numbers[0])

An Châu, Sơn Động, Bắc Giang Azure Apparel 7345831361


In [3]:
def remove_accents(X):
    X_new = X.copy()
    for i, row in enumerate(X):
        X_new[i] = unidecode(row)
    return X_new


names = remove_accents(names)
addresses = remove_accents(addresses)
print(names[0], "||", addresses[0])

Azure Apparel || An Chau, Son Dong, Bac Giang


In [4]:
X = np.concatenate((names, addresses, phone_numbers), axis=0).reshape(-1, 1)
y = np.array(
    [0] * len(names) + [1] * len(addresses) + [2] * len(phone_numbers)
).reshape(-1, 1)
print(X.shape, y.shape)

(2403, 1) (2403, 1)


In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.1, random_state=42, stratify=y
)
print(X_train.shape, y_train.shape)

(2162, 1) (2162, 1)


In [26]:
text_vec_layer = tf.keras.layers.TextVectorization(
    split="character",
    # standardize="lower",
)
text_vec_layer.adapt([X])
vocab_size = text_vec_layer.vocabulary_size()
vocab_size

39

In [27]:
# Pickle the config and weights
import pickle

pickle.dump(
    {"config": text_vec_layer.get_config(), "weights": text_vec_layer.get_weights()},
    open("tv_layer.pkl", "wb"),
)

In [28]:
text_vec_layer.get_vocabulary()

['',
 '[UNK]',
 ' ',
 'n',
 'a',
 'h',
 'o',
 'i',
 'u',
 'e',
 'g',
 't',
 'c',
 'r',
 'l',
 's',
 '3',
 '4',
 '9',
 '0',
 '5',
 '6',
 '7',
 '1',
 '8',
 '2',
 'b',
 'd',
 'm',
 'y',
 'p',
 'k',
 'v',
 'q',
 'f',
 'x',
 'w',
 'j',
 'z']

In [29]:
X_train_encoded = text_vec_layer(X_train)
X_test_encoded = text_vec_layer(X_test)

In [30]:
tf.random.set_seed(42)  # extra code – ensures reproducibility on CPU
model = tf.keras.Sequential(
    [
        tf.keras.layers.Embedding(
            input_dim=vocab_size,
            output_dim=16,
            mask_zero=True,
        ),
        tf.keras.layers.GRU(128),
        tf.keras.layers.Dense(3, activation="softmax"),
    ]
)
model.compile(
    loss="sparse_categorical_crossentropy", optimizer="nadam", metrics=["accuracy"]
)
model_ckpt = tf.keras.callbacks.ModelCheckpoint(
    "gru_model", monitor="val_accuracy", save_best_only=True
)
early_stopping_cb = tf.keras.callbacks.EarlyStopping(
    monitor="val_accuracy", patience=10
)
history = model.fit(
    X_train_encoded,
    tf.constant(y_train),
    validation_data=(X_test_encoded, tf.constant(y_test)),
    epochs=10,
    callbacks=[model_ckpt, early_stopping_cb],
)

Epoch 1/10


INFO:tensorflow:Assets written to: gru_model/assets


Epoch 2/10


INFO:tensorflow:Assets written to: gru_model/assets


Epoch 3/10


INFO:tensorflow:Assets written to: gru_model/assets


Epoch 4/10
Epoch 5/10


INFO:tensorflow:Assets written to: gru_model/assets


Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10


INFO:tensorflow:Assets written to: gru_model/assets


Epoch 10/10


In [31]:
model.predict(text_vec_layer(["Bun bo Hue"]), verbose=False).argmax()

0

In [32]:
from_disk = pickle.load(open("tv_layer.pkl", "rb"))
new_v = tf.keras.layers.TextVectorization(
    split="character",
    # standardize="lower",
)
# You have to call `adapt` with some dummy data (BUG in Keras)
# new_v.adapt(tf.data.Dataset.from_tensor_slices(["xyz"]))
new_v.set_weights(from_disk["weights"])

In [33]:
tf.reduce_max((new_v([X_train[100]]) - text_vec_layer([X_train[100]]))).numpy()

0

In [34]:
new_model = tf.keras.models.load_model("gru_model")

2024-07-04 18:40:58.782637: W tensorflow/core/common_runtime/graph_constructor.cc:840] Node 'cond/while' has 13 outputs but the _output_shapes attribute specifies shapes for 40 outputs. Output shapes may be inaccurate.
2024-07-04 18:40:58.796649: W tensorflow/core/common_runtime/graph_constructor.cc:840] Node 'cond' has 4 outputs but the _output_shapes attribute specifies shapes for 40 outputs. Output shapes may be inaccurate.
2024-07-04 18:40:59.275142: W tensorflow/core/common_runtime/graph_constructor.cc:840] Node 'cond/while' has 13 outputs but the _output_shapes attribute specifies shapes for 40 outputs. Output shapes may be inaccurate.
2024-07-04 18:40:59.287779: W tensorflow/core/common_runtime/graph_constructor.cc:840] Node 'cond' has 4 outputs but the _output_shapes attribute specifies shapes for 40 outputs. Output shapes may be inaccurate.
2024-07-04 18:40:59.454380: W tensorflow/core/common_runtime/graph_constructor.cc:840] Node 'cond/while' has 13 outputs but the _output_sh

In [37]:
samples = [
    "Sữa chua Hạ Long",
    "Đường Thiện Khánh, Bích Nhôi 3, Minh Tân, Kinh Môn, Hải Dương",
    "ĐT: 0375565858",
]
new_model.predict(new_v(samples), verbose=False).argmax(axis=1)

array([0, 1, 2])

hihi