In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
import seaborn as sns
import pickle as pkl

from model.preprocess import Preprocessor
from model.callbacks import SeqGenerateCallback
from model.model import Poet
from model.metrics import MaskedAccuracy, MaskedLoss

In [2]:
MAX_VOCAB_SIZE = 10000
EMBEDDING_DIM = 200
DFF = 512
D_MODEL = 256
MAX_SEQ_LEN = 10
TEMP_DATA = False

In [3]:
if not TEMP_DATA:
    data = pd.read_csv("datasets/data.tsv", sep="\t")
else:
    data = pd.read_csv("datasets/preprocessed_data.tsv", sep="\t")

In [4]:
data.head()

Unnamed: 0,lines
0,सदियों से बदनाम हुआ हूं
1,किसी एक के नही सभी के
2,हाथों से मै छला गया हूं
3,"भला , बुरा , बीता और गुजरा"
4,दगाबाज धोखा देता हूं


In [5]:
total_dataset_len = len(data.iloc[:, :])
test_split = 0.1
train_index_limit = int((1-test_split)*total_dataset_len)
train_data = data.iloc[:train_index_limit, :1]
val_data = data.iloc[train_index_limit:, :1]

In [6]:
train_data.shape

(8379, 1)

In [7]:
preprocessor = Preprocessor(max_vocab_size=10000, seq_len=14)

In [8]:
vocab = preprocessor.build_vocab(train_data.values)
x_train_seq, y_train_seq = preprocessor(train_data, training=True)
x_val_seq, y_val_seq = preprocessor(val_data, training=True)

In [9]:
preprocessor.tokenizer.get_config()

{'name': 'text_vectorization',
 'trainable': True,
 'dtype': 'string',
 'max_tokens': 10000,
 'standardize': <bound method Preprocessor._custom_standardize of <model.preprocess.Preprocessor object at 0x000001DA57ECDEB0>>,
 'split': 'whitespace',
 'ngrams': None,
 'output_mode': 'int',
 'output_sequence_length': 14,
 'pad_to_max_tokens': False,
 'vocabulary_size': 8934}

In [10]:
x_train_seq.shape, y_train_seq.shape, x_val_seq.shape, y_val_seq.shape

(TensorShape([8379, 14]),
 TensorShape([8379, 14]),
 TensorShape([932, 14]),
 TensorShape([932, 14]))

In [11]:
x_train_seq[:1]

<tf.Tensor: shape=(1, 14), dtype=int64, numpy=
array([[   2,  458,    8, 1333,   84,   54,    3,    0,    0,    0,    0,
           0,    0,    0]], dtype=int64)>

In [12]:
print("few vocab tokens:", vocab[:10])

few vocab tokens: ['', '[UNK]', '[SURU]', '[KHATAM]', ',', 'है', '।', 'में', 'से', 'की']


In [13]:
print("Vocab Size: ", preprocessor.vocab_size)

Vocab Size:  8934


In [14]:
with open("embedding-100.pkl", "rb") as f:
    embeddings = pkl.load(f)

In [22]:
model = Poet(
    preprocessor=preprocessor, 
    num_blocks=4, 
    d_model=256, 
    dff=512, 
    heads=8, 
    embedding_dims=100, 
    rate=0.5,  
    embeddings = embeddings
)
model.compile(loss=MaskedLoss(), optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), metrics=[MaskedAccuracy()])

Embeddings hits: 8922, misses: 716 from the trained embeddings
False


In [23]:
model.compile(loss=MaskedLoss(), optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001), metrics=[MaskedAccuracy()])

In [24]:
trigger_inputs = [["यूं"]]
trigger_inputs = preprocessor(trigger_inputs, training=False)
generator_callback = SeqGenerateCallback(trigger_inputs)

lr_scheduler = tf.keras.callbacks.ReduceLROnPlateau(monitor="loss", patience=3)

In [28]:
history = model.fit(x=x_train_seq, y=y_train_seq, batch_size=16, epochs=10, callbacks=[generator_callback, lr_scheduler])

Epoch 1/10
current_seq:  [[  2 204   0   0   0   0   0   0   0   0   0   0   0   0]]
current_seq:  [[  2 204   0   0   0   0   0   0   0   0   0   0   0   0]]
current_seq:  [[   2  204 7340    0    0    0    0    0    0    0    0    0    0    0]]
current_seq:  [[   2  204 7340 5878    0    0    0    0    0    0    0    0    0    0]]
current_seq:  [[   2  204 7340 5878 5546    0    0    0    0    0    0    0    0    0]]
current_seq:  [[   2  204 7340 5878 5546 2902    0    0    0    0    0    0    0    0]]
current_seq:  [[   2  204 7340 5878 5546 2902 8757    0    0    0    0    0    0    0]]
current_seq:  [[   2  204 7340 5878 5546 2902 8757 2333    0    0    0    0    0    0]]
current_seq:  [[   2  204 7340 5878 5546 2902 8757 2333  129    0    0    0    0    0]]
current_seq:  [[   2  204 7340 5878 5546 2902 8757 2333  129 8494    0    0    0    0]]
current_seq:  [[   2  204 7340 5878 5546 2902 8757 2333  129 8494 1003    0    0    0]]
current_seq:  [[   2  204 7340 5878 5546 2902 875

In [29]:
model.save_weights("model_wieghts.h5")

In [30]:
model.load_weights("model_wieghts.h5")

In [32]:
inputs = [["मैं क्या "]]
# inputs = [["आशा"]]
inputs = preprocessor(inputs, training=False)
model.generate(inputs, temperature=1)

current_seq:  [[ 2 20 32  0  0  0  0  0  0  0  0  0  0  0]]
current_seq:  [[ 2 20 32  0  0  0  0  0  0  0  0  0  0  0]]
current_seq:  [[ 2 20 32  0  0  0  0  0  0  0  0  0  0  0]]
current_seq:  [[   2   20   32 1207    0    0    0    0    0    0    0    0    0    0]]
current_seq:  [[   2   20   32 1207 3686    0    0    0    0    0    0    0    0    0]]
current_seq:  [[   2   20   32 1207 3686 4695    0    0    0    0    0    0    0    0]]
current_seq:  [[   2   20   32 1207 3686 4695 1464    0    0    0    0    0    0    0]]
current_seq:  [[   2   20   32 1207 3686 4695 1464 5607    0    0    0    0    0    0]]
current_seq:  [[   2   20   32 1207 3686 4695 1464 5607 7069    0    0    0    0    0]]
current_seq:  [[   2   20   32 1207 3686 4695 1464 5607 7069 1982    0    0    0    0]]
current_seq:  [[   2   20   32 1207 3686 4695 1464 5607 7069 1982 4410    0    0    0]]
current_seq:  [[   2   20   32 1207 3686 4695 1464 5607 7069 1982 4410 8506    0    0]]
current_seq:  [[   2   20   

'मैं क्या गायब प्रांगण फ़नकार सहमी रिमझिम दुराचारी देवता क़िताबों उर्दू गुज़ारे पड़ोसी'