In [8]:
%load_ext autoreload
%autoreload 2

In [9]:
import tensorflow as tf
import numpy as np
import pandas as pd
import seaborn as sns
import pickle as pkl

from model.preprocess import Preprocessor
from model.callbacks import SeqGenerateCallback
from model.model import Poet
from model.metrics import MaskedAccuracy, MaskedLoss

In [10]:
MAX_VOCAB_SIZE = 10000
EMBEDDING_DIM = 200
DFF = 512
D_MODEL = 256
MAX_SEQ_LEN = 10
TEMP_DATA = False

In [11]:
if not TEMP_DATA:
    data = pd.read_csv("datasets/data.tsv", sep="\t")
else:
    data = pd.read_csv("datasets/preprocessed_data.tsv", sep="\t")

In [4]:
data.head()

Unnamed: 0,lines
0,सदियों से बदनाम हुआ हूं
1,किसी एक के नही सभी के
2,हाथों से मै छला गया हूं
3,"भला , बुरा , बीता और गुजरा"
4,दगाबाज धोखा देता हूं


In [17]:
total_dataset_len = len(data.iloc[:, :])
test_split = 0.1
train_index_limit = int((1-test_split)*total_dataset_len)
train_data = data.iloc[:train_index_limit, :1]
val_data = data.iloc[train_index_limit:, :1]

In [13]:
train_data.shape

(9436, 1)

In [14]:
preprocessor = Preprocessor(max_vocab_size=5000, seq_len=14)

In [18]:
vocab = preprocessor.build_vocab(train_data.values)
x_train_seq, y_train_seq = preprocessor(train_data, training=True)
x_val_seq, y_val_seq = preprocessor(val_data, training=True)

In [9]:
preprocessor.tokenizer

<tensorflow_text.python.ops.wordpiece_tokenizer.WordpieceTokenizer at 0x17405b7e970>

In [10]:
len(preprocessor.word_ids.keys())

1600

In [11]:
# preprocessor.max_vocab_size

5000

In [19]:
x_train_seq.shape, y_train_seq.shape, x_val_seq.shape, y_val_seq.shape

(TensorShape([9436, 14]),
 TensorShape([9436, 14]),
 TensorShape([1049, 14]),
 TensorShape([1049, 14]))

In [43]:
print(x_train_seq[21:22])
preprocessor.get_text(x_train_seq[21:22], return_subtokens=True)

tf.Tensor([[   2 1346  871  114  304  204   36  147 1274  138    3    0    0    0]], shape=(1, 14), dtype=int64)


(array([['जिसके बारे में कल तक अनजान था']], dtype='<U29'),
 array([['जिसके बारे में कल तक अ ##न ##जान था']], dtype='<U35'))

In [44]:
print("few vocab tokens:", vocab[:50])

few vocab tokens: ['[PAD]', '[UNK]', '[SURU]', '[KHATAM]', '!', '"', '*', ',', '-', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', '=', '?', '_', 'l', 'o', '|', '©', '¸', '×', 'ب', 'ر', 'ش', 'ँ', 'ं', 'ः', 'अ', 'आ', 'इ', 'ई', 'उ', 'ऊ', 'ऋ', 'ए', 'ऐ', 'ऑ', 'ओ', 'औ', 'क', 'ख']


In [45]:
print("Vocab Size: ", preprocessor.vocab_size)

Vocab Size:  1600


In [25]:
threshold = 5
d = dict()
with open("datasets/data.tsv", "r") as f:
    for line in f:
        words = line.split()
        for word in words:
            try:
                d[word] += 1 
            except:
                d[word] = 0

low_freq_list = []
for key in d.keys():
    if d[key]<threshold:
        low_freq_list.append(key)
print(len(low_freq_list), len(d.keys()))

8582 10267


In [26]:
with open("embedding-100.pkl", "rb") as f:
    embeddings = pkl.load(f)

In [46]:
model = Poet(
    preprocessor=preprocessor, 
    num_blocks=4, 
    d_model=256, 
    dff=512, 
    heads=8, 
    embedding_dims=100, 
    rate=0.5,  
)
model.compile(loss=MaskedLoss(), optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), metrics=[MaskedAccuracy()])

In [47]:
model.compile(loss=MaskedLoss(), optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001), metrics=[MaskedAccuracy()])

In [48]:
trigger_inputs = [["यूं"]]
trigger_inputs = preprocessor(trigger_inputs, training=False)
generator_callback = SeqGenerateCallback(trigger_inputs)

lr_scheduler = tf.keras.callbacks.ReduceLROnPlateau(monitor="loss", patience=3)

In [49]:
history = model.fit(x=x_train_seq, y=y_train_seq, batch_size=16, epochs=10, callbacks=[generator_callback, lr_scheduler])

Epoch 1/10
current_seq:  [[  2 368   0   0   0   0   0   0   0   0   0   0   0   0]]
current_seq:  [[  2 368   0   0   0   0   0   0   0   0   0   0   0   0]]
current_seq:  [[  2 368 884   0   0   0   0   0   0   0   0   0   0   0]]
current_seq:  [[  2 368 884 578   0   0   0   0   0   0   0   0   0   0]]
current_seq:  [[  2 368 884 578 514   0   0   0   0   0   0   0   0   0]]
current_seq:  [[  2 368 884 578 514 201   0   0   0   0   0   0   0   0]]
current_seq:  [[  2 368 884 578 514 201 735   0   0   0   0   0   0   0]]
current_seq:  [[   2  368  884  578  514  201  735 1265    0    0    0    0    0    0]]
current_seq:  [[   2  368  884  578  514  201  735 1265 1584    0    0    0    0    0]]
current_seq:  [[   2  368  884  578  514  201  735 1265 1584  470    0    0    0    0]]
current_seq:  [[   2  368  884  578  514  201  735 1265 1584  470 1360    0    0    0]]
current_seq:  [[   2  368  884  578  514  201  735 1265 1584  470 1360 1102    0    0]]
current_seq:  [[   2  368  884 

KeyboardInterrupt: 

In [29]:
model.save_weights("model_wieghts.h5")

In [30]:
model.load_weights("model_wieghts.h5")

In [32]:
inputs = [["मैं क्या "]]
# inputs = [["आशा"]]
inputs = preprocessor(inputs, training=False)
model.generate(inputs, temperature=1)

current_seq:  [[ 2 20 32  0  0  0  0  0  0  0  0  0  0  0]]
current_seq:  [[ 2 20 32  0  0  0  0  0  0  0  0  0  0  0]]
current_seq:  [[ 2 20 32  0  0  0  0  0  0  0  0  0  0  0]]
current_seq:  [[   2   20   32 1207    0    0    0    0    0    0    0    0    0    0]]
current_seq:  [[   2   20   32 1207 3686    0    0    0    0    0    0    0    0    0]]
current_seq:  [[   2   20   32 1207 3686 4695    0    0    0    0    0    0    0    0]]
current_seq:  [[   2   20   32 1207 3686 4695 1464    0    0    0    0    0    0    0]]
current_seq:  [[   2   20   32 1207 3686 4695 1464 5607    0    0    0    0    0    0]]
current_seq:  [[   2   20   32 1207 3686 4695 1464 5607 7069    0    0    0    0    0]]
current_seq:  [[   2   20   32 1207 3686 4695 1464 5607 7069 1982    0    0    0    0]]
current_seq:  [[   2   20   32 1207 3686 4695 1464 5607 7069 1982 4410    0    0    0]]
current_seq:  [[   2   20   32 1207 3686 4695 1464 5607 7069 1982 4410 8506    0    0]]
current_seq:  [[   2   20   

'मैं क्या गायब प्रांगण फ़नकार सहमी रिमझिम दुराचारी देवता क़िताबों उर्दू गुज़ारे पड़ोसी'