In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
import seaborn as sns

from model.preprocess import Preprocessor
from model.callbacks import SeqGenerateCallback
from model.model import Poet
from model.metrics import MaskedAccuracy, MaskedLoss

In [2]:
MAX_VOCAB_SIZE = 10000
EMBEDDING_DIM = 200
DFF = 512
D_MODEL = 256
MAX_SEQ_LEN = 10

In [3]:
data = pd.read_csv("data.tsv", sep="\t")

In [4]:
data.head()

Unnamed: 0,lines
0,मैं तो केवल एक तथ्य हूं
1,सदियों से बदनाम हुआ हूं
2,किसी एक के नही सभी के
3,हाथों से मै छला गया हूं ।
4,"भला ,बुरा ,बीता और गुजरा"


In [5]:
train_data = data.iloc[:-5, :]
val_data = data.iloc[-5:, :]

In [6]:
train_data.shape

(15, 1)

In [7]:
preprocessor = Preprocessor(vocab_size=1000, seq_len=10)

In [8]:
vocab = preprocessor.build_vocab(train_data)
x_train_seq, y_train_seq = preprocessor(train_data, training=True)
x_val_seq, y_val_seq = preprocessor(val_data, training=True)

In [9]:
x_train_seq.shape, y_train_seq.shape, x_val_seq.shape, y_val_seq.shape

(TensorShape([15, 10]),
 TensorShape([15, 10]),
 TensorShape([5, 10]),
 TensorShape([5, 10]))

In [10]:
x_train_seq[:3]

<tf.Tensor: shape=(3, 10), dtype=int64, numpy=
array([[ 2, 26, 41, 51, 18, 44,  4,  3,  0,  0],
       [ 2, 13, 11, 33, 10,  4,  3,  0,  0,  0],
       [ 2,  8, 18,  7, 36, 24,  7,  3,  0,  0]], dtype=int64)>

In [11]:
print("few vocab tokens:", vocab[:10])

few vocab tokens: ['', '[UNK]', '[SURU]', '[KHATAM]', 'हूं', '।', 'नहीं', 'के', 'किसी', 'है']


In [12]:
print("Vocab Size: ", preprocessor.vocab_size)

Vocab Size:  60


In [23]:
model = Poet(preprocessor=preprocessor, num_blocks=1, d_model=256, dff=512, heads=8, embedding_dims=100)
model.compile(loss=MaskedLoss(), optimizer=tf.keras.optimizers.Adam(learning_rate=0.01), metrics=[MaskedAccuracy()])

In [24]:
trigger_inputs = [["मै"]]
trigger_inputs = preprocessor(trigger_inputs, training=False)
generator_callback = SeqGenerateCallback(trigger_inputs)

In [28]:
history = model.fit(x=x_train_seq, y=y_train_seq, batch_size=5, epochs=10, callbacks=[generator_callback])

Epoch 1/10
after epoch 0 model generates:
actual sequence:  [[13 55 29 10  4  3  3  3  3  3]]
generated text sequence:  [['सदियों इतिहासो में हुआ हूं']]
Epoch 2/10
after epoch 1 model generates:
actual sequence:  [[42 35 17  4  3  3  3  3  3  3]]
generated text sequence:  [['तेरे पलटा और हूं']]
Epoch 3/10
after epoch 2 model generates:
actual sequence:  [[40 35 38  4  4  3  3  3  5  3]]
generated text sequence:  [['दगाबाज पलटा देता हूं हूं']]
Epoch 4/10
after epoch 3 model generates:
actual sequence:  [[16 37 38 39  4  3  3  3  3  3]]
generated text sequence:  [['भला धोखा देता दबकर हूं']]
Epoch 5/10
after epoch 4 model generates:
actual sequence:  [[13 13 17 10  4  3  3  3  3  3]]
generated text sequence:  [['सदियों सदियों और हुआ हूं']]
Epoch 6/10
after epoch 5 model generates:
actual sequence:  [[40 37 17 46  4  3  3  3  3  3]]
generated text sequence:  [['दगाबाज धोखा और जाता हूं']]
Epoch 7/10
after epoch 6 model generates:
actual sequence:  [[42 42 17  4  4  3  3  3  3  3]]
generated