In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Dense, Embedding, Bidirectional, TimeDistributed,Input,LayerNormalization,Dropout
from tensorflow.keras.losses import sparse_categorical_crossentropy
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

In [None]:
data=pd.read_csv('/content/datasethindi.csv')

In [None]:
eng = data['English'].astype(str).tolist()
hin= data['Hindi'].astype(str).tolist()

In [None]:
hintok = ["<start> " + s.strip() + " <end>" for s in hin]

In [None]:
numwords = 20000
en = Tokenizer(num_words=numwords, oov_token="<unk>", filters='', lower=True)     #keeping only 20000 words, replacing out of vocab token with <unk>
hi = Tokenizer(num_words=numwords, oov_token="<unk>", filters='', lower=True)
en.fit_on_texts(eng)        #learns unique words and assigns ids
hi.fit_on_texts(hintok)

In [None]:
import pickle
with open("en.pkl", "wb") as f:
    pickle.dump(en, f)
with open("hi.pkl", "wb") as f:
    pickle.dump(hi, f)

In [None]:
invocab = min(numwords, len(en.word_index)+1)     #min(20000,tot eng words+1->for padding)
outvocab = min(numwords, len(hi.word_index)+1)
startid = hi.word_index.get("<start>")    #adding start and end tokens
endid = hi.word_index.get("<end>")
print("invocab", invocab, "outvocab", outvocab, "startid", startid, "endid", endid)

invocab 20000 outvocab 20000 startid 2 endid 3


In [None]:
engseqs = en.texts_to_sequences(eng)       #eng sentences to word ids
hinseqs = hi.texts_to_sequences(hintok)
mxlen = 150       #longer ones will be turncated
engpad = pad_sequences(engseqs, maxlen=mxlen, padding='post', truncating='post')  #padding
hinpad = pad_sequences(hinseqs, maxlen=mxlen, padding='post', truncating='post')
xtrain, xtest, ytrainfull, ytestfull = train_test_split(engpad, hinpad, test_size=0.2, random_state=42)

In [None]:
decintrain = np.zeros_like(ytrainfull)        #decoder input for training
decintrain[:,0] = startid          #first is strt token
decintrain[:,1:] = ytrainfull[:,:-1]        #teacher forcing
decintest = np.zeros_like(ytestfull)
decintest[:,0] = startid
decintest[:,1:] = ytestfull[:,:-1]        #shifting by 1, for calculating test loss/metrics and not for real preds
ytrain = ytrainfull[..., np.newaxis]             #extra dim as we are using scce
ytest = ytestfull[..., np.newaxis]

In [None]:
class transblock(tf.keras.layers.Layer):             #encoder block
    def __init__(self, embdim, heads, ffdim, rate=0.1, **kw):
        super().__init__(**kw)
        keydim = max(1, embdim // heads)           #dim per att head
        self.att = tf.keras.layers.MultiHeadAttention(num_heads=heads, key_dim=keydim)      #multi head self attention
        self.ff = tf.keras.Sequential([Dense(ffdim, activation="relu"), Dense(embdim)])        #FFNN
        self.ln1 = LayerNormalization(epsilon=1e-6)            #Layer Normalisation to stabilise training
        self.ln2 = LayerNormalization(epsilon=1e-6)
        self.dp1 = Dropout(rate)      #for regularisation
        self.dp2 = Dropout(rate)
    def call(self, x, training=None):
        a = self.att(x, x)          #query*key+val
        a = self.dp1(a, training=training)
        x = self.ln1(x+a)         #residual connection+layer norm
        f = self.ff(x)
        f = self.dp2(f, training=training)
        return self.ln2(x+f)            #again r+l


In [None]:
class decoderblock(tf.keras.layers.Layer):          #decoder
    def __init__(self, embdim, heads, ffdim, rate=0.1, **kw):
        super().__init__(**kw)
        keydim = max(1, embdim // heads)   #dim per head
        self.selfatt = tf.keras.layers.MultiHeadAttention(num_heads=heads, key_dim=keydim)         #masked self att
        self.crossatt = tf.keras.layers.MultiHeadAttention(num_heads=heads, key_dim=keydim)           #cross att leyer
        self.ff = tf.keras.Sequential([Dense(ffdim, activation="relu"), Dense(embdim)])
        self.ln1, self.ln2, self.ln3 = LayerNormalization(epsilon=1e-6), LayerNormalization(epsilon=1e-6), LayerNormalization(epsilon=1e-6)   #3 layer norms for (after self att,cross att,FFNN)
        self.dp1, self.dp2, self.dp3 = Dropout(rate), Dropout(rate), Dropout(rate) #3 dropout layers
    def call(self, x, encout, training=None):
        ln = tf.shape(x)[1]
        mask = tf.linalg.band_part(tf.ones((ln, ln)), -1, 0)      #upper triangle as 0s so token cannot see future ones
        s = self.selfatt(x, x, attention_mask=mask)      #masked self att**
        s = self.dp1(s, training=training)
        x = self.ln1(x+s)
        c = self.crossatt(x, encout)       #query=decoder ip and key=val=encoder inp
        c = self.dp2(c, training=training)
        x = self.ln2(x+c)       #residual+norm
        f = self.ff(x)
        f = self.dp3(f, training=training)
        return self.ln3(x+f)            #r+l again

In [None]:
class tokposemb(tf.keras.layers.Layer):           #simple token+pos emb layer
    def __init__(self, maxlen, vocab, embdim, **kw):
        super().__init__(**kw)
        self.tokemb = Embedding(vocab, embdim)
        self.posemb = Embedding(maxlen, embdim)
    def call(self, x):
        ln = tf.shape(x)[-1]               #dynamic seq len
        pos = tf.range(start=0, limit=ln, delta=1)           #pos indices
        return self.tokemb(x) + self.posemb(pos)         #adding them

In [None]:
numlayers = 2
heads = 4              #multi attention heads
embdim = 128
ffdim = 256
opt = Adam(learning_rate=1e-4, clipnorm=1.0)            #adam with gradient clipping to prevent exploding grad

encinp = Input(shape=(mxlen,))
decinp = Input(shape=(mxlen,))        #hin sentences ids shifted with start
enc = tokposemb(mxlen, invocab, embdim)(encinp)         #token +pos
dec = tokposemb(mxlen, outvocab, embdim)(decinp)
for _ in range(numlayers):          #pass them thru 2 blocks
    enc = transblock(embdim, heads, ffdim)(enc)
for _ in range(numlayers):
    dec = decoderblock(embdim, heads, ffdim)(dec, enc)          #eacch block with aelf att+cross att with enc op
out = TimeDistributed(Dense(outvocab, activation="softmax"))(dec)           #for each t, taking op prob dist over op vocab

model = tf.keras.Model([encinp, decinp], out)
model.compile(loss=sparse_categorical_crossentropy, optimizer=opt, metrics=['accuracy'])
model.summary()

In [None]:
history = model.fit([xtrain, decintrain], ytrain,
                    validation_data=([xtest, decintest], ytest),
                    batch_size=32, epochs=10, verbose=1)

Epoch 1/10
[1m1832/1832[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m223s[0m 122ms/step - accuracy: 0.9654 - loss: 0.2314 - val_accuracy: 0.9630 - val_loss: 0.2693
Epoch 2/10
[1m1832/1832[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m217s[0m 118ms/step - accuracy: 0.9669 - loss: 0.2152 - val_accuracy: 0.9637 - val_loss: 0.2637
Epoch 3/10
[1m1832/1832[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m268s[0m 122ms/step - accuracy: 0.9684 - loss: 0.1998 - val_accuracy: 0.9641 - val_loss: 0.2607
Epoch 4/10
[1m1832/1832[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m257s[0m 119ms/step - accuracy: 0.9696 - loss: 0.1873 - val_accuracy: 0.9646 - val_loss: 0.2578
Epoch 5/10
[1m1832/1832[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m261s[0m 119ms/step - accuracy: 0.9711 - loss: 0.1730 - val_accuracy: 0.9649 - val_loss: 0.2568
Epoch 6/10
[1m1832/1832[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m268s[0m 122ms/step - accuracy: 0.9727 - loss: 0.1601 - val_accuracy: 0.9651 - val_loss:

In [None]:
def greedy_decode(model, src, maxlen, startid, endid):
    dec = np.zeros((1, maxlen))            #decoder ip arr, all 0s at start
    dec[0,0] = startid #start token
    for t in range(1, maxlen):
        p = model.predict([src, dec], verbose=0)       #encoder ip+curr decoder ip,  shape=(1,mxlen,voacbsize)
        nid = np.argmax(p[0,t-1])           #picking highest prob token at each step
        dec[0,t] = nid      #adding it to decoder ip
        if nid == endid: break      #end token is last
    return dec[0]       #return pred seq

def ids_to_text(ids, tok):
    out = []
    for i in ids:
        if i in (0, startid, endid): continue
        w = tok.index_word.get(int(i), "")      #id->word
        if w: out.append(w)
    return " ".join(out)

samps = ["leave door open","king organised meet"]
for s in samps:
    seq = en.texts_to_sequences([s])    #word->token ids
    padseq = pad_sequences(seq, maxlen=mxlen, padding='post')
    pred = greedy_decode(model, padseq, mxlen, startid, endid)    #greedy decoding, one toekn at a time
    print("EN:", s)
    print("HI:", ids_to_text(pred, hi))

EN: leave door open
HI: दरवाज़ा खोलो।
EN: king organised meet
HI: राजा रूप रामचरितमानस तुलसी घर ले जाता


In [None]:
model.save("besteng2hindi2.keras")