In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
hindi=pd.read_csv('/content/datasethindi.csv')

In [None]:
from sklearn.model_selection import train_test_split
xtrain,xtest,ytrain,ytest=train_test_split(hindi["English"],hindi["Hindi"],test_size=0.2,random_state=42)

In [None]:
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Dense, Embedding, Bidirectional, TimeDistributed
from tensorflow.keras.losses import sparse_categorical_crossentropy
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

In [None]:
xtrain=[str(x) for x in xtrain]   #to ensure they are strings
ytrain=[str(y) for y in ytrain]
xtest=[str(x) for x in xtest]
ytest=[str(y) for y in ytest]

In [None]:
en = Tokenizer(num_words=20000, oov_token=None)   #keeping only top 20000 freq words, no specil token for oov words
hi = Tokenizer(num_words=20000, oov_token=None)
en.fit_on_texts(xtrain+xtest)
hi.fit_on_texts(ytrain+ytest)

In [None]:
import pickle
with open("en_tokenizer.pkl", "wb") as f:
    pickle.dump(en, f)
with open("hi_tokenizer.pkl", "wb") as f:
    pickle.dump(hi, f)

In [None]:
len(en.word_index),len(hi.word_index)

(28706, 35696)

In [None]:
en.document_count

73243

In [None]:
invocab=len(en.word_index)+1
outvocab=len(hi.word_index)+1

In [None]:
xtrain=en.texts_to_sequences(xtrain)   #words replaced by their corresp index from tokenizer
xtest=en.texts_to_sequences(xtest)
ytrain=hi.texts_to_sequences(ytrain)
ytest=hi.texts_to_sequences(ytest)

In [None]:
mxlen=150    #max seq length
xtrain=pad_sequences(xtrain,maxlen=mxlen,padding='post',truncating='post')   #padding at end of seqs
xtest=pad_sequences(xtest,maxlen=mxlen,padding='post',truncating='post')
ytrain= pad_sequences(ytrain,maxlen=mxlen,padding='post',truncating='post')
ytest= pad_sequences(ytest,maxlen=mxlen,padding='post',truncating='post')

In [None]:
ytrain=ytrain.reshape(*ytrain.shape,1)     ##reshaping and adding extra dimension, req for seq2seq models with sparse categorical cross entropy so that preds (batch,seqlen,vocabsize) align with (batch,seqlen,1)
ytest=ytest.reshape(*ytest.shape,1)

In [None]:
class transblock(tf.keras.layers.Layer):
    def __init__(self,embdim,heads,ffdim,rate=0.1,**kwargs):
        super().__init__(**kwargs)
        self.embdim=embdim     #embedding dimension
        self.heads=heads     #attention heads
        self.ffdim=ffdim       #dimension for FF layer
        self.rate=rate            #dropout rate
        self.att=tf.keras.layers.MultiHeadAttention(num_heads=heads,key_dim=embdim)    #multi head attention layer (looks at all tokens from seq and builds contextual embeddings that remmber context)
        self.ff=tf.keras.Sequential([            #FF network
            tf.keras.layers.Dense(ffdim,activation="relu"),
            tf.keras.layers.Dense(embdim),
        ])
        self.ln1=tf.keras.layers.LayerNormalization(epsilon=1e-6)    #layer norm to prevent vanishing gradient problem
        self.ln2=tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.dp1=tf.keras.layers.Dropout(rate)      #Dropout to reduce overfitting
        self.dp2=tf.keras.layers.Dropout(rate)

    def get_config(self):        #serialisable configs(to save and load models later)
        cfg=super().get_config().copy()
        cfg.update({
            'embdim':self.embdim,
            'heads':self.heads,
            'ffdim':self.ffdim,
            'rate':self.rate,
        })
        return cfg

    def call(self,x,training=None):
        att=self.att(x,x)
        att=self.dp1(att,training=training)
        out1=self.ln1(x+att)
        ffout=self.ff(out1)
        ffout=self.dp2(ffout,training=training)
        return self.ln2(out1+ffout)


In [None]:
class tokposemb(tf.keras.layers.Layer):    #token+position embedding layer
   def __init__(self,maxlen,vocab,embdim,**kw):
        super().__init__(**kw)
        self.maxlen=maxlen
        self.vocab=vocab
        self.embdim=embdim
        self.tokemb=tf.keras.layers.Embedding(input_dim=vocab,output_dim=embdim)    #token embedding maps word indicesto dense vectors
        self.posemb=tf.keras.layers.Embedding(input_dim=maxlen,output_dim=embdim)     #pos emb -> same dimension as token emb

   def get_config(self):
        cfg=super().get_config().copy()
        cfg.update({
            'maxlen':self.maxlen,
            'vocab':self.vocab,
            'embdim':self.embdim,
        })
        return cfg

   def call(self,x):
        ln=tf.shape(x)[-1]    #dynamic seq len of input
        pos=tf.range(start=0,limit=ln,delta=1)   #creating poisition emb(easier app as dataset is small and emb stays within linit)
        pos=self.posemb(pos)
        x=self.tokemb(x)
        return x+pos    #adding embeddings(token+pos) same as official transformers


In [None]:
#Mini transformer seq2seq model
heads=2        # fewer att heads for light attn
ffdim=32
embdim=128     # smaller emb to save memory
opt=Adam(learning_rate=0.003)   #optimiser

inp=tf.keras.layers.Input(shape=(mxlen,))
emb=tokposemb(mxlen, invocab, embdim)(inp)    #token+pos emb layer
x=transblock(embdim, heads, ffdim)(emb)         #encoder block (self att+FF layer)
x=TimeDistributed(Dense(128, activation="relu"))(x)  #applying dense to each timestamp separately
out=TimeDistributed(Dense(outvocab, activation="softmax"))(x)        #softmax for output vocab

model=tf.keras.Model(inputs=inp,outputs=out)
model.compile(loss=sparse_categorical_crossentropy,optimizer=opt,metrics=['accuracy'])    #loss is scce coz targets are int ids not one hot encoded ones
model.summary()


In [None]:
history=model.fit(
    xtrain,
    ytrain,
    validation_data=(xtest,ytest),
    verbose=1,
    batch_size=16,
    epochs=5,
)


Epoch 1/5
[1m3663/3663[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m632s[0m 143ms/step - accuracy: 0.9573 - loss: 0.4828 - val_accuracy: 0.9598 - val_loss: 0.3472
Epoch 2/5
[1m3663/3663[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m414s[0m 112ms/step - accuracy: 0.9600 - loss: 0.3396 - val_accuracy: 0.9602 - val_loss: 0.3418
Epoch 3/5
[1m3663/3663[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m461s[0m 118ms/step - accuracy: 0.9606 - loss: 0.3225 - val_accuracy: 0.9606 - val_loss: 0.3296
Epoch 4/5
[1m3663/3663[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m431s[0m 118ms/step - accuracy: 0.9616 - loss: 0.2949 - val_accuracy: 0.9611 - val_loss: 0.3207
Epoch 5/5
[1m3663/3663[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m423s[0m 112ms/step - accuracy: 0.9625 - loss: 0.2676 - val_accuracy: 0.9615 - val_loss: 0.3132


In [None]:
samps = ["i love you","she is beautiful"]
for s in samps:
    seq=en.texts_to_sequences([s])      #eng sentences to seq of token ids
    padseq=pad_sequences(seq, maxlen=mxlen,padding='post',truncating='post')    #padding seq to our fixed len
    pred=model.predict(padseq)[0].argmax(1)           #predicting hindi token probabilities and taking token with max probability
    out=hi.sequences_to_texts([pred])[0]
    print("EN:",s)
    print("HI:",out,"\n")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 14s/step
EN: i love you
HI: मुझे प्यार प्यार 

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 55ms/step
EN: she is beautiful
HI: ये सुंदर 



In [None]:
model.save("besteng2hindi.keras")