In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
hinglish=pd.read_csv('/content/datasethinglish.csv')

In [None]:
from sklearn.model_selection import train_test_split
xtrain,xtest,ytrain,ytest=train_test_split(hinglish["English"],hinglish["Hinglish"],test_size=0.2,random_state=42)

In [None]:
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Dense, Embedding, Bidirectional, TimeDistributed
from tensorflow.keras.losses import sparse_categorical_crossentropy
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

In [None]:
xtrain=[str(x) for x in xtrain]
ytrain=[str(y) for y in ytrain]
xtest=[str(x) for x in xtest]
ytest=[str(y) for y in ytest]

In [None]:
en = Tokenizer(num_words=20000, oov_token=None)
hi = Tokenizer(num_words=20000, oov_token=None)
en.fit_on_texts(xtrain+xtest)
hi.fit_on_texts(ytrain+ytest)

In [None]:
import pickle
with open("eng_tokenizer.pkl", "wb") as f:
    pickle.dump(en, f)
with open("hing_tokenizer.pkl", "wb") as f:
    pickle.dump(hi, f)

In [None]:
len(en.word_index),len(hi.word_index)

(28706, 30350)

In [None]:
invocab=len(en.word_index)+1
outvocab=len(hi.word_index)+1

In [None]:
xtrain=en.texts_to_sequences(xtrain)
xtest=en.texts_to_sequences(xtest)
ytrain=hi.texts_to_sequences(ytrain)
ytest=hi.texts_to_sequences(ytest)

In [None]:
mxlen=150
xtrain=pad_sequences(xtrain,maxlen=mxlen,padding='post',truncating='post')
xtest=pad_sequences(xtest,maxlen=mxlen,padding='post',truncating='post')
ytrain= pad_sequences(ytrain,maxlen=mxlen,padding='post',truncating='post')
ytest= pad_sequences(ytest,maxlen=mxlen,padding='post',truncating='post')

In [None]:
ytrain=ytrain.reshape(*ytrain.shape,1)
ytest=ytest.reshape(*ytest.shape,1)

In [None]:
class transblock(tf.keras.layers.Layer):
    def __init__(self,embdim,heads,ffdim,rate=0.1,**kwargs):
        super().__init__(**kwargs)
        self.embdim=embdim
        self.heads=heads
        self.ffdim=ffdim
        self.rate=rate
        self.att=tf.keras.layers.MultiHeadAttention(num_heads=heads,key_dim=embdim)
        self.ff=tf.keras.Sequential([
            tf.keras.layers.Dense(ffdim,activation="relu"),
            tf.keras.layers.Dense(embdim),
        ])
        self.ln1=tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.ln2=tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.dp1=tf.keras.layers.Dropout(rate)
        self.dp2=tf.keras.layers.Dropout(rate)

    def get_config(self):
        cfg=super().get_config().copy()
        cfg.update({
            'embdim':self.embdim,
            'heads':self.heads,
            'ffdim':self.ffdim,
            'rate':self.rate,
        })
        return cfg

    def call(self,x,training=None):
        att=self.att(x,x)
        att=self.dp1(att,training=training)
        out1=self.ln1(x+att)
        ffout=self.ff(out1)
        ffout=self.dp2(ffout,training=training)
        return self.ln2(out1+ffout)


In [None]:
class tokposemb(tf.keras.layers.Layer):
   def __init__(self,maxlen,vocab,embdim,**kw):
        super().__init__(**kw)
        self.maxlen=maxlen
        self.vocab=vocab
        self.embdim=embdim
        self.tokemb=tf.keras.layers.Embedding(input_dim=vocab,output_dim=embdim)
        self.posemb=tf.keras.layers.Embedding(input_dim=maxlen,output_dim=embdim)

   def get_config(self):
        cfg=super().get_config().copy()
        cfg.update({
            'maxlen':self.maxlen,
            'vocab':self.vocab,
            'embdim':self.embdim,
        })
        return cfg

   def call(self,x):
        ln=tf.shape(x)[-1]
        pos=tf.range(start=0,limit=ln,delta=1)
        pos=self.posemb(pos)
        x=self.tokemb(x)
        return x+pos


In [None]:
heads=2        # fewer heads for light attn
ffdim=32
embdim=128     # smaller emb to save memory
opt=Adam(learning_rate=0.003)

inp=tf.keras.layers.Input(shape=(mxlen,))
emb=tokposemb(mxlen, invocab, embdim)(inp)
x=transblock(embdim, heads, ffdim)(emb)
x=TimeDistributed(Dense(128, activation="relu"))(x)
out=TimeDistributed(Dense(outvocab, activation="softmax"))(x)

model=tf.keras.Model(inputs=inp,outputs=out)
model.compile(loss=sparse_categorical_crossentropy,optimizer=opt,metrics=['accuracy'])
model.summary()


In [None]:
history=model.fit(
    xtrain,
    ytrain,
    validation_data=(xtest,ytest),
    verbose=1,
    batch_size=16,
    epochs=5,
)

Epoch 1/5
[1m3663/3663[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m591s[0m 130ms/step - accuracy: 0.9554 - loss: 0.4951 - val_accuracy: 0.9584 - val_loss: 0.3525
Epoch 2/5
[1m3663/3663[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m431s[0m 105ms/step - accuracy: 0.9587 - loss: 0.3421 - val_accuracy: 0.9591 - val_loss: 0.3298
Epoch 3/5
[1m3663/3663[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m422s[0m 100ms/step - accuracy: 0.9595 - loss: 0.3109 - val_accuracy: 0.9596 - val_loss: 0.3173
Epoch 4/5
[1m3663/3663[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m403s[0m 106ms/step - accuracy: 0.9604 - loss: 0.2806 - val_accuracy: 0.9600 - val_loss: 0.3132
Epoch 5/5
[1m3663/3663[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m442s[0m 106ms/step - accuracy: 0.9616 - loss: 0.2562 - val_accuracy: 0.9601 - val_loss: 0.3137


In [None]:
samps = ["i love you","she is beautiful"]
for s in samps:
    seq=en.texts_to_sequences([s])
    padseq=pad_sequences(seq, maxlen=mxlen,padding='post',truncating='post')
    pred=model.predict(padseq)[0].argmax(1)
    out=hi.sequences_to_texts([pred])[0]
    print("EN:",s)
    print("HI:",out,"\n")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 15s/step
EN: i love you
HI: mujhe pyara pyara 

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step
EN: she is beautiful
HI: eka sumdara sumdara 



In [None]:
model.save("besteng2hinglish.keras")