In [None]:
import pandas as pd
import numpy as np
from tensorflow.keras.layers import Bidirectional, LSTM, Dense, Dropout, Input, Embedding, Dot, Concatenate, Activation
from tensorflow.keras.models import Model
import sentencepiece as sp
import spacy
from tqdm import tqdm
!pip install fr_core_news_sm
from datasets import load_dataset
import json

In [None]:
data = load_dataset("opus_books", "en-fr")
print(data)

df = pd.DataFrame(data['train'])
df['english'] = df['translation'].apply(lambda x: x['en'])
df['french'] = df['translataion'].apply(lambda x: x['fr'])

df2 = df[['english', 'french']]
df3 = df2.head(30000)

In [None]:
nlp_en = spacy.load('en_core_web_sm', disable = ['parser'])
nlp_fr = spacy.load('fr_news_core_sm', disable = ['parser'])

def smart_case(sentences, nlp, batch_size = 1000):
    processed = []
    
    for doc in tqdm(nlp.pipe(sentences, batch_size = batch_size, n_process = -1), total = len(sentences)):
        tokens = []
        for token in doc:
            if token.ent_type_ or token.pos_ in ['PROPN'] or token.text.isupper():
                tokens.append(token.text)
            else:
                tokens.append(token.text.lower())
        processed.append(' '.join(tokens))
    return processed


english_sentences = smart_case(df['english'].tolist(), nlp_en)
french_sentences = smart_case(df['french'].tolist(), nlp_fr)    

In [None]:
with open("english_smartCased_sentences.json", "w", encoding = 'utf-8') as f:
    json.dump(english_sentences, f, ensure_ascii = False, indent = 2)

with open("french_smartCased_sentences.json", "w", encoding = 'utf-8') as f:
    json.dump(french_sentences, f, ensure_ascii = False, indent = 2)


In [None]:
french_input_sentences = ['<start> ' + s for s in french_sentences]
french_output_senteces = [s + ' <end>' for s in french_sentences]

In [None]:
with open("eng.txt", "w", encoding = 'utf-8') as f:
    for s in english_sentences:
        f.write(s + "\n")

with  open("fr.txt", "w", encoding = 'utf-8') as f:
    for s in french_sentences:
        f.write(s + "\n")

sp.SentencePieceTrainer.Train(input = "en.txt", vocab_size = 3000, model_prefix = "eng_sp", model_type = "bpe")
sp.SentencePieceTrainer.Train(input = "fr.txt", vocab_size = 3000, model_prefix = "fr_sp", model_type = "bpe")

eng_sp = sp.SentencePieceProcessor()
eng_sp.Load("eng_sp.model")

fr_sp = sp.SentencePieceProcessor()
fr_sp.Load("fr_sp.model")

fr_in_seq = [fr_sp.EncodeAsIds(s) for s in french_input_sentences]
fr_out_seq = [fr_sp.EncodeAsIds(s) for s in french_output_sentences]
eng_seq = [eng_sp.EncodeAsIds(s) for s in english_sentences]

In [None]:
length = [len(s) for s in english_sentences]
eng_max_len = np.percentile(length, 90)

lengths2 = [len(s) for s in french_sentences]
fr_max_len = np.percentile(lengths2, 90)

eng_vocab = eng_sp.GetPieceSize()
fr_vocab = fr_sp.GetPieceSize()

In [None]:
eng_seq = pad_sequences(eng_seq, maxlen = eng_max_len, padding = 'post')
fr_in_seq = pad_sequences(fr_in_seq, maxlen = fr_max_len, padding = 'post')
fr_out_seq = pad_sequences(fr_out_seq, maxlen = fr_max_len, padding = 'post')


# ------- Parameters ------
embedding_dim = 64
units = 128
dropout = 0.2


# ------ Encoder -------
encoder_input = Input(shape = (eng_seq.shape[1], ))
encoder_emb = Embedding(eng_vocab, embedding_dim, mask_zero = True)(encoder_input)
encoder_emb = Dropout(dropout)(encoder_emb)

encoder_biLstm = Bidirectional(LSTM(units, return_sequences = True, return_state = True, dropout = dropout, recurrent_dropout = dropout))
encoder_output, forward_h, forward_c, backward_h, backward_c = encoder_biLstm(encoder_emb)

combined_h = Concatenate()([forward_h, backward_h])
combined_c = Concatenate()([forward_c, backward_c])


# -------- Decoder --------
decoder_input = Input(shape = (fr_seq_out.shape[1]), )
decoder_emb = Embedding(fr_vocab, embedding_dim, mask_zero = True)(decoder_input)
decoder_emb = Dropout(dropout)(decoder_emb)

decoder_lstm = LSTM(units*2, return_sequences = True, return_state = True, dropout = dropout)
decoder_output, _, _ = decoder_lstm(decoder_emb, initial_state = [combined_h, combined_c])



# -------- Luong (Multiplicative) Attention --------- 
# This attention multiplies the hiddenstates. Bahadnau adds the hidden states. We use Luong only since it is faster and simple.
score = Dot(axis=[2, 2])([decoder_output, encoder_output]) # 'Since we know that the output of the encoder-decoder when return_sequences = True is: (batch_size, timesteps, units), so, the dot product of units will be there because units carries the output probabilitites. Therefore the dot product will be caluculated for units(since axis = 2)
attention_weights = Activation('softmax', name = 'attention_weights')(score) # Calculates attention weight(alpha) using softmax.


# Context Vector
context_vector = Dot(axis=[2, 1])([attention_weights, encoder_output]) # To know the reason see the below notes.


# Combined Context
decoder_combined_context = Concatenate(axis = -1)([context_vector, decoder_output]) # 'axis=-1' because, the last axis which is the no. of units will be concatenated(see the below logic).

# Output Layer
decoder_dense = Dense(fr_vocab, activation = 'softmax')
decoder_output_final = decoder_dense(decoder_combined_context) # Now, giving the attention outputs instead of giving the LSTM outputs.


# Model Training
model = Model([encoder_input, decoder_input], decoder_output_final)
model.compile(optimizer = 'adam', loss = 'sparse_categorical_crossentropy', metrics = ['accuracy'])
model.summary()
model.fit([eng_seq, fr_in_seq], np.expand_dims(fr_out_seq, -1), epochs = 100, validation_split = 0.1)


# ------- Inference Model ---------
# Encoder Inference Model
encoder_model = Model(encoder_input, [encoder_output, combined_h, combined_c]) # Since the decoder want the encoder's output state at each timestep that's why gave the encoder_output.
encoder_output_input = Input(shape = (eng_max_len, units*2)) # See the below notes to know the logic the of this.


# Decoder Inference Model
decoder_state_h = Input(shape = (units*2, ))
decoder_state_c = Input(shape = (units*2, ))
decoder_input_inf = Input(shape = (1, )) # One word at a time.
decoder_emb2 = Embedding(fr_vocab, embedding_dim, mask_zero = True)(decoder_input_inf)
decoder_output2, decoder_output_h, decoder_output_c = decoder_lstm(decoder_emb2, initial_state = [decoder_state_h, decoder_state_c])


# Attention at inference
score2 = Dot(axis=(2,2))([decoder_output2, encoder_output_input]) # You have to always write the encoder_output at after decoder output.
attention_weights2 = Activation('softmax')(score2)
context_vector2 = Dot(axis = (2, 1))([attention_weights2, encoder_output_input])
decoder_combined_context2 = Concatenate()([context_vector2, decoder_output2])

decoder_final_output2 = decoder_dense(decoder_combined_context2)

decoder_model = Model([decoder_input_inf, decoder_state_h, decoder_state_c, encoder_output_input], # Passing encoder_output_input is mandatory since the decoder needs it calculate context vectors.
                     [decoder_final_output2, decoder_output_h, decoder_output_c, attention_weights]) # Passing attention_weights for visualization purposes only. It has nothting to do with the model.


def translate_sentence(sentence):
    sentence = ' '.join(smart_case([sentence], nlp_en))
    seq = eng_sp.EncodeAsIds(sentence)
    seq = pad_sequences([seq], padding = 'post', maxlen = eng_max_len)
    en_output, state_h_, state_c_ = encoder_model.predict(seq)
    start_id = eng_sp.PieceToId('<start>')
    stop = False
    input_id = np.array([[start_id]])
    decoded_sentence = []
    
    while not stop:
        decoder_output_, decoder_output_h_, decoder_output_c_ = decoder_model.predict([input_id, state_h_, state_c_, en_output]) # order must be same.
        output_id = np.argmax(decoder_output_[0, -1, :]) # Taking the final timestep as output word(ID). Important: np.argmax() return the index of the maximum value not that value itself
        word = fr_sp.DecodeIds(output_id)

        if word == '<end>' or len(decoded_sentence) >= fr_max_len:
            stop = True

        else:
            decoded_sentence.append(word)
            input_id = np.array([[output_id]])
            state_h_ = decoder_output_h_
            state_c_ = decoder_output_c_

    return ' '.join(decoded_sentence)

# Using Self Attention:

In [None]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Dropout, Embedding, LSTM, Bidirectional, MultiHeadAttention
from import pandas as pd 
import numpy as np
import json
import spacy
!pip install fr_core_news_sm
from datasets import load_dataset
import sentencepiece as sp

In [None]:
with open("/kaggle/input/dataset/english_smartCased_sentences.json", encoding = 'utf-8') as f:
    json.load(f)
    english_sentences = f

with open("/kaggle/input/dataset/french_smartCased_sentences.json", encoding = 'utf-8') as f:
    json.load(f)
    french_sentences = f

french_input_sentences = ['<start> ' + s for s in french_sentences]
french_output_sentences = [s + ' <end>' for s in french_sentences]



In [None]:
eng_seq = pad_sequences(eng_seq, maxlen = eng_max_len, padding = 'post')
fr_in_seq = pad_sequences(fr_seq, maxlen = fr_max_len, padding = 'post')
fr_out_seq = pad_sequences(fr_out_seq, maxlen = fr_max_len, padding = 'post')
eng_input = eng_seq.shape[1]
fr_input = fr_in_seq.shape[1]

# ------
# Encoder
# ------
encoder_input = Input(shape = (eng_input, ))
encoder_emb = Embedding(eng_vocab, embedding_dim, mask_zero = True)(encoder_input)
encoder_biLstm = Bidirectional(LSTM(units, return_sequences = True, return_state = True, dropout = dropout))

encoder_outputs, forward_h, forward_c, backward_h, backward_c = encoder_biLstm(encoder_emb)

combined_h = Concatenate()([forward_h, backward_h])
combined_c = Concatenate()([forward_c, backward_c])

# Self Attention on Encoder outputs
att_output = MultiHeadAttention(num_head = 4, key_dim = units*2)(encoder_outputs, encoder_outputs) # See the below notes for knowing about everything. We have given encoder_outputs twice because, the Query, Key, and value will be calculated based on the encoder's output, and internally we need the separate outputs of encoder, so, internally in MultiHead() key, and value are passed as a same argument, but, for query it needs a separate argument to be passed. Query is the context of a current word, Key is the context of entire word and value is the embedding dimensions value.
att_output = Dropout(dropout)(enc_att) # Applying dropout here will make the code less prone to overfitting. That's why haven't applied Dropout Earlier after encoder_biLstm.
att_output = LayerNormalization()(enc_att + encoder_outputs) # For normalizing the layers. See the below notes for the entire information about this Luong Self Attention mechanism and paramteres used.


# ------
# Decoder
# ------                              
decoder_input = Input(shape = (fr_input, ))
decoder_emb = Embedding(fr_vocab, embedding_dim, mask_zero = True)(decoder_input)
decoder_lstm = LSTM(units, return_sequences = True, return_state = True, dropout = dropout)

decoder_outputs, _, _ = decoder_lstm(decoder_emb, initial_state = [combined_h, combined_c])

# Cross Attention(decoder queries and encoder outputs)
cross_att = MultiHeadAttention(num_head = 4, key_dim = units*2)(decoder_outputs, att_output)
cross_att = Dropout(dropout)(cross_att)
cross_att = LayerNormalization()(cross_att + decoder_outputs)

# Output Dense(FFNN)
decoder_dense = Dense(fr_vocab, activation = 'softmax')
decoder_outputs = decoder_dense(cross_att)


# ----------
# Seq2Seq Model
# ----------
model = Model([encoder_input, decoder_input], decoder_outputs)
model.compile(optimizer = 'adam', loss = 'sparse_categorical_crossentropy', metrics = ['accuracy'])
model.summary()
model.fit([eng_seq, fr_in_seq], np.expand_dims(fr_out_seq , -1), epochs = 100, validation_split = 0.2)


# ---------------
# Inference Model
# ---------------
# Encoder Inference Model
encoder_model = Model(encoder_input, [encoder_outputs, combined_h, combined_c])
enc_att_input = Input(shape = (eng_max_len, units*2)) # Just like we did in simple attention, but, we'll name it as enc_att_input.

# Deocder Inference Model
decoder_input2 = Input(shape = (1, ))
decoder_input_h = Input(shape = (units*2, ))
decoder_input_c = Input(shape = (units*2, ))

decoder_emb2 = Embedding(fr_vocab, embedding_dim, mask_zero = True)(decoder_input)
decoder_outputs2, decoder_output_h, decoder_output_c = decoder_lstm(decoder_emb2, initial_state = [decoder_input_h, decoder_input_c])

# Cross Attention:
cross_att2 = MultiHeadAttention(num_heads = 4, key_dim = units*2)(decoder_outputs2, enc_att_input)
cross_att2 = LayerNormalizatoin()(cross_att2 + decoder_outputs2)

decoder_outputs2_inf = decoder_dense(cross_att2)
decoder_model = Model([decoder_input2, decoder_input_h, decoder_input_c, encoder_output_input], 
                      [decoder_output2_inf, decoder_output_h, decodeer_output_c])



