# **ML Translation Model**

## **Import Libraries**

In [None]:
import numpy as np
from numpy import array
import pandas as pd
import nltk
import re
import tensorflow as tf
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Bidirectional, Dropout
from sklearn.model_selection import train_test_split
from keras.optimizers import Adam
from keras.preprocessing import sequence
from keras.callbacks import Callback
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

## **Data Collecting and Cleansing**

In [None]:
df=pd.read_csv("ara_eng.txt", delimiter="\t", names=["english", "arabic"])

In [None]:
ds = df.loc[0:12000, :].copy()
ds

Unnamed: 0,english,arabic
0,Hi.,مرحبًا.
1,Run!,اركض!
2,Help!,النجدة!
3,Jump!,اقفز!
4,Stop!,قف!
...,...,...
11996,according to the united nations educational sc...,وفقا لمنظمة الامم المتحدة للتربية والعلم والثق...
11997,he was passionate about work and convinced tha...,كان متحمسا لعمله ومقتنع بان العمارة قبل ان تكو...
11998,is the saudi government monitoring women globa...,السعودية اتراقب الحكومة النساء؟ الاصوات العالمية
11999,is the saudi government monitoring women blogg...,هل تراقب الحكومة السعودية النساء؟ يوضح لنا الم...


## **Lowercasing**

In [None]:
def preprocess(text):
  text = re.sub(r'\b\w+\b',lambda match: match.group(0).lower(),text)
  return text
ds.loc[:, 'english'] = ds['english'].apply(preprocess)
ds['english']

0                                                      hi.
1                                                     run!
2                                                    help!
3                                                    jump!
4                                                    stop!
                               ...                        
11996    according to the united nations educational sc...
11997    he was passionate about work and convinced tha...
11998    is the saudi government monitoring women globa...
11999    is the saudi government monitoring women blogg...
12000    mexican women are murdered each day global voi...
Name: english, Length: 12001, dtype: object

## **Tokenization And Indexing**

In [None]:
def tokenization(dataset):
  tokenizer = Tokenizer()
  tokenizer.fit_on_texts(dataset)
  return tokenizer

tokenizer=tokenization(ds)

tokenizer_eng=tokenization(ds['english'])
VOCAB_SIZE1=len(tokenizer_eng.word_index) + 1
eng_length=8

word_to_index_english = {word: idx for idx, word in enumerate(tokenizer_eng.word_index)}
index_to_word_english = {index: word for word, index in tokenizer_eng.word_index.items()}

tokenizer_ara=tokenization(ds['arabic'])
VOCAB_SIZE2=len(tokenizer_ara.word_index) + 1
ara_length=8

word_to_index_arabic = {word: idx for idx, word in enumerate(tokenizer_ara.word_index.items())}
index_to_word_arabic = {index: word for word, index in tokenizer_ara.word_index.items()}

print("\nEnglish vocabulary size:",VOCAB_SIZE1)
print("\nArabic vocabulary size:",VOCAB_SIZE2)


English vocabulary size: 7607

Arabic vocabulary size: 18880


## **Encoding and Pad Sequence**

In [None]:
def encode_sequence(tokenizer, length, ds):
  #integer encode sequences
  seq = tokenizer.texts_to_sequences(ds)
  #pad sequences with 0 value
  seq = pad_sequences(seq, maxlen=length, padding = 'post')
  return seq

## **Splitting the data**

In [None]:
main_data,test_data,main_label,test_label=train_test_split(ds['english'],ds['arabic'],test_size=0.2,random_state=12)
train_data,val_data,train_label,val_label=train_test_split(main_data,main_label,test_size=0.2,random_state=12)

## **Preparing the data**

In [None]:
#prepare training data
trainX = encode_sequence(tokenizer_eng, eng_length , train_data)
trainY = encode_sequence(tokenizer_ara , ara_length , train_label)

#prepare validating data
valX = encode_sequence(tokenizer_eng, eng_length, val_data)
valY = encode_sequence(tokenizer_ara, ara_length, val_label)

#prepare test data
testX = encode_sequence(tokenizer_eng, eng_length , test_data)
testY = encode_sequence(tokenizer_ara , ara_length , test_label)

## **Model Building**

In [None]:
model = Sequential()
model.add(Embedding(input_dim=VOCAB_SIZE1, output_dim=100, input_length=trainX.shape[1]))
model.add(Bidirectional(LSTM(units=64, return_sequences=True, dropout=0.2, recurrent_dropout=0.2)))
model.add(Dense(units=VOCAB_SIZE2, activation='sigmoid', use_bias=True, kernel_initializer='glorot_uniform'))

## **Model Compilation and Training**

In [None]:
class BLEUMetric(Callback):
    def __init__(self, validation_data, index_to_word_english, index_to_word_arabic):
        super(BLEUMetric, self).__init__()
        self.validation_data = validation_data
        self.index_to_word_english = index_to_word_english
        self.index_to_word_arabic = index_to_word_arabic

    def on_epoch_end(self, epoch, logs=None):
        y_true = self.validation_data[1]  # Assuming validation_data is a tuple (X_val, y_val)
        y_pred = self.model.predict(self.validation_data[0])

        bleu_scores = []

        for true_seq, pred_seq in zip(y_true, y_pred):
            # Convert sequences from indices to actual words
            reference_words = [self.index_to_word_english.get(idx, 'UNKNOWN') for idx in true_seq]
            prediction_words = [self.index_to_word_arabic.get(np.argmax(pred_word_probs), 'UNKNOWN') for pred_word_probs in pred_seq]

            # Calculate BLEU score for each pair of reference and prediction
            bleu = nltk.translate.bleu_score.sentence_bleu([reference_words], prediction_words)
            bleu_scores.append(bleu)

        # Calculate the average BLEU score for the entire validation set
        avg_bleu = np.mean(bleu_scores)
        print(f'Epoch {epoch + 1}, Average BLEU Score on Validation Set: {avg_bleu}')

validation_data = (valX, valY)
bleu_metric = BLEUMetric(validation_data, index_to_word_english, index_to_word_arabic)

model.compile(loss='sparse_categorical_crossentropy', optimizer='adamax', metrics=['accuracy'])
history = model.fit(trainX, trainY, epochs=5, batch_size=32, validation_data=(valX, valY), callbacks=[bleu_metric])


Epoch 1/5


The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


Epoch 1, Average BLEU Score on Validation Set: 0.3269798531238043
Epoch 2/5
Epoch 2, Average BLEU Score on Validation Set: 0.3269798531238043
Epoch 3/5
Epoch 3, Average BLEU Score on Validation Set: 0.3269798531238043
Epoch 4/5
Epoch 4, Average BLEU Score on Validation Set: 0.3265655208718608
Epoch 5/5
Epoch 5, Average BLEU Score on Validation Set: 0.32605218386773677


## **Evaluation**

In [None]:
score, acc = model.evaluate(testX, testY, batch_size=32)
print('Test score:', score)
print('Test accuracy:', acc)

Test score: 5.275645732879639
Test accuracy: 0.43237191438674927


## **Prediction**

In [None]:
english_sentences = ds['english'].head(5)

encoded_english = encode_sequence(tokenizer_eng, eng_length, english_sentences)
encoded_english

predicted_arabic_sequences = model.predict(encoded_english)

for idx, pred_seq in enumerate(predicted_arabic_sequences):
    translated_sentence = ' '.join([index_to_word_arabic.get(idx) for idx in np.argmax(pred_seq, axis=-1) if idx != 0])
    print(f"English Sentence {idx}: {english_sentences.iloc[idx-1]}")
    print(f"Arabic Translation {idx}: {translated_sentence}\n")
    if idx == 5:
        break

English Sentence 0: stop!
Arabic Translation 0: هل

English Sentence 1: hi.
Arabic Translation 1: هل

English Sentence 2: run!
Arabic Translation 2: 

English Sentence 3: help!
Arabic Translation 3: هل

English Sentence 4: jump!
Arabic Translation 4: هل



## **Transformer**


In [None]:
!pip install transformers[sentencepiece]
!pip install sentencepiece

!pip install --upgrade transformers

# Use a pipeline as a high-level helper
from transformers import pipeline
import sentencepiece

pipe = pipeline("translation_en_to_ar", model="marefa-nlp/marefa-mt-en-ar")

# Translate text
text = "how are you?"
output = pipe(text)

print(output)

Collecting sentencepiece!=0.1.92,>=0.1.91 (from transformers[sentencepiece])
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: sentencepiece
Successfully installed sentencepiece-0.1.99
Collecting transformers
  Downloading transformers-4.36.2-py3-none-any.whl (8.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.2/8.2 MB[0m [31m20.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.35.2
    Uninstalling transformers-4.35.2:
      Successfully uninstalled transformers-4.35.2
Successfully installed transformers-4.36.2


config.json:   0%|          | 0.00/1.32k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/306M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/237 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/801k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/917k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.24M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]



[{'translation_text': 'كيف الحال؟'}]
