In [None]:
# import csv

# # Read the input data
# with open("ara.txt", "r", encoding="utf-8") as f:
#     lines = f.readlines()

# pairs = []

# for line in lines:
#     line = line.strip()
#     if not line:
#         continue

#     # Split line by tab
#     segments = line.split('\t')

#     # Every two consecutive segments = [English, Arabic]
#     for i in range(0, len(segments) - 1, 2):
#         english = segments[i].strip()
#         arabic = segments[i + 1].strip()
#         pairs.append([english, arabic])

# # Write to CSV
# with open("translations.csv", "w", newline="", encoding="utf-8-sig") as csvfile:
#     writer = csv.writer(csvfile)
#     writer.writerow(["English", "Arabic"])
#     writer.writerows(pairs)

# print("✅ CSV file 'translations.csv' created successfully.")

✅ CSV file 'translations.csv' created successfully.


In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv('translations.csv')
df.head(20)

Unnamed: 0,English,Arabic
0,Hi.,مرحبًا.
1,Run!,اركض!
2,Duck!,اخفض رأسك!
3,Duck!,اخفضي رأسك!
4,Duck!,اخفضوا رؤوسكم!
5,Help!,النجدة!
6,Jump!,اقفز!
7,Stop!,قف!
8,Stop!,توقف !
9,Wait!,إنتظر


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12523 entries, 0 to 12522
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   English  12523 non-null  object
 1   Arabic   12523 non-null  object
dtypes: object(2)
memory usage: 195.8+ KB


In [None]:
!pip install pyarabic

Collecting pyarabic
  Downloading PyArabic-0.6.15-py3-none-any.whl.metadata (10 kB)
Downloading PyArabic-0.6.15-py3-none-any.whl (126 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/126.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.4/126.4 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyarabic
Successfully installed pyarabic-0.6.15


In [None]:
from pyarabic.araby import strip_tashkeel, normalize_hamza
import re
def clean_english(text):
        text = text.lower()
        text = re.sub(r'[^a-z0-9\s]', '', text)
        return text.strip()


def clean_arabic(text):

    text = strip_tashkeel(text)

    text = normalize_hamza(text)

    text = text.replace('ة', 'ه').replace('ى', 'ي')
    text = re.sub(r'[إأٱآا]', 'ا', text)

    text = re.sub(r'[^\u0600-\u06FF0-9\s]', '', text)
    return re.sub(r'\s+', ' ', text).strip()

In [None]:
df['English'] = df['English'].apply(clean_english)
df['Arabic'] = df['Arabic'].apply(clean_arabic)
df['Arabic'] = df['Arabic'].apply(lambda x: '<start> ' + x + ' <end>')

In [None]:
df.head(10)

Unnamed: 0,English,Arabic
0,hi,<start> مرحبا <end>
1,run,<start> اركض <end>
2,duck,<start> اخفض رءسك <end>
3,duck,<start> اخفضي رءسك <end>
4,duck,<start> اخفضوا رءوسكم <end>
5,help,<start> النجده <end>
6,jump,<start> اقفز <end>
7,stop,<start> قف <end>
8,stop,<start> توقف <end>
9,wait,<start> ءنتظر <end>


In [None]:
import pandas as pd
import numpy as np
import re
import os
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from pyarabic.araby import strip_tashkeel, normalize_hamza
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input,Embedding, LSTM, Dense, Dropout
from keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

In [None]:
eng_tokenizer = Tokenizer(filters='')
eng_tokenizer.fit_on_texts(df['English'])

ar_tokenizer = Tokenizer(filters='')
ar_tokenizer.fit_on_texts(df['Arabic'])

eng_seqs = eng_tokenizer.texts_to_sequences(df['English'])
ar_seqs = ar_tokenizer.texts_to_sequences(df['Arabic'])

max_eng_len = max(len(seq) for seq in eng_seqs)
max_ar_len = max(len(seq) for seq in ar_seqs)

In [None]:
start_token = ar_tokenizer.word_index.get('<start>')
end_token = ar_tokenizer.word_index.get('<end>')

for i, seq in enumerate(ar_seqs[:5]):  # جرّب أول 5 جمل
    print(f"Seq {i}: {seq}")
    if seq[0] != start_token or seq[-1] != end_token:
        print("❌ Missing <start> or <end> token in sequence", i)
    else:
        print("✅ Start and end tokens OK")

Seq 0: [1, 598, 2]
✅ Start and end tokens OK
Seq 1: [1, 3061, 2]
✅ Start and end tokens OK
Seq 2: [1, 4897, 998, 2]
✅ Start and end tokens OK
Seq 3: [1, 4898, 998, 2]
✅ Start and end tokens OK
Seq 4: [1, 4899, 4900, 2]
✅ Start and end tokens OK


In [None]:
encoder_input_data_pad = pad_sequences(eng_seqs, maxlen=max_eng_len, padding='post')

In [None]:
decoder_input_data = []
decoder_target_data = []

In [None]:
for seq in ar_seqs:
    decoder_input_data.append(seq[:-1])
    decoder_target_data.append(seq[1:])

In [None]:
decoder_input_data

[[1, 598],
 [1, 3061],
 [1, 4897, 998],
 [1, 4898, 998],
 [1, 4899, 4900],
 [1, 1172],
 [1, 3062],
 [1, 1401],
 [1, 270],
 [1, 669],
 [1, 3063],
 [1, 855],
 [1, 598],
 [1, 1402],
 [1, 598],
 [1, 4901],
 [1, 4902],
 [1, 165, 1721],
 [1, 13, 4903],
 [1, 1403],
 [1, 3064],
 [1, 7, 1404],
 [1, 8, 4904],
 [1, 3065],
 [1, 56],
 [1, 91, 15],
 [1, 13, 91],
 [1, 13, 7, 4905],
 [1, 13, 498],
 [1, 670],
 [1, 159, 3066],
 [1, 1173],
 [1, 143],
 [1, 143],
 [1, 60, 3067],
 [1, 544],
 [1, 331, 3068],
 [1, 2208, 19, 1722],
 [1, 4906],
 [1, 392, 197],
 [1, 271, 1405],
 [1, 198, 12, 3069],
 [1, 3070, 35],
 [1, 199],
 [1, 199],
 [1, 1723, 5, 23],
 [1, 599],
 [1, 1723],
 [1, 1174, 29, 2209],
 [1, 600, 601],
 [1, 4907],
 [1, 54, 3071],
 [1, 12, 1406],
 [1, 669],
 [1, 22, 320],
 [1, 21, 999],
 [1, 460],
 [1, 1172, 460],
 [1, 3072],
 [1, 461],
 [1, 13, 2210],
 [1, 13, 4908],
 [1, 13, 226],
 [1, 13, 671],
 [1, 4909],
 [1, 4910],
 [1, 3073],
 [1, 602, 1724],
 [1, 4911],
 [1, 4912],
 [1, 174],
 [1, 4, 3074],
 [

In [None]:
# التعبئة بشكل صحيح
decoder_input_data = pad_sequences(decoder_input_data, maxlen=max_ar_len-1, padding='post')
decoder_target_data = pad_sequences(decoder_target_data, maxlen=max_ar_len-1, padding='post')
decoder_target_data = np.expand_dims(decoder_target_data, -1)

In [None]:
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Bidirectional
from tensorflow.keras.models import Model
import tensorflow as tf
# Encoder
encoder_inputs = Input(shape=(max_eng_len,))
encoder_embedding = Embedding(input_dim=len(eng_tokenizer.word_index)+1, output_dim=256)(encoder_inputs)

# Use Bidirectional LSTM and get both forward and backward states
encoder_bi_lstm = Bidirectional(LSTM(256, return_state=True))
encoder_outputs, forward_h, forward_c, backward_h, backward_c = encoder_bi_lstm(encoder_embedding)

# Concatenate forward and backward states for decoder initial state
state_h = tf.keras.layers.Concatenate()([forward_h, backward_h])
state_c = tf.keras.layers.Concatenate()([forward_c, backward_c])

# Decoder
decoder_inputs = Input(shape=(max_ar_len - 1,))
decoder_embedding = Embedding(input_dim=len(ar_tokenizer.word_index)+1, output_dim=512)(decoder_inputs)  # 512 = 256*2

decoder_lstm = LSTM(512, return_sequences=True)
decoder_outputs = decoder_lstm(decoder_embedding, initial_state=[state_h, state_c])
decoder_dense = Dense(len(ar_tokenizer.word_index)+1, activation='softmax')(decoder_outputs)

model = Model([encoder_inputs, decoder_inputs], decoder_dense)

In [None]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# تدريب الموديل
history = model.fit([encoder_input_data_pad, decoder_input_data], decoder_target_data,
                    batch_size=32,
                    epochs=20,  # يمكنك تعديل عدد الـ epochs حسب الحاجة
                    validation_split=0.2)  # تخصيص جزء من البيانات للتقييم

model.summary()

Epoch 1/20
[1m314/314[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 68ms/step - accuracy: 0.8709 - loss: 1.5945 - val_accuracy: 0.8180 - val_loss: 1.5717
Epoch 2/20
[1m314/314[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 67ms/step - accuracy: 0.9022 - loss: 0.7520 - val_accuracy: 0.8233 - val_loss: 1.5340
Epoch 3/20
[1m314/314[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 70ms/step - accuracy: 0.9062 - loss: 0.6808 - val_accuracy: 0.8260 - val_loss: 1.5486
Epoch 4/20
[1m314/314[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 70ms/step - accuracy: 0.9113 - loss: 0.5988 - val_accuracy: 0.8301 - val_loss: 1.5536
Epoch 5/20
[1m314/314[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 68ms/step - accuracy: 0.9164 - loss: 0.5237 - val_accuracy: 0.8312 - val_loss: 1.5832
Epoch 6/20
[1m314/314[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 70ms/step - accuracy: 0.9222 - loss: 0.4456 - val_accuracy: 0.8329 - val_loss: 1.5937
Epoch 7/20
[1m3

In [None]:
def translate_sentence(sentence):
    # تنظيف الجملة الإنجليزية
    sentence = clean_english(sentence)

    # تحويل الجملة إلى تسلسل
    eng_seq = eng_tokenizer.texts_to_sequences([sentence])
    eng_seq = pad_sequences(eng_seq, maxlen=max_eng_len, padding='post')

    # تهيئة decoder input بعلامة البداية فقط
    target_seq = np.zeros((1, max_ar_len-1))
    target_seq[0, 0] = start_token  # أول كلمة هي <start>

    output_sentence = []

    for i in range(max_ar_len-1):
        # التنبؤ بالكلمة التالية
        pred = model.predict([eng_seq, target_seq], verbose=0)

        # الحصول على الكلمة ذات أعلى احتمال
        pred_token = np.argmax(pred[0, i, :])

        # إذا كانت كلمة النهاية، نتوقف
        if pred_token == end_token:
            break

        # حفظ الكلمة المتنبأ بها
        output_word = ar_tokenizer.index_word.get(pred_token, '')
        output_sentence.append(output_word)

        # تحديث تسلسل الهدف للخطوة التالية
        if i+1 < max_ar_len-1:
            target_seq[0, i+1] = pred_token

    return ' '.join(output_sentence)

In [None]:
sentence = "give me your money"  # أدخل الجملة الإنجليزية هنا
translation = translate_sentence(sentence)
print("Translation:", translation)

Translation: اعطني مهله


In [None]:
test_sentences = [
    "hello",
    "good morning",
    "how are you",
    "thank you"
]

for sent in test_sentences:
    print(f"English: {sent}")
    print(f"Arabic: {translate_sentence(sent)}")
    print("---")

English: hello
Arabic: ءهلا
---
English: good morning
Arabic: صباح الخير
---
English: how are you
Arabic: كيف حالك؟
---
English: thank you
Arabic: شكرا لك
---


In [None]:
sentence = input("Enter a sentence in English: ")
translation = translate_sentence(sentence)
print("Translation:", translation)

Enter a sentence in English: I love you
Translation: ءحبك
