In [3]:
import numpy as np
import pandas as pd
import re
import pickle
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, SimpleRNN, LSTM, Dense, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
!pip install tensorflow

Collecting tensorflow
  Downloading tensorflow-2.19.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.1 kB)
Collecting astunparse>=1.6.0 (from tensorflow)
  Downloading astunparse-1.6.3-py2.py3-none-any.whl.metadata (4.4 kB)
Collecting flatbuffers>=24.3.25 (from tensorflow)
  Downloading flatbuffers-25.2.10-py2.py3-none-any.whl.metadata (875 bytes)
Collecting google-pasta>=0.1.1 (from tensorflow)
  Downloading google_pasta-0.2.0-py3-none-any.whl.metadata (814 bytes)
Collecting libclang>=13.0.0 (from tensorflow)
  Downloading libclang-18.1.1-py2.py3-none-manylinux2010_x86_64.whl.metadata (5.2 kB)
Collecting tensorboard~=2.19.0 (from tensorflow)
  Downloading tensorboard-2.19.0-py3-none-any.whl.metadata (1.8 kB)
Collecting tensorflow-io-gcs-filesystem>=0.23.1 (from tensorflow)
  Downloading tensorflow_io_gcs_filesystem-0.37.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (14 kB)
Collecting wheel<1.0,>=0.23.0 (from astunparse>=1.6.0->tensorflow

In [4]:
def load_dataset(de_path, en_path):
    with open(de_path, encoding='utf-8') as f_de, open(en_path, encoding='utf-8') as f_en:
        de_sentences = f_de.readlines()
        en_sentences = f_en.readlines()

    # Truncate to shortest length to avoid mismatch
    min_len = min(len(de_sentences), len(en_sentences))
    de_sentences = de_sentences[:min_len]
    en_sentences = en_sentences[:min_len]

    return pd.DataFrame({'de': de_sentences, 'en': en_sentences})

europarl = load_dataset('/content/drive/MyDrive/DATASETS/training-parallel-europarl-v7/training/europarl-v7.de-en.de', '/content/drive/MyDrive/DATASETS/training-parallel-europarl-v7/training/europarl-v7.de-en.en')
commoncrawl = load_dataset('/content/drive/MyDrive/DATASETS/training-parallel-commoncrawl/commoncrawl.de-en.de', '/content/drive/MyDrive/DATASETS/training-parallel-commoncrawl/commoncrawl.de-en.en')
news = load_dataset('/content/drive/MyDrive/DATASETS/training-parallel-nc-v9/training/news-commentary-v9.de-en.de', '/content/drive/MyDrive/DATASETS/training-parallel-nc-v9/training/news-commentary-v9.de-en.en')

full_data = pd.concat([europarl, commoncrawl, news], ignore_index=True)
print(f'Total dataset size: {full_data.shape}')

Total dataset size: (4521186, 2)


In [5]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-ZäöüÄÖÜß?.!,¿]+", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

full_data['de'] = full_data['de'].apply(clean_text)
full_data['en'] = full_data['en'].apply(clean_text)
full_data.dropna(inplace=True)
full_data = full_data[(full_data['de'] != '') & (full_data['en'] != '')]

In [6]:
full_data.head()

Unnamed: 0,de,en
0,wiederaufnahme der sitzungsperiode,resumption of the session
1,"ich erkläre die am freitag, dem . dezember unt...",i declare resumed the session of the european ...
2,"wie sie feststellen konnten, ist der gefürchte...","although, as you will have seen, the dreaded m..."
3,im parlament besteht der wunsch nach einer aus...,you have requested a debate on this subject in...
4,heute möchte ich sie bitten das ist auch der w...,"in the meantime, i should like to observe a mi..."


In [7]:
src_tokenizer = Tokenizer()
tgt_tokenizer = Tokenizer()

src_tokenizer.fit_on_texts(full_data['de'])
tgt_tokenizer.fit_on_texts(full_data['en'])

src_seq = src_tokenizer.texts_to_sequences(full_data['de'])
tgt_seq = tgt_tokenizer.texts_to_sequences(full_data['en'])

max_src_len = 20
max_tgt_len = 20

src_padded = pad_sequences(src_seq, maxlen=max_src_len, padding='post')
tgt_padded = pad_sequences(tgt_seq, maxlen=max_tgt_len, padding='post')

tgt_input = tgt_padded[:, :-1]
tgt_output = tgt_padded[:, 1:]

In [None]:
vocab_src = len(src_tokenizer.word_index) + 1
vocab_tgt = len(tgt_tokenizer.word_index) + 1
embedding_dim = 256
units = 512

def build_rnn_model():
    encoder_input = Input(shape=(max_src_len,))
    x = Embedding(vocab_src, embedding_dim)(encoder_input)
    x = SimpleRNN(units, return_sequences=False)(x)
    x = Dense(units)(x)

    decoder_input = Input(shape=(max_tgt_len - 1,))
    y = Embedding(vocab_tgt, embedding_dim)(decoder_input)
    y = SimpleRNN(units, return_sequences=True)(y, initial_state=[x])
    output = Dense(vocab_tgt, activation='softmax')(y)

    model = Model([encoder_input, decoder_input], output)
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')
    return model

rnn_model = build_rnn_model()
rnn_model.summary()
rnn_model.fit([src_padded, tgt_input], tgt_output[..., np.newaxis], batch_size=64, epochs=5, validation_split=0.1)

Epoch 1/5




[1m    2/63409[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m2902:34:09[0m 165s/step - loss: 13.2528

In [None]:
def build_lstm_model():
    encoder_input = Input(shape=(max_src_len,))
    x = Embedding(vocab_src, embedding_dim)(encoder_input)
    x, state_h, state_c = LSTM(units, return_state=True)(x)

    decoder_input = Input(shape=(max_tgt_len - 1,))
    y = Embedding(vocab_tgt, embedding_dim)(decoder_input)
    y = LSTM(units, return_sequences=True)(y, initial_state=[state_h, state_c])

    output = Dense(vocab_tgt, activation='softmax')(y)

    model = Model([encoder_input, decoder_input], output)
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')
    return model

lstm_model = build_lstm_model()
lstm_model.summary()
lstm_model.fit([src_padded, tgt_input], tgt_output[..., np.newaxis], batch_size=64, epochs=5, validation_split=0.1)

In [None]:
def build_bidir_lstm_model():
    encoder_input = Input(shape=(max_src_len,))
    x = Embedding(vocab_src, embedding_dim)(encoder_input)
    x = Bidirectional(LSTM(units, return_sequences=False))(x)

    decoder_input = Input(shape=(max_tgt_len - 1,))
    y = Embedding(vocab_tgt, embedding_dim)(decoder_input)
    y = Bidirectional(LSTM(units, return_sequences=True))(y)

    output = Dense(vocab_tgt, activation='softmax')(y)

    model = Model([encoder_input, decoder_input], output)
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')
    return model

bidir_model = build_bidir_lstm_model()
bidir_model.summary()
bidir_model.fit([src_padded, tgt_input], tgt_output[..., np.newaxis], batch_size=64, epochs=5, validation_split=0.1)

In [None]:
def build_encoder_decoder():
    encoder_inputs = Input(shape=(max_src_len,))
    enc_emb = Embedding(vocab_src, embedding_dim)(encoder_inputs)
    encoder_lstm = LSTM(units, return_state=True)
    _, state_h, state_c = encoder_lstm(enc_emb)

    decoder_inputs = Input(shape=(max_tgt_len - 1,))
    dec_emb = Embedding(vocab_tgt, embedding_dim)(decoder_inputs)
    decoder_lstm = LSTM(units, return_sequences=True)
    decoder_outputs = decoder_lstm(dec_emb, initial_state=[state_h, state_c])
    decoder_dense = Dense(vocab_tgt, activation='softmax')
    output = decoder_dense(decoder_outputs)

    model = Model([encoder_inputs, decoder_inputs], output)
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')
    return model

encdec_model = build_encoder_decoder()
encdec_model.summary()
encdec_model.fit([src_padded, tgt_input], tgt_output[..., np.newaxis], batch_size=64, epochs=5, validation_split=0.1)

In [None]:
lstm_model.save('best_translation_model.h5')
with open('tokenizers.pkl', 'wb') as f:
    pickle.dump((src_tokenizer, tgt_tokenizer), f)

In [None]:
def translate_sentence(input_sentence, model, tokenizer_src, tokenizer_tgt):
    seq = tokenizer_src.texts_to_sequences([clean_text(input_sentence)])
    padded = pad_sequences(seq, maxlen=max_src_len, padding='post')
    decoder_input = np.zeros((1, max_tgt_len - 1))
    pred = model.predict([padded, decoder_input])
    translated = np.argmax(pred[0], axis=-1)
    words = [word for i in translated for word, index in tokenizer_tgt.word_index.items() if index == i]
    return ' '.join(words)

In [None]:
# 🔍 Test Translation Example
sample_input = "ich liebe maschinelles lernen"
translated_sentence = translate_sentence(sample_input, lstm_model, src_tokenizer, tgt_tokenizer)
print(f"German Input: {sample_input}")
print(f"English Translation: {translated_sentence}")