In [9]:
import numpy as np
import tensorflow as tf
import unicodedata
import re
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from transformers import MarianMTModel, MarianTokenizer
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Dropout, Attention, Concatenate
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model

In [10]:
print(tf.config.list_physical_devices('GPU'))

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU'), PhysicalDevice(name='/physical_device:GPU:1', device_type='GPU')]


In [11]:


def create_model(input_vocab_size, target_vocab_size, embedding_dim, hidden_units, dropout_rate, learning_rate):
    # Encoder input
    encoder_input = Input(shape=(None,), dtype=tf.int32, name='encoder_input')
    encoder_embedding = Embedding(input_vocab_size, embedding_dim)(encoder_input)
    encoder_lstm = LSTM(hidden_units, return_state=True, dropout=dropout_rate, recurrent_dropout=dropout_rate)
    encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)

    # Decoder input
    decoder_input = Input(shape=(None,), dtype=tf.int32, name='decoder_input')
    decoder_embedding = Embedding(target_vocab_size, embedding_dim)(decoder_input)
    decoder_lstm = LSTM(hidden_units, return_sequences=True, return_state=True, dropout=dropout_rate, recurrent_dropout=dropout_rate)
    decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=[state_h, state_c])
    
    # Output layer
    output = Dense(target_vocab_size, activation='softmax')(decoder_outputs)
    
    # Create and compile the model
    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
    model = Model([encoder_input, decoder_input], output)
    model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    
    return model

# Example usage
input_vocab_size = 30000  # example value
target_vocab_size = 30000 # example value
embedding_dim = 256
hidden_units = 512
dropout_rate = 0.3
learning_rate = 0.0005

model = create_model(input_vocab_size, target_vocab_size, embedding_dim, hidden_units, dropout_rate, learning_rate)

model.summary()

In [12]:
 # Indonesian sentences file

def load_and_preprocess(file_en, file_id):
    # Load files with UTF-8 decoding
    with open(file_en, 'r', encoding='utf-8') as f_en, open(file_id, 'r', encoding='utf-8') as f_id:
        english_lines = f_en.readlines()
        indonesian_lines = f_id.readlines()   
    english_lines=english_lines[:500000]
    indonesian_lines=indonesian_lines[:500000]
    # Ensure line counts match
    print(len(english_lines))
    print(len(indonesian_lines))
    
    assert len(english_lines) == len(indonesian_lines), "Mismatched line counts in dataset!"


    # Preprocess lines
    english_lines = ['<start> ' + line.strip().lower() + ' <end>' for line in english_lines]
    indonesian_lines = ['<start> ' + line.strip().lower() + ' <end>' for line in indonesian_lines]
    
    return english_lines, indonesian_lines

# Tokenize and pad sequences
def tokenize_and_pad(lines, num_words, max_length):
    tokenizer = Tokenizer(num_words=num_words, filters='')
    tokenizer.fit_on_texts(lines)
    sequences = tokenizer.texts_to_sequences(lines)
    padded = pad_sequences(sequences, maxlen=max_length, padding='post')
    return tokenizer, padded

# Load dataset
file_en = "/kaggle/input/opensub-en-id/OpenSubtitles.en-id.en"  # English sentences file
file_id = "/kaggle/input/opensub-en-id/OpenSubtitles.en-id.id" 

english_lines, indonesian_lines = load_and_preprocess(file_en, file_id)

# Tokenize and pad
num_words = 30000  # Maximum vocabulary size
max_length = 50    # Maximum sentence length

input_tokenizer, input_padded = tokenize_and_pad(english_lines, num_words, max_length)
output_tokenizer, output_padded = tokenize_and_pad(indonesian_lines, num_words, max_length)

# Target data needs to be shifted for teacher forcing
output_input = output_padded[:, :-1]  # Remove last token for decoder input
output_target = output_padded[:, 1:]  # Remove first token for target

500000
500000


In [13]:
batch_size = 64
epochs = 5

history = model.fit(
    [input_padded, output_input],
    output_target[..., np.newaxis],  # Add extra dimension for sparse categorical crossentropy
    batch_size=batch_size,
    epochs=epochs,
    validation_split=0.2
)
model.save("MT.h5")

Epoch 1/5
[1m6250/6250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2036s[0m 325ms/step - accuracy: 0.9020 - loss: 0.8712 - val_accuracy: 0.9249 - val_loss: 0.5482
Epoch 2/5
[1m6250/6250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2044s[0m 327ms/step - accuracy: 0.9221 - loss: 0.5321 - val_accuracy: 0.9322 - val_loss: 0.4761
Epoch 3/5
[1m6250/6250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2044s[0m 327ms/step - accuracy: 0.9292 - loss: 0.4488 - val_accuracy: 0.9353 - val_loss: 0.4452
Epoch 4/5
[1m6250/6250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2046s[0m 327ms/step - accuracy: 0.9332 - loss: 0.3989 - val_accuracy: 0.9370 - val_loss: 0.4292
Epoch 5/5
[1m6250/6250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2047s[0m 328ms/step - accuracy: 0.9362 - loss: 0.3626 - val_accuracy: 0.9381 - val_loss: 0.4202


In [14]:
def beam_search_decode(input_sequence, model, input_tokenizer, output_tokenizer, max_length, beam_width=3):
    # Initialize variables
    start_token = output_tokenizer.word_index['<start>']
    end_token = output_tokenizer.word_index['<end>']
    sequences = [[list(), 0.0]]  # Each sequence: (tokens, score)

    # Preprocess input
    input_sequence = pad_sequences(input_sequence, maxlen=max_length, padding='post')

    for _ in range(max_length):
        all_candidates = []
        for seq, score in sequences:
            if seq and seq[-1] == end_token:
                all_candidates.append((seq, score))
                continue
            target_sequence = pad_sequences([seq], maxlen=max_length, padding='post')
            predictions = model.predict([input_sequence, target_sequence], verbose=0)
            for i in np.argsort(predictions[0, len(seq) - 1])[-beam_width:]:
                candidate = seq + [i]
                candidate_score = score - np.log(predictions[0, len(seq) - 1, i])
                all_candidates.append((candidate, candidate_score))
        sequences = sorted(all_candidates, key=lambda tup: tup[1])[:beam_width]

    final_sequence = sequences[0][0]
    translated_sentence = ' '.join([output_tokenizer.index_word.get(token, '') for token in final_sequence if token != start_token and token != end_token])
    return translated_sentence

In [15]:
def translate_sentence(input_sentence, model, input_tokenizer, output_tokenizer, max_length):
    # Preprocess input sentence
    input_sentence = '<start> ' + input_sentence.strip().lower() + ' <end>'
    input_sequence = input_tokenizer.texts_to_sequences([input_sentence])
    input_sequence = pad_sequences(input_sequence, maxlen=max_length, padding='post')
    
    # Initialize the decoder input with <start> token
    start_token = output_tokenizer.word_index['<start>']
    end_token = output_tokenizer.word_index['<end>']
    target_sequence = np.zeros((1, max_length))
    target_sequence[0, 0] = start_token
    
    translated_sentence = []
    for i in range(1, max_length):
        # Predict next token
        predictions = model.predict([input_sequence, target_sequence], verbose=0)
        predicted_id = np.argmax(predictions[0, i - 1])
        
        # Stop if <end> token is predicted
        if predicted_id == end_token:
            break
        
        # Append the predicted word
        translated_sentence.append(output_tokenizer.index_word.get(predicted_id, ''))
        target_sequence[0, i] = predicted_id
    
    return ' '.join(translated_sentence)

# Example usage
manual_input = input("Masukan kalimat: ")
translated_output = translate_sentence(manual_input, model, input_tokenizer, output_tokenizer, max_length)
debug = beam_search_decode(input_padded[0:1], model, input_tokenizer, output_tokenizer, max_length)
print("Input:", manual_input)
print("Translated Output:", translated_output)

Masukan kalimat:  this is a test


Input: this is a test
Translated Output: ini adalah sebuah


In [16]:
def test_translate_sentence(model, input_tokenizer, output_tokenizer, max_length):
    test_cases = [
        # Simple test cases (common phrases)
        ("hello", "halo"),  # Should translate to the expected Indonesian translation
        ("how are you", "apa kabar"),
        ("this is good", "ini bagus"),
        
        # More complex sentences
        ("nothing to do here", "tidak ada yang bisa dilakukan disini"),
        ("the weather is nice today", "cuaca bagus hari ini"),
        
        # Edge cases (sentences that may be harder for the model)
        ("I am happy", "saya senang"),
        ("good morning", "selamat pagi"),
        
        # Test for empty input (should ideally return an empty string or a specific error message)
        ("", ""),
    ]
    
    # Run test cases
    for input_sentence, expected_output in test_cases:
        print(f"Testing input: {input_sentence}")
        
        translated_output = translate_sentence(input_sentence, model, input_tokenizer, output_tokenizer, max_length)
        print(f"Expected output: {expected_output}")
        print(f"Model output: {translated_output}")
# Run the test
test_translate_sentence(model, input_tokenizer, output_tokenizer, max_length)

Testing input: hello
Expected output: halo
Model output: hello
Testing input: how are you
Expected output: apa kabar
Model output: bagaimana anda
Testing input: this is good
Expected output: ini bagus
Model output: ini adalah baik.
Testing input: nothing to do here
Expected output: tidak ada yang bisa dilakukan disini
Model output: tidak ada yang perlu di sini.
Testing input: the weather is nice today
Expected output: cuaca bagus hari ini
Model output: hari ini sangat baik hari ini.
Testing input: I am happy
Expected output: saya senang
Model output: aku senang
Testing input: good morning
Expected output: selamat pagi
Model output: selamat pagi.
Testing input: 
Expected output: 
Model output: 
