In [None]:
!wget  http://www.manythings.org/anki/por-eng.zip
!unzip -q /kaggle/working/por-eng.zip


In [3]:
import keras
import keras_nlp
import random
import re
import string
import numpy as np
import tensorflow as tf
import torch



Using TensorFlow backend


In [4]:
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
  try:
    # Currently, memory growth needs to be the same across GPUs
    for gpu in gpus:
      tf.config.experimental.set_memory_growth(gpu, True)
    logical_gpus = tf.config.experimental.list_logical_devices('GPU')
    print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
  except RuntimeError as e:
    # Memory growth must be set before GPUs have been initialized
    print(e)

1 Physical GPUs, 1 Logical GPUs


In [5]:
text_file = "/kaggle/working/por.txt"

with open(text_file) as f:
    lines = f.read().split("\n")[:-1]

text_pairs = []

for line in lines:
    # Split the line into fields using tabs
    fields = line.split("\t")
    
    # Keep only the first two fields
    english, portuguese = fields[:2]

    # Modify the 'portuguese' value
    portuguese = "[start] " + portuguese + " [end]"

    # Append the modified pair to text_pairs
    text_pairs.append((english, portuguese))


In [6]:
print(random.choice(text_pairs))

('The alarm went off.', '[start] O alarme disparou. [end]')


In [7]:
random.shuffle(text_pairs)
num_val_samples = int(0.15 * len(text_pairs))
num_train_samples = len(text_pairs) - 2 * num_val_samples
train_pairs = text_pairs[:num_train_samples]
val_pairs = text_pairs[num_train_samples:num_train_samples + num_val_samples]
test_pairs = text_pairs[num_train_samples + num_val_samples:]


In [21]:
# [ and ] cannot be in the list of excluded characters!
punct_to_exclude = '!"#$%&()*+-/:;<=>@\\^_`{|}~'
punct_to_tokenize = '.,:;!¡?¿'

def custom_standardization(input_string):
    lowercase = tf.strings.lower(input_string)
    stripped = tf.strings.regex_replace(
        lowercase, f"[{re.escape(punct_to_exclude)}]", "")
    stripped = tf.strings.regex_replace(
        stripped, f"([{re.escape(punct_to_tokenize)}])", r" \1")
    return stripped

vocab_size = 15000            # looking only at the most frequent 15,000 words
sequence_length = 45

source_vectorization = keras.layers.TextVectorization(
    max_tokens=vocab_size,
    output_mode="int",
    output_sequence_length=sequence_length,
    standardize=custom_standardization
)

target_vectorization = keras.layers.TextVectorization(
    max_tokens=vocab_size,
    output_mode="int",
    output_sequence_length=sequence_length + 1,
    standardize=custom_standardization
)

train_english_texts = [pair[0] for pair in train_pairs]
train_spanish_texts = [pair[1] for pair in train_pairs]
source_vectorization.adapt(train_english_texts)
target_vectorization.adapt(train_spanish_texts)


In [22]:
source_vectorization.get_vocabulary()[:10]

['', '[UNK]', '.', 'tom', 'i', 'to', 'you', 'the', '?', 'a']

In [23]:
batch_size = 256

def format_dataset(eng, spa):
    eng = source_vectorization(eng)
    spa = target_vectorization(spa)
    return ({"english": eng,
             "spanish": spa[:, :-1]
            }, spa[:, 1:])

def make_dataset(pairs):
    eng_texts, spa_texts = zip(*pairs)
    eng_texts = list(eng_texts)
    spa_texts = list(spa_texts)
    dataset = tf.data.Dataset.from_tensor_slices((eng_texts, spa_texts))
    dataset = dataset.batch(batch_size)
    dataset = dataset.map(format_dataset, num_parallel_calls=tf.data.AUTOTUNE)
    return dataset.shuffle(2048).prefetch(buffer_size=tf.data.AUTOTUNE).cache()

train_ds = make_dataset(train_pairs)
val_ds = make_dataset(val_pairs)


In [24]:
eng_texts, spa_texts = zip(*text_pairs)
eng_texts = list(eng_texts)
spa_texts = list(spa_texts)

In [25]:
for inputs, targets in train_ds.take(1):
    print(f"inputs['english'].shape: {inputs['english'].shape}")
    print(f"inputs['spanish'].shape: {inputs['spanish'].shape}")
    print(f"targets.shape: {targets.shape}")

inputs['english'].shape: (256, 45)
inputs['spanish'].shape: (256, 45)
targets.shape: (256, 45)


In [26]:
embed_dim = 512
dense_dim = 2048
num_heads = 16


In [27]:
encoder_inputs = keras.layers.Input(shape=(None,), dtype="int64", name="english")

In [28]:
token_embeddings = keras.layers.Embedding(
        input_dim=vocab_size, output_dim=embed_dim
    )(encoder_inputs)
position_embeddings = keras_nlp.layers.PositionEmbedding(
    sequence_length=sequence_length)(token_embeddings)
x = token_embeddings + position_embeddings


In [29]:
encoder_outputs = keras_nlp.layers.TransformerEncoder(
    intermediate_dim=dense_dim, num_heads=num_heads)(x)

In [30]:
decoder_inputs = keras.Input(shape=(None,), dtype="int64", name="spanish")
token_embeddings = keras.layers.Embedding(
        input_dim=vocab_size, output_dim=embed_dim
    )(decoder_inputs)
position_embeddings = keras_nlp.layers.PositionEmbedding(
    sequence_length=sequence_length)(token_embeddings)
x = token_embeddings + position_embeddings

In [31]:
x = keras_nlp.layers.TransformerDecoder(
    intermediate_dim=dense_dim, num_heads=num_heads)(x, encoder_outputs)
x = keras.layers.Dropout(0.5)(x)
decoder_outputs = keras.layers.Dense(vocab_size, activation="softmax")(x)
transformer = keras.Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [32]:
transformer.compile(
    optimizer="adam",
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"])
transformer.fit(train_ds, epochs=30, validation_data=val_ds)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.src.callbacks.History at 0x7eb6637d1ea0>

In [34]:
por_vocab = target_vectorization.get_vocabulary()
por_index_lookup = dict(zip(range(len(por_vocab)), por_vocab))
max_decoded_sentence_length = 45

def decode_sequence(input_sentence):
    tokenized_input_sentence = source_vectorization([input_sentence])
    decoded_sentence = "[start]"
    for i in range(max_decoded_sentence_length):
        tokenized_target_sentence = target_vectorization(
            [decoded_sentence])[:, :-1]
        predictions = transformer([tokenized_input_sentence, tokenized_target_sentence])
        sampled_token_index = np.argmax(predictions[0, i, :])
        sampled_token = por_index_lookup[sampled_token_index]
        decoded_sentence += " " + sampled_token
        if sampled_token == "[end]":
            break
    return decoded_sentence

test_eng_texts = [pair[0] for pair in test_pairs]
test_por_texts = [pair[1] for pair in test_pairs]
test_texts = list(zip(test_eng_texts, test_por_texts))
for _ in range(20):
    input_sentence, target_sentence = random.choice(test_texts)
    print("-")
    print("Input: ", input_sentence)
    print("Reference: ", target_sentence)
    print("Prediction: ", decode_sequence(input_sentence))

-
Input:  Tom dropped Mary off at the library.
Reference:  [start] Tom deixou Mary na biblioteca. [end]
Prediction:  [start] tom deixou mary na biblioteca . [end]
-
Input:  I know the reason that she quit her job.
Reference:  [start] Eu sei o porquê dela ter largado o emprego. [end]
Prediction:  [start] eu sei que ela tem razão para deixar o trabalho deles . [end]
-
Input:  Tom never mentioned your name.
Reference:  [start] Tom nunca mencionou o seu nome. [end]
Prediction:  [start] tom nunca mencionou o nome deles . [end]
-
Input:  Have you really lost your wallet again?
Reference:  [start] Você já perdeu sua carteira de novo? [end]
Prediction:  [start] você realmente perdeu sua carteira de novo ? [end]
-
Input:  I wish I had something to drink.
Reference:  [start] Eu queria ter algo para beber. [end]
Prediction:  [start] queria ter uma coisa para beber . [end]
-
Input:  There's no more salt.
Reference:  [start] Não tem mais sal. [end]
Prediction:  [start] não tem mais sal . [end]
-
In

In [35]:
import nltk
from nltk.translate.bleu_score import corpus_bleu

# Example machine-generated translations
hypotheses = ["está chovendo muito cedo hoje de manhã , então eu não quero"]
# Example reference translations
references = [["Está chovendo forte desde esta manhã, então não quero ir a lugar algum", "Está a chover muito desde esta manhã, por isso não quero ir a lado nenhum"]]

# Tokenize the translations
hypotheses = [nltk.word_tokenize(sent) for sent in hypotheses]
references = [[nltk.word_tokenize(sent) for sent in ref] for ref in references]

# Calculate BLEU score
bleu_score = corpus_bleu(references, hypotheses)

print(f"BLEU Score: {bleu_score * 100:.2f}")

BLEU Score: 30.06


Corpus/Sentence contains 0 counts of 4-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().


In [37]:
transformer.save("veysel_kaan_bati.mtl.keras")