In [1]:
# CS 898 AW - Spring 2025
# Group 17
# Initial neural machine translation code for testing
# Open-source LSTM & attnention based model
# Reference: https://paperswithcode.com/paper/effective-approaches-to-attention-based
# Reference: https://github.com/philipperemy/keras-attention
# Reused under Apache 2.0 License

In [2]:
# Import libraries
import numpy as np
import pandas as pd
import kagglehub
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras import Input
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding
from tensorflow.keras.models import load_model, Model
import matplotlib.pyplot as plt
import pickle

!pip install sacrebleu



In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
# Import datasets
# WMT 2014 English-German dataset
train_df = pd.read_csv('/content/drive/MyDrive/School/CS 770 - ML/Group Project/WMT Data/wmt14_translate_de-en_train.csv', nrows=50000) # Reduced number of rows due to TPU & Colab memory constraints
validation_df = pd.read_csv('/content/drive/MyDrive/School/CS 770 - ML/Group Project/WMT Data/wmt14_translate_de-en_validation.csv')
test_df = pd.read_csv('/content/drive/MyDrive/School/CS 770 - ML/Group Project/WMT Data/wmt14_translate_de-en_test.csv', nrows=1000)

In [5]:
# Load tokenizers for test dataset
# Load the German tokenizer
with open('/content/drive/MyDrive/School/CS 770 - ML/Group Project/tokenizer_de.pickle', 'rb') as handle:
    tokenizer_de = pickle.load(handle)

# Load the English tokenizer
with open('/content/drive/MyDrive/School/CS 770 - ML/Group Project/tokenizer_en.pickle', 'rb') as handle:
    tokenizer_en = pickle.load(handle)
print("English word indices:", tokenizer_en.word_index)



In [6]:
# Import trained model
model = load_model('/content/drive/MyDrive/School/CS 770 - ML/Group Project/g17_nmt_lstm_val_df.keras')

  saveable.load_own_variables(weights_store.get(inner_path))


In [7]:
# Prepare test data for use (apply tokens)
# Useful module for preprocessing input data across the three input dfs
def prepare_data(input_df, max_length_de=85, max_length_en=81): # Add default values for max sequence lengths
  import tensorflow as tf
  from tensorflow.keras.preprocessing.sequence import pad_sequences
  from tensorflow.keras.utils import to_categorical

  df = input_df.copy()
  encoder_input_sequences_de = tokenizer_de.texts_to_sequences(df['de'])

  # Create decoder input sequences (English)
  decoder_input_sequences_en = tokenizer_en.texts_to_sequences(df['en'])

  # Pad sequences using the provided max sequence lengths
  encoder_input_sequences_de = pad_sequences(
      encoder_input_sequences_de, maxlen=max_length_de, padding="post" # Use max_length_de=85
  )
  decoder_input_sequences_en = pad_sequences(
      decoder_input_sequences_en, maxlen=max_length_en, padding="post" # Use max_length_en=68
  )

  # 2. Create decoder target data (shifted by one timestep)
  decoder_target_sequences_en = []
  for sequence in decoder_input_sequences_en:
    target_sequence = sequence[1:]  # Shift by one timestep
    padded_sequence = tf.concat([target_sequence, tf.constant([0], dtype=tf.int32)], axis=0)  # Append 0
    decoder_target_sequences_en.append(padded_sequence)
  decoder_target_sequences_en = pad_sequences(decoder_target_sequences_en, maxlen=max_length_en, padding='post') # Use max_length_en=68
  decoder_target_sequences_en = tf.convert_to_tensor(decoder_target_sequences_en)  # Convert to tensor, pad to max length

  # 3. One-hot encode target data
  # Ensure num_classes matches the output layer's vocabulary size
  num_decoder_tokens = len(tokenizer_en.word_index) + 1  # Update num_decoder_tokens
  decoder_target_sequences_en = to_categorical(
      decoder_target_sequences_en, num_classes=num_decoder_tokens  # Use updated num_decoder_tokens
  )

  # Print the shapes to verify
  print("Encoder input shape:", encoder_input_sequences_de.shape)
  print("Decoder input shape:", decoder_input_sequences_en.shape)
  print("Decoder target shape:", decoder_target_sequences_en.shape)

  return encoder_input_sequences_de, decoder_input_sequences_en, decoder_target_sequences_en

In [8]:
# Encode/decode test dataset
encoder_input_sequences_de, decoder_input_sequences_en, decoder_target_sequences_en = prepare_data(test_df)

Encoder input shape: (1000, 85)
Decoder input shape: (1000, 81)
Decoder target shape: (1000, 81, 8882)


In [9]:
# Predict using model and tokenized data
predictions = model.predict([encoder_input_sequences_de, decoder_input_sequences_en], batch_size=32)

[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 204ms/step


In [14]:
# Evaluate accuracy, F1, BLEU, and perplexity of the predictions
# Accuracy
accuracy = np.mean(np.argmax(decoder_target_sequences_en, axis=-1) == np.argmax(predictions, axis=-1))
print("Accuracy:", accuracy)

# F1 Score
from sklearn.metrics import f1_score

# Convert one-hot encoded targets and predictions to single-label format
y_true = np.argmax(decoder_target_sequences_en, axis=-1)
y_pred = np.argmax(predictions, axis=-1)

# Flatten the arrays to a single dimension for f1_score
y_true = y_true.flatten()
y_pred = y_pred.flatten()

# Calculate F1 score
f1 = f1_score(y_true, y_pred, average='weighted')
print("F1 Score:", f1)

# BLEU
from sacrebleu.metrics import BLEU
bleu = BLEU()
# Ensure predictions are in a list of strings
predicted_sentences = tokenizer_en.sequences_to_texts(np.argmax(predictions, axis=-1).tolist()) # Change here
reference_sentences = [[s] for s in test_df['en']] # BLEU expects a list of lists for references
bleu_score = bleu.corpus_score(predicted_sentences, reference_sentences)
print(bleu_score)

# Perplexity
perplexity = np.exp(np.mean(np.log(np.max(predictions, axis=-1))))
print("Perplexity:", perplexity)

Accuracy: 0.814679012345679
F1 Score: 0.8062234916590552
BLEU = 7.78 100.0/10.0/2.6/1.4 (BP = 1.000 ratio = 1.000 hyp_len = 21 ref_len = 21)
Perplexity: 0.74335957
