https://www.kaggle.com/datasets/mohamedlotfy50/wmt-2014-english-french

In [1]:
# 11b
import tensorflow as tf
import pandas as pd
import tensorflow_datasets as tfds

# Step 1: Load dataset from CSV using Pandas
data_path = '/kaggle/input/wmt-2014-english-french/wmt14_translate_fr-en_test.csv'
data = pd.read_csv(data_path)

# Check the first few rows and the column names of the dataframe
print(data.head())
print("Columns in the DataFrame:", data.columns.tolist())  # Print the actual column names

# Ensure the dataframe contains the required columns
expected_columns = ['en', 'fr']
assert all(col in data.columns for col in expected_columns), f"CSV must contain {expected_columns} columns"

# Step 2: Convert the DataFrame to a TensorFlow Dataset
# Create a TensorFlow dataset from the DataFrame
train_dataset = tf.data.Dataset.from_tensor_slices((data['en'].values, data['fr'].values))

# Print the first example to verify conversion
for english, french in train_dataset.take(1):
    print(f'English: {english.numpy().decode("utf-8")}, French: {french.numpy().decode("utf-8")}')

# Optional: Define constants for batch size and max length
BATCH_SIZE = 64
MAX_LENGTH = 40

# Optional: Tokenization process
# Tokenizer setup for input (English) and output (French)
tokenizer_en = tfds.deprecated.text.SubwordTextEncoder.build_from_corpus(
    (en.numpy() for en, fr in train_dataset), target_vocab_size=2**13)
tokenizer_fr = tfds.deprecated.text.SubwordTextEncoder.build_from_corpus(
    (fr.numpy() for en, fr in train_dataset), target_vocab_size=2**13)

# Encoding function
def encode(en_t, fr_t):
    en_t = [tokenizer_en.vocab_size] + tokenizer_en.encode(en_t.numpy().decode('utf-8')) + [tokenizer_en.vocab_size + 1]
    fr_t = [tokenizer_fr.vocab_size] + tokenizer_fr.encode(fr_t.numpy().decode('utf-8')) + [tokenizer_fr.vocab_size + 1]
    return en_t, fr_t

def tf_encode(en_t, fr_t):
    return tf.py_function(encode, [en_t, fr_t], [tf.int64, tf.int64])

# Prepare the dataset with encoding
train_dataset = train_dataset.map(tf_encode)

# Filter sequences longer than MAX_LENGTH
def filter_max_length(en, fr, max_length=MAX_LENGTH):
    return tf.logical_and(tf.size(en) <= max_length, tf.size(fr) <= max_length)

train_dataset = train_dataset.filter(filter_max_length)

# Shuffle and batch the dataset
train_dataset = train_dataset.shuffle(20000).padded_batch(BATCH_SIZE, padded_shapes=([None], [None]))
train_dataset = train_dataset.prefetch(tf.data.experimental.AUTOTUNE)

# Print the first training example after processing
for en, fr in train_dataset.take(1):
    print(f'Encoded English: {en.numpy()}')
    print(f'Encoded French: {fr.numpy()}')

                                                  en  \
0              Spectacular Wingsuit Jump Over Bogota   
1  Sportsman Jhonathan Florez jumped from a helic...   
2  Wearing a wingsuit, he flew past over the famo...   
3                           A black box in your car?   
4  As America's road planners struggle to find th...   

                                                  fr  
0  Spectaculaire saut en "wingsuit" au-dessus de ...  
1  Le sportif Jhonathan Florez a sauté jeudi d'un...  
2  Equipé d'un wingsuit (une combinaison munie d'...  
3               Une boîte noire dans votre voiture ?  
4  Alors que les planificateurs du réseau routier...  
Columns in the DataFrame: ['en', 'fr']
English: Spectacular Wingsuit Jump Over Bogota, French: Spectaculaire saut en "wingsuit" au-dessus de Bogota
Encoded English: [[7639 1879   28 ...    0    0    0]
 [7639 7417   96 ...    0    0    0]
 [7639   12 2850 ...    0    0    0]
 ...
 [7639   12 1488 ...    0    0    0]
 [7639  729 415