In [2]:
import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense, Attention
from tensorflow.keras.models import Model
from tensorflow.keras.utils import to_categorical

In [3]:
import pandas as pd

def load_sentences(file_path):
    """Load sentences from CSV file, handling potential formatting issues"""
    try:
        # First try standard CSV reading
        df = pd.read_csv(file_path, header=None)
        return df[0].tolist()
    except pd.errors.ParserError:
        # If that fails, read line by line
        sentences = []
        with open(file_path, 'r', encoding='utf-8') as f:
            for line in f:
                # Take everything before first comma if multiple exist
                sentence = line.strip().split(',')[0]
                sentences.append(sentence)
        return sentences

# Load data
en_sentences = load_sentences('/kaggle/input/english-to-french/small_vocab_en.csv')
fr_sentences = load_sentences('/kaggle/input/english-to-french/small_vocab_fr.csv')

# Add start/end tokens to French sentences
fr_sentences = ['<start> ' + sent + ' <end>' for sent in fr_sentences]

# Verify
print(f"English samples: {en_sentences[:3]}")
print(f"French samples: {fr_sentences[:3]}")
print(f"\nTotal English sentences: {len(en_sentences)}")
print(f"Total French sentences: {len(fr_sentences)}")

English samples: ['new jersey is sometimes quiet during autumn ', 'the united states is usually chilly during july ', 'california is usually quiet during march ']
French samples: ["<start> new jersey est parfois calme pendant l' automne  <end>", '<start> les états-unis est généralement froid en juillet  <end>', '<start> california est généralement calme en mars  <end>']

Total English sentences: 137860
Total French sentences: 137860


In [4]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

# Tokenization
en_tokenizer = Tokenizer(filters='', lower=False)  # Preserve case and punctuation
en_tokenizer.fit_on_texts(en_sentences)
en_sequences = en_tokenizer.texts_to_sequences(en_sentences)

fr_tokenizer = Tokenizer(filters='', lower=False)
fr_tokenizer.fit_on_texts(fr_sentences)
fr_sequences = fr_tokenizer.texts_to_sequences(fr_sentences)

# Calculate vocab sizes
en_vocab_size = len(en_tokenizer.word_index) + 1
fr_vocab_size = len(fr_tokenizer.word_index) + 1

# Padding
max_en_len = max(len(seq) for seq in en_sequences)
max_fr_len = max(len(seq) for seq in fr_sequences)
print(f"Max English length: {max_en_len}, Max French length: {max_fr_len}")

en_padded = pad_sequences(en_sequences, maxlen=max_en_len, padding='post')
fr_padded = pad_sequences(fr_sequences, maxlen=max_fr_len, padding='post')

# Split data
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(
    en_padded, 
    fr_padded, 
    test_size=0.2,
    random_state=42
)

Max English length: 12, Max French length: 18


In [7]:
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense, Attention, Concatenate
from tensorflow.keras.models import Model

# Hyperparameters
embedding_dim = 256
latent_dim = 512
batch_size = 64
epochs = 13

# Encoder
encoder_inputs = Input(shape=(max_en_len,))
enc_emb = Embedding(en_vocab_size, embedding_dim)(encoder_inputs)
encoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(enc_emb)

# Decoder
decoder_inputs = Input(shape=(max_fr_len - 1,))
dec_emb = Embedding(fr_vocab_size, embedding_dim)(decoder_inputs)
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(dec_emb, initial_state=[state_h, state_c])

# Attention
attention = Attention()([decoder_outputs, encoder_outputs])
decoder_concat = Concatenate(axis=-1)([decoder_outputs, attention])
decoder_dense = Dense(fr_vocab_size, activation='softmax')
outputs = decoder_dense(decoder_concat)

# Model
model = Model([encoder_inputs, decoder_inputs], outputs)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()

In [8]:
print(y_train.shape) 

(110288, 18)


In [11]:
# Prepare decoder inputs/outputs - FIXED VERSION
def prepare_decoder_io(sequences):
    # Input: All tokens except last
    decoder_input = sequences[:, :-1] 
    # Output: All tokens except first (shifted by one)
    decoder_output = sequences[:, 1:]  
    return decoder_input, decoder_output

train_decoder_input, train_decoder_output = prepare_decoder_io(y_train)
val_decoder_input, val_decoder_output = prepare_decoder_io(y_val)

# Verify shapes
print(f"Training shapes - Encoder: {X_train.shape}, Decoder in: {train_decoder_input.shape}, Decoder out: {train_decoder_output.shape}")
print(f"Validation shapes - Encoder: {X_val.shape}, Decoder in: {val_decoder_input.shape}, Decoder out: {val_decoder_output.shape}")

# Training with corrected shapes
history = model.fit(
    [X_train, train_decoder_input],
    train_decoder_output,
    batch_size=batch_size,
    epochs=epochs,
    validation_data=([X_val, val_decoder_input], val_decoder_output)
)

Training shapes - Encoder: (110288, 12), Decoder in: (110288, 17), Decoder out: (110288, 17)
Validation shapes - Encoder: (27572, 12), Decoder in: (27572, 17), Decoder out: (27572, 17)
[1m1724/1724[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 18ms/step - accuracy: 0.9911 - loss: 0.0261 - val_accuracy: 0.9923 - val_loss: 0.0232


In [12]:

# Evaluate on validation set
loss, accuracy = model.evaluate(
    [X_val, val_decoder_input],
    val_decoder_output,
    batch_size=batch_size
)
print(f"\nValidation Loss: {loss:.4f}")
print(f"Validation Accuracy: {accuracy:.4f}")

[1m431/431[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 7ms/step - accuracy: 0.9921 - loss: 0.0239

Validation Loss: 0.0232
Validation Accuracy: 0.9923
