In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [13]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from torch.utils.data import DataLoader, TensorDataset
import random
import os
import time

In [10]:
df = pd.read_csv('/content/drive/MyDrive/English French Translation.csv')

In [11]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
if device.type == "cuda":
    print(f"GPU Name: {torch.cuda.get_device_name(0)}")

Using device: cuda
GPU Name: Tesla T4


In [12]:
df.head()

Unnamed: 0,English words/sentences,French words/sentences
0,Hi.,Salut!
1,Run!,Cours !
2,Run!,Courez !
3,Who?,Qui ?
4,Wow!,Ça alors !


In [14]:
# Preprocessing
df['English words/sentences'] = df['English words/sentences'].str.lower()
df['French words/sentences'] = df['French words/sentences'].str.lower()
df['French words/sentences'] = '<start> ' + df['French words/sentences'] + ' <end>'

# Tokenization
tokenizer = Tokenizer(filters='', oov_token='<unk>')
corpus = df["English words/sentences"].tolist() + df["French words/sentences"].tolist()
tokenizer.fit_on_texts(corpus)

# Convert text to sequences
df['tokenized_English'] = tokenizer.texts_to_sequences(df['English words/sentences'])
df['tokenized_French'] = tokenizer.texts_to_sequences(df['French words/sentences'])

# Padding
max_len = max(max(len(seq) for seq in df['tokenized_English']),
              max(len(seq) for seq in df['tokenized_French']))
print(f"Max Sequence Length: {max_len}")
X = pad_sequences(df['tokenized_English'], maxlen=max_len, padding='post')
y = pad_sequences(df['tokenized_French'], maxlen=max_len, padding='post')

# Convert to tensors and move to GPU immediately
X = torch.tensor(X, dtype=torch.long).to(device)
y = torch.tensor(y, dtype=torch.long).to(device)

# Vocabulary size
vocab_size = len(tokenizer.word_index) + 1

# Define Encoder
class Encoder(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True, dropout=0.2 if num_layers > 1 else 0)

    def forward(self, x):
        embedded = self.embedding(x)
        outputs, (hidden, cell) = self.lstm(embedded)
        return outputs, hidden, cell

# Define Decoder
class Decoder(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True, dropout=0.2 if num_layers > 1 else 0)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x, hidden, cell):
        x = x.unsqueeze(1)
        embedded = self.embedding(x)
        output, (hidden, cell) = self.lstm(embedded, (hidden, cell))
        prediction = self.fc(output.squeeze(1))
        return prediction, hidden, cell

# Define Seq2Seq Model
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        batch_size = src.size(0)
        trg_len = trg.size(1)
        trg_vocab_size = self.decoder.fc.out_features
        outputs = torch.zeros(batch_size, trg_len, trg_vocab_size).to(device)
        encoder_outputs, hidden, cell = self.encoder(src)
        input_token = trg[:, 0]
        for t in range(1, trg_len):
            output, hidden, cell = self.decoder(input_token, hidden, cell)
            outputs[:, t, :] = output
            teacher_force = random.random() < teacher_forcing_ratio
            top1 = output.argmax(1)
            input_token = trg[:, t] if teacher_force else top1
        return outputs



Max Sequence Length: 57


In [15]:
# Model parameters
embedding_dim = 512
hidden_dim = 256
num_layers = 2
batch_size = 32
epochs = 20

encoder = Encoder(vocab_size, embedding_dim, hidden_dim, num_layers).to(device)
decoder = Decoder(vocab_size, embedding_dim, hidden_dim, num_layers).to(device)
model = Seq2Seq(encoder, decoder).to(device)

# Loss and optimizer
loss_function = nn.CrossEntropyLoss(ignore_index=0)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
dataset = TensorDataset(X, y)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

checkpoint_dir = '/content/drive/MyDrive/checkpoints'
os.makedirs(checkpoint_dir, exist_ok=True)
checkpoint_path = os.path.join(checkpoint_dir, 'model_checkpoint.pt')

start_epoch = 0
if os.path.exists(checkpoint_path):
    checkpoint = torch.load(checkpoint_path, map_location=device)
    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    start_epoch = checkpoint['epoch'] + 1
    print(f"Resuming from epoch {start_epoch}")

for epoch in range(start_epoch, epochs):
    model.train()
    total_loss = 0
    epoch_start = time.time()
    for i, (batch_X, batch_y) in enumerate(dataloader):
        batch_start = time.time()
        optimizer.zero_grad()
        outputs = model(batch_X, batch_y)
        loss = loss_function(outputs.view(-1, vocab_size), batch_y.view(-1))
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)
        optimizer.step()
        total_loss += loss.item()
        if i % 500 == 0:
            print(f"Epoch {epoch+1}, Batch {i}/{len(dataloader)}, Batch Time: {time.time() - batch_start:.2f}s")

    avg_loss = total_loss / len(dataloader)
    epoch_time = time.time() - epoch_start
    print(f'Epoch [{epoch+1}/{epochs}], Loss: {avg_loss:.4f}, Time: {epoch_time:.2f}s')

    if (epoch + 1) % 5 == 0:
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': avg_loss,
        }, checkpoint_path)
        print(f"Checkpoint saved at epoch {epoch+1}")


Epoch 1, Batch 0/5489, Batch Time: 1.16s
Epoch 1, Batch 500/5489, Batch Time: 1.09s
Epoch 1, Batch 1000/5489, Batch Time: 0.98s
Epoch 1, Batch 1500/5489, Batch Time: 0.97s
Epoch 1, Batch 2000/5489, Batch Time: 0.98s


KeyboardInterrupt: 

In [None]:
def translate_sentence(sentence, model, tokenizer, max_len, device):
    model.eval()
    with torch.no_grad():
        sentence = sentence.lower()
        tokens = tokenizer.texts_to_sequences([sentence])
        padded = pad_sequences(tokens, maxlen=max_len, padding='post')
        input_tensor = torch.tensor(padded, dtype=torch.long).to(device)
        _, hidden, cell = model.encoder(input_tensor)
        trg_indexes = [tokenizer.word_index['<start>']]
        for _ in range(max_len):
            trg_tensor = torch.tensor([trg_indexes[-1]], dtype=torch.long).to(device)
            output, hidden, cell = model.decoder(trg_tensor, hidden, cell)
            pred_token = output.argmax(1).item()
            trg_indexes.append(pred_token)
            if pred_token == tokenizer.word_index['<end>']:
                break
        trg_words = [tokenizer.index_word.get(idx, '<unk>') for idx in trg_indexes[1:]]
        return ' '.join(trg_words[:-1])

test_sentence = "Hello, how are you?"
translation = translate_sentence(test_sentence, model, tokenizer, max_len, device)
print(f"English: {test_sentence}")
print(f"French: {translation}")

torch.save(model.state_dict(), '/content/drive/MyDrive/final_model.pt')
print("Model saved successfully.")