In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from torch.utils.data import DataLoader,TensorDataset
from sklearn.preprocessing import OneHotEncoder

In [None]:
df = pd.read_csv('/content/drive/MyDrive/eng_-french.csv')

In [None]:
df

Unnamed: 0,English words/sentences,French words/sentences
0,Hi.,Salut!
1,Run!,Cours !
2,Run!,Courez !
3,Who?,Qui ?
4,Wow!,Ça alors !
...,...,...
175616,"Top-down economics never works, said Obama. ""T...","« L'économie en partant du haut vers le bas, ç..."
175617,A carbon footprint is the amount of carbon dio...,Une empreinte carbone est la somme de pollutio...
175618,Death is something that we're often discourage...,La mort est une chose qu'on nous décourage sou...
175619,Since there are usually multiple websites on a...,Puisqu'il y a de multiples sites web sur chaqu...


In [None]:
tokenizer = Tokenizer()

In [None]:
df['English words/sentences'] = df['English words/sentences'].str.lower()

In [None]:
df['French words/sentences'] = df['French words/sentences'].str.lower()

In [None]:
corpus = df["English words/sentences"].tolist() + df["French words/sentences"].tolist()
tokenizer.fit_on_texts(corpus)

In [None]:
df['tokenized_English'] = tokenizer.texts_to_sequences(df['English words/sentences'])
df['tokenized French'] = tokenizer.texts_to_sequences(df['French words/sentences'])

In [None]:
max_len = max(max(len(seq) for seq in df['tokenized_English']), max(len(seq) for seq in df['tokenized French']))
df['padded_English'] = list(pad_sequences(df['tokenized_English'], maxlen=max_len, padding='post'))
df['padded_French'] = list(pad_sequences(df['tokenized French'], maxlen=max_len, padding='post'))

In [None]:
word_count = len(tokenizer.index_word)+1

In [None]:
word_count

43046

In [None]:
X = torch.tensor(df['padded_English'],dtype=torch.long)
y = torch.tensor(df['padded_French'],dtype=torch.long)

  X = torch.tensor(df['padded_English'],dtype=torch.long)


In [None]:
class Encoder(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_layer,num_layers):
        super().__init__()
        self.Embedding = nn.Embedding(vocab_size, embedding_dim)
        self.LSTM = nn.LSTM(embedding_dim, hidden_layer,num_layers, batch_first=True)

    def forward(self, input_seq):
        embedding = self.Embedding(input_seq)
        output, (hidden, cell) = self.LSTM(embedding)
        return output, hidden, cell

In [None]:
class Decoder(nn.Module):
    def __init__(self, output_dim, embedding_dim, hidden_dim, num_layers):
        super().__init__()
        self.embedding = nn.Embedding(output_dim, embedding_dim)
        self.LSTM = nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True, dropout=0.1)
        self.out = nn.Linear(hidden_dim, output_dim)

    def forward(self, input_seq, hidden, cell):
        input_token = input_seq.unsqueeze(1)
        embedded = self.embedding(input_token)
        output, (hidden, cell) = self.LSTM(embedded, (hidden, cell))
        output = self.out(output.squeeze(1))
        return output, hidden, cell

In [None]:
import random
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, input_seq, target_seq, teacher_forcing_ratio=0.5):
        batch_size = input_seq.size(0)
        target_len = target_seq.size(1)
        vocab_size = self.decoder.out.out_features

        # Initialize outputs tensor
        outputs = torch.zeros(batch_size, target_len, vocab_size).to(input_seq.device)

        # Encoder
        encoder_outputs, hidden, cell = self.encoder(input_seq)

        # First decoder input is start token
        decoder_input = target_seq[:, 0]

        # Decode
        for t in range(1, target_len):
            # Decoder forward
            output, hidden, cell = self.decoder(decoder_input, hidden, cell)

            # Store output
            outputs[:, t, :] = output

            # Teacher forcing
            teacher_force = random.random() < teacher_forcing_ratio

            # Next input
            top1 = output.argmax(1)
            decoder_input = target_seq[:, t] if teacher_force else top1

        return outputs

In [None]:
vocab_size = word_count
embedding_dim =  512
hidden_dim  = 256
num_layers = 2
encoder = Encoder(vocab_size,embedding_dim,hidden_dim,num_layers)
decoder1 = Decoder(vocab_size,embedding_dim,hidden_dim,num_layers)
model = Seq2Seq(encoder,decoder1)
loss_function = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(),lr = 0.001)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
X,y = X.to(device),y.to(device)
dataset =TensorDataset(X,y)
dataloader = DataLoader(dataset,batch_size=32,shuffle=True)
epochs = 10
for epoch in range(epochs):
    total_loss = 0
    for batch_X, batch_y in dataloader:
        batch_X, batch_y = batch_X.to(device), batch_y.to(device)
        optimizer.zero_grad()
        outputs = model(batch_X, batch_y)
        outputs_flat = outputs.view(-1, outputs.size(-1))
        targets_flat = batch_y.view(-1)
        loss = loss_function(outputs_flat, targets_flat)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f'Epoch [{epoch+1}/{epochs}], Loss: {total_loss/len(dataloader):.4f}')