In [23]:
import torch
import torch.nn as nn
import torch.optim as optim
import gzip
import time
import math
import spacy
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from tqdm import tqdm

In [24]:
spacy_de=spacy.load('de_core_news_sm')
spacy_en=spacy.load('en_core_web_sm')

In [None]:
!python -m spacy download de_core_news_sm
!python -m spacy download en_core_web_sm

In [3]:
SEED=123
torch.manual_seed(SEED)
#torch.backends.cudnn.deterministic = True

<torch._C.Generator at 0x157dd74b3b0>

In [33]:
#load the dataset
class Multi30kDataset(Dataset):
    def __init__(self,src_file, trg_file, src_transform=None, trg_transform=None):
        self.src_data=self.load_data(src_file)
        self.trg_data=self.load_data(trg_file)
        self.src_transform=src_transform
        self.trg_transform=trg_transform
    
    def load_data(self, file_path):
        with gzip.open(file_path, 'rt', encoding='utf-8') as f:
            data = f.readlines()
        return data
    
    def __len__(self):
        return len(self.src_data)
    
    def __getitem__(self, idx):
        src_sentence = self.src_data[idx].strip()
        trg_sentence = self.trg_data[idx].strip()
        
        if self.src_transform:
            src_sentence = self.src_transform(src_sentence)
        if self.trg_transform:
            trg_sentence = self.trg_transform(trg_sentence)
        
        return {"src": src_sentence, "trg": trg_sentence}



In [34]:
def tokenize_de(text):
    return [token.text.lower() for token in spacy_de.tokenizer(text)]

In [35]:
def tokenize_en(text):
    return [token.text.lower() for token in spacy_en.tokenizer(text)]

In [36]:
train_de_path="C:\\Complete_Content\\GENERATIVEAI\\NEW_E2E_COURSE\\genai_bootcamp\\data\\train.de.gz"
train_en_path="C:\\Complete_Content\\GENERATIVEAI\\NEW_E2E_COURSE\\genai_bootcamp\\data\\train.en.gz"
val_de_path="C:\\Complete_Content\\GENERATIVEAI\\NEW_E2E_COURSE\\genai_bootcamp\\data\\val.de.gz"
val_en_path="C:\\Complete_Content\\GENERATIVEAI\\NEW_E2E_COURSE\\genai_bootcamp\\data\\val.en.gz"
test_de_path="C:\\Complete_Content\\GENERATIVEAI\\NEW_E2E_COURSE\\genai_bootcamp\\data\\test_2016_flickr.de.gz"
test_en_path="C:\\Complete_Content\\GENERATIVEAI\\NEW_E2E_COURSE\\genai_bootcamp\\data\\test_2016_flickr.en.gz"

In [37]:
# Load datasets
train_data = Multi30kDataset(train_de_path, train_en_path, src_transform=tokenize_de, trg_transform=tokenize_en)
val_data = Multi30kDataset(val_de_path, val_en_path, src_transform=tokenize_de, trg_transform=tokenize_en)
test_data = Multi30kDataset(test_de_path, test_en_path, src_transform=tokenize_de, trg_transform=tokenize_en)


# Define special tokens
PAD_TOKEN = '<pad>'
SOS_TOKEN = '<sos>'
EOS_TOKEN = '<eos>'
UNK_TOKEN = '<unk>'

In [38]:
def create_vocab(tokenized_sentences,special_tokens):
    vocab = {token: idx for idx, token in enumerate(special_tokens)}
    for sentence in tokenized_sentences:
        for token in sentence:
            if token not in vocab:
                vocab[token] = len(vocab)
    return vocab
    

In [39]:
# Tokenize all sentences
train_de_tokenized = [tokenize_de(sentence.strip()) for sentence in train_data.src_data]
train_en_tokenized = [tokenize_en(sentence.strip()) for sentence in train_data.trg_data]

In [40]:
train_de_tokenized

[['zwei',
  'junge',
  'weiße',
  'männer',
  'sind',
  'im',
  'freien',
  'in',
  'der',
  'nähe',
  'vieler',
  'büsche',
  '.'],
 ['mehrere',
  'männer',
  'mit',
  'schutzhelmen',
  'bedienen',
  'ein',
  'antriebsradsystem',
  '.'],
 ['ein',
  'kleines',
  'mädchen',
  'klettert',
  'in',
  'ein',
  'spielhaus',
  'aus',
  'holz',
  '.'],
 ['ein',
  'mann',
  'in',
  'einem',
  'blauen',
  'hemd',
  'steht',
  'auf',
  'einer',
  'leiter',
  'und',
  'putzt',
  'ein',
  'fenster',
  '.'],
 ['zwei',
  'männer',
  'stehen',
  'am',
  'herd',
  'und',
  'bereiten',
  'essen',
  'zu',
  '.'],
 ['ein',
  'mann',
  'in',
  'grün',
  'hält',
  'eine',
  'gitarre',
  ',',
  'während',
  'der',
  'andere',
  'mann',
  'sein',
  'hemd',
  'ansieht',
  '.'],
 ['ein', 'mann', 'lächelt', 'einen', 'ausgestopften', 'löwen', 'an', '.'],
 ['ein',
  'schickes',
  'mädchen',
  'spricht',
  'mit',
  'dem',
  'handy',
  'während',
  'sie',
  'langsam',
  'die',
  'straße',
  'entlangschwebt',
  '.'],

In [41]:
# Create vocabularies with special tokens
SRC_VOCAB = create_vocab(train_de_tokenized, [PAD_TOKEN, SOS_TOKEN, EOS_TOKEN, UNK_TOKEN])
TRG_VOCAB = create_vocab(train_en_tokenized, [PAD_TOKEN, SOS_TOKEN, EOS_TOKEN, UNK_TOKEN])

In [42]:
SRC_VOCAB

{'<pad>': 0,
 '<sos>': 1,
 '<eos>': 2,
 '<unk>': 3,
 'zwei': 4,
 'junge': 5,
 'weiße': 6,
 'männer': 7,
 'sind': 8,
 'im': 9,
 'freien': 10,
 'in': 11,
 'der': 12,
 'nähe': 13,
 'vieler': 14,
 'büsche': 15,
 '.': 16,
 'mehrere': 17,
 'mit': 18,
 'schutzhelmen': 19,
 'bedienen': 20,
 'ein': 21,
 'antriebsradsystem': 22,
 'kleines': 23,
 'mädchen': 24,
 'klettert': 25,
 'spielhaus': 26,
 'aus': 27,
 'holz': 28,
 'mann': 29,
 'einem': 30,
 'blauen': 31,
 'hemd': 32,
 'steht': 33,
 'auf': 34,
 'einer': 35,
 'leiter': 36,
 'und': 37,
 'putzt': 38,
 'fenster': 39,
 'stehen': 40,
 'am': 41,
 'herd': 42,
 'bereiten': 43,
 'essen': 44,
 'zu': 45,
 'grün': 46,
 'hält': 47,
 'eine': 48,
 'gitarre': 49,
 ',': 50,
 'während': 51,
 'andere': 52,
 'sein': 53,
 'ansieht': 54,
 'lächelt': 55,
 'einen': 56,
 'ausgestopften': 57,
 'löwen': 58,
 'an': 59,
 'schickes': 60,
 'spricht': 61,
 'dem': 62,
 'handy': 63,
 'sie': 64,
 'langsam': 65,
 'die': 66,
 'straße': 67,
 'entlangschwebt': 68,
 'frau': 69,
 '

In [11]:
class PositionalEncoding(nn.Module):
    def __init__(self):
        pass

In [13]:
class MultiHeadAttention(nn.Module):
    def __init__(self):
        pass

In [14]:
class PositionwiseFeedforward(nn.Module):
    def __init__(self):
        pass

In [15]:
class EncoderLayer(nn.Module):
    def __init__(self):
        pass
    

In [None]:
class DecoderLayer(nn.Module):
    def __init__(self):
        pass

In [16]:
class Transformer(nn.Module):
    def __init__(self,src_vocab_size, trg_vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_length, dropout):
        super().__init__()
        self.encoder_embedding = nn.Embedding(src_vocab_size, d_model)
        self.decoder_embedding = nn.Embedding(trg_vocab_size, d_model)
        
        self.encoder_layers = nn.ModuleList([EncoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])
        self.decoder_layers = nn.ModuleList([DecoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])
        
        self.fc_out = nn.Linear(d_model, trg_vocab_size)
        self.dropout = nn.Dropout(dropout)
        
        self.scale = torch.sqrt(torch.FloatTensor([d_model]))
        
    def generate_mask(self,src,trg):
        pass

    def forward(self,src,trg):
        pass
        
        
        
        
        
        
        
    