In [1]:
# Packages
import torch
import torch.nn as nn
import torch.optim as optim

import torchtext
from torchtext.datasets import TranslationDataset, Multi30k
from torchtext.data import Field, BucketIterator

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

import spacy
import numpy as np

import random
import math
import time

In [4]:
# ! python -m spacy download de

Collecting de_core_news_sm==2.1.0
  Downloading https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-2.1.0/de_core_news_sm-2.1.0.tar.gz (11.1 MB)
[K     |████████████████████████████████| 11.1 MB 133 kB/s eta 0:00:01
[?25hBuilding wheels for collected packages: de-core-news-sm
  Building wheel for de-core-news-sm (setup.py) ... [?25ldone
[?25h  Created wheel for de-core-news-sm: filename=de_core_news_sm-2.1.0-py3-none-any.whl size=11073066 sha256=9bdae4b917cab9f06d7c743b175ac0f950035d46056c7630f7c22246b1f8c12d
  Stored in directory: /tmp/pip-ephem-wheel-cache-yapx0c5f/wheels/19/c6/ac/c9e47e5851255b175d864b74fef866e331c8375a5517a912cb
Successfully built de-core-news-sm
Installing collected packages: de-core-news-sm
  Attempting uninstall: de-core-news-sm
    Found existing installation: de-core-news-sm 2.2.5
    Uninstalling de-core-news-sm-2.2.5:
      Successfully uninstalled de-core-news-sm-2.2.5
Successfully installed de-core-news-sm-2.1.0
[38;5;2m✔ Downl

In [6]:
# ! python -m spacy download en

Collecting en_core_web_sm==2.1.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.1.0/en_core_web_sm-2.1.0.tar.gz (11.1 MB)
[K     |████████████████████████████████| 11.1 MB 125 kB/s eta 0:00:01
[?25hBuilding wheels for collected packages: en-core-web-sm
  Building wheel for en-core-web-sm (setup.py) ... [?25ldone
[?25h  Created wheel for en-core-web-sm: filename=en_core_web_sm-2.1.0-py3-none-any.whl size=11074433 sha256=e03ca909e6758da794722a633f59ba13f600af9add4c22577d6f30cad73916e6
  Stored in directory: /tmp/pip-ephem-wheel-cache-ve4z2h3u/wheels/59/4f/8c/0dbaab09a776d1fa3740e9465078bfd903cc22f3985382b496
Successfully built en-core-web-sm
Installing collected packages: en-core-web-sm
  Attempting uninstall: en-core-web-sm
    Found existing installation: en-core-web-sm 2.2.5
    Uninstalling en-core-web-sm-2.2.5:
      Successfully uninstalled en-core-web-sm-2.2.5
Successfully installed en-core-web-sm-2.1.0
[38;5;2m✔ Download and instal

In [2]:
# Create the tokenizers
spacy_de=spacy.load('de')
spacy_en=spacy.load('en')

In [3]:
def tokenize_de(text):
    """ Tokenizes german text from a string into a list of strings """
    return [tok.text for tok in spacy_de.tokenizer(text)]

def tokenize_en(text):
    """ Tokenizes english text from a string into a list of strings """
    return [tok.text for tok in spacy_en.tokenizer(text)]

In [4]:
# The model expects data to be fed in with batch dimension first, so we use batch_first=True

In [5]:
SRC=Field(tokenize=tokenize_de, init_token='<sos>', eos_token='<eos>', lower=True, batch_first=True)

TRG=Field(tokenize=tokenize_en, init_token='<sos>', eos_token='<sos>', lower=True, batch_first=True)

In [6]:
# We load the Multi30k dataset and build the vocabulary

In [7]:
train_data, valid_data, test_data=Multi30k.splits(exts=('.de', '.en'), fields=(SRC, TRG))

In [8]:
SRC.build_vocab(train_data, min_freq=2)
TRG.build_vocab(train_data, min_freq=2)

In [9]:
# Finally we define the device and the data iterator

In [10]:
device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [11]:
BATCH_SIZE=64

# train_iterator, valid_iterator, test_iterator=BucketIterator.splits((train_data, valid_data, test_data), batch_sizes=BATCH_SIZE, device=device)
train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data), 
     batch_size = BATCH_SIZE,
     device = device)

# Building the model

## Encoder

In [13]:
class Encoder(nn.Module):
    def __init__(self, input_dim, hid_dim, n_layers, n_heads, pf_dim, dropout, device, max_length=100):
        super.__init()
        
        self.device=device
        self.tok_embedding=nn.Embedding(input_dim, output_dim)
        self.pos_embedding=nn.Embedding(max_length, hid_dim)
        
        self.layers=nn.ModuleList([EncoderLayer(hid_dim, n_heads, pf_dim, dropout, device) for _ in range(n_layers)])
        
        self.dropout=nn.Dropout(dropout)
        
        self.scale=torch.sqrt(torch.FloatTensor([hid_dim])).to(device)
        
    def forward(self, src, src_mask):
        # src= [batch_size, src_len]
        # src_mask=[batch_size, src_len]
        
        batch_size=src.shape[0]
        src_len=src.shape[1]
        
        pos=torch.arange(0, src_len).unsqueeze(0).repeat(batch_size, 1).to(self.device)
        
        # pos=[batch_size, src_len]
        
        src=self.dropout((self.tok_embedding(src) * self.scale) + self.pos_embedding(pos))
        
        # src=[batch_size, src_len, hid_dim]
        
        for layer in self.layers:
            src=layer(src, src_mask)
        
        # src=[batch_size, src_len, hid_dim]
        
        return src

## Encoder Layer

In [14]:
class EncoderLayer(nn.Module):
    def __init__(self, hid_dim, n_heads, pf_dim, dropout, device):
        super().__init__()
        self.layer_norm=nn.LayerNorm(hid_dim)
        self.self_attention=MultiHeadAttentionLayer(hid_dim, n_heads, dropout, device)
        self.positionwise_feedforward=PositionwiseFeedforwardLayer(hid_dim, pf_dim, dropout)
        self.dropout=nn.Dropout(dropout)
        
        def forward(self, src, src_mask):
            # src = [batch_size, src_len, hid_dim]
            # src_mask = [batch_size, src_len]
            
            # self attention
            _src, _ = self.self_attention(src, src, src, src_mask)
            
            #dropout, residual connection and layer norm
            src=self.layer_norm(src+self.dropout(_src))
            # src = [batch_size, src_len, hid_dim]
            
            # positionwise feedforward
            _src = self.positionwise_feedforward(src)
            
            # dropout, residual connection and layer norm
            src=self.layer_norm(src + self.dropout(_src))
            # src = [batch_size, src_len, hid_dim]
            
            return src

## MultiHead Attention Layer

In [None]:
class MultiHeadAttentionLayer(nn.Module):
    def __init__(self, hid_dim, n_heads, dropout, device):
        super().__init__()
        
        assert hid_dim%n_heads=0
        
        self.hid_dim=hid_dim
        self.n_heads=n_heads
        self.head_dim=hid_dim//n_heads
        
        self.fc_q=nn.Linear(hid_dim, hid_dim)
        self.fc_k=nn.Linear(hid_dim, hid_dim)
        self.fc_v=nn.Linear(hid_dim, hid_dim)
        
        self.fc_o=nn.Linear(hid_dim, hid_dim)
        
        self.dropout=nn.Dropout(dropout)
        
        self.scale=torch.sqrt(torch.FloatTensor([self.hid_dim])).to(device)
        
        def forward(self, query, key, value, mask=None):
            
            batch_size=query.shape[0]
            
            # query = [batch_size, query_len, hid_dim]
            # key = [batch_size, key_len, hid_dim]
            # value = [batch_size, value_len, hid_dim]
            
            Q = self.fc_q(query)
            K = self.fc_k(key)
            V = self.fc_v(value)
            
            Q = Q.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3)
            K = K.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3)
            V = V.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3)
            # Q = [batch_size, query_len, hid_dim]
            # K = [batch_size, key_len, hid_dim]
            # V = [batch_size, value_len, hid_dim]
            
            