In [0]:
import torch
import  numpy
import spacy
import torchtext
from torchtext.data import Field,BucketIterator,TabularDataset


In [0]:
#loading datafiles in english and french seperately
europarl_en = open('/content/europarl-v7.fr-en.en', encoding='utf-8').read().split('\n')
europarl_fr = open('/content/europarl-v7.fr-en.fr', encoding='utf-8').read().split('\n')

In [0]:
en=spacy.load('en')#loading english language framework for processing

In [5]:
!python -m spacy download fr
fr=spacy.load('fr')#loading french language framework for processing

Collecting fr_core_news_sm==2.1.0
[?25l  Downloading https://github.com/explosion/spacy-models/releases/download/fr_core_news_sm-2.1.0/fr_core_news_sm-2.1.0.tar.gz (13.1MB)
[K     |████████████████████████████████| 13.1MB 611kB/s 
[?25hBuilding wheels for collected packages: fr-core-news-sm
  Building wheel for fr-core-news-sm (setup.py) ... [?25l[?25hdone
  Created wheel for fr-core-news-sm: filename=fr_core_news_sm-2.1.0-cp36-none-any.whl size=13156209 sha256=f0eacb6e1d8075b0840a766aa68a9a3d7a87553d08cb2f779e3b9ca2cc4e1723
  Stored in directory: /tmp/pip-ephem-wheel-cache-x9xw1c27/wheels/ab/82/2a/61dd0ff02e22f10eef65a5aa35453a0eb745c84b4c874b612f
Successfully built fr-core-news-sm
Installing collected packages: fr-core-news-sm
Successfully installed fr-core-news-sm-2.1.0
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('fr_core_news_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.6/dist-packages/fr_core_news_sm -->
/usr/

In [0]:
import pandas as pd
#creat ing a dictionary with 'english' and 'french' as key
raw_data={'English':[line for line in europarl_en],'French':[line for line in europarl_fr]}


In [0]:
df=pd.DataFrame.from_dict(raw_data,orient='index')


In [0]:
df=df.transpose()

In [9]:
#4th row
df.iloc[4,:]

English    In the meantime, I should like to observe a mi...
French     En attendant, je souhaiterais, comme un certai...
Name: 4, dtype: object

In [0]:
#adding columns with the length of the sentences
df['eng_len'] = df['English'].str.count(' ')
df['fr_len'] = df['French'].str.count(' ')

In [11]:
df #we need to del the rows with the nan values

Unnamed: 0,English,French,eng_len,fr_len
0,Resumption of the session,Reprise de la session,3,3.0
1,I declare resumed the session of the European ...,Je déclare reprise la session du Parlement eur...,37,32.0
2,"Although, as you will have seen, the dreaded '...","Comme vous avez pu le constater, le grand ""bog...",30,36.0
3,You have requested a debate on this subject in...,Vous avez souhaité un débat à ce sujet dans le...,18,18.0
4,"In the meantime, I should like to observe a mi...","En attendant, je souhaiterais, comme un certai...",39,37.0
...,...,...,...,...
646473,So please go ahead and ask the President-in-Of...,,15,
646474,"That is very gracious of you, Madam Preside...",,27,
646475,That is very fair.,,4,
646476,We shall continue to discuss what is to be don...,,13,


In [12]:
df=df.dropna(axis=0,subset=['fr_len'])
df.shape

(565963, 4)

In [0]:
#remove long sentences and others with large difference in the translations 
df = df.query('fr_len < 80 & eng_len < 80')
df = df.query('fr_len < eng_len * 1.5 & fr_len * 1.5 > eng_len')


In [14]:
df.shape

(249594, 4)

In [0]:
from sklearn.model_selection import train_test_split
train,val=train_test_split(df,test_size=0.1)


In [16]:
train.shape

(224634, 4)

In [0]:
#saving the train and val to csv files
train.to_csv("train.csv", index=False)
val.to_csv("val.csv", index=False)

In [0]:
def tokenize_en(sentence):
    return [tok.text for tok in en.tokenizer(sentence)]
def tokenize_fr(sentence):
    return [tok.text for tok in fr.tokenizer(sentence)]
#creating two field object to  tokemize and split the data and passing the custom tokenizer function
EN_TEXT = Field(tokenize=tokenize_en)
FR_TEXT = Field(tokenize=tokenize_fr, init_token = "<sos>", eos_token = "<eos>")

In [0]:
# associate the text in the 'English' column with the EN_TEXT field, # and 'French' with FR_TEXT
data_fields = [('English', EN_TEXT), ('French', FR_TEXT)]

train,val = TabularDataset.splits(path='/content/', train='train.csv', validation='val.csv', format='csv', fields=data_fields)


In [0]:
sz=3
mask=torch.triu(torch.ones(sz, sz))
mask=(torch.triu(torch.ones(sz, sz)) == 1)


In [21]:
mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))

tensor([[0., 0., 0.],
        [-inf, 0., 0.],
        [-inf, -inf, 0.]])

In [0]:
FR_TEXT.build_vocab(train, val)
EN_TEXT.build_vocab(train, val)

In [23]:
#check the index of any word
FR_TEXT.vocab.stoi ['the']  #<unk>:0,<pad>:1,<sos>:2,<eos>:3

6548

In [0]:
#Defines an iterator that batches examples of similar lengths together hence reducing the amount of the padding
train_iter = BucketIterator(train, batch_size=20, sort_key=lambda x: len(x.French), shuffle=True)

In [0]:
#batch=(next(iter(train_iter)))

In [26]:
for i,n in enumerate(train_iter):
  print(i,"\n",n)

0 
 
[torchtext.data.batch.Batch of size 20]
	[.English]:[torch.LongTensor of size 47x20]
	[.French]:[torch.LongTensor of size 60x20]
1 
 
[torchtext.data.batch.Batch of size 20]
	[.English]:[torch.LongTensor of size 63x20]
	[.French]:[torch.LongTensor of size 63x20]
2 
 
[torchtext.data.batch.Batch of size 20]
	[.English]:[torch.LongTensor of size 73x20]
	[.French]:[torch.LongTensor of size 80x20]
3 
 
[torchtext.data.batch.Batch of size 20]
	[.English]:[torch.LongTensor of size 73x20]
	[.French]:[torch.LongTensor of size 78x20]
4 
 
[torchtext.data.batch.Batch of size 20]
	[.English]:[torch.LongTensor of size 83x20]
	[.French]:[torch.LongTensor of size 80x20]
5 
 
[torchtext.data.batch.Batch of size 20]
	[.English]:[torch.LongTensor of size 58x20]
	[.French]:[torch.LongTensor of size 67x20]
6 
 
[torchtext.data.batch.Batch of size 20]
	[.English]:[torch.LongTensor of size 57x20]
	[.French]:[torch.LongTensor of size 65x20]
7 
 
[torchtext.data.batch.Batch of size 20]
	[.English]:[torc

In [0]:
#tr_en=(batch.English)# sentence_length*batch_size
#tr_target=(batch.French)#20 is the batch
#tr_en.shape

In [0]:
import torch
import torch.nn as nn
import math
import torch.nn.functional as F

In [0]:
class TransformerModel(nn.Module):
  def __init__(self, vocab_size_en,vocab_size_fr, dim_input, nos_head, fd_frwd, nlayers, dropout=0.5):
    super(TransformerModel, self).__init__()
    from torch.nn import TransformerEncoderLayer,TransformerEncoder,TransformerDecoder,TransformerDecoderLayer, Embedding
    self.src_mask=None
    self.embed_en =Embedding(vocab_size_en,dim_input)
    self.pos_encoder=PositionalEncoding(dim_input)
    encoder_layers=TransformerEncoderLayer(dim_input,nos_head,fd_frwd,dropout)
    self.encoder=TransformerEncoder(encoder_layers,nlayers)
    self.dim_input=dim_input
    self.nlayers=nlayers
    self.embed_fr=Embedding(vocab_size_fr,dim_input)
    self.pos_decoder=PositionalEncoding(dim_input)
    dec_layers=TransformerDecoderLayer(dim_input,nos_head,fd_frwd,dropout)
    self.decoder=TransformerDecoder(dec_layers,nlayers)
    self.decoder_out_layer=nn.Linear(dim_input,vocab_size_fr)
    self.output_final=nn.Softmax()
    self.init_weights()
  def _generate_square_subsequent_mask(self, sz):
    mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
    mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
    return mask
  
  def init_weights(self):
    initrange = 0.1
    self.embed_en.weight.data.uniform_(-initrange, initrange)
    self.embed_fr.weight.data.uniform_(-initrange, initrange)
    self.decoder_out_layer.bias.data.zero_()
    self.decoder_out_layer.weight.data.uniform_(-initrange, initrange)
  
  def forward(self,inp,target):
    inp=self.embed_en(inp)
    inp=self.pos_encoder(inp)
    encoder_output = self.encoder(inp)
    #print(encoder_output.shape)
    #print(self.src_mask == None or self.src_mask.size(0) != len(target))
    if (self.src_mask == None or self.src_mask.size(0) != len(target)): 
      device = target.device
      #print(len(target))
      mask = self._generate_square_subsequent_mask(len(target)).to(device)
      src_mask = mask
      #print(src_mask)
    target = self.embed_fr(target) 
    target = self.pos_decoder(target)
    #print(target.shape)
    output = self.decoder(target,encoder_output,src_mask,memory_mask=None,tgt_key_padding_mask=None, memory_key_padding_mask=None)
    output = self.decoder_out_layer(output)
    out=self.output_final(output)
    return out
    



In [0]:
vocab_size_fr=len(FR_TEXT.vocab.stoi)
vocab_size_en=len(EN_TEXT.vocab.stoi)
dim_input=256
nos_head=4
nlayers=4
fd_frwd=512


In [0]:
class PositionalEncoding(nn.Module):

    def __init__(self, d_model, max_seq_len=120):
        super(PositionalEncoding, self).__init__()
        pe = torch.zeros((max_seq_len, d_model))
        position = torch.arange(0, max_seq_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return (x)

In [0]:
model=TransformerModel(vocab_size_en,vocab_size_fr,dim_input,nos_head,fd_frwd,nlayers)

In [39]:
import time
criterion = nn.CrossEntropyLoss()
lr = 5.0 # learning rate
optimizer = torch.optim.SGD(model.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95) 
start_time=time.time()
tot_loss=0
for i,batch in enumerate(train_iter):
  #print("batch",i)
 
  tr_en=batch.English
  tr_target=batch.French
  optimizer.zero_grad()                     
  model.train() 
  output = model(tr_en,tr_target)
  loss = criterion(output.view(-1, vocab_size_fr), tr_target.view(-1))
  loss.backward()
  optimizer.step()
  log_interval = 200
  tot_loss+=loss
  #print the loss and time per 200 batches
  if i%200 ==0 :
    elapsed = time.time() - start_time
    print("loss:",tot_loss,"\t","time:",elapsed)
    tot_loss=0
    




loss: tensor(11.0471, grad_fn=<AddBackward0>) 	 time: 10.830580711364746


KeyboardInterrupt: ignored

In [0]:
vocab_size_fr

In [0]:
output.view(-1, vocab_size_fr).shape
            

In [0]:
tr_target.shape

In [0]:
tr_target.view(-1).shape