<a href="https://colab.research.google.com/github/whoami-Lory271/DL-project/blob/andrea/transformer_base.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from numpy import array
from sklearn.preprocessing import OneHotEncoder
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.modules.normalization import LayerNorm

In [2]:
def create_vocabulary(sentences):
  vocabulary = {}
  vocabulary['<sos>'] = 1
  vocabulary['<eos>'] = 2
  vocabulary['<pad>'] = 0
  index = 3
  for s in sentences:
    tokens = s.split()
    for t in tokens:
      if t not in vocabulary:
        vocabulary[t] = index
        index += 1
  return vocabulary

In [3]:
def encoding(vocabulary,sentences):
  X = [[vocabulary[token]] for sentence in sentences for token in sentence.split()]
  X = torch.tensor(X)
  X = X.reshape((len(sentences),X.shape[0] // len(sentences)))
  
  pad = X == vocabulary['<pad>']
  padding_mask = pad.repeat(1,1,X.shape[1]).reshape((X.shape[0],X.shape[1],X.shape[1]))
  padding_mask[pad] = True

  return X,padding_mask

In [4]:
class EmbeddingLayer(nn.Module):
  def __init__(self,vocabulary_size,embedding_dim,padding_idx=0):
    super().__init__()
    self.E = nn.Embedding(vocabulary_size,embedding_dim,padding_idx=0)
  
  def forward(self,x):
    return self.E(x)

In [5]:
def position_embedding(batch_size,seq_length,emb_dim,pad_mask):
  res = torch.zeros((batch_size,seq_length,emb_dim),dtype=torch.float32)
  for pos in range(seq_length):
    for i in range(emb_dim):
      if i%2 == 0:
        res[:,pos,i] = math.sin(pos/10000**(2*i/emb_dim))
      else:
        res[:,pos,i] = math.cos(pos/10000**(2*i/emb_dim))
  res[pad_mask[:,0,:]] = 0
  return res

In [6]:
class SelfAttention(nn.Module):
  def __init__(self,dmodel,dk,dv):
    super().__init__()
    self.dk = dk
    self.Wq = nn.Linear(dmodel,dk)
    self.Wk = nn.Linear(dmodel,dk)
    self.Wv = nn.Linear(dmodel,dv)
    self.softmax = nn.Softmax(dim=2)
  
  def forward(self,x,padding_mask,enc=None,mask=False,other_mask=None):
    q = self.Wq(x)
    if enc == None:
      k = self.Wk(x)
      v = self.Wv(x)
    else:
      k = self.Wk(enc)
      v = self.Wv(enc)
    sc = torch.matmul(q,k.permute(0,2,1)) / math.sqrt(self.dk)

    if other_mask == None:
      sc[padding_mask] = float('-inf')
    else:
      qmod = q.clone()
      kmod = k.clone()
      qmod[padding_mask[:,0,:]] = 0
      kmod[other_mask[:,0,:]] = 0
      sc = torch.matmul(qmod,kmod.permute(0,2,1)) / math.sqrt(self.dk)
      sc[sc == 0] = float('-inf')

    if mask==True:
      for i in range(sc.shape[1]):
        sc[:,i,i+1:] = float('-inf')
    score = torch.matmul(torch.nan_to_num(self.softmax(sc)),v)
    return score

In [7]:
class MultiHeadAttention(nn.Module):
  def __init__(self,dmodel,dk,dv,nhead,dropout=0.1):
    super().__init__()
    self.nhead = nhead
    self.att_layers = nn.ModuleList([SelfAttention(dmodel,dk,dv) for i in range(nhead)])
    self.Wo = nn.Linear(dv * nhead, dmodel)
    self.drop = nn.Dropout(p=dropout)
  
  def forward(self,x,padding_mask,enc=None,mask=False,other_mask=None):
    y = self.att_layers[0](x,padding_mask,enc=enc,mask=mask,other_mask=other_mask)
    for i in range(1,self.nhead):
      y = torch.cat([y,self.att_layers[i](x,padding_mask,enc=enc,mask=mask,other_mask=other_mask)],dim=2)
    y = self.Wo(y)
    y = self.drop(y)
    return y

In [None]:
class FFN(nn.Module):
  def __init__(self,dmodel,df,dropout=0.1):
    super().__init__()
    self.W1 = nn.Linear(dmodel,df)
    self.W2 = nn.Linear(df,dmodel)
    self.drop = nn.Dropout(p=dropout)
  
  def forward(self,x):
    x = self.W1(x)
    x = F.relu(x)
    x = self.W2(x)
    x = self.drop(x)
    return x

In [None]:
class Encoder(nn.Module):
  def __init__(self,dmodel,dk,dv,df,nhead):
    super().__init__()
    self.mha = MultiHeadAttention(dmodel,dk,dv,nhead)
    self.norm1 = LayerNorm(dmodel)
    self.ffn = FFN(dmodel,df)
    self.norm2 = LayerNorm(dmodel)
  
  def forward(self,x,padding_mask):
    z = self.mha(x,padding_mask)
    z = self.norm1(x+z)
    y = self.ffn(z)
    return self.norm2(z+y)

In [None]:
class Decoder(nn.Module):
  def __init__(self,dmodel,dk,dv,df,nhead):
    super().__init__()
    self.masked_mha = MultiHeadAttention(dmodel,dk,dv,nhead)
    self.norm1 = LayerNorm(dmodel)
    self.enc_dec_attention = MultiHeadAttention(dmodel,dk,dv,nhead)
    self.norm2 = LayerNorm(dmodel)
    self.ffn = FFN(dmodel,df)
    self.norm3 = LayerNorm(dmodel)
  
  def forward(self,x,enc,padding_mask,other_mask):
    z1= self.masked_mha(x,padding_mask,mask=True)
    z1 = self.norm1(x+z1)
    z2= self.enc_dec_attention(z1,padding_mask,enc=enc,mask=False,other_mask=other_mask)
    z2 = self.norm2(z1+z2)
    y = self.ffn(z2)
    return self.norm3(z2+y)

In [None]:
class Transformer(nn.Module):
  def __init__(self,in_voc_size,out_voc_size,dmodel,dk,dv,df,nhead,nlayers,dropout=0.1):
    super().__init__()
    self.nlayers = nlayers
    self.in_embedding = EmbeddingLayer(in_voc_size,dmodel)
    self.out_embedding = EmbeddingLayer(out_voc_size,dmodel)
    self.encoders = nn.ModuleList([Encoder(dmodel,dk,dv,df,nhead) for i in range(nlayers)])
    self.decoders = nn.ModuleList([Decoder(dmodel,dk,dv,df,nhead) for i in range(nlayers)])
    self.drop1 = nn.Dropout(p=dropout)
    self.drop2 = nn.Dropout(p=dropout)

  def forward(self,x,z,in_padding_mask,out_padding_mask,encoding=True):
    if encoding:
      emb = self.in_embedding(x) * math.sqrt(dmodel)
      t = position_embedding(x.shape[0],x.shape[1],dmodel,in_padding_mask)
      x = emb + t
      x = self.drop1(x)
      for i in range(self.nlayers):
        x = self.encoders[i](x,in_padding_mask)

    emb = self.out_embedding(z) * math.sqrt(dmodel)
    t = position_embedding(z.shape[0],z.shape[1],dmodel,out_padding_mask)
    z = emb + t
    z = self.drop2(z)
    for i in range(self.nlayers):
      z = self.decoders[i](z,x,out_padding_mask,in_padding_mask)
    
    z = z @ self.out_embedding.E.weight.T

    return x,z

In [None]:
in_sentences = ["<sos> dai ragazzi per una volta che ci andiamo non scegliamo il posto che fa pagare poco <eos>","<sos> altrimenti tanto vale andare a mensa <eos> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>","<sos> importante è la compagnia <eos> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>"]
in_vocabulary = create_vocabulary(in_sentences)

In [None]:
out_sentences = ["<sos> come on guys for once let's not choose the place that charges little <eos>","<sos> otherwise we might as well go to the canteen <eos> <pad> <pad> <pad> <pad>","<sos> important is company <eos> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>"]
out_vocabulary = create_vocabulary(out_sentences)

In [None]:
input,in_pad_mask = encoding(in_vocabulary,in_sentences)
teacher,out_pad_mask = encoding(out_vocabulary,out_sentences)

In [None]:
dmodel = 512
dk,dv = 64,64
nhead = 8
df = 2048
nlayers = 6
in_vocabulary_size = len(in_vocabulary.keys())
out_vocabulary_size = len(out_vocabulary.keys())

In [None]:
tran = Transformer(in_vocabulary_size,out_vocabulary_size,dmodel,dk,dv,df,nhead,nlayers)

In [None]:
loss = nn.CrossEntropyLoss()
opt = torch.optim.Adam(tran.parameters(),lr=1e-04, betas=(0.9, 0.98), eps=1e-09)

tran.train()

for i in range(100):
    
  opt.zero_grad()
  x,output = tran(input,teacher,in_pad_mask,out_pad_mask)
  pred = output[:,:-1,:].reshape(output.shape[0]*(output.shape[1] - 1),output.shape[2])
  target = teacher[:,1:].reshape(teacher.shape[0]*(teacher.shape[1] - 1))
  l = loss(pred,target)
  l.backward()
  if i%10 == 0 or i == 99:
    print(l)
  #nn.utils.clip_grad_norm_(tran.parameters(), 0.1)
  opt.step()
  

tensor(234.5838, grad_fn=<NllLossBackward0>)
tensor(12.6249, grad_fn=<NllLossBackward0>)
tensor(9.2264, grad_fn=<NllLossBackward0>)
tensor(6.2455, grad_fn=<NllLossBackward0>)
tensor(5.6942, grad_fn=<NllLossBackward0>)
tensor(4.5833, grad_fn=<NllLossBackward0>)
tensor(4.6332, grad_fn=<NllLossBackward0>)
tensor(4.3645, grad_fn=<NllLossBackward0>)
tensor(3.5775, grad_fn=<NllLossBackward0>)
tensor(4.2612, grad_fn=<NllLossBackward0>)
tensor(3.3563, grad_fn=<NllLossBackward0>)


In [None]:
output.argmax(-1)[:,:-1]

tensor([[ 5,  4,  5,  4,  7,  7, 12,  2, 10, 12, 12, 12,  2,  2],
        [17, 16, 17, 18, 19, 20, 21, 17, 18, 23, 17,  0,  0,  0],
        [26, 24,  2, 26,  2,  0,  0,  0,  0,  0,  0,  0,  0,  0]])

In [None]:
teacher[:,1:]

tensor([[ 3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,  2],
        [16, 17, 18, 19, 20, 21, 22, 11, 23,  2,  0,  0,  0,  0],
        [24, 25, 26,  2,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0]])

In [None]:
in_test = ["<sos> ragazzi andiamo a mensa <eos> <pad> <pad>","<sos> il posto fa pagare la compagnia <eos>"]
out_test = ["<sos> guys let's go to the canteen <eos>","<sos> the place charges the company <eos> <pad>"]

In [None]:
input_test,in_test_pad_mask = encoding(in_vocabulary,in_test)
target_test,out_test_pad_mask = encoding(out_vocabulary,out_test)

In [None]:
x = input_test
enc = True
trg = target_test.clone()
for i in range(1,target_test.shape[1]):
  x,output = tran(x,trg[:,:i],in_test_pad_mask,out_test_pad_mask[:,:i,:i],encoding = enc)
  trg[:,i] = output.argmax(-1)[:,-1]
  enc = False

In [None]:
output.argmax(-1)

tensor([[16, 17, 18, 19, 20, 21, 22],
        [ 3,  4,  5,  6,  7,  8,  9]])

In [None]:
target_test[:,1:]

tensor([[ 5,  8, 21, 22, 11, 23,  2],
        [11, 12, 14, 11, 26,  2,  0]])