<a href="https://colab.research.google.com/github/talha1503/DoubtGuess/blob/master/Model/Pytorch_Chatbot_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import math

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [0]:
class Embedding():
  '''
  This embedding block will act as a lookup table.
  The input here will be a word and the output will be an embedding of the word.
  FastText will be our first choice. If not found in any others, generate a random embedding
  Input:
    word from the vocabulary
  Otuput:
    embedding from the pre-trained vector spaces.
  '''
  def __init__(self,embedding_fast_text,embedding_glove,embedding_word2vec):
    self.fast_text = embedding_fast_text
    self.glove = embedding_glove
    self.w2v = embedding_word2vec

  def lookup(self,word,dimensions):
    if self.fast_text.get(word):
      return fast_text.get(word)
    elif self.glove.get(word):
      return glove.get(word)
    elif self.w2v.get(word):
      return w2v.get(word)
    else:
      return torch.rand(1,dimensions,device=device)


SyntaxError: ignored

In [0]:
class Positional_Embeddings(nn.Module):
  '''
  
  We need to create a positional encoding for our model since our input will always be embeddings of the words and nothing
  with respect to their positions. Hence we use Vasmari et al's positional encoding. 
  Input : 
    Matrix of Sequence_Length X Dimensions
  Output:
    Embedding with positional encoding of the word.
  '''
  def __init__(self,dimensions,sentence):
    super().__init__()
    self.dimensions = dimensions
    self.sentence = sentence

  def forward(self):
    embeddings = Embedding()
    self.embeddings = torch.zeros(len(self.sentence),self.dimensions,device = device)  #creating a matrix of sequence length X model dimensions.
    for index in range(len(self.sentence)):
      self.embeddings[index] = embeddings.lookup(self.sentence[index])
    self.embeddings = self.embeddings * math.sqrt(self.dimensions)      #Embeedings are made larger so that no effect of positional encodings.
    positional_encoded_embedding = torch.zeros(len(self.sentence).self.dimensions,device = device)
    for index in range(len(self.sentence)):         #we create the positional embeddings over here
      for embedding_index in range(0,self.dimensions,2):
        positional_encoded_embedding[index,2*embedding_index] = math.sin(index/math.pow(10000,(2*embedding_index)/dimensions))    
        positional_encoded_embedding[index,2*embedding_index+1] = math.cos(index/math.pow(10000,(2*embedding_index)+1/dimensions))    
    positional_encoded_embedding = self.embedding +  positional_encoded_embedding  #we add ositional embeddings to the sentence matrix
    return positional_encoded_embedding


In [0]:
class MultiHeadAttention(nn.Module):
  '''
    This is the multi-head attention block which also consists of the masked multi head attention part of the decoder. 
    In this block we will recieve 3 inputs -> queries, keys and values.
    Our flow will be like this:

      q->linear->split->scale
                            \
                            matmul --> softmax ----> matmul --> merge-->output_linear
                           /                        /
          k->linear->split                         /     
                                                  /
                                v->linear->split                       

  '''
  def __init__(self,heads,input_dims,q_dims,k_dims,v_dims,output_dims,mask = None):  #q_dims == k_dims since we need dot product of both of them together.
    super(MultiHeadAttention, self).__init__()
    self.q_linear = nn.Linear(input_dims,q_dims,bias= False)
    self.k_linear = nn.Linear(input_dims,k_dims,bias= False)
    self.v_linear = nn.Linear(input_dims,v_dims,bias= False)
    self.output_linear = nn.Linear(v_dims,output_dims,bias= False)
    self.heads = heads
    self.scaling_factor = (k_dims//self.heads)**-0.5
    self.mask = mask

  def split_heads(self,input_):
    if len(input_.shape) == 3:       #Input shape is [batch_size,sequence_length,hidden_dimensions]  --> Output shape should be [batch_size,heads,seq_len,hidden_dim/heads]
      shape = input_.shape
      input_ = input_.view(shape[0],self.heads,shape[1],shape[2]//self.heads)
      return input_
    elif len(input_.shape) == 2:      #Input shape is [sequence_length,hidden_dimensions] --> output shape should be [heads,seq_len,hidden_dim/heads]
      shape = input_.shape
      input_ = input_.view(self.heads,shape[0],shape[1]//self.heads)
      return input_
    else:
      print("Invalid Shapes")

  def merge_heads(self,input_):
    if len(input_.shape) == 3:
      shape = input_.shape
      input_ = input_.view(shape[0],shape[2]*self.heads)
    elif len(input_.shape) == 4:
      shape = input_.shape
      input_ = input_.view(shape[0],shape[2],shape[3]*self.heads)

  def forward(self,queries,keys,values):
    queries = self.q_linear(queries)
    values = self.v_linear(values)
    keys = self.k_linear(keys)

    queries = self.split_heads(queries)      # in case of no batches shape is --> [heads , seq_len , dims // heads]
    values = self.split_heads(values)        # in case of batches shape is -->    [batches , heads,seq_len,dims//heads]
    keys = self.split_heads(keys)

    queries = self.queries * self.scaling_factor

    if len(queries.shape) == 3:  # no batches
      scores = torch.matmul(queries,keys.permute(0,2,1))
      '''
      Mask the scores. 
      if self.mask:
  
      '''
      probabilities = F.softmax(scores,dim = 0)
    elif len(queries.shape) == 4: #in case of batches
      scores = torch.matmul(queries,keys.permute(0,1,3,2))
      '''
      Mask the scores. 
      if self.mask:
  
      '''
      probabilities = F.softmax(scores,dim = 1)



    z = torch.matmul(probabilities , values)
    z = merge_heads(z)

    outputs = self.output_linear(z)
    return  outputs

In [0]:
class FeedForwardNetwork(nn.Module):
  '''
  This is feedforward network consisting of three layers(1 input ,1 hidden and 1 output)
  Each layer is of the config --> Linear -> ReLU -> Dropout
  '''
  def __init__(self,hidden_size,input_size,output_size,dropout_val):
    super(FeedForwardNetwork,self).__init__()
    self.hidden_size = hidden_size
    self.input_size = input_size
    self.output_size = output_size
    self.first_linear = nn.Linear(self.input_size,self.hidden_size)
    self.second_linear = nn.Linear(self.hidden_size,self.hidden_size)
    self.third_linear = nn.Linear(self.hidden_size,self.output_size)
    self.dropout = nn.Dropout(dropout_val)
    self.relu = nn.ReLu()

  def forward(self,inputs)
    output1 = self.first_linear(inputs)
    output1_relu = self.relu(output1)
    output1_relu_dropout = self.dropout(output1_relu)
    output2 = self.second_linear(output1_relu_dropout)
    output2_relu = self.relu(output2)
    output2_relu_dropout = self.dropout(output2_relu)
    output3 = self.third_linear(output2_relu_dropout)
    output3_relu = self.relu(output3)
    output3_relu_dropout = self.dropout(output3_relu)
    return ouptut3_relu_dropout
    

In [0]:
class LayerNorm(nn.Module):
  '''
    Reference -> https://github.com/pytorch/pytorch/issues/1959
  '''
  def __init__(self,features,eps=1e-12):
    super(LayerNorm,self).__init__()
    self.beta = nn.Parameter(torch.ones(features))
    self.gamma = nn.Paramerter(torch.zeros(features))
    self.eps = eps

  def forward(self):
    mean = input_.mean(-1,keepdims = True)
    std = input_.std(-1,keepdims = True)
    norm = self.gamma*(input_-mean)/(self.eps+self.std) + self.beta
    return norm

In [0]:
class Encoder_Layer(nn.Module):
  def __init__(self,hidden_size,q_dims,k_dims,v_dims,heads,input_dims_attention,output_dims_attention,dropout_ffn,dropout_enc):
    self.q_dims = q_dims
    self.v_dims = v_dims
    self.k_dims = k_dims
    self.hidden_size = hidden_size
    self.heads = heads
    self.input_dims_attention = input_dims_attention
    self.output_dims_attention = output_dims_attention
    self.dropout_ffn = dropout_ffn
    
    self.mha = MultiHeadAttention(self.heads,self.input_dims_attention,self.q_dims,self.k_dims,self.v_dims,self.output_dims_attention) 
    self.ffn = FeedForwardNetwork(self.hidden_size,self.hidden_size,self.hidden_size,self.dropout_ffn)

    self.layernorm_mha = LayerNorm(self.hidden_size)
    self.layernorm_ffn = LayerNorm(self.hidden_size)

    self.dropout = nn.Dropout(dropout_enc)

  def forward(self,input_):
    layer_normed_input = self.layernorm_mha(input_)
    mha_output = self.mha(layer_normed_input,layer_normed_input,layer_normed_input)
    input_conc_mha = mha_output + input_
    input_conc_mha_drop = self.dropout(input_conc_mha)
    layer_normed_ffn = self.layernorm_mha(input_conc_mha_drop)
    ffn_output = self.ffn(layer_normed_ffn)
    ffn_output_concat = ffn_output+input_conc_mha_drop
    ffn_output_concat_dropout = self.dropout(ffn_output_concat)
    return ffn_output_concat_dropout


In [0]:
class Decoder_Layer(nn.Module):
  def __init__(self):
    

  def forward(self):
    pass

In [0]:
class Encoder(nn.Module):
  def __init__(self):
    pass

  def forward(self):
    pass

In [0]:
class Decoder(nn.Module):
  def __init__(self):
    pass

  def forward(self):
    pass 