In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn

In [2]:
df = pd.read_csv("/content/100_Unique_QA_Dataset.csv")
df.head()

Unnamed: 0,question,answer
0,What is the capital of France?,Paris
1,What is the capital of Germany?,Berlin
2,Who wrote 'To Kill a Mockingbird'?,Harper-Lee
3,What is the largest planet in our solar system?,Jupiter
4,What is the boiling point of water in Celsius?,100


In [3]:
# @title Tokenize
def tokenize(text):
  text = text.lower()
  text = text.replace("?", "")
  text = text.replace("'", "")
  return text.split()

In [4]:
tokenize("What is the capital of US?")

['what', 'is', 'the', 'capital', 'of', 'us']

In [5]:
# @title Vocab

vocab = {'<UNK>': 0} # UNK represents unknown tokens, not in our dataset, set at index 0

def build_vocab(row):
  print(row['question'], row['answer'])
  tokenized_question = tokenize(row['question'])
  tokenized_answer = tokenize(row['answer'])

  merged_tokens = tokenized_question + tokenized_answer # Combined list of tokens in Q&A

  for token in merged_tokens:
    if token not in vocab:
      vocab[token] = len(vocab) # If there are 3 tokens in vocab (including UNK) then
      # current len = 3, the next token added will have value 3
      # vocab = {
          # UNK: 0
          # token1: 1
          # token2: 2 -> current len = 3
          # token3: 3
      # }

df.apply(build_vocab, axis = 1)


What is the capital of France? Paris
What is the capital of Germany? Berlin
Who wrote 'To Kill a Mockingbird'? Harper-Lee
What is the largest planet in our solar system? Jupiter
What is the boiling point of water in Celsius? 100
Who painted the Mona Lisa? Leonardo-da-Vinci
What is the square root of 64? 8
What is the chemical symbol for gold? Au
Which year did World War II end? 1945
What is the longest river in the world? Nile
What is the capital of Japan? Tokyo
Who developed the theory of relativity? Albert-Einstein
What is the freezing point of water in Fahrenheit? 32
Which planet is known as the Red Planet? Mars
Who is the author of '1984'? George-Orwell
What is the currency of the United Kingdom? Pound
What is the capital of India? Delhi
Who discovered gravity? Newton
How many continents are there on Earth? 7
Which gas do plants use for photosynthesis? CO2
What is the smallest prime number? 2
Who invented the telephone? Alexander-Graham-Bell
What is the capital of Australia? Canber

Unnamed: 0,0
0,
1,
2,
3,
4,
...,...
85,
86,
87,
88,


In [6]:
vocab

{'<UNK>': 0,
 'what': 1,
 'is': 2,
 'the': 3,
 'capital': 4,
 'of': 5,
 'france': 6,
 'paris': 7,
 'germany': 8,
 'berlin': 9,
 'who': 10,
 'wrote': 11,
 'to': 12,
 'kill': 13,
 'a': 14,
 'mockingbird': 15,
 'harper-lee': 16,
 'largest': 17,
 'planet': 18,
 'in': 19,
 'our': 20,
 'solar': 21,
 'system': 22,
 'jupiter': 23,
 'boiling': 24,
 'point': 25,
 'water': 26,
 'celsius': 27,
 '100': 28,
 'painted': 29,
 'mona': 30,
 'lisa': 31,
 'leonardo-da-vinci': 32,
 'square': 33,
 'root': 34,
 '64': 35,
 '8': 36,
 'chemical': 37,
 'symbol': 38,
 'for': 39,
 'gold': 40,
 'au': 41,
 'which': 42,
 'year': 43,
 'did': 44,
 'world': 45,
 'war': 46,
 'ii': 47,
 'end': 48,
 '1945': 49,
 'longest': 50,
 'river': 51,
 'nile': 52,
 'japan': 53,
 'tokyo': 54,
 'developed': 55,
 'theory': 56,
 'relativity': 57,
 'albert-einstein': 58,
 'freezing': 59,
 'fahrenheit': 60,
 '32': 61,
 'known': 62,
 'as': 63,
 'red': 64,
 'mars': 65,
 'author': 66,
 '1984': 67,
 'george-orwell': 68,
 'currency': 69,
 'unit

In [7]:
def text_to_indices(text, vocab):

  text_to_indices = []
  for text in tokenize(text):

    if text in vocab:
      text_to_indices.append(vocab[text])
    else:
      text_to_indices.append(vocab['<UNK>'])

  return text_to_indices

In [8]:
from torch.utils.data import Dataset, DataLoader

In [9]:
class QAdataset(Dataset):

  def __init__(self, df, vocab):
    self.df = df
    self.vocab = vocab

  def __len__(self):
    return self.df.shape[0]

  def __getitem__(self, idx):
    numerical_question = text_to_indices(self.df.iloc[idx]['question'], self.vocab)
    numerical_answer = text_to_indices(self.df.iloc[idx]['answer'], self.vocab)
    return torch.tensor(numerical_question), torch.tensor(numerical_answer)

In [10]:
dataset = QAdataset(df, vocab)
dataset[10]

(tensor([ 1,  2,  3,  4,  5, 53]), tensor([54]))

In [11]:
dataloader = DataLoader(dataset, batch_size = 1, shuffle = True)

In [12]:
for question, answer in dataloader:
  print(question, answer)

for question, answer in dataloader:
  print(question, answer[0])

tensor([[ 1,  2,  3, 17, 18, 19, 20, 21, 22]]) tensor([[23]])
tensor([[ 1,  2,  3, 24, 25,  5, 26, 19, 27]]) tensor([[28]])
tensor([[ 1,  2,  3, 59, 25,  5, 26, 19, 60]]) tensor([[61]])
tensor([[  1,   2,   3,   4,   5, 286]]) tensor([[287]])
tensor([[ 1,  2,  3, 50, 51, 19,  3, 45]]) tensor([[52]])
tensor([[ 42, 137,   2, 226,  12,   3, 227, 228]]) tensor([[155]])
tensor([[ 42, 290, 291, 118, 292, 158, 293, 294]]) tensor([[295]])
tensor([[ 10,  75, 111]]) tensor([[112]])
tensor([[ 42, 137,   2, 138,  39, 139]]) tensor([[53]])
tensor([[  1,   2,   3,  92, 137,  19,   3,  45]]) tensor([[185]])
tensor([[42, 18,  2, 62, 63,  3, 64, 18]]) tensor([[65]])
tensor([[ 10,  75,   3, 296,  19, 297]]) tensor([[298]])
tensor([[ 42, 137,   2,  62,  39,   3, 322, 323]]) tensor([[6]])
tensor([[  1,   2,   3, 103,   5, 104,  19, 105]]) tensor([[106]])
tensor([[ 42, 174,   2,  62,  39, 175, 176,  12, 177, 178]]) tensor([[179]])
tensor([[  1,   2,   3, 163, 164, 165,  83,  84]]) tensor([[166]])
tensor([[

In [13]:
# @title RNN

dataset[0][0] # Question 1

tensor([1, 2, 3, 4, 5, 6])

In [14]:
x = nn.Embedding(324, embedding_dim=50)
c = x(dataset[0][0])
x(dataset[0][0]).shape # Each of the six words in the question are converted to a vector of size 50
# Total 6 vectors each of size 50

torch.Size([6, 50])

In [15]:
a = nn.RNN(50, 64)
y = a(c)
# y will be a tuple with 2 values
print(y[0].shape) # All o1 to o6, o1 is output for first word, then we send second word and o1
# to RNN layer to get o2, then we send third word and o2... till o6

# first value in the tuple is all values o1 through o6

# Second value in the tuple is the final output (i.e. o6) after all words have been sent
# to hidden layer
print(y[1].shape) # the second value in the tuple will be the last tensor in the first value of
# the tuple (which contains all outputs through time) in this case y[1] will be o6

# We can't use nn.Sequential because it expects one single value output from previous layer
# But y is a tuple with 2 values (all output values through time, final value)

torch.Size([6, 64])
torch.Size([1, 64])


In [16]:
print(f"Input tensor shape: {dataset[15][0]} = (8,) 1D tensor")
# Reshape will convert this to (1,8) 2D tensor, 1 is for batch

# Without reshape
a = nn.Embedding(324, embedding_dim=50)
b = a(dataset[15][0]) # .reshape converts tensor of size 8,1 to 1,8
print(f"Without reshaping input, Shape of b: {b.shape}")

# With reshape
a = nn.Embedding(324, embedding_dim=50)
b = a(dataset[15][0].reshape(1,8)) # .reshape converts tensor of size 8,1 to 1,8
print(f"With reshaping input, Shape of b: {b.shape}")

# RNN layer without batch_first = True (this keeps batch dimension first)
y = nn.RNN(50, 64)
d = y(b)
print(f"Without batch_first- Hidden output shape: {d[0].shape}, Final output shape: {d[1].shape}")

# RNN layer with batch_first = True (this keeps batch dimension first)
y = nn.RNN(50, 64, batch_first=True)
c = y(b)
print(f"With batch_first- Hidden output shape: {c[0].shape}, Final output shape: {c[1].shape}")
print("Final output: ", c[1])

# Output of final layer without batch_first
z1 = nn.Linear(64, 324)
e1 = z1(d[1])
print(f"Output of final layer without batch_first: {e1.shape}")

# Output of final layer with batch_first
z2 = nn.Linear(64, 324)
e2 = z2(c[1])
print(f"Output of final layer without batch_first: {e2.shape}")

# Even with batch_first we are getting 1, 8, 324 but we need a 1, 324 vector for loss calculation
e = e2.squeeze(0) # To remove the first 1
print(f"Final output logits shape after sequeezing: {e.shape}")

Input tensor shape: tensor([ 1,  2,  3, 69,  5,  3, 70, 71]) = (8,) 1D tensor
Without reshaping input, Shape of b: torch.Size([8, 50])
With reshaping input, Shape of b: torch.Size([1, 8, 50])
Without batch_first- Hidden output shape: torch.Size([1, 8, 64]), Final output shape: torch.Size([1, 8, 64])
With batch_first- Hidden output shape: torch.Size([1, 8, 64]), Final output shape: torch.Size([1, 1, 64])
Final output:  tensor([[[-0.6299,  0.5277,  0.7710,  0.4048,  0.7291, -0.3848, -0.5765,
           0.3769,  0.1551,  0.4197, -0.3704,  0.1178,  0.4352, -0.1122,
          -0.1843, -0.5328,  0.0880, -0.3082, -0.0777, -0.0728,  0.1416,
           0.6438,  0.8491,  0.5066,  0.5593,  0.4331,  0.1472, -0.3177,
          -0.5275, -0.9150, -0.4234,  0.8432, -0.4150,  0.1030,  0.6830,
          -0.2814,  0.7304, -0.2990,  0.1383,  0.3723,  0.0532, -0.0361,
          -0.5078,  0.3440,  0.2258, -0.3793,  0.4569, -0.5576, -0.6436,
          -0.2556, -0.5936,  0.2188, -0.8297, -0.6415,  0.1542, -0.

In [17]:
class SimpleRNN(nn.Module):

  def __init__(self, vocab_size):
    super().__init__()

    self.embedding = nn.Embedding(vocab_size, embedding_dim=50)
    self.RNN = nn.RNN(50, 64, batch_first=True) # Since we have 50 dimensional embeddings, input to RNN layer will be 50
    # We have 64 neurons in this layer

    self.fc = nn.Linear(64, vocab_size) # Output is vocab size as each neuron corresponds to
    # a word in the vocab, the answers are also part of vocab, neuron with highest value
    # output corresponds to predicted answer

  def forward(self, question):
    embedded_question = self.embedding(question)
    hidden, final = self.RNN(embedded_question)
    output = self.fc(final.squeeze(0))

    return output


In [18]:
lr = 0.001
epochs = 20
model = SimpleRNN(len(vocab))
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

In [19]:
for epoch in range(epochs):
  total_loss = 0
  for question, answer in dataloader:
    optimizer.zero_grad()
    out = model(question)
    loss = criterion(out, answer[0]) # Answer has shape 1, 1 we just want 1D tensor (single answer
    # to which logit with max probability will be used to calculate loss)
    loss.backward()
    optimizer.step()
    total_loss += loss.item()

  print(f"Epoch {epoch+1}: {total_loss:.4f}")

Epoch 1: 527.7548
Epoch 2: 455.5189
Epoch 3: 376.1973
Epoch 4: 317.6182
Epoch 5: 267.2670
Epoch 6: 219.8879
Epoch 7: 173.9641
Epoch 8: 135.1649
Epoch 9: 102.9909
Epoch 10: 78.1373
Epoch 11: 59.0848
Epoch 12: 46.1155
Epoch 13: 36.2798
Epoch 14: 29.2895
Epoch 15: 24.0680
Epoch 16: 19.7963
Epoch 17: 16.9372
Epoch 18: 14.4775
Epoch 19: 12.4468
Epoch 20: 10.8186


In [20]:
# @title Understanding dims=k

# dims = 1
probabilities1 = torch.tensor([
    [0.1, 0.3, 0.6],
    [0.8, 0.1, 0.1]
]) # (2, 3)

values_1, indices_1 = torch.max(probabilities1, dim = 1)
print(f"Output of values, indices across dimension 1: {values_1, indices_1}")

# dims = 2
probabilities2 = torch.tensor([
    [[0.1, 0.7, 0.2], [0.3, 0.4, 0.3]],
    [[0.8, 0.1, 0.1], [0.2, 0.2, 0.6]]
])  # (2, 2, 3)

values_2, indices_2 = torch.max(probabilities2, dim = 2)
print(f"Output of values, indices across dimension 1: {values_2, indices_2}")

Output of values, indices across dimension 1: (tensor([0.6000, 0.8000]), tensor([2, 0]))
Output of values, indices across dimension 1: (tensor([[0.7000, 0.4000],
        [0.8000, 0.6000]]), tensor([[1, 1],
        [0, 2]]))


In [21]:
# For questions the model hasn't seen
def predict(model, question, threshold=0.5):
  numerical_question = text_to_indices(question, vocab)
  question_tensor = torch.tensor(numerical_question).unsqueeze(0) # Add 1 to first dimension
  out = model(question_tensor) # Get logits

  # Convert logits to probs
  probs = torch.nn.functional.softmax(out, dim=1) # dim=1 => keep dim as 1

  value, index_of_max_prob = torch.max(probs, dim=1) # dim=1, take max along dimension 1

  if value < 0.5:
    print("I don't know")

  else:
    indices_in_vocab = list(vocab.keys())
    print(indices_in_vocab[index_of_max_prob])

In [22]:
predict(model, "What is the capital city of France?")

paris


In [23]:
predict(model, "What do I do with I don't know?")

I don't know
