In [121]:
import pandas as pd
import numpy as np
import torch

In [122]:
df = pd.read_csv("100_Unique_QA_Dataset.csv")

In [123]:
df.head()

Unnamed: 0,question,answer
0,What is the capital of France?,Paris
1,What is the capital of Germany?,Berlin
2,Who wrote 'To Kill a Mockingbird'?,Harper-Lee
3,What is the largest planet in our solar system?,Jupiter
4,What is the boiling point of water in Celsius?,100


In [124]:
# tokenkize
def tokenkize(text):
  text = text.lower()
  text = text.replace('?','')
  text = text.replace('!','')
  return text.split()

In [125]:
# vocab
vocab = {'<UNK>':0}

def build_vocab(row):
  print(row['question'], row['answer'])
  q_tok = tokenkize(row['question'])
  a_tok = tokenkize(row['answer'])
  merge_token = q_tok + a_tok

  for token in merge_token:
    if token not in vocab:
      vocab[token] = len(vocab)

In [126]:
df.apply(build_vocab,axis=1)

What is the capital of France? Paris
What is the capital of Germany? Berlin
Who wrote 'To Kill a Mockingbird'? Harper-Lee
What is the largest planet in our solar system? Jupiter
What is the boiling point of water in Celsius? 100
Who painted the Mona Lisa? Leonardo-da-Vinci
What is the square root of 64? 8
What is the chemical symbol for gold? Au
Which year did World War II end? 1945
What is the longest river in the world? Nile
What is the capital of Japan? Tokyo
Who developed the theory of relativity? Albert-Einstein
What is the freezing point of water in Fahrenheit? 32
Which planet is known as the Red Planet? Mars
Who is the author of '1984'? George-Orwell
What is the currency of the United Kingdom? Pound
What is the capital of India? Delhi
Who discovered gravity? Newton
How many continents are there on Earth? 7
Which gas do plants use for photosynthesis? CO2
What is the smallest prime number? 2
Who invented the telephone? Alexander-Graham-Bell
What is the capital of Australia? Canber

Unnamed: 0,0
0,
1,
2,
3,
4,
...,...
85,
86,
87,
88,


In [127]:
len(vocab)

326

In [128]:
# convert words to numerical index
def text_to_indices(text,vocab):
  indexed_text = []
  for token in tokenkize(text):
    if token in vocab:
      indexed_text.append(vocab[token])
    else:
      indexed_text.append(vocab['<UNK>'])

  return indexed_text

In [129]:
text_to_indices("What is Ratn",vocab)

[1, 2, 0]

In [130]:
import torch
from torch.utils.data import Dataset, DataLoader

In [131]:
class QADataset(Dataset):

  def __init__(self,df,vocab) -> None:
    self.df = df
    self.vocab = vocab

  def __len__(self):
    return self.df.shape[0]

  def __getitem__(self, index):

     num_question = text_to_indices(self.df.iloc[index]['question'],self.vocab)
     num_answer = text_to_indices(self.df.iloc[index]['answer'],self.vocab)

     return torch.tensor(num_question),torch.tensor(num_answer)


In [132]:
dataset = QADataset(df,vocab)

In [133]:
dataset[4]

(tensor([ 1,  2,  3, 24, 25,  5, 26, 19, 27]), tensor([28]))

In [134]:
dataloader = DataLoader(dataset,batch_size=1,shuffle=True)

In [135]:
for question,answer in dataloader:
  print(question, answer)

tensor([[1, 2, 3, 4, 5, 6]]) tensor([[7]])
tensor([[ 42, 320,   2,  62,  63,   3, 321,   5, 322]]) tensor([[323]])
tensor([[ 42, 217, 118, 218, 219,  19,  14, 220,  43]]) tensor([[221]])
tensor([[78, 79, 80, 81, 82, 83, 84]]) tensor([[85]])
tensor([[ 42, 117, 118,   3, 119,  94, 120]]) tensor([[121]])
tensor([[ 10, 310,   3, 311, 312]]) tensor([[313]])
tensor([[ 1,  2,  3, 24, 25,  5, 26, 19, 27]]) tensor([[28]])
tensor([[  1,   2,   3, 147, 148,  19, 149]]) tensor([[150]])
tensor([[10,  2,  3, 66,  5, 67]]) tensor([[68]])
tensor([[ 78,  79, 196,  81,  19,   3, 197, 198, 199]]) tensor([[200]])
tensor([[ 42, 252, 253, 118, 254, 255]]) tensor([[256]])
tensor([[  1,   2,   3,  69,   5, 156]]) tensor([[157]])
tensor([[10, 11, 12, 13, 14, 15]]) tensor([[16]])
tensor([[ 10,  75, 209]]) tensor([[210]])
tensor([[ 10,  11, 190, 159, 191]]) tensor([[192]])
tensor([[ 78,  79, 290,  81,  19,  14, 291]]) tensor([[85]])
tensor([[ 42, 314,   2, 315,  62,  63,   3, 316, 317]]) tensor([[318]])
tensor([

In [136]:
import torch.nn as nn
class SimpleRNN(nn.Module):

  def __init__(self, vocab_size) -> None:
    super().__init__()
    self.embedding = nn.Embedding(vocab_size,embedding_dim=50)
    self.rnn = nn.RNN(50,64,batch_first=True)
    self.fc = nn.Linear(64,vocab_size)

  def forward(self,question):
    embedded_question = self.embedding(question)
    hidden, final = self.rnn(embedded_question)
    output = self.fc(final.squeeze(0))
    return output


In [137]:
dataset[15][0]

tensor([ 1,  2,  3, 69,  5,  3, 70, 71])

In [138]:
x = nn.Embedding(324,embedding_dim=50)

In [139]:
x(dataset[15][0]).shape

torch.Size([8, 50])

In [140]:
a = x(dataset[15][0])

In [141]:
y  = nn.RNN(50,64)

In [142]:
y(a)

(tensor([[ 6.0875e-01,  3.2783e-02, -5.3725e-01, -5.4554e-03,  1.8149e-01,
          -1.2239e-01,  4.0680e-01, -1.9375e-01, -4.9047e-01,  3.1743e-01,
          -4.7732e-01,  5.9959e-01,  5.1170e-01, -6.7028e-01,  7.5059e-01,
          -4.4853e-01,  2.4362e-01, -7.3007e-01, -4.9046e-01, -5.9707e-01,
           4.6494e-02, -4.4739e-01,  2.2071e-01, -3.1814e-01,  5.6302e-01,
          -3.3791e-01,  5.3213e-01,  2.4644e-01,  2.4268e-01, -1.7234e-01,
          -8.4262e-01, -8.0606e-02,  2.0620e-01, -2.6445e-01, -1.5486e-01,
          -4.0731e-01, -1.1530e-01, -4.2498e-02,  6.1428e-01,  6.7840e-01,
          -4.2266e-01,  9.3435e-02, -4.3205e-01,  3.9222e-02,  4.7004e-01,
          -1.2018e-01, -2.1207e-01,  4.6941e-01, -2.9752e-01,  3.2326e-01,
          -9.3981e-02,  7.4849e-01,  1.3379e-01, -2.2751e-01,  4.6915e-01,
          -8.0713e-02, -1.3054e-01, -1.5693e-01,  2.8799e-01, -3.5498e-01,
           2.4536e-01,  1.0461e-01, -3.9932e-01, -4.8920e-01],
         [ 9.0894e-01, -5.1049e-01, -

In [143]:
# hidden laye
y(a)[0]

tensor([[ 6.0875e-01,  3.2783e-02, -5.3725e-01, -5.4554e-03,  1.8149e-01,
         -1.2239e-01,  4.0680e-01, -1.9375e-01, -4.9047e-01,  3.1743e-01,
         -4.7732e-01,  5.9959e-01,  5.1170e-01, -6.7028e-01,  7.5059e-01,
         -4.4853e-01,  2.4362e-01, -7.3007e-01, -4.9046e-01, -5.9707e-01,
          4.6494e-02, -4.4739e-01,  2.2071e-01, -3.1814e-01,  5.6302e-01,
         -3.3791e-01,  5.3213e-01,  2.4644e-01,  2.4268e-01, -1.7234e-01,
         -8.4262e-01, -8.0606e-02,  2.0620e-01, -2.6445e-01, -1.5486e-01,
         -4.0731e-01, -1.1530e-01, -4.2498e-02,  6.1428e-01,  6.7840e-01,
         -4.2266e-01,  9.3435e-02, -4.3205e-01,  3.9222e-02,  4.7004e-01,
         -1.2018e-01, -2.1207e-01,  4.6941e-01, -2.9752e-01,  3.2326e-01,
         -9.3981e-02,  7.4849e-01,  1.3379e-01, -2.2751e-01,  4.6915e-01,
         -8.0713e-02, -1.3054e-01, -1.5693e-01,  2.8799e-01, -3.5498e-01,
          2.4536e-01,  1.0461e-01, -3.9932e-01, -4.8920e-01],
        [ 9.0894e-01, -5.1049e-01, -4.2572e-01,  5

In [144]:
# final output
b = y(a)[1]

In [145]:
z = nn.Linear(64,324)

In [146]:
z(b)

tensor([[ 0.0579,  0.0137, -0.2523,  0.1690, -0.0653,  0.3145,  0.1593, -0.1760,
         -0.0835, -0.1022,  0.0358,  0.2363, -0.0920, -0.0088, -0.1763, -0.2607,
          0.4056, -0.3487, -0.4256, -0.1526,  0.3152, -0.2150,  0.1667, -0.3029,
         -0.0199,  0.1103,  0.1838, -0.2477, -0.1775, -0.1494,  0.3157, -0.1850,
         -0.1850, -0.1806,  0.1057, -0.0751,  0.1902, -0.2699, -0.3145, -0.2515,
          0.0358,  0.0637, -0.0606,  0.0356,  0.0404,  0.0544,  0.0899,  0.0937,
         -0.2368,  0.1355,  0.0180,  0.4322,  0.3502,  0.1797,  0.0973, -0.2403,
          0.0076, -0.2773,  0.1757, -0.3118, -0.2060, -0.3520, -0.1630,  0.2438,
         -0.0306, -0.2359, -0.2934,  0.3379, -0.1286,  0.2448,  0.1880, -0.6000,
         -0.0612, -0.2974,  0.1479, -0.1526, -0.1277,  0.2492, -0.4722, -0.0809,
          0.0199, -0.1480, -0.2685,  0.1425, -0.4159, -0.0580, -0.2418, -0.4081,
          0.2372, -0.5011,  0.0248, -0.1213,  0.0111,  0.2797, -0.1021, -0.3901,
          0.2269, -0.1870, -

In [147]:
learning_rate = 0.001
epochs = 30

In [148]:
model = SimpleRNN(len(vocab))

In [149]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(),lr=learning_rate)

In [151]:
# training loop

for epoch in range(epochs):
  total_loss = 0

  for question, answer in dataloader:

    optimizer.zero_grad()

    # forward pass

    output = model(question)

    # loss
    loss = criterion(output,answer[0])

    # gradient
    loss.backward()

    # update
    optimizer.step()

    total_loss +=loss.item()

  print(f"Epoch : {epoch+1} , Loss : {total_loss}")







Epoch : 1 , Loss : 525.1888113021851
Epoch : 2 , Loss : 455.5463185310364
Epoch : 3 , Loss : 375.7082896232605
Epoch : 4 , Loss : 319.5743246078491
Epoch : 5 , Loss : 270.0499427318573
Epoch : 6 , Loss : 223.51463282108307
Epoch : 7 , Loss : 181.21713399887085
Epoch : 8 , Loss : 141.81009888648987
Epoch : 9 , Loss : 109.93685558438301
Epoch : 10 , Loss : 84.3723674416542
Epoch : 11 , Loss : 65.84823963046074
Epoch : 12 , Loss : 51.1991323530674
Epoch : 13 , Loss : 40.72987599670887
Epoch : 14 , Loss : 33.0695867985487
Epoch : 15 , Loss : 27.419420212507248
Epoch : 16 , Loss : 22.800765067338943
Epoch : 17 , Loss : 19.28801777213812
Epoch : 18 , Loss : 16.250550597906113
Epoch : 19 , Loss : 13.970291815698147
Epoch : 20 , Loss : 12.091766256839037
Epoch : 21 , Loss : 10.497291903942823
Epoch : 22 , Loss : 9.230716485530138
Epoch : 23 , Loss : 8.130901098251343
Epoch : 24 , Loss : 7.244007244706154
Epoch : 25 , Loss : 6.468312628567219
Epoch : 26 , Loss : 5.829500524327159
Epoch : 27 , L

In [155]:
def predict(model,question,threshold=0.5):
  # comvert question to number /vector
  num_question = text_to_indices(question,vocab)
  # tensor
  tensor_question = torch.tensor(num_question).unsqueeze(0)
  # send question to model
  output = model(tensor_question)
  # convert logits to prob
  probs = nn.functional.softmax(output,dim=1)
  # max prob
  value, index = torch.max(probs, dim=1)

  if value < threshold:
    print("I don't know")
  else:
    print(list(vocab.keys())[index])





In [156]:
predict(model,"What is Camp")

I don't know


In [158]:
predict(model,"What is capitol of france")

paris
