In [1]:
import pandas as pd


In [2]:
df = pd.read_csv('/content/100_Unique_QA_Dataset.csv')
df

Unnamed: 0,question,answer
0,What is the capital of France?,Paris
1,What is the capital of Germany?,Berlin
2,Who wrote 'To Kill a Mockingbird'?,Harper-Lee
3,What is the largest planet in our solar system?,Jupiter
4,What is the boiling point of water in Celsius?,100
...,...,...
85,Who directed the movie 'Titanic'?,JamesCameron
86,Which superhero is also known as the Dark Knight?,Batman
87,What is the capital of Brazil?,Brasilia
88,Which fruit is known as the king of fruits?,Mango


In [3]:
#tokenize
def tokenize(text):
  text = text.lower()
  text = text.replace('\n','')
  text = text.replace('?','')
  text = text.replace("'",'')
  return text.split()

In [4]:
tokenize('What is the capital of France')

['what', 'is', 'the', 'capital', 'of', 'france']

In [5]:
#vocab
vocab = {'<UNK>' : 0}
def build_vocab(row):
  tokenize_question = tokenize(row['question'])
  tokenize_answer = tokenize(row['answer'])
  merged_token = tokenize_question + tokenize_answer
  for word in merged_token:
    if word not in vocab:
      vocab[word] = len(vocab)



In [6]:
df.apply(build_vocab,axis=1)

Unnamed: 0,0
0,
1,
2,
3,
4,
...,...
85,
86,
87,
88,


In [7]:
df.shape

(90, 2)

In [8]:
len(vocab)

324

In [9]:
vocab

{'<UNK>': 0,
 'what': 1,
 'is': 2,
 'the': 3,
 'capital': 4,
 'of': 5,
 'france': 6,
 'paris': 7,
 'germany': 8,
 'berlin': 9,
 'who': 10,
 'wrote': 11,
 'to': 12,
 'kill': 13,
 'a': 14,
 'mockingbird': 15,
 'harper-lee': 16,
 'largest': 17,
 'planet': 18,
 'in': 19,
 'our': 20,
 'solar': 21,
 'system': 22,
 'jupiter': 23,
 'boiling': 24,
 'point': 25,
 'water': 26,
 'celsius': 27,
 '100': 28,
 'painted': 29,
 'mona': 30,
 'lisa': 31,
 'leonardo-da-vinci': 32,
 'square': 33,
 'root': 34,
 '64': 35,
 '8': 36,
 'chemical': 37,
 'symbol': 38,
 'for': 39,
 'gold': 40,
 'au': 41,
 'which': 42,
 'year': 43,
 'did': 44,
 'world': 45,
 'war': 46,
 'ii': 47,
 'end': 48,
 '1945': 49,
 'longest': 50,
 'river': 51,
 'nile': 52,
 'japan': 53,
 'tokyo': 54,
 'developed': 55,
 'theory': 56,
 'relativity': 57,
 'albert-einstein': 58,
 'freezing': 59,
 'fahrenheit': 60,
 '32': 61,
 'known': 62,
 'as': 63,
 'red': 64,
 'mars': 65,
 'author': 66,
 '1984': 67,
 'george-orwell': 68,
 'currency': 69,
 'unit

In [10]:
def text_to_indices(text , vocab):
  indexed_text = []
  for token in tokenize(text):
    if token in vocab:
      indexed_text.append(vocab[token])
    else:
      indexed_text.append(vocab['<UNK>'])
  return indexed_text

In [11]:
text_to_indices("How are you?", vocab)

[78, 81, 0]

In [12]:
import torch
from torch.utils.data import Dataset, DataLoader


In [13]:
class QADataset(Dataset):
  def __init__(self , df, vocab):

    self.df = df
    self.vocab = vocab

  def __len__(self):
    return len(self.df)

  def __getitem__(self, idx):
    numerical_question = text_to_indices(self.df.iloc[idx]['question'], self.vocab)
    numerical_answer = text_to_indices(self.df.iloc[idx]['answer'], self.vocab)
    return torch.tensor(numerical_question), torch.tensor(numerical_answer)

In [14]:
dataset = QADataset(df, vocab)

In [15]:
dataset[0]

(tensor([1, 2, 3, 4, 5, 6]), tensor([7]))

In [16]:
dataloader = DataLoader(dataset, batch_size=1 , shuffle= True)

In [17]:
len(vocab)

324

In [54]:
for question, answer in dataloader:
  print(question)
  print(answer[0])



tensor([[ 10,  29, 130, 131]])
tensor([132])
tensor([[ 10, 308,   3, 309, 310]])
tensor([311])
tensor([[ 10,  11, 157, 158, 159]])
tensor([160])
tensor([[ 42, 125,   2,  62,  63,   3, 126, 127]])
tensor([128])
tensor([[ 78,  79, 129,  81,  19,   3,  21,  22]])
tensor([36])
tensor([[ 10, 140,   3, 141, 171,   5,   3,  70, 172]])
tensor([173])
tensor([[ 42, 216, 118, 217, 218,  19,  14, 219,  43]])
tensor([220])
tensor([[ 1,  2,  3,  4,  5, 99]])
tensor([100])
tensor([[  1,   2,   3, 146,  86,  19, 192, 193]])
tensor([194])
tensor([[10, 75, 76]])
tensor([77])
tensor([[  1,   2,   3, 163, 164, 165,  83,  84]])
tensor([166])
tensor([[  1,   2,   3,  33,  34,   5, 245]])
tensor([246])
tensor([[ 42,   2,   3, 210, 137, 168, 211, 169]])
tensor([113])
tensor([[  1,   2,   3, 234,   5, 235]])
tensor([131])
tensor([[ 42, 250, 251, 118, 252, 253]])
tensor([254])
tensor([[ 42, 117, 118,   3, 119,  94, 120]])
tensor([121])
tensor([[ 1,  2,  3, 50, 51, 19,  3, 45]])
tensor([52])
tensor([[  1,   2,  

In [21]:
import torch
import torch.nn as nn

In [28]:
class SimpleRNN(nn.Module):

  def __init__(self , vocab_size):
    super().__init__()
    self.embedding = nn.Embedding(vocab_size , embedding_dim = 50)
    self.rnn = nn.RNN(50 , 64, batch_first = True)
    self.linear = nn.Linear(64, vocab_size)


  def forward(self , question):
    embedded_question = self.embedding(question)
    hidden , output = self.rnn(embedded_question)
    output = self.linear(output)
    return output



In [29]:
x = nn.Embedding(324, embedding_dim=50)
y = nn.RNN(50, 64, batch_first=True)
z = nn.Linear(64, 324)

a = dataset[0][0].reshape(1,6)
print("shape of a:", a.shape)
b = x(a)
print("shape of b:", b.shape)
c, d = y(b)
print("shape of c:", c.shape)
print("shape of d:", d.shape)

e = z(d.squeeze(0))

print("shape of e:", e.shape)

shape of a: torch.Size([1, 6])
shape of b: torch.Size([1, 6, 50])
shape of c: torch.Size([1, 6, 64])
shape of d: torch.Size([1, 1, 64])
shape of e: torch.Size([1, 324])


In [30]:
learning_rate = 0.001
epochs = 20

In [31]:
model = SimpleRNN(len(vocab))

In [32]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [57]:
# training loop

for epoch in range(epochs):

  total_loss = 0

  for question, answer in dataloader:

    optimizer.zero_grad()

    # forward pass
    output = model(question)
    # Squeeze the output and answer to correct shapes
       # (1, 1) → (1)

    loss = criterion(output[0], answer[0])
    # gradients
    loss.backward()

    # update
    optimizer.step()

    total_loss = total_loss + loss.item()

  print(f"Epoch: {epoch+1}, Loss: {total_loss:4f}")

Epoch: 1, Loss: 1.716037
Epoch: 2, Loss: 1.603643
Epoch: 3, Loss: 1.503223
Epoch: 4, Loss: 1.410219
Epoch: 5, Loss: 1.324971
Epoch: 6, Loss: 1.247235
Epoch: 7, Loss: 1.173694
Epoch: 8, Loss: 1.106338
Epoch: 9, Loss: 1.041087
Epoch: 10, Loss: 0.982961
Epoch: 11, Loss: 0.928525
Epoch: 12, Loss: 0.877428
Epoch: 13, Loss: 0.827315
Epoch: 14, Loss: 0.782080
Epoch: 15, Loss: 0.740523
Epoch: 16, Loss: 0.701740
Epoch: 17, Loss: 0.663901
Epoch: 18, Loss: 0.629345
Epoch: 19, Loss: 0.596556
Epoch: 20, Loss: 0.565966


In [60]:
def predict(model, question, threshold=0.5):
    # convert question to numbers
    numerical_question = text_to_indices(question, vocab)

    # tensor
    question_tensor = torch.tensor(numerical_question)

    # send to model
    output = model(question_tensor)

    # convert logits to probs
    probs = torch.nn.functional.softmax(output, dim=1)

    # find index of max prob
    value, index = torch.max(probs, dim=1)

    if value.item() < threshold:
        print("I don't know")
        return

    print(list(vocab.keys())[index.item()])


In [61]:
predict(model, "What is the largest planet in our solar system?")

jupiter


In [65]:
predict(model, "What is the capital of Spain?")

madrid


In [69]:
predict(model, "What is the capital of France?")

paris
