In [3]:
import pandas as pd


In [4]:
df=pd.read_csv("/content/drive/MyDrive/Colab Notebooks/100_Unique_QA_Dataset.csv")

In [5]:
df.head()

Unnamed: 0,question,answer
0,What is the capital of France?,Paris
1,What is the capital of Germany?,Berlin
2,Who wrote 'To Kill a Mockingbird'?,Harper-Lee
3,What is the largest planet in our solar system?,Jupiter
4,What is the boiling point of water in Celsius?,100


In [6]:
def tokenize(text):
  text=text.lower()
  text=text.replace('?','')
  text=text.replace("'","")
  return text.split()

In [7]:
tokenize("What is the capital of india")

['what', 'is', 'the', 'capital', 'of', 'india']

In [8]:
vocab={'<UKN>':0}

In [9]:
def build_vocab(row):
  tokenized_question=tokenize(row['question'])
  tokenized_answer=tokenize(row['answer'])

  merged_tokens=tokenized_question+tokenized_answer

  for token in merged_tokens:
    if token not in vocab:
      vocab[token]=len(vocab)

In [10]:
df.apply(build_vocab,axis=1)

Unnamed: 0,0
0,
1,
2,
3,
4,
...,...
85,
86,
87,
88,


In [11]:
len(vocab)

324

In [12]:
# conver words to numeric indices


def text_to_indices(text,vocab):
  indexed_text=[]

  for token in tokenize(text):

    if token in vocab:
      indexed_text.append(vocab[token])
    else:
      indexed_text.append(vocab['<UKN>'])

  return indexed_text

In [13]:
text_to_indices("Who is zaid",vocab)

[10, 2, 0]

In [14]:
import torch
from torch.utils.data import Dataset,DataLoader

In [15]:
class QADataset(Dataset):
  def __init__(self,df,vocab):
    self.df=df
    self.vocab=vocab

  def __len__(self):
    return self.df.shape[0]

  def __getitem__(self,index):
    numeric_question=text_to_indices(self.df.iloc[index]['question'],self.vocab)
    numeric_answer=text_to_indices(self.df.iloc[index]['answer'],self.vocab)

    return torch.tensor(numeric_question),torch.tensor(numeric_answer)


In [16]:
dataset=QADataset(df,vocab)

In [17]:
dataloader=DataLoader(dataset,batch_size=1,shuffle=True)

In [18]:
for question,answer in dataloader:
  print(question,answer[0])

tensor([[ 1,  2,  3,  4,  5, 53]]) tensor([54])
tensor([[ 1,  2,  3, 17, 18, 19, 20, 21, 22]]) tensor([23])
tensor([[ 42, 174,   2,  62,  39, 175, 176,  12, 177, 178]]) tensor([179])
tensor([[  1,   2,   3,   4,   5, 109]]) tensor([317])
tensor([[  1,   2,   3, 212,   5,  14, 213, 214]]) tensor([215])
tensor([[  1,   2,   3,  33,  34,   5, 245]]) tensor([246])
tensor([[ 42, 200,   2,  14, 201, 202, 203, 204]]) tensor([205])
tensor([[10,  2,  3, 66,  5, 67]]) tensor([68])
tensor([[ 42,  18, 118,   3, 186, 187]]) tensor([188])
tensor([[ 42, 299, 300, 118,  14, 301, 302, 158, 303, 304, 305, 306]]) tensor([307])
tensor([[  1,   2,   3,  92, 137,  19,   3,  45]]) tensor([185])
tensor([[ 10,  29, 130, 131]]) tensor([132])
tensor([[1, 2, 3, 4, 5, 8]]) tensor([9])
tensor([[  1,   2,   3, 122, 123,  19,   3,  45]]) tensor([124])
tensor([[ 1,  2,  3,  4,  5, 73]]) tensor([74])
tensor([[78, 79, 80, 81, 82, 83, 84]]) tensor([85])
tensor([[ 10,  75,   3, 296,  19, 297]]) tensor([298])
tensor([[10, 

In [19]:
import torch.nn as nn

In [20]:
class SimpleRNN(nn.Module):

  def __init__(self,vocab_size):
    super().__init__()
    self.embedding=nn.Embedding(vocab_size,embedding_dim=50)
    self.rnn=nn.RNN(50,64,batch_first=True)
    self.fc=nn.Linear(64,vocab_size)


  def forward(self,question):
    embedded_question=self.embedding(question)
    hidden,final=self.rnn(embedded_question)
    output=self.fc(final.squeeze(0))

    return output

In [21]:
learning_rate=0.001
epochs=20

In [22]:
model=SimpleRNN(len(vocab))

In [23]:
criterion=nn.CrossEntropyLoss()
optimizer=torch.optim.Adam(model.parameters(),lr=learning_rate)

In [24]:
# training loop

for epoch in range(epochs):
  total_loss=0

  for question,answer in dataloader:

    optimizer.zero_grad()

    # forward pass
    output=model(question)
    # loss -> output shape (1,324) - (1)
    loss=criterion(output,answer[0])
    #gradient
    loss.backward()
    #update
    optimizer.step()

    total_loss+=loss.item()

  print(f"Epoch {epoch+1} , Loss : {total_loss:4f}")

Epoch 1 , Loss : 526.737341
Epoch 2 , Loss : 457.925278
Epoch 3 , Loss : 379.563951
Epoch 4 , Loss : 317.387781
Epoch 5 , Loss : 266.655551
Epoch 6 , Loss : 218.438889
Epoch 7 , Loss : 174.385578
Epoch 8 , Loss : 136.528508
Epoch 9 , Loss : 106.012015
Epoch 10 , Loss : 81.999218
Epoch 11 , Loss : 63.000867
Epoch 12 , Loss : 49.515089
Epoch 13 , Loss : 39.661264
Epoch 14 , Loss : 32.026273
Epoch 15 , Loss : 26.232638
Epoch 16 , Loss : 21.789473
Epoch 17 , Loss : 18.389851
Epoch 18 , Loss : 15.655568
Epoch 19 , Loss : 13.345774
Epoch 20 , Loss : 11.686578


In [59]:
def predict(model,question,threshold=0.5):
  numerical_question=text_to_indices(question,vocab)

  question_tensor=torch.tensor(numerical_question).unsqueeze(0)

  output=model(question_tensor)

  probs=torch.nn.functional.softmax(output,dim=1)
  value,index=torch.max(probs,dim=1)

  if value < threshold:
    print("I don't know")
  else:
    print(list(vocab.keys())[index])

In [60]:
predict(model,"What is the largest planet in our solar system?")

jupiter


In [61]:
predict(model,"What is the capital of france? ")

paris


In [62]:
predict(model,"Which ocean is the largest?")

pacific-ocean


In [63]:
predict(model,"Who is zaid?")

I don't know
