In [101]:
import pandas as pd
df=pd.read_csv('100_Unique_QA_Dataset.csv')
df.head()

Unnamed: 0,question,answer
0,What is the capital of France?,Paris
1,What is the capital of Germany?,Berlin
2,Who wrote 'To Kill a Mockingbird'?,Harper-Lee
3,What is the largest planet in our solar system?,Jupiter
4,What is the boiling point of water in Celsius?,100


In [102]:
def tokenize(text):
    text=text.lower()
    text=text.replace('?'," ")
    text=text.replace("'","")
    return text.split()

In [103]:
tokenize('What is the capital of France?')

['what', 'is', 'the', 'capital', 'of', 'france']

In [104]:
vocab={'<UNK>':0}

In [105]:
def build_vocab(row):
    tokenized_question=tokenize(row['question'])
    tokenized_answer=tokenize(row['answer'])
    merged_tokens=tokenized_question+tokenized_answer
    for token in merged_tokens:
        if token not in vocab:
            vocab[token]=len(vocab)




In [106]:
df.apply(build_vocab,axis=1)

0     None
1     None
2     None
3     None
4     None
      ... 
85    None
86    None
87    None
88    None
89    None
Length: 90, dtype: object

In [107]:
vocab

{'<UNK>': 0,
 'what': 1,
 'is': 2,
 'the': 3,
 'capital': 4,
 'of': 5,
 'france': 6,
 'paris': 7,
 'germany': 8,
 'berlin': 9,
 'who': 10,
 'wrote': 11,
 'to': 12,
 'kill': 13,
 'a': 14,
 'mockingbird': 15,
 'harper-lee': 16,
 'largest': 17,
 'planet': 18,
 'in': 19,
 'our': 20,
 'solar': 21,
 'system': 22,
 'jupiter': 23,
 'boiling': 24,
 'point': 25,
 'water': 26,
 'celsius': 27,
 '100': 28,
 'painted': 29,
 'mona': 30,
 'lisa': 31,
 'leonardo-da-vinci': 32,
 'square': 33,
 'root': 34,
 '64': 35,
 '8': 36,
 'chemical': 37,
 'symbol': 38,
 'for': 39,
 'gold': 40,
 'au': 41,
 'which': 42,
 'year': 43,
 'did': 44,
 'world': 45,
 'war': 46,
 'ii': 47,
 'end': 48,
 '1945': 49,
 'longest': 50,
 'river': 51,
 'nile': 52,
 'japan': 53,
 'tokyo': 54,
 'developed': 55,
 'theory': 56,
 'relativity': 57,
 'albert-einstein': 58,
 'freezing': 59,
 'fahrenheit': 60,
 '32': 61,
 'known': 62,
 'as': 63,
 'red': 64,
 'mars': 65,
 'author': 66,
 '1984': 67,
 'george-orwell': 68,
 'currency': 69,
 'unit

In [108]:
len(vocab)

324

In [109]:
def text_to_indices(text,vocab):
    indexed_text=[]
    for token in tokenize(text):
        if token in vocab:
            indexed_text.append(vocab[token])
        else:
            indexed_text.append(vocab['<UNK>'])
    return indexed_text


In [110]:
text_to_indices("what is my name",vocab)

[1, 2, 0, 0]

In [111]:
import torch
from torch.utils.data import Dataset,DataLoader


In [112]:
class QADataset(Dataset):
    def __init__(self,df,vocab):
        self.df=df
        self.vocab=vocab
    def __len__(self):
        return self.df.shape[0]
    def __getitem__(self,index):

        numerical_question=text_to_indices(self.df.iloc[index]['question'],self.vocab)
        numerical_answer=text_to_indices(self.df.iloc[index]['answer'],self.vocab)

        return torch.tensor(numerical_question),torch.tensor(numerical_answer)


In [113]:
dataset=QADataset(df,vocab)

In [114]:
dataset[6]

(tensor([ 1,  2,  3, 33, 34,  5, 35]), tensor([36]))

In [115]:
dataLoader=DataLoader(dataset,batch_size=1,shuffle=True)

In [116]:
for question,answer in dataLoader:
    print(question,answer)

tensor([[ 1,  2,  3, 59, 25,  5, 26, 19, 60]]) tensor([[61]])
tensor([[ 10, 140,   3, 141, 270,  93, 271,   5,   3, 272]]) tensor([[273]])
tensor([[1, 2, 3, 4, 5, 8]]) tensor([[9]])
tensor([[ 1,  2,  3, 17, 18, 19, 20, 21, 22]]) tensor([[23]])
tensor([[ 42, 137, 118,   3, 247,   5, 248]]) tensor([[249]])
tensor([[ 42,  18,   2,   3, 281,  12,   3, 282]]) tensor([[205]])
tensor([[  1,   2,   3,  33,  34,   5, 245]]) tensor([[246]])
tensor([[  1,   2,   3,   4,   5, 135]]) tensor([[136]])
tensor([[ 42, 318,   2,  62,  63,   3, 319,   5, 320]]) tensor([[321]])
tensor([[10, 11, 12, 13, 14, 15]]) tensor([[16]])
tensor([[ 42, 200,   2,  14, 201, 202, 203, 204]]) tensor([[205]])
tensor([[ 42, 312,   2, 313,  62,  63,   3, 314, 315]]) tensor([[316]])
tensor([[ 42, 117, 118,   3, 119,  94, 120]]) tensor([[121]])
tensor([[10, 96,  3, 97]]) tensor([[98]])
tensor([[  1,   2,   3,  17, 115,  83,  84]]) tensor([[116]])
tensor([[ 10,  29, 130, 131]]) tensor([[132]])
tensor([[ 42, 174,   2,  62,  39, 

In [117]:
import torch.nn as nn
class SimpleRNN(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim=50)
        self.rnn = nn.RNN(50, 64, batch_first=True)
        self.fc = nn.Linear(64, vocab_size)


    def forward(self, x):
        embedded_question = self.embedding(x)
        hidden, final = self.rnn(embedded_question)
        output = self.fc(final.squeeze(0))
        return output

In [118]:
dataset[0]

(tensor([1, 2, 3, 4, 5, 6]), tensor([7]))

In [119]:
dataset[0][0]

tensor([1, 2, 3, 4, 5, 6])

In [120]:
x=nn.Embedding(324,embedding_dim=50)

In [121]:
x(dataset[0][0])

tensor([[ 0.5223,  1.6158, -1.4547, -1.9465,  1.4318,  1.0655,  0.5848,  0.3456,
          1.6265, -1.2918, -1.1763, -0.0710, -0.7008,  0.4292, -0.0055,  0.7451,
         -0.0847, -0.4744,  0.3825,  1.6595, -1.7710,  0.6053,  0.2199,  0.1254,
         -0.1482, -0.6393, -3.0452, -0.4129, -0.1451,  0.9331,  0.0606,  0.9325,
         -1.2037, -0.7824,  0.9377,  0.0611,  2.1154, -0.3592,  1.6228,  0.3884,
         -0.9902,  2.1665, -0.7269,  0.8341, -0.0821, -1.8306,  0.3545,  0.0359,
         -1.1664,  1.8332],
        [-0.7967, -0.4313,  1.1157,  0.0148,  0.6420,  1.2715, -0.8557, -0.4052,
          1.2917, -0.5205,  0.0920,  0.2177,  0.5751,  0.3590,  1.5541, -0.6194,
          0.2355,  1.3861,  0.7591,  1.2759,  0.0301, -1.5087, -0.1097,  0.9628,
         -0.5738, -0.5156, -1.9417, -0.8922, -0.7772,  1.3434, -0.6593, -0.6217,
         -1.2754,  0.3999,  0.0274, -1.1163, -1.0271, -0.5479,  0.2524, -0.5727,
          0.4945, -0.3735,  1.4920, -0.6753,  0.3018, -0.5356,  0.3953, -1.3905,


In [122]:
x(dataset[0][0]).shape

torch.Size([6, 50])

In [123]:
learning_rate=0.001
epochs=20


In [124]:
model=SimpleRNN(len(vocab))


In [125]:
criteria=nn.CrossEntropyLoss()
optimizer=torch.optim.Adam(model.parameters(),lr=learning_rate)

In [126]:
x=nn.Embedding(324,embedding_dim=50)
y=nn.RNN(50,64,batch_first=True)
z=nn.Linear(64,324)
a=dataset[0][0].reshape(1,6)
print("shape of a:",a.shape)
b=x(a)
print("shape of b",b.shape)
c,d=y(b)
print("shape of c",c.shape)
print("shape of d",d.shape)
e=z(d.squeeze(0))
print("shape of e",e.shape)

shape of a: torch.Size([1, 6])
shape of b torch.Size([1, 6, 50])
shape of c torch.Size([1, 6, 64])
shape of d torch.Size([1, 1, 64])
shape of e torch.Size([1, 324])


In [127]:
for epoch in range(epochs):
    total_loss=0
    for question,answer in dataset:
        optimizer.zero_grad()
        output=model(question)
        loss=criteria(output,answer[0])
        loss.backward()
        optimizer.step()
        total_loss=total_loss+loss.item()
    print(f"Epochs:{epoch+1}, Loss:{total_loss:4f}")

Epochs:1, Loss:521.883983
Epochs:2, Loss:453.657301
Epochs:3, Loss:374.704909
Epochs:4, Loss:315.736787
Epochs:5, Loss:264.640112
Epochs:6, Loss:216.184650
Epochs:7, Loss:171.690679
Epochs:8, Loss:133.133920
Epochs:9, Loss:101.791103
Epochs:10, Loss:77.710552
Epochs:11, Loss:59.867732
Epochs:12, Loss:46.850151
Epochs:13, Loss:37.360934
Epochs:14, Loss:30.372882
Epochs:15, Loss:25.132005
Epochs:16, Loss:21.117618
Epochs:17, Loss:17.982947
Epochs:18, Loss:15.494016
Epochs:19, Loss:13.482852
Epochs:20, Loss:11.818743


In [132]:
def predict(model,question,threshold=0.5):
    numerical_question=text_to_indices(question,vocab)
    question_tensor=torch.tensor(numerical_question).unsqueeze(0)
    output=model(question_tensor)
    probs=torch.nn.functional.softmax(output,dim=1)
    value,index=torch.max(probs,dim=1)
    if value<threshold:
        print("I don't know")
    print(list(vocab.keys())[index])

In [130]:
predict(model,"what is my name")

I don't know


In [133]:
predict(model,"what is the capital of france")

paris
