## QA summaritzation

1. Convert data (answer source) to embeddings.
2. Convert questions to embeddings.
3. Calculate similarity between question and probable answers.
4. Extract top-k most relevant texts (with higher similarity).
5. Generate answer with use of pre-trained lamguage model (T5).

In [None]:
# load dataset with possible answers

import pandas as pd

df = pd.read_csv('data/migri_data.csv')
df.shape

In [None]:
# filter non-relevant texts (like questions or intoduction)
answer_texts = []
for text in df.text:
    if text[-1] != ':':
        answer_texts.append(text)

len(answer_texts) # number of texts

In [None]:
# load embeddings model

from sentence_transformers import SentenceTransformer
emb_model = SentenceTransformer("all-MiniLM-L6-v2")

In [None]:
# convert texts to embeddings

embeddings = emb_model.encode(answer_texts)

In [None]:
# Convert question to embeddings
question = 'How can I get Finnish citizenship?'
q_emb = emb_model.encode([question])

### Similarity calculation

In [None]:
# calculate cosine similarity betweet data and question

from sklearn.metrics.pairwise import cosine_similarity

cosine_similarity([q_emb[0]], [embeddings[4]])[0][0]

similarity = {}
for num, emb in enumerate(embeddings):
    similarity[num] = cosine_similarity([q_emb[0]], [embeddings[num]])[0][0]

In [None]:
similarity

In [None]:
# sort possible answers by similarity score
sorted_x = sorted(similarity.items(), key=lambda kv: kv[1], reverse=True)
sorted_x

### Source of the possible answer

In [None]:
# collect indexes of the most relevant texts
answers_index = [i[0] for i in sorted_x[:10]]
answers_index

In [None]:
# create bulk answer text from the most relevant texts
text_answer = ''
for i in answers_index:
    text_answer += ' ' + str(answer_texts[i])

In [None]:
len(text_answer)

### Generation

In [None]:
# loading decoder model
from transformers import AutoTokenizer, AutoModelWithLMHead

tokenizer=AutoTokenizer.from_pretrained('T5-small')
model=AutoModelWithLMHead.from_pretrained('T5-small', return_dict=True)

In [None]:
# encode source answer text
inputs = tokenizer.encode("sumarize: " + text_answer, return_tensors='pt', max_length=512, truncation=True)

In [None]:
# generate answer
output = model.generate(inputs, min_length=80, max_length=100)

In [None]:
# decode and print the answer
summary=tokenizer.decode(output[0])
print(summary)