# Environment setup

In [None]:
!pip install transformers

In [None]:
# faiss (Facebook AI Similarity Search) installation is best with conda
!pip install -q condacolab
import condacolab
condacolab.install()
!conda install -c pytorch faiss-gpu

In [None]:
!pip install datasets
!pip install gradio

In [None]:
!pip install transformers[onnx]

# Semantic search model: create context

In [9]:
import tensorflow as tf
from transformers import TFAutoModelForQuestionAnswering

In [10]:
from transformers import pipeline

In [None]:
from datasets import load_dataset
dataset = load_dataset("csv", data_files="https://raw.githubusercontent.com/vietanh00/research_paper_query/main/data/arxiv_papers.csv", split="train")
#only keep the title and abstract columns; remove rows with abstract < 30 since those likely won't help
dataset = dataset.filter(lambda x: len(x["abstract"]) > 30)
dataset_columns = dataset.column_names
col_to_keep = ["title", "abstract"]
col_to_delete = ["published","authors","url"]
dataset = dataset.remove_columns(col_to_delete)
dataset

In [None]:
from transformers import AutoTokenizer, TFAutoModel

model_ckpt = "sentence-transformers/multi-qa-mpnet-base-dot-v1"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = TFAutoModel.from_pretrained(model_ckpt, from_pt=True)

In [14]:
def cls_pooling(model_output):
    return model_output.last_hidden_state[:, 0]

def get_embeddings(text_list):
    encoded_input = tokenizer(
        text_list, padding=True, truncation=True, return_tensors="tf"
    )
    encoded_input = {k: v for k, v in encoded_input.items()}
    model_output = model(**encoded_input)
    return cls_pooling(model_output)

In [15]:
embedding = get_embeddings(dataset["abstract"][0])
embedding.shape

TensorShape([1, 768])

In [16]:
embeddings_dataset = dataset.map(
    lambda x: {"embeddings": get_embeddings(x["abstract"]).numpy()[0]}
)



Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

In [None]:
import faiss
# faiss fails for some reasons
embeddings_dataset.add_faiss_index(column="embeddings")

In [None]:
question = "What is regression?"
question_embedding = get_embeddings([question]).numpy()
scores, samples = embeddings_dataset.get_nearest_examples(
    "embeddings", question_embedding, k=5
)

In [52]:
import pandas as pd

samples_df = pd.DataFrame.from_dict(samples)
samples_df["scores"] = scores
samples_df.sort_values("scores", ascending=False, inplace=True)
relevant_title = []
relevant_abstract = []
for _, row in samples_df.iterrows():
    relevant_title.append(row.title)
    relevant_abstract.append(row.abstract)

# Question-answer: distilBERT from created context

In [62]:
from transformers import DistilBertTokenizer, TFDistilBertForQuestionAnswering, pipeline
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased-distilled-squad")
model = TFDistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased-distilled-squad")

context_list = relevant_abstract

inputs = tokenizer(question, relevant_abstract[0], return_tensors="tf")
outputs = model(**inputs)

answer_start_index = int(tf.math.argmax(outputs.start_logits, axis=-1)[0])
answer_end_index = int(tf.math.argmax(outputs.end_logits, axis=-1)[0])

predict_answer_tokens = inputs.input_ids[0, answer_start_index : answer_end_index + 1]
tokenizer.decode(predict_answer_tokens)

q_a = pipeline("question-answering", model='distilbert-base-uncased-distilled-squad')
print("="*50)
print(f">>Possible answers for \"{question}\" include: ")
answer_list = []
for ra in relevant_abstract:
  result = q_a(question=question, context=ra)
  answer_list.append(result['answer'])
print(" -- ".join(answer_list))
print(">>Works regarding this subject include: ")
for rt in relevant_title:
  print(">>>>", rt)

Some layers from the model checkpoint at distilbert-base-uncased-distilled-squad were not used when initializing TFDistilBertForQuestionAnswering: ['dropout_19']
- This IS expected if you are initializing TFDistilBertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased-distilled-squad and are newly initialized: ['dropout_433']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


>>Possible answers for "What is regression?" include: 
a gap -- Logic artificial intelligence -- significant discontent -- it has been
debated whether humans are able to create intelligence using technology -- exponential growth
>>Works regarding this subject include: 
>>> Independent Ethical Assessment of Text Classification Models: A Hate Speech Detection Case Study
>>> Design of quantum optical experiments with logic artificial intelligence
>>> Symbols as a Lingua Franca for Bridging Human-AI Chasm for Explainable and Advisable AI Systems
>>> An argument for the impossibility of machine intelligence
>>> A brief history of AI: how to prevent another winter (a critical review)


# Put gradio

In [46]:
import gradio as gr
def answer(question):
  question_embedding = get_embeddings([question]).numpy()
  scores, samples = embeddings_dataset.get_nearest_examples(
    "embeddings", question_embedding, k=5)
  
  samples_df = pd.DataFrame.from_dict(samples)
  samples_df["scores"] = scores
  samples_df.sort_values("scores", ascending=False, inplace=True)
  relevant_title = []
  relevant_abstract = []
  for _, row in samples_df.iterrows():
      relevant_title.append(row.title)
      relevant_abstract.append(row.abstract)

  q_a = pipeline("question-answering", model='distilbert-base-uncased-distilled-squad')
  output = ""
  output += ">>Possible answers for \"" + question + "\" include: \n"
  answer_list = []
  for ct in relevant_abstract:
    result = q_a(question=question, context=ct)
    answer_list.append(result['answer'])
  output += " -- ".join(answer_list)
  output += "\n>>Works regarding this subject include: ", ", ".join(relevant_title)
  return output

In [None]:
demo = gr.Interface(fn=answer, inputs="text", outputs="text")

demo.launch(share=True, debug=True)  

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
Running on public URL: https://d9e3558ca2bd9cd9e8.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades (NEW!), check out Spaces: https://huggingface.co/spaces
