## Load required packages

In [3]:


# Load required packages

!pip install sentence-transformers
!pip install -qU \
  pinecone-client \
  pinecone-datasets \
  sentence-transformers \
  PyPDF2 \



[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25h

#### Pdf link for pdf which i used to feed as document for this task
pdf is stored in the same folder  with the name ct.pdf
- https://www.cancer.org/content/dam/CRC/PDF/Public/6800.00.pdf

In [15]:

# importing required modules
from PyPDF2 import PdfReader
from sentence_transformers import SentenceTransformer
import torch
from pinecone import Pinecone , ServerlessSpec , PodSpec


def prepare_model(model_name):
  '''
      Load the pretrained model

      params:
        model_name : pretrained model name

      return: Pretrained model object

  '''

  # load the model and return it
  device = 'cuda' if torch.cuda.is_available() else 'cpu'
  pretrained_model = SentenceTransformer(model_name, device=device)
  return pretrained_model

def get_embeddings(model , sentece_list):
  # create the query vector
  encoded_array = model.encode(sentece_list)
  encoded_list = encoded_array.tolist()
  return encoded_list

def pdf_to_embeddings(pdf_path , pretrained_model ,min_page_no, max_page_no):
  '''
      convert pdf text sentences to embeddings

      params:
        pdf_path : path of pdf
        pretrained_model: pretrained model object
        min_page_no: initial page no
        max_page_no: final page no

      return: embedding list with required metadata

  '''
  reader = PdfReader(pdf_path)
  records_to_search = []
  sentece_list_meta = []
  sentence_num = 0

  for page_ind , page in  enumerate(reader.pages[min_page_no-1:max_page_no]):
    # extracting text from page
    text = page.extract_text()
    paragraph_list = text.split('.\n')

    # create sentece embeddings and respected meta data
    for index_para, text_para in enumerate(paragraph_list):
      para_id = f'Paragraph_{page_ind}_{index_para}'
      # print(f'Paragraph_{page_ind}_{index_para}' ,text_para )
      sentece_text_list = text_para.replace('\n','').split('. ')
      sentece_embedding_list = get_embeddings(pretrained_model , sentece_text_list)
      for sentence_ind ,  sentence_val in enumerate(sentece_text_list):
        metadata_dict = {}
        sentence_value_dict = {}
        id = f'sentence_{sentence_num}'

        sentence_value_dict['id']=id
        sentence_value_dict['values']=sentece_embedding_list[sentence_ind]

        metadata_dict['page_index'] =min_page_no + page_ind
        metadata_dict['sentence_text'] = sentece_text_list[sentence_ind]
        metadata_dict['paragraph_id'] = para_id
        sentence_value_dict['metadata'] = metadata_dict

        sentece_list_meta.append(sentence_value_dict)
        sentence_num += 1

  print(f'embeddings and metadata for text in page {min_page_no} to page {max_page_no} are generated')
  return sentece_list_meta

def create_pinecone_index(api_key , index_name , embedding_dim , similarity_metric):
  '''
      create pinecone vector database

      params:
        api_key : account api key
        index_name: database index name
        embedding_dim: each vector size
        similarity_metric: similarity metric for search

      return: database instance

  '''
  # create the pinecone instance with provided vector size and similarity metric
  pc = Pinecone(api_key=api_key)
  active_indexes = [i['name'] for  i  in  pc.list_indexes()]
  if index_name in active_indexes:
    pc.delete_index(index_name)

  pc.create_index(
    name=index_name,
    dimension=embedding_dim,
    metric=similarity_metric, # cosine
    spec=PodSpec(
      environment="gcp-starter"
    )
  )

  index = pc.Index(index_name)
  return index

def generate_question_embedding(pretrained_model , prompt):
    '''
      create embeddings for question text

      params:
        pretrained_model : pretrained model object
        prompt: question text

      return: qustion embeddings

    '''
    #create question embedding from pretrained model
    que_emb = get_embeddings(pretrained_model , [prompt])
    que_emb = que_emb[0]
    return que_emb

def search_topk_similar_records(query_vector ,  pinecone_index , top_k):
    '''
      fetch the top results with respect schemantic similarity

      params:
        query_vector : question embedding vector
        pinecone_index: vector database where document text embeddings are stored

      return: top k results with metadata

    '''
    # search topk result with the type of score set in vector dataset
    results = pinecone_index.query( vector = query_vector , top_k = top_k , include_values=True ,  include_metadata=True)
    return results

def store_document_to_dataset(document_path , pretrained_model , pinecone_index , min_page_no , max_page_no ):
    '''
      create the embeddings for document and store in pinecone blank dataset

      params:
        document_path : path for pdf file
        pretrained_model : pretrained model object
        pinecone_index: vector database where document text embeddings are stored
        min_page_no: initial page no
        max_page_no: final page no

      return: top k results with metadata

    '''
    # convert pdf to embeddings
    vectors_metadata = pdf_to_embeddings(pdf_path , pretrained_model ,min_page_no , max_page_no  )

    # fit document embeddings in vector database
    pinecone_index.upsert(
      vectors=vectors_metadata
    )


# Define a function to generate a response to the user's query
def q_and_a_chatbot(question_text, pretrained_model , database_loaded_index , top_k , similarity_threshould = 0.3):
    '''
      provide the top k answers according to similarity metric
      provide message if not found any sentence above certain threshould

      params:
        question_text : text of the question
        pretrained_model : pretrained model object
        database_loaded_index: vector database instance where document text embeddings are stored
        top_k: no of top responses

      return: top k results with required structure

    '''
    result_dict_list =[]

    # create the question embbeding
    query_vector = generate_question_embedding(pretrained_model , question_text)

    # get the most similar vectors to the user's query vector
    top_similar_results = search_topk_similar_records(query_vector ,  pinecone_index , top_k)

    # instruction if blank array produced
    if top_similar_results['matches'] == []:
      print('Rerun the cell and reduce the top k number, as pinecode free version sometimes give blank result')
      return result_dict_list


    top_similar_answers = [{'score':i['score'],'metadata':i['metadata']} for i in top_similar_results['matches']]
    score_arr = [i['score'] for i in top_similar_results['matches']]

    # if no schemantic similarity present in the sentence likely there is no answer in it
    if max(score_arr) < similarity_threshould:
      print("I don't know the answer.")
      return result_dict_list

    # return top k results and thier scores

    for result_ind , result_dict in enumerate(top_similar_results['matches']):
        result_dict_list.append({'rank':int(result_ind+1) , 'score': result_dict['score'] , 'response_text':result_dict['metadata']['sentence_text'] })

    return result_dict_list

## Load the document devide it into sentences and store document embeddings in pinecone database

In [24]:


# context level info
pdf_path = 'ct.pdf'
min_page_no = 5
max_page_no = 7

# database level info
index_name = 'closeddomainqanda'
api_key='01d05f63-7f48-4f3a-b1be-92cab459a713'

#ebedding size of pretrained model
embedding_dim= 384

# model which used to create embeddings , i choose multi-qa-MiniLM-L6-cos-v1 because it is trained for q and a task, so it will produce better embeddings with respect to our task objective
# model_name = 'sentence-transformers/all-MiniLM-L6-v2'
model_name = 'sentence-transformers/multi-qa-MiniLM-L6-cos-v1'

# choose dotproduct to select most relative text , available scores in pinecone are ['cosine', 'euclidean', 'dotproduct']
similarity_metric="dotproduct"

# choose the minimum score threshould for valid answers
similarity_threshould = 0.3

# load the pretrained model
pretrained_model = prepare_model(model_name)

#create blank vector database
pinecone_index = create_pinecone_index(api_key , index_name , embedding_dim , similarity_metric)

#create and store document embeddings to search the answers of questions in created vector database
store_document_to_dataset(pdf_path , pretrained_model , pinecone_index , min_page_no , max_page_no )

embeddings and metadata for text in page 5 to page 7 are generated


## Questions and Answers

### Example questions and top k responses

#### Good example where answers are present

In [25]:
# ask questions for which text is availabe in document which having similarity score above certain threshould
question_text = 'how many patients with the same type of cancer get the new treatment in a phase II study ?'
top_k_searches = 5
# get the top responses from the document wich is above the defined threshould
# you can also tune the top k search number and if you dont get any answer you can lower the similarity_threshould
q_and_a_chatbot(question_text, pretrained_model , pinecone_index , top_k_searches , similarity_threshould)

[{'rank': 1,
  'score': 0.730267704,
  'response_text': 'American Cancer Society cancer.org | 1.800.227.2345 ____________________________________________________________________________________A group of 25 to 100 patients with the same type of cancer get the new treatment ina phase II study'},
 {'rank': 2,
  'score': 0.631570339,
  'response_text': 'Phase II clinical trials: Does the treatment work?\xa0\xa0If a new treatment is found to be safe in phase I clinical trials, a phase II clinical trial isdone to see if it works in certain types of cancer'},
 {'rank': 3,
  'score': 0.627096653,
  'response_text': 'These groups mayget different doses or get the treatment in different ways to see which provides thebest balance of safety and response.●Placebos (inactive treatments) are not\xa0used in phase II trials.●Phase II studies may be done at major cancer centers, community hospitals oreven doctors’ offices.●Larger numbers of patients get the treatment in phase II trials, so less common 

### bad exambple where answer is not present in document

In [27]:
# ask questions which are out of context
question_text = 'how many states are present in india'
top_k_searches = 5
q_and_a_chatbot(question_text, pretrained_model , pinecone_index , top_k_searches)

I don't know the answer.


[]