In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Installing libraries


In [None]:
!pip install faiss-cpu
!pip install transformers
!pip install torch
!pip install datasets
!pip install --upgrade transformers
!pip install nltk
!pip install -U sentence-transformers

Collecting faiss-cpu
  Downloading faiss_cpu-1.7.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.6/17.6 MB[0m [31m99.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.7.4
Collecting transformers
  Downloading transformers-4.30.2-py3-none-any.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m70.4 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m30.3 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [

## Importing dependencies

In [None]:
import faiss
import datasets
import transformers
import torch
import pandas as pd
import numpy as np
import tensorflow as tf
from transformers import AutoTokenizer, RagRetriever, TFRagSequenceForGeneration
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
stemmer=PorterStemmer()
nltk.download('stopwords')
from sentence_transformers import SentenceTransformer
import datetime
from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial import distance
model_sentence_transformer = SentenceTransformer("nli-distilroberta-base-v2")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Downloading (…)7023f/.gitattributes:   0%|          | 0.00/736 [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)433037023f/README.md:   0%|          | 0.00/3.71k [00:00<?, ?B/s]

Downloading (…)3037023f/config.json:   0%|          | 0.00/679 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading (…)33037023f/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/329M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading (…)7023f/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/1.12k [00:00<?, ?B/s]

Downloading (…)33037023f/vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading (…)037023f/modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

## Reading the dataset

In [None]:
def read_dataset(file_path):
  # Initializing the path
  file_path = file_path

  #Using pandas read_excel function to read the data
  df = pd.read_excel(file_path)

  # return a dataframe
  return df

### Input the question

In [None]:
def input_question():
    question  = input()
    return question

## Data Preprocessing

In [None]:
def preprocessing_data(dataframe,question):

    #declare corpus to store answer
    corpus=[" "]
    #iterate through the dataframe["Answer"]
    for answer in dataframe["Ideal Answer"]:

        #replace every special characters, numbers etc.. with whitespace of answer
        #It will help retain only letter/alphabets
        answer=re.sub("[^a-zA-Z]"," ",answer)

        #convert every letters to its lowercase
        answer=answer.lower()

        #append the answer in corpus list
        corpus[0] += " "  + answer



    #convert every letters to its lowercase
    ques=question.lower()


    return corpus,ques

## Model Building

#### RAG module

In [None]:
def training_model(model_name):
      #  Initialize the Autotokenizer
      tokenizer = AutoTokenizer.from_pretrained(model_name)

      #  Initialize the RagRetriever
      retriever = RagRetriever.from_pretrained(
          model_name, index_name="exact", use_dummy_dataset=True
      )

      # initialize with RagRetriever to do everything in one forward call
      model = TFRagSequenceForGeneration.from_pretrained(
          model_name, retriever=retriever, from_pt=True
      )
      return tokenizer, retriever, model

In [None]:
def creating_input_dict_and_outputs(tokenizer,question,answers_all,model):
    input_dict = tokenizer.prepare_seq2seq_batch(
        question, answers_all, return_tensors="tf",truncation=True,
        padding=True,
    )
    outputs = model(input_dict, output_retrieved=True)
    return outputs, input_dict

#### Retrieval Phase

In [None]:
def retrievers(input_dict,model,tokenizer,retriever):

    # or use retriever separately
    # 1. Encode
    input_ids = input_dict["input_ids"]
    question_hidden_states = model.question_encoder(input_ids)[0]
    # 2. Retrieve
    docs_dict = retriever(input_ids.numpy(), question_hidden_states.numpy(), return_tensors="tf")
    doc_scores = tf.squeeze(
        tf.matmul(
            tf.expand_dims(question_hidden_states, axis=1), docs_dict["retrieved_doc_embeds"], transpose_b=True
        ),
        axis=1,
    )
    return input_ids,question_hidden_states,docs_dict,doc_scores

#### Generation phase

In [None]:
def generator(input_ids,question_hidden_states,docs_dict,doc_scores,model,input_dict,tokenizer):
    # 3. Forward to generator
    outputs = model(
        input_dict,
        context_input_ids=docs_dict["context_input_ids"],
        context_attention_mask=docs_dict["context_attention_mask"],
        doc_scores=doc_scores,
        decoder_input_ids=input_dict["labels"],
    )


    # or directly generate
    generated = model.generate(
        context_input_ids=docs_dict["context_input_ids"],
        context_attention_mask=docs_dict["context_attention_mask"],
        doc_scores=doc_scores,
    )
    generated_string = tokenizer.batch_decode(generated, skip_special_tokens=True)
    return generated_string

#### Adding the similarity check

In [None]:
def check_similarity(generated_string,dataframe,question,model):
    # Creating an empty list for storing the Question column similarity score
    question_similarity_score = []

    # Creating an empty list for storing the Question column index
    question_similarity_index = []

    for i in range(len(dataframe["Question"])):
        sentences = [
                dataframe["Question"][i],
                question
              ]
        #  Creating the sentence embeddings
        sentence_embedding = model.encode(sentences)

        # Finding out the cosine similarity
        a = 1 - distance.cosine(sentence_embedding[0], sentence_embedding[1])
        # Appending the similarity score and index
        question_similarity_score.append(a)
        question_similarity_index.append(i)

    # Finding the most similar question to the input question from dataset
    max_question_similarity = max(question_similarity_score)
    inx_ques = question_similarity_score.index(max_question_similarity)
    ques_inx = question_similarity_index[inx_ques]

    # Getting the similarity score for dataset given answers and generated answers
    answer_from_data = dataframe["Ideal Answer"][inx_ques]

    sentences = [
        generated_string,
        answer_from_data
    ]
    sentence_embedding = model.encode(sentences)
    a = 1 - distance.cosine(sentence_embedding[0], sentence_embedding[1])
    return a*100

# Function Calling

In [None]:
df = read_dataset("/content/drive/MyDrive/colab/Stealth/SampleQuestions.xlsx")
tokenizer, retriever, model_train = training_model("facebook/rag-sequence-nq")

Downloading (…)lve/main/config.json:   0%|          | 0.00/4.60k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

Downloading (…)_tokenizer/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'DPRQuestionEncoderTokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'DPRQuestionEncoderTokenizerFast'.


Downloading (…)okenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

Downloading (…)tokenizer/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)tokenizer/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'BartTokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'BartTokenizerFast'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'DPRQuestionEncoderTokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may res

Downloading builder script:   0%|          | 0.00/9.62k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/67.5k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/14.6k [00:00<?, ?B/s]

Downloading and preparing dataset wiki_dpr/dummy.psgs_w100.nq.no_index to /root/.cache/huggingface/datasets/wiki_dpr/dummy.psgs_w100.nq.no_index-dummy=True,with_index=False/0.0.0/74d4bff38a7c18a9498fafef864a8ba7129e27cb8d71b22f5e14d84cb17edd54...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/4.69G [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/1.32G [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset wiki_dpr downloaded and prepared to /root/.cache/huggingface/datasets/wiki_dpr/dummy.psgs_w100.nq.no_index-dummy=True,with_index=False/0.0.0/74d4bff38a7c18a9498fafef864a8ba7129e27cb8d71b22f5e14d84cb17edd54. Subsequent calls will reuse this data.
Downloading and preparing dataset wiki_dpr/dummy.psgs_w100.nq.exact to /root/.cache/huggingface/datasets/wiki_dpr/dummy.psgs_w100.nq.exact-ce970d5f816ae529/0.0.0/74d4bff38a7c18a9498fafef864a8ba7129e27cb8d71b22f5e14d84cb17edd54...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Dataset wiki_dpr downloaded and prepared to /root/.cache/huggingface/datasets/wiki_dpr/dummy.psgs_w100.nq.exact-ce970d5f816ae529/0.0.0/74d4bff38a7c18a9498fafef864a8ba7129e27cb8d71b22f5e14d84cb17edd54. Subsequent calls will reuse this data.


  0%|          | 0/10 [00:00<?, ?it/s]

Downloading pytorch_model.bin:   0%|          | 0.00/2.06G [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFRagSequenceForGeneration: ['rag.generator.final_logits_bias', 'rag.generator.model.decoder.embed_tokens.weight', 'rag.generator.model.encoder.embed_tokens.weight', 'rag.question_encoder.question_encoder.bert_model.embeddings.position_ids', 'rag.question_encoder.question_encoder.bert_model.pooler.dense.weight', 'rag.question_encoder.question_encoder.bert_model.pooler.dense.bias']
- This IS expected if you are initializing TFRagSequenceForGeneration from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRagSequenceForGeneration from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFRagSequence

In [None]:
question = input_question()
answer, question = preprocessing_data(df,question)

What is the cost/fees of a PAN card?


In [None]:
outputs, input_dict = creating_input_dict_and_outputs(tokenizer,question,answer,model_train)
input_ids,question_hidden_states,docs_dict,doc_scores = retrievers(input_dict,model_train,tokenizer,retriever)
generated_string = generator(input_ids,question_hidden_states,docs_dict,doc_scores,model_train,input_dict,tokenizer)
generated_string

[' us $ 299.99']

###  Verify similarity checkup

In [None]:
check_similarity(generated_string,df,question,model_sentence_transformer)

34.39426124095917