### DocuTalk - AI : Advanced AI App for RAG over documents. 

In [2]:
# libraries

# Huggingface 
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM


# langchain - RAG
from langchain_community.document_loaders import TextLoader
from langchain_community.embeddings.sentence_transformer import (
    SentenceTransformerEmbeddings,
)

from langchain_community.document_loaders import Docx2txtLoader

from langchain_community.vectorstores import Chroma
from langchain_text_splitters import CharacterTextSplitter

# Gradio App
import gradio as gr

from docx import Document 
import json
import os


#### Step by Step

In [3]:
# Step 1: 
# load the document and split it into chunks
loader = TextLoader("sample.txt")
documents = loader.load()


In [4]:
# Step 2:
# split it into chunks
text_splitter = CharacterTextSplitter(separator="\n",
    chunk_size=500,
    chunk_overlap=20,
    length_function=len,
    is_separator_regex=False)

docs = text_splitter.split_documents(documents)

Created a chunk of size 675, which is longer than the specified 500
Created a chunk of size 559, which is longer than the specified 500


In [5]:
len(docs)

19

In [6]:
# Step 3: Making vectors and storing in vector db
# create the open-source embedding function
embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

In [7]:
# load it into Chroma
db = Chroma.from_documents(docs, embedding_function)

In [None]:
# Sample : query 
query = "Who is Yeshwanth Sai?"
docs = db.similarity_search(query)

# print results
print(docs[0].page_content)

### Gradio App

In [8]:
# NER Model
get_completion = pipeline("ner", model="dslim/bert-base-NER")

Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [9]:
# Helper functions
def merge_tokens(tokens):
    merged_tokens = []
    for token in tokens:
        if merged_tokens and token['entity'].startswith('I-') and merged_tokens[-1]['entity'].endswith(token['entity'][2:]):
            # If current token continues the entity of the last one, merge them
            last_token = merged_tokens[-1]
            last_token['word'] += token['word'].replace('##', '')
            last_token['end'] = token['end']
            last_token['score'] = (last_token['score'] + token['score']) / 2
        else:
            # Otherwise, add the token to the list
                
            merged_tokens.append(token)
    merged_tokens_updated = []
    for token in merged_tokens:
        temp = {}
        for k,v in token.items():
            temp[k]=str(v)
        merged_tokens_updated.append(temp)
            
    return merged_tokens_updated

def ner(input):
    output = get_completion(input)
    merged_tokens = merge_tokens(output)
    return {"text": input, "entities": merged_tokens}



In [10]:
def ner_doc(f):
    # load the contents of the file
    path = f.name
    path = path.replace("\\", "\\\\")
    
    f = open(path, 'rb')
    document = Document(f)
    f.close()

    content = ""
    for para in document.paragraphs:
        content += para.text
    
    # get ner
    output = get_completion(content)
    merged_tokens = merge_tokens(output)
    return merged_tokens

In [11]:
def final_ner_(f):
    global document
    global embedding_function
    
    # load the contents of the file
    path = f.name
    path = path.replace("\\", "\\\\")
    
    f = open(path, 'rb')
    document = Document(f)
    f.close()

    content = ""
    for para in document.paragraphs:
        content += para.text
    
    # get ner
    output = get_completion(content)
    merged_tokens = merge_tokens(output)
    return merged_tokens

In [12]:
def final_ner(f=None):

    path = f.name
    path = path.replace("\\", "\\\\")
    
    f = open(path, 'rb')
    document = Document(f)
    f.close()

    content = ""
    for para in document.paragraphs:
        content += para.text

    # initializing the chroma db:
    loader = Docx2txtLoader(path)
    data = loader.load()
    
    
    # split it into chunks
    text_splitter = CharacterTextSplitter(separator="\n",
        chunk_size=500,
        chunk_overlap=0,
        length_function=len,
        is_separator_regex=False)
    
    docs = text_splitter.split_documents(documents)
    
    db = Chroma.from_documents(docs, embedding_function)
    
    # get ner
    output = get_completion(content)
    merged_tokens = merge_tokens(output)
    return merged_tokens

        

In [13]:
# Step 6: LLM model
model_name = "google/flan-t5-base"
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)  
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

In [14]:
# RAG function
def final_rag(question):
    global db
    global model, tokenizer

    docs = db.similarity_search(question)
    # print results
    context = docs[0].page_content

    prompt = f"""Role: You are an expert in answering questions. You have given a context below. Carefully analyze the context.  
        #### Context: {context}
        
        Now answer the following question based on the above context:
        #### Question: {question}
        #### Answer:
        """
    inputs = tokenizer(prompt, return_tensors="pt")
    
    output = tokenizer.decode(model.generate(
        inputs["input_ids"],
        max_new_tokens=50)[0], 
        skip_special_tokens=True)
    # framing the response

    
    return output

In [17]:
gr.close_all()

In [18]:
demo_ner = gr.Interface(fn=final_ner,
                    inputs=[gr.File(label="Upload file to find entities")],
                    outputs=[gr.JSON(label="Entities")],
                    title="DocuTalk - AI",
                    description="Find key phrase information entities using the `dslim/bert-base-NER` model under the hood!",
                    allow_flagging="never")

demo_rag = gr.Interface(fn=final_rag,
                    inputs=[gr.Textbox(label="Ask question over uploaded document", lines=6)],
                    outputs=[ gr.Textbox(label="Result", lines=3)],
                    title="DocuTalk - AI",
                    description="QA over documents : ",
                    allow_flagging="never")

app = gr.TabbedInterface(interface_list=[demo_ner, demo_rag], 
                         tab_names = ["Key Phrase information", "RAG over Docs"])

app.launch(share=True, server_port=int(3001))

Running on local URL:  http://127.0.0.1:3001
Running on public URL: https://30f94a86ed1a265ee6.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)




Created a chunk of size 675, which is longer than the specified 500
Created a chunk of size 559, which is longer than the specified 500
