In [1]:
# Importing everything

In [1]:
from langchain_pinecone import PineconeVectorStore
from langchain_community.document_loaders import BSHTMLLoader, PyPDFLoader, TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document

import os
from tqdm import tqdm

from app.chains import get_chunk_classification_chain
from app.consts import LLM, EMBEDDING_MODEL
from app.consts import PINCONE_VECTORSTORE_INDEX_NAME

from dotenv import load_dotenv
load_dotenv()


True

### Defining Functions

In [11]:
def load_docs(folder, d_loader):
    l_docs_whole = []
    for file_name in tqdm(os.listdir(folder)):
        file_path = os.path.join(folder,file_name)

        # Process only files directly in the folder. Ignore folders 
        if os.path.isdir(file_path):
            break

        file_format = file_name.split('.')[-1]
        user_name = file_name.split(' ')[0]
        
        loader = d_loader[file_format](file_path)
        docs_whole = loader.load()
        
        ## Adding nessecary metadata to each document 
        for i, _ in enumerate(docs_whole):
            docs_whole[i].metadata['person_name'] = user_name

        l_docs_whole += docs_whole
    return l_docs_whole

def split_docs(l_docs_whole, splitter ):
    l_docs = splitter.split_documents(l_docs_whole)
    return l_docs


def examine_docs(l_docs):
    for doc in l_docs:
        print(doc.page_content)
        print(doc.metadata)        
        print('---'*20)

def tag_docs(l_docs, clf_chain):
    for i, doc in enumerate(l_docs):
        clf_obj = clf_chain.invoke(l_docs[i].page_content)
        clf = clf_obj['classification']
        # A copy has to be made, because dictionaries are called by reference, 
        # The metdata for one is be copied to the other,if just used as reference
        l_docs[i].metadata['clf'] = clf
        # metadata = doc.metadata.copy()
        # metadata['clf'] = clf

        # tagged_docs.append(Document(page_content=doc.page_content, metadata=metadata))
    return l_docs

def add_to_pinecone(l_docs_tagged):
    vectorstore = PineconeVectorStore.from_documents(
        l_docs_tagged,
        index_name=PINCONE_VECTORSTORE_INDEX_NAME,
        embedding=EMBEDDING_MODEL
    )

     

## Ingesting the documents
*We will go through the following steps, analysing the output after each step, making any changes as neccessary.* 
- Loading all the documents
- Splitting the docuents into chunks
- Tagging the documents; These metadata tags Will be used while retrival later in the RAG process
- If the results of all above steps seem fine, loading the documents into the vectorstore 


#### Step 1: Loading the documents - 
- May need to change the loader for file types, if ingestion is not proper 

In [10]:
#Testing the loader 
d_loader = {
    'txt' : TextLoader, 
    'pdf' : PyPDFLoader,
    'htm' :BSHTMLLoader,
    'html':BSHTMLLoader
}
folder = './rag_data'
l_docs_whole = load_docs(folder=folder,d_loader=d_loader)
examine_docs(l_docs_whole)

100%|██████████| 3/3 [00:00<00:00, 18.50it/s]

  
Rashi Jain  
Manager , Strategy and M&A   
 
Email: jainrashi1202@gmail.com  | www. linkedin.com/in/rashisrcc |  Contact: +91-9718428432  
 Work Experience                                                                                                                                                                          
 
Deloitte Consulting USI  
Manager , Strategy and Mergers & Acquisitions   June '19 – Present ( 5 Years ) 
Conducted 5+ growth strategy projects, 5+ cost optimization, 10+ target identification, commercial and operational diligence,  3 post 
merger integration . Utilized financial modelling to drive data -driven decision -making  
Growth Strategy and Market Assessment  
• Conducted market assessment of chartering and fractional  aircraft market to assess the commercial and operational feasibility  
for an OEM to enter the business; identified  acquisition targets and crafted a GTM strategy including high -level S&M tactics  
• Developed growth pathways for a glo




#### Step 2: Splitting the documents
- May need to change splitter type or it's argments if not happy with the splits

In [12]:

splitter = RecursiveCharacterTextSplitter( chunk_size = 500, chunk_overlap = 100)
l_docs_split = split_docs(l_docs_whole, splitter=splitter)
examine_docs(l_docs_split)

Rashi Jain  
Manager , Strategy and M&A   
 
Email: jainrashi1202@gmail.com  | www. linkedin.com/in/rashisrcc |  Contact: +91-9718428432  
 Work Experience                                                                                                                                                                          
 
Deloitte Consulting USI  
Manager , Strategy and Mergers & Acquisitions   June '19 – Present ( 5 Years )
{'source': './rag_data\\Rashi-Jain CV.pdf', 'page': 0, 'person_name': 'Rashi-Jain'}
------------------------------------------------------------
Manager , Strategy and Mergers & Acquisitions   June '19 – Present ( 5 Years ) 
Conducted 5+ growth strategy projects, 5+ cost optimization, 10+ target identification, commercial and operational diligence,  3 post 
merger integration . Utilized financial modelling to drive data -driven decision -making  
Growth Strategy and Market Assessment  
• Conducted market assessment of chartering and fractional  aircraft market 

#### Step 3: Tagging the documents
- Using the document classifier to identify which all topics are being talked about in a chunk. 
- In case of unsatisfactory results, may need to change the prompt (language, bucket descriptions), or LLM
- Started by using a small LLM (gpt-3.5 turbo), may need to change if results are not satisfactory   

In [13]:

clf_chain = get_chunk_classification_chain(LLM) #  
l_docs_tagged = tag_docs(l_docs_split, clf_chain=clf_chain)
examine_docs(l_docs_tagged)

  llm_st = llm.with_structured_output(InputClassification)


Rashi Jain  
Manager , Strategy and M&A   
 
Email: jainrashi1202@gmail.com  | www. linkedin.com/in/rashisrcc |  Contact: +91-9718428432  
 Work Experience                                                                                                                                                                          
 
Deloitte Consulting USI  
Manager , Strategy and Mergers & Acquisitions   June '19 – Present ( 5 Years )
{'source': './rag_data\\Rashi-Jain CV.pdf', 'page': 0, 'person_name': 'Rashi-Jain', 'clf': ['Workex', 'Contact']}
------------------------------------------------------------
Manager , Strategy and Mergers & Acquisitions   June '19 – Present ( 5 Years ) 
Conducted 5+ growth strategy projects, 5+ cost optimization, 10+ target identification, commercial and operational diligence,  3 post 
merger integration . Utilized financial modelling to drive data -driven decision -making  
Growth Strategy and Market Assessment  
• Conducted market assessment of chartering an

#### Step 4: Adding them to the VectorStore
If the results of all above steps seem fine, the chunks will be added to the vectorstore

*VERY IMPORTANT NOTE: Only run this code once. Adding multiple instances of the same chunk can be detrimental to the RAG model*


*Note: To maintain uniformity across the app, the index-name, and embeddings is being specified in the app.consts file*



In [14]:


# add_to_pinecone(l_docs_tagged)