In [1]:
# ! pip install unstructured
# ! pip install sentence-transformers
# ! pip install markdown

In [5]:
from langchain.text_splitter import Language
from langchain.document_loaders.generic import GenericLoader
from langchain.document_loaders.parsers import LanguageParser
from langchain.document_loaders import NotionDirectoryLoader
from langchain.text_splitter import MarkdownHeaderTextSplitter
from langchain.document_loaders import UnstructuredMarkdownLoader
import os
from langchain.chat_models import ChatOllama
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler 
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain.chains.question_answering import load_qa_chain
from langchain.prompts import PromptTemplate
from langchain.chat_models import ChatOllama
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler 
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
from langchain.llms import Ollama

In [11]:
# Step 1: Load the markdown files for Mermaid Knowledge Base
path = "/Users/nmg/Desktop/Lamp/mermaid-develop/docs/syntax/"
loader = NotionDirectoryLoader(path)
docs = loader.load()

md_file = docs[0].page_content

In [12]:
headers_to_split_on = [
    ("#", "Main Header"),
    ("##", "Section"),
    ("###", "Sub Section"),
]
markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)

In [13]:
md_header_splits_list= []
meta_data_types = []
for doc in docs:
    md_file = doc.page_content
    md_header_splits = markdown_splitter.split_text(md_file)
    filename = os.path.basename(doc.metadata["source"])
    diagram_type = filename.split(".")[0]
    meta_data_types.append(diagram_type)
    updated_splits = []
    for chunk in md_header_splits:
        existing_metadata = chunk.metadata.copy()  # Create a copy of the existing metadata
        existing_metadata["DiagramType"] = diagram_type
        chunk.metadata = existing_metadata # Update with new metadata
        updated_splits.append(chunk)
    md_header_splits_list.append(updated_splits)

In [49]:
# md_header_splits_list[16][3]

In [50]:
max_length = 0

for splits in md_header_splits_list:
    for chunk in splits:
        length = len(chunk.page_content)
        if length > max_length:
            max_length = length

# print("Maximum length of md_header_splits:", max_length)

In [16]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
# Split the chunks into smaller pieces while preserving metadata
from langchain.schema import Document

# Define the maximum chunk size for the document transformer
max_chunk_size = 512

# Create an instance of the RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=max_chunk_size, chunk_overlap=0)



# Split the chunks into smaller pieces while preserving metadata
split_chunks = []
for splits in md_header_splits_list:
    for chunk in splits:
        if len(chunk.page_content) > max_chunk_size:
            smaller_chunks = text_splitter.split_text(chunk.page_content)
            for smaller_chunk in smaller_chunks:
                split_chunk = Document(page_content=smaller_chunk, metadata=chunk.metadata.copy())
                split_chunks.append(split_chunk)
        else:
            split_chunk = Document(page_content=chunk.page_content, metadata=chunk.metadata.copy())
            split_chunks.append(split_chunk)

In [51]:
# print(split_chunks[15].page_content)
# print(split_chunks[15].metadata)

In [18]:


#use this model for text corpus
modelPath = "BAAI/bge-large-en-v1.5"

# Create a dictionary with model configuration options, specifying to use the CPU for computations
model_kwargs = {'device':'mps'}

# Create a dictionary with encoding options, specifically setting 'normalize_embeddings' to False
# this experiment based on usage
encode_kwargs = {'normalize_embeddings': False}

# Initialize an instance of HuggingFaceEmbeddings with the specified parameters
embeddings = HuggingFaceEmbeddings(
    model_name=modelPath,     # Provide the pre-trained model's path
    model_kwargs=model_kwargs, # Pass the model configuration options
    encode_kwargs=encode_kwargs # Pass the encoding options
)

In [19]:
db = Chroma.from_documents(split_chunks, embedding=embeddings)

In [20]:
retriever = db.as_retriever(
    search_type="similarity",  # Also test "similarity"
    search_kwargs={"k": 8},
)

In [52]:

# Prompt
template = """Instruction: Use the following pieces of context to answer the question at the end. 
Provided a question {question}, Detect the type of diagram to be generated from the list """ + ",".join(meta_data_types) + """
and output the detected type. 
If you don't know the answer, just say that you don't know, don't try to make up an answer. 
Output: Provide a single word reply.
Question: {question}
Helpful Answer:"""
QA_CHAIN_PROMPT = PromptTemplate(
    input_variables=["question"],
    template=template
)

In [53]:
chat_model1 = ChatOllama(model="codellama:34b-instruct", 
                        callback_manager = CallbackManager([StreamingStdOutCallbackHandler()]),temperature=0)

In [54]:
# Chain
question = "How do I draw a top down flow chart. Write an example for the same?"
llm_chain = LLMChain(
    llm=chat_model1,
    prompt=QA_CHAIN_PROMPT
)

meta_data_type = llm_chain(question, return_only_outputs=True)['text']


 Flowchart

In [55]:
# Prompt
template = """Instruction: Use the following pieces of context to answer the question at the end. 
If you don't know the answer, just say that you don't know, don't try to make up an answer. 
Output: Use three sentences maximum and keep the answer as concise as possible. 
{context}
Question: {question}
Helpful Answer:"""
QA_CHAIN_PROMPT2 = PromptTemplate(
    input_variables=["context", "question"],
    template=template,
)

In [56]:

chat_model2 = ChatOllama(model="codellama:34b-instruct", 
                        callback_manager = CallbackManager([StreamingStdOutCallbackHandler()]),temperature=0)

In [57]:

metadata_filter = {"DiagramType": meta_data_type}

docs = retriever.get_relevant_documents(question,metadata_filter=metadata_filter)


In [58]:
# Chain
chain = load_qa_chain(chat_model2, chain_type="stuff", prompt=QA_CHAIN_PROMPT2)

# Run
new_output = chain({"input_documents": docs, "question": question}, return_only_outputs=True)['output_text']

 To draw a top-down flowchart, you can use the `flowchart TD` diagram type in Mermaid. Here's an example:
```mermaid
flowchart TD
    A[Start] --> B[Do something]
    B --> C[Do something else]
    C --> D[End]
```
This will create a top-down flowchart with four nodes (A, B, C, and D) connected by arrows. The `TD` in the diagram type stands for "top-down," which indicates that the chart should be drawn from top to bottom.