## Libraries

In [None]:
!pip install langchain langchain_core langchain_community langchain-huggingface torch accelerate bitsandbytes docarray unstructured jq

## Requirements

Install the requirements and import relevant modules

In [90]:
import os
import pandas as pd
from pandas import DataFrame
import json
from langchain_core.prompts import ChatPromptTemplate
from langchain_huggingface import ChatHuggingFace, HuggingFacePipeline
from transformers import BitsAndBytesConfig
import torch
from langchain_community.vectorstores import DocArrayInMemorySearch
from langchain_core.runnables import RunnableParallel, RunnablePassthrough
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.document_loaders import JSONLoader

## Environment Variables and Constants

Set the API keys and environment variables required for running the app

In [71]:
os.environ["HUGGINGFACEHUB_API_TOKEN"] = ""
os.environ['LANGCHAIN_API_KEY'] = ""
os.environ["OPENAI_API_KEY"] = ""

# The hugging face cahces is the location
# where the models will be downloaded. It is 
# recommended to set it in a location which has
# sufficient storage space

os.environ["HF_HUB_CACHE"] = ""
os.environ["HF_HOME"] = ""

TEMPLATE = """
You are an assistant for question-answering tasks.
If you don't know the answer, just say that you don't know.
Use three sentences maximum and keep the answer concise.
Question: {question}
Answer:
"""

## Utitlity Functions

Utitlity functions to perform different operations like loading data, formatting data etc.

In [100]:
def load_excel_to_dataframe(file_path: str, header=0, index_col=0, reset_index=False) -> DataFrame:
    """
    Loads the remediation table into a pandas dataframe.

    Args:
        * file_path (str): The path to the remidations excel file
        * header (int): the row to set as header row
        * index_col: column ids. provide if there are rows with multi level sub subrows

    Returns:
        A pandas `DataFrame` object with the loaded data
    """

    df = pd.read_excel(file_path, header=header, index_col=index_col)
    df = df.reset_index()
    return df

def convert_df_to_json(df: DataFrame, save_path=None) -> json:
    """
    Converts a pandas `DataFrame` to json format

    Args:
        * df (DataFrame): A panads `DataFrame` object with required data
        * save_path (str): path if want to save the json data in a file
    Returns:
        json data
    """

    json_data = df.to_json(orient="records")

    if save_path:
        with open(save_path, 'w') as f:
            json.dump(json_data, f)
            
    return json_data

def set_prompt(template) -> ChatPromptTemplate:
    """
    Set up the chat prompt to be used with the model

    Args:
        * template (str): The prompt template to use

    Returns:
        `ChatPromptTemplate` object
    """

    prompt = ChatPromptTemplate.from_template(template)
    return prompt

def load_lrm_model_from_hf(model_id) -> ChatHuggingFace:
    """
    Loads lrm model from hugging face to a `ChatHuggingFace` model

    Args:
        model_id (str): the hugging face url of the model

    Returns:
        A `ChatHuggingFace` model
    """

    quantization_config = BitsAndBytesConfig(
        load_in_4bit=True,
    )

    llm = HuggingFacePipeline.from_model_id(
    model_id="O1-OPEN/OpenO1-LLama-8B-v0.1",
    task="text-generation",
    pipeline_kwargs=dict(
        max_new_tokens=1024,
        do_sample=False,
        repetition_penalty=1.03,
        return_full_text=False
        ),
    model_kwargs={"quantization_config": quantization_config},
    device_map="auto",
    )
    model = ChatHuggingFace(llm=llm)
    return model

def create_retrievar_from_vector_store(docs: list):
    """
    Create a retrievar from a vector store.
    Embeds documnents, stores into a vector store and creates
    a retrievar that can be used to retrieve relevant documents.

    Args:
        docs (list): A list of `Documents` which need to be embeded.

    Returns:
    
    """

    embed = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
    vector_store = DocArrayInMemorySearch.from_documents(docs, embed)
    retrievar = vector_store.as_retriever()

    return retrievar
    

## RAG Workflow

This is where the RAG workflow starts from

In [101]:
# load excel docs
remediations_df = load_excel_to_dataframe("./data/Remediations.xlsx", header=2, index_col=[0,1,2,3,4])
scenarios_df = load_excel_to_dataframe("./data/Scenarios.xlsx")

# convert dataframe to json
scenarios_json = convert_df_to_json(scenarios_df, save_path="./data/scen.json")
remediations_json = convert_df_to_json(remediations_df, save_path="./data/rem.json")

# convert json data to lanchain Document format
loader_scen = JSONLoader(file_path="./data/scen.json",jq_schema='.')
loader_rem = JSONLoader(file_path="./data/rem.json",jq_schema='.')

scenarios_doc = loader_scen.load()
remediations_doc = loader_rem.load()

# save the Documents to the vector store and get
# the retrievar
scenarios = create_retrievar_from_vector_store(scenarios_doc)
remediations = create_retrievar_from_vector_store(remediations_doc)


In [102]:
# setup the prompt
prompt = set_prompt(TEMPLATE)



In [None]:
prompt