In [None]:
import uuid
from PyPDF2 import PdfReader
import openai
import pinecone

"""

GenAi assistant for documenting a business process. 

The documentation serves multiple purposes, including ensuring transparency, facilitating communication, and providing a reference for auditing, improvement, or compliance purposes. 

Here's how an expert auditor might document a business process:

Describe Each Step:
For each step in the process, provide a detailed description. Include information such as:

Activity Name: Clearly label each step.
Inputs: Identify the inputs required for each activity.
Outputs: Specify the expected outputs produced by each activity.

Create a Process Flowchart:
Develop a visual representation of the process flowchart. Use dash symbols to represent activities, decision points, inputs, outputs, and flow direction. The flowchart provides a high-level overview of the process and helps in understanding the sequence of steps.


Here is a POC of how a GenAI applicationcan in assistant  documenting a business process

Text Extraction from PDF:
There's a function (extract_text_from_pdf) that takes a PDF file path as input and extracts text from each page, cleaning and concatenating it into a single string.

Text Summarization:
Another function (summarize_process_data) uses the OpenAI API to generate a human-readable summary of the given text using the "davinci" engine.

Saving Summaries to Files:
The script has a function (save_summary_to_file) that saves the generated summary to a text file.

Processing PDFs and Summarizing Data:
A function (load_pinecone_with_process_summary) loads a PDF file, extracts text, generates a summary, and saves the summary to multiple files.

Text Chunking and Embedding:
There are functions (split_text_into_chunks and create_embedding) that break down text into chunks and create embeddings using the OpenAI Embedding API.

Loading Data into Pinecone:
The script includes a function (load_pinecone_with_process_data) that loads text data into Pinecone by converting chunks into embeddings and indexing them.

Querying Pinecone for Similar Documents:
There's a function (rag_query_pinecone) that queries Pinecone for similar documents based on a given text input.

Processing Queries Using ChatCompletion:
Functions (process_query_with_docs and process_query) use ChatCompletion from the OpenAI API to process queries and generate responses.

Develop a visual representation of the process flowchart. 
Use dash symbols to represent activities, decision points, inputs, outputs, and flow direction. The flowchart provides a high-level overview of the process and helps in understanding the sequence of steps.




"""


PIINECONE_INDEX = "your_index_name"  # Replace with your actual Pinecone index name
PINECONE_API_KEY = "your_pinecone_api_key"  # Replace with your actual Pinecone API key

def extract_text_from_pdf(file_path):
    reader = PdfReader(file_path)
    meta = reader.metadata
    print(meta)
    num_pages = len(reader.pages)
    full_text = ""
    
    for next_page_num in range(num_pages):
        page_read = reader.pages[next_page_num]
        page_text = page_read.extract_text()
        page_text_ascii = page_text.encode("ascii", "ignore")
        page_text_clean = page_text_ascii.decode()
        full_text += page_text_clean
    
    return full_text.strip()

def summarize_process_data(process_data):
    response = openai.Completion.create(
        engine="davinci",
        prompt=f'Summarize the key process topics from given text in human readable paragraphs: {process_data}',
        max_tokens=500
    )
    summary = response.choices[0].text.strip()
    return summary

def save_summary_to_file(summary, state, page_num):
    file_txt = f"apps/chats/data/raw_data/{state}_{page_num}.txt"
    with open(file_txt, 'w') as f:
        f.write(summary)

def load_pinecone_with_process_summary(process_summary_doc_file_name, state):
    file_pdf = process_summary_doc_file_name
    full_text = extract_text_from_pdf(file_pdf)
    process_data_summary = summarize_process_data(full_text)
    
    for next_page_num in range(10):  # Adjust the number of pages as needed
        save_summary_to_file(process_data_summary, state, next_page_num)

def split_text_into_chunks(text):
    words = text.split()
    text_chunks = []
    for i in range(0, len(words), chunk_size):
        chunk = ' '.join(words[i:i+chunk_size])
        text_chunks.append(chunk)
    return text_chunks

def create_embedding(chunk):
    response = openai.Embedding.create(
            input=chunk,
            model="text-embedding-ada-002"
        )
    return response["data"][0]["embedding"]


def load_pinecone_with_process_data(process_name, model="text-embedding-ada-002", index_name=PIINECONE_INDEX):
    file_pdf = f"./process_history_data/{process_name}.pdf"
    full_text = extract_text_from_pdf(file_pdf)
    
    process_data = full_text.strip()

    index_name = PIINECONE_INDEX
    text_chunks = split_text_into_chunks(process_data)
    
    pinecone.init(api_key=PINECONE_API_KEY, environment="us-east1-gcp")
    pinecone_index = pinecone.Index(index_name=index_name)

    for chunk in text_chunks:
        id = uuid.uuid4().hex
        embedding = create_embedding(chunk)
        embedding_sample_doc = [
            {
                "id": id,
                "values": embedding,
                "metadata": {
                    "id": id,
                    "text": chunk
                }
            }
        ]
        pinecone_index.upsert(embedding_sample_doc)

def rag_query_pinecone(text, model="text-embedding-ada-002", index_name=PIINECONE_INDEX, top_k=2):
    pinecone.init(api_key=PINECONE_API_KEY, environment="us-east1-gcp")
    index = pinecone.Index(index_name)
    query_vector = get_embedding(text, model)
    result = index.query(queries=[query_vector], top_k=top_k, includeMetadata=True)
    
    output_texts = ''
    threshold = 0.8

    results = result['results']
    for result in results:
        matches = result['matches']
        for match in matches:
            if match['score'] < threshold:
                continue
            metadata = match['metadata']
            text = metadata['text']
            source = metadata['source']
            output_texts += f"source: {source}\ntext: {text}\n\n"
    
    context = output_texts
    return context

def process_query_with_docs(process_summary, openai_api_key):

    # Generate step headings with dash boxes using ChatCompletion
    step_headings_prompt = f'Generate step headings with dash boxes for the given process steps: {process_summary}'
    step_headings_response = openai.ChatCompletion.create(
        model="gpt-4",
        messages=[
            {'role': 'system', 'content': step_headings_prompt},
            {'role': 'user', 'content': ''}
        ],
        temperature=0
    )
    step_headings = step_headings_response["choices"][0]["message"]["content"]

    # Use the generated step headings in the process prompt
    process_prompt += f"\n\nGenerated Step Headings:\n\n{step_headings}"

    # Make the final ChatCompletion call
    response = openai.ChatCompletion.create(
        model="gpt-4",
        messages=[
            {'role': 'system', 'content': process_prompt},
            {'role': 'user', 'content': text_message}
        ],
        temperature=0
    )

    prompt_response = response["choices"][0]["message"]['content']
    print(prompt_response)

def process_query(mysteps, openai_api_key):
    
    rag_search_pinecone_response = rag_query_pinecone(mysteps)
    print('rag_search_pinecone_response ..............................')
    print(rag_search_pinecone_response)
    
    process_prompt = f'as an expert auditor describe the details of the steps needed to successfully implement the process in the following step by step process: {mysteps} and use the following process expert data to enhance each current step: {rag_search_pinecone_response} '
    
    response = openai.ChatCompletion.create(model="gpt-4", messages=[
        {'role': 'system', 'content': process_prompt},
        {'role': 'user', 'content': text_message}
    ], temperature=0)
    
    prompt_response = response["choices"][0]["message"]['content']

def main():


    ############################################################################################################################################
    #    Bank Deposit Operations Process
    ############################################################################################################################################
    
    mysteps = {
        "process_steps": [
        "Step 1: Customer Initiation of Bank Deposit Request of $AMOUNT",
        "Step 2: Document Verification",
        "Step 3: Amount Verification",
        "Step 4: Fee Assessment",
        "Step 5: Fraud Detection",
        "Step 6: Customer Identification",
        "Step 7: Transaction Recording",
        "Step 8: Endorsement Verification",
        "Step 9: Cash Handling Procedures",
        "Step 10: Error Detection",
        "Step 11: Record Retention",
        "Step 12: Reporting",
        "Step 13: Audit Trail Documentation",
        "Step 14: Compliance Check",
        "Step 15: Continuous Monitoring"
        ]
    }

    bank_process_summary = process_query(mysteps, PINECONE_API_KEY)
    print(process_summary)
    bank_process_step_by_step_documenation = process_query(bank_process_summary, PINECONE_API_KEY)
    print(process_step_by_step_documenation)

    ############################################################################################################################################
    #    Agile Software Development Process
    ############################################################################################################################################
    
    mysteps_for_dev = {
        "process_steps": [
        "Step 1: Continuous integration",
        "Step 2: Automated unit testing",
        "Step 3: Pair programming,
        "Step 4: Test-driven development",
        "Step 5: Design patterns"
        ]
    }

    devevlopment_process_summary = process_query(mysteps_for_dev, PINECONE_API_KEY)
    print(process_summary)
    developemnt_process_step_by_step_documenation = process_query(devevlopment_process_summary, PINECONE_API_KEY)
    print(developemnt_process_step_by_step_documenation)

if __name__ == "__main__":
    main()