In [1]:
import os
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain.schema import Document
from typing import List, Dict, Any
import json
from dotenv import load_dotenv

load_dotenv()

# Now you can access the OpenAI API key
openai_api_key = os.getenv("OPENAI_API_KEY")

# Set the API key in the environment (if required by the library)
os.environ["OPENAI_API_KEY"] = openai_api_key

In [2]:
def load_processed_files(directory: str) -> List[Document]:
    if not os.path.exists(directory):
        raise ValueError(f"Directory does not exist: {directory}")

    documents = []
    files = [f for f in os.listdir(directory) if f.endswith(".vcon.json")]

    print(f"Found {len(files)} VCon JSON files in directory")

    for filename in files:
        file_path = os.path.join(directory, filename)
        try:
            with open(file_path, "r", encoding="utf-8") as f:
                vcon_data = json.load(f)

            processed_docs = process_vcon_data(vcon_data, file_path)
            if processed_docs:
                documents.extend(processed_docs)
                # print(f"Successfully processed {filename}")
            else:
                print(f"No valid content found in {filename}")

        except json.JSONDecodeError as e:
            print(f"Invalid JSON in file {filename}: {e}")
        except Exception as e:
            print(f"Error processing file {filename}: {e}")

    print(f"Successfully processed {len(documents)} documents")
    return documents


from langchain.schema import Document

def process_vcon_data(vcon_data: Dict[str, Any], file_path: str) -> List[Document]:
    """
    Processes a VCon JSON structure and extracts individual messages as separate documents.
    """
    documents = []

    # Extract base metadata (flatten structure)
    base_metadata = {
        "source": file_path,
        "uuid": vcon_data.get("uuid", ""),
        "created_at": vcon_data.get("created_at", ""),
        "updated_at": vcon_data.get("updated_at", ""),
    }

    # Process analysis sections for individual messages
    for analysis in vcon_data.get("analysis", []):
        if analysis.get("type") == "transcript":
            transcript_body = analysis.get("body", [])

            for entry in transcript_body:
                if isinstance(entry, dict):
                    speaker = entry.get("speaker", "")
                    message = entry.get("message", "")

                    if message:
                        # Attach metadata to each message
                        message_metadata = {
                            **base_metadata,
                            "speaker": speaker,
                            "speaker_role": (
                                "agent" if speaker == "Agent" else "customer"
                            ),
                            "analysis_type": "transcript",
                        }

                        # Add the message as a separate document
                        documents.append(
                            Document(page_content=message, metadata=message_metadata)
                        )

    return documents


def create_vectorstore(documents: List[Document]):
    """
    Creates a Chroma vectorstore from the processed message-level documents.
    
    Args:
        documents: List of processed Document objects (where each document represents one message).
    
    Returns:
        Chroma retriever object
    """
    if not documents:
        raise ValueError("No valid documents to create embeddings for.")

    try:
        # Create vectorstore using each message as a document
        vectorstore = Chroma.from_documents(
            documents=documents, embedding=OpenAIEmbeddings()
        )
        return vectorstore.as_retriever()
    except Exception as e:
        print(f"Error creating vectorstore: {e}")
        raise


def search_documents(question: str, directory: str) -> List[Dict[str, Any]]:
    """
    Searches for relevant messages and returns metadata and the first line of the message.
    """
    # Load and process documents (messages)
    processed_docs = load_processed_files(directory)

    if not processed_docs:
        print("No valid documents found to search through")
        return []

    # Create vectorstore and retrieve relevant messages
    try:
        retriever = create_vectorstore(processed_docs)
        relevant_docs = retriever.get_relevant_documents(question, n_results=5)
        uuids = [f'../Conversations/vCon/{doc.metadata['uuid']}.vcon.json' for doc in relevant_docs]
        return uuids

        

        # Return simplified format with just UUID, speaker, and content
        # return [
        #     {
        #         "uuid": doc.metadata["uuid"],
        #         "speaker": doc.metadata["speaker"],
        #         "content": doc.page_content,  # Get full message content
        #     }
        #     for doc in relevant_docs
        # ]
    except Exception as e:
        print(f"Error during search: {e}")
        raise


def search_documents_with_llm(question: str, directory: str):
    processed_docs = load_processed_files(directory)
    print("Processing docs done!")
    retriever = create_vectorstore(processed_docs)
    

    llm = ChatOpenAI(model_name="gpt-3.5-turbo")

    relevant_docs = retriever.get_relevant_documents(question, n_results=5)

    combined_docs = "\n\n".join([doc.page_content for doc in relevant_docs])
    refined_question = f"Here are the most relevant sections from the documents. Please answer the question: {question}\n\n{combined_docs}"

    response = llm.generate(refined_question)

    return response


In [5]:
directory = "../Conversations/vCon"
question = "staff"
processed_docs = load_processed_files(directory)
retriever = create_vectorstore(processed_docs)
relevant_docs = retriever.get_relevant_documents(question, n_results=4)
# combined_docs = "\n\n".join([doc.page_content for doc in relevant_docs])

Found 88 VCon JSON files in directory
Successfully processed 927 documents


In [6]:
relevant_docs

[Document(metadata={'analysis_type': 'transcript', 'created_at': '2024-05-07T17:00:56.102095', 'source': '../Conversations/vCon/36b78788-6296-49ec-bbcd-0bfc3b6f686e.vcon.json', 'speaker': 'Customer', 'speaker_role': 'customer', 'updated_at': '2024-05-07T17:00:56.102095', 'uuid': '36b78788-6296-49ec-bbcd-0bfc3b6f686e'}, page_content='Hello, this is Martha Roberts.'),
 Document(metadata={'analysis_type': 'transcript', 'created_at': '2024-05-07T17:00:56.102095', 'source': '../Conversations/vCon/36b78788-6296-49ec-bbcd-0bfc3b6f686e.vcon.json', 'speaker': 'Customer', 'speaker_role': 'customer', 'updated_at': '2024-05-07T17:00:56.102095', 'uuid': '36b78788-6296-49ec-bbcd-0bfc3b6f686e'}, page_content='Hello, this is Martha Roberts.'),
 Document(metadata={'analysis_type': 'transcript', 'created_at': '2024-05-07T17:04:31.760217', 'source': '../Conversations/vCon/0dc1e73a-38f6-4760-8b36-f80b7ad888e0.vcon.json', 'speaker': 'Customer', 'speaker_role': 'customer', 'updated_at': '2024-05-07T17:04:31

In [9]:

uuids

['../Conversations/vCon/95a81d06-5a12-4654-8736-5ab79258ca29.vcon.json',
 '../Conversations/vCon/07a1cf5c-c68c-4955-9271-40e1c7f4470e.vcon.json',
 '../Conversations/vCon/07a1cf5c-c68c-4955-9271-40e1c7f4470e.vcon.json',
 '../Conversations/vCon/95a81d06-5a12-4654-8736-5ab79258ca29.vcon.json']

In [10]:
relevant_convos = []
for file in uuids:
    with open(file, 'r', encoding='utf-8') as f:
        data = json.load(f)        
    relevant_convos.append([file,data['analysis'][0]['body']])


In [11]:
relevant_convos
#abcdef12-3456-7890-abcd-ef1234567890

[['../Conversations/vCon/95a81d06-5a12-4654-8736-5ab79258ca29.vcon.json',
  [{'speaker': 'Agent',
    'message': 'Hello, thank you for contacting CleanStays! My name is Joe Miller. May I have your name, please?'},
   {'speaker': 'Customer', 'message': "Hi, I'm Joan Peterson."},
   {'speaker': 'Agent',
    'message': 'Nice to meet you, Joan! How are you doing today?'},
   {'speaker': 'Customer', 'message': "I'm feeling happy, thank you."},
   {'speaker': 'Agent',
    'message': "That's wonderful to hear! To assist you better, could you please provide me with your room number?"},
   {'speaker': 'Customer', 'message': "I'm staying in room two zero four."},
   {'speaker': 'Agent',
    'message': 'Great! And for security purposes, can you confirm the last four digits of the credit card used for booking your reservation?'},
   {'speaker': 'Customer',
    'message': 'The last four digits are eight seven six five.'},
   {'speaker': 'Agent',
    'message': "Thank you for verifying that, Joan. I

In [3]:
question = "late order"
directory = "../Conversations/vCon"
relevant_documents = search_documents_with_llm(question, directory)

        # Convert the response to a dictionary
response_content = {"documents": relevant_documents.generations[0][0].text}

Found 88 VCon JSON files in directory
Successfully processed ffba043b-d1aa-4691-8739-ac3ddd030594.vcon.json
Successfully processed 56700e4c-42cf-465f-ae8e-3cebec95934e.vcon.json
Successfully processed 4f710435-9ae8-44c7-9fb6-9a2a65793c43.vcon.json
Successfully processed 545bbb07-cabb-4d29-98a5-548d11fa8ae1.vcon.json
Successfully processed 7606114a-c79b-43a3-aed8-35b979412d80.vcon.json
Successfully processed 7d0f3cf0-85f0-4813-b2af-c9160e5518ad.vcon.json
Successfully processed 7c2c8801-5b7c-4f17-886e-a12e2278e7e4.vcon.json
Successfully processed 2ed79657-f80e-45b0-ac5f-9a09ab909f64.vcon.json
Successfully processed f6e8b7c6-21c8-4a1a-8c6c-654c5429ff41.vcon.json
Successfully processed f18af675-a755-4c2c-b57a-aec0b97151d9.vcon.json
Successfully processed 1deaa521-91a4-42a6-89b4-c4e5d2569ee0.vcon.json
Successfully processed 90859417-605e-48ea-b017-abbb2c509d34.vcon.json
Successfully processed 8115d39e-fb64-481c-9694-9369d9f2aef0.vcon.json
Successfully processed 7fe499cc-99ed-43f3-b025-bb0bc

  relevant_docs = retriever.get_relevant_documents(question, n_results=5)


KeyboardInterrupt: 