In [18]:
import os
import dotenv
from pathlib import Path

from langchain_core.messages import AIMessage, HumanMessage
from langchain_community.document_loaders.text import TextLoader
from langchain_community.document_loaders import (
    CSVLoader,
    WebBaseLoader, 
    PyPDFLoader, 
    Docx2txtLoader,
)
from langchain_community.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_anthropic import ChatAnthropic
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain.chains import create_history_aware_retriever, create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain

dotenv.load_dotenv()

True

In [19]:
# Load docs

doc_paths = [
    "docs/data_forecast_results_extended.csv",
    "docs/about_company.pdf",
]

docs = [] 
for doc_file in doc_paths:
    file_path = Path(doc_file)

    try:
        if doc_file.endswith(".csv"):
            loader = CSVLoader(file_path)
        elif doc_file.endswith(".txt"):
            loader = TextLoader(file_path)
        elif doc_file.endswith(".docx"):
            loader = Docx2txtLoader(file_path)
        elif doc_file.endswith(".pdf"):
            loader = PyPDFLoader(file_path)
        else:
            print(f"Document type {doc_file.type} not supported.")
            continue

        docs.extend(loader.load())

    except Exception as e:
        print(f"Error loading document {doc_file}: {e}")


In [20]:
docs

[Document(metadata={'source': 'docs\\data_forecast_results_extended.csv', 'row': 0}, page_content=': 0\nCountry: Angola\nCoffee type: Mixta\n1990/91: 1200000.0\n1991/92: 1800000.0\n1992/93: 2100000.0\n1993/94: 1200000.0\n1994/95: 1500000.0\n1995/96: 600000.0\n1996/97: 1200000.0\n1997/98: 2400000.0\n1998/99: 1800000.0\n1999/00: 1200000.0\n2000/01: 1200000.0\n2001/02: 1200000.0\n2002/03: 1200000.0\n2003/04: 900000.0\n2004/05: 900000.0\n2005/06: 900000.0\n2006/07: 1800000.0\n2007/08: 1800000.0\n2008/09: 1800000.0\n2009/10: 1800000.0\n2010/11: 1800000.0\n2011/12: 1800000.0\n2012/13: 1800000.0\n2013/14: 1800000.0\n2014/15: 1800000.0\n2015/16: 1800000.0\n2016/17: 1800000.0\n2017/18: 1800000.0\n2018/19: 1800000.0\n2019/20: 1800000.0\n2020/21: 1800000.0\n2021/22: 1809221.4466235603\n2022/23: 1809221.4466235603\n2023/24: 1809221.4466235603\n2024/25: 1809221.4466235603\nTotal_domestic_consumption: 55536885.786494255'),
 Document(metadata={'source': 'docs\\data_forecast_results_extended.csv', 'ro

In [21]:
# Split docs

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=5000,
    chunk_overlap=1000,
)

document_chunks = text_splitter.split_documents(docs)

In [22]:
# Tokenize and load the documents to the vector store

vector_db = Chroma.from_documents(
    documents=document_chunks,
    embedding=OpenAIEmbeddings(),
)

In [23]:
# Retrieve

def _get_context_retriever_chain(vector_db, llm):
    retriever = vector_db.as_retriever()
    prompt = ChatPromptTemplate.from_messages([
        MessagesPlaceholder(variable_name="messages"),
        ("user", "{input}"),
        ("user", "Given the above conversation, generate a search query to look up in order to get inforamtion relevant to the conversation, focusing on the most recent messages."),
    ])
    retriever_chain = create_history_aware_retriever(llm, retriever, prompt)

    return retriever_chain

In [24]:
def get_conversational_rag_chain(llm):
    retriever_chain = _get_context_retriever_chain(vector_db, llm)

    prompt = ChatPromptTemplate.from_messages([
        ("system",
        """You are an expert Data Analytics Assistant for "High Garden Coffee,"
         an international coffee exporter. Your primary role is to empower internal teams (Sales, Marketing, Logistics) 
         with agile, data-driven decision-making by providing interactive access to insights derived from our advanced analytical solutions.
        **Your Core Function:**
        Leverage the outputs from our developed machine learning and analytical models to 
        answer specific business queries related to domestic coffee consumption. 
        These models have analyzed historical data from 1990-2020 across various countries and coffee types.

        **Data and Analytical Capabilities at Your Disposal:**
        *   **Predictive Analysis of Consumption Trends:** You have access to **forecasts for future coffee consumption volumes** for specific coffee types and countries, enabling anticipation of demand changes and optimization of export volumes.
        *   **Supply Chain Optimization & Demand Forecasting:** You can provide **precise demand forecasts and insights into consumption ranges** by coffee type and destination country, aiding in inventory planning and product availability.
        *   **Market Segmentation:** You can describe **characteristics of identified international market segments**, based on consumption patterns, preferred coffee types, and volumes, to personalize export and marketing strategies.
        *   **Innovation & Product Development Opportunities:** You can identify **key insights, emerging niches, or incipient trends** in coffee consumption that suggest new product development needs.

        **Guidelines for Responding:**
        1.  **Accuracy & Data-Driven:** Always provide answers that are **directly supported by the analytical solutions' findings**. Do not hallucinate, speculate, or extrapolate beyond the scope of the pre-computed insights. 
        Your responses must directly reflect the "valor a la compañía" generado por la analítica.
        2.  **Clarity & Conciseness:** Deliver information in a clear, easy-to-understand manner suitable for business teams.
        3.  **Completeness:** Provide comprehensive answers to the user's query, drawing on all relevant information from the models and insights.
        4.  **Unit Consistency:** When referring to consumption volumes, use appropriate units (e.g., kg, tons) as provided by the analytical models.
        5.  **Handling Missing Information:** If a user's query cannot be answered with the available data or insights from the models, **explicitly state that the information is not available** in the current analytical outputs. Do not invent answers.
        6.  **Focus:** Concentrate exclusively on queries related to coffee consumption trends, preferences, volumes, market segments, and potential innovation areas within the context of "High Garden Coffee's" operations.
        7.  **Examples of Supported Queries (as per business case objectives):** You should be able to answer questions similar to:
            *   "¿Cuál es el tipo de café de mayor crecimiento en el mercado brasileño para el próximo año?" (e.g., "What is the fastest-growing coffee type in the German market for the next year?")
            *   "Muéstrame los rangos de consumo de café arábica en Japón." (e.g., "Show me the consumption ranges for Arabica coffee in Japan.")
            *   "Describe the characteristics of Market Segment 3 based on preferred coffee types and volumes."
            *   "Are there any emerging trends in organic coffee consumption in Scandinavian countries based on our data?"

        **Constraint:** Do not engage in general conversation outside the scope of "High Garden Coffee" data analytics. Do not provide opinions, recommendations, or speculate on future events not covered by the predictive models. Maintain a professional, objective, and helpful tone.\n
        {context}"""),
        MessagesPlaceholder(variable_name="messages"),
        ("user", "{input}"),
    ])
    stuff_documents_chain = create_stuff_documents_chain(llm, prompt)

    return create_retrieval_chain(retriever_chain, stuff_documents_chain)

In [25]:
# Augmented Generation

llm_stream_openai = ChatOpenAI(
    model="gpt-4o",  # Here you could use "o1-preview" or "o1-mini" if you already have access to them
    temperature=0.3,
    streaming=True,
)

llm_stream_anthropic = ChatAnthropic(
    model="claude-3-5-sonnet-20240620",
    temperature=0.3,
    streaming=True,
)

llm_stream = llm_stream_openai  # Select between OpenAI and Anthropic models for the response

messages = [
    {"role": "user", "content": "Hi"},
    {"role": "assistant", "content": "Hola! Cómo puedo ayudarte hoy?"},
    {"role": "user", "content": "¿Qué tipo de café comercializamos en Brazil?"},
]
messages = [HumanMessage(content=m["content"]) if m["role"] == "user" else AIMessage(content=m["content"]) for m in messages]

conversation_rag_chain = get_conversational_rag_chain(llm_stream)
response_message = "*(RAG Response)*\n"
for chunk in conversation_rag_chain.pick("answer").stream({"messages": messages[:-1], "input": messages[-1].content}):
    response_message += chunk
    print(chunk, end="", flush=True)

messages.append({"role": "assistant", "content": response_message})

En Brasil, "High Garden Coffee" comercializa el tipo de café "Mixta."