## Set up credential

gcloud config set billing/quota_project PROJECT_ID  
gcloud auth application-default login

## Set up required packages

In [None]:
%pip install -q --upgrade langchain-text-splitters langchain-community langgraph

In [None]:
import os
import getpass

os.environ['LANGSMITH_TRACING'] = "true"
os.environ['LANGSMITH_API_KEY'] = getpass.getpass()

In [None]:
os.environ["GOOGLE_PROJECT"] = getpass.getpass("Enter project id: ")

In [None]:
os.environ["GOOGLE_API_KEY"] = getpass.getpass("Enter api key: ")

## Setting up RAG

### Set up llm

In [None]:
%pip install -q --upgrade "langchain[google-genai]"

In [None]:
import getpass
import os

from langchain.chat_models import init_chat_model
from langchain_core.language_models.chat_models import BaseChatModel
from langchain_core.messages.base import BaseMessage


llm = init_chat_model('gemini-2.0-flash', model_provider="google_genai")

### Set up model for embedding

Vecotorize the imput for efficiency

In [None]:
%pip install -q --upgrade "langchain-google-vertexai"

In [None]:
import vertexai
from langchain_google_genai import GoogleGenerativeAIEmbeddings

embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")

### Setting up vector store

Will use memory store for this prototype

In [None]:
from langchain_core.vectorstores import InMemoryVectorStore

vector_store = InMemoryVectorStore(embeddings)

## Indexing

### Loader

Scrape the table from html and convert it into Document

In [None]:
from bs4 import BeautifulSoup
import requests
from langchain_core.documents import Document

TRUSTED_SOURCE = 'https://nutritionsource.hsph.harvard.edu/vitamins/'

resp = requests.get(TRUSTED_SOURCE)
soup = BeautifulSoup(resp.text, 'html.parser')

NAME_IDX = 0
AMOUNT_MEN_IDX = 1
AMOUNT_WOMEN_IDX = 2
LIMIT_IDX = 3

docs = []

for tr in soup.find('table').find_all('tr'):
    row = tr.find_all('td')

    if row[NAME_IDX].text.startswith('Vitamin') or row[NAME_IDX].text.startswith('Mineral') or row[NAME_IDX].text.startswith('Women') or row[NAME_IDX].text.startswith('Men') or row[NAME_IDX].text.startswith('*'):
        continue
    source = tr.find('a')

    docs.append(
        Document(
            page_content=f"common_names={row[NAME_IDX].text} ; daily_dietry_amount={row[AMOUNT_MEN_IDX].text} ; upper_limit={row[LIMIT_IDX].text} ; gender=men",
            metadata={"source": source}))
    docs.append(
        Document(
            page_content=f"common_names={row[NAME_IDX].text} ; daily_dietry_amount={row[AMOUNT_WOMEN_IDX].text} ; upper_limit={row[LIMIT_IDX].text} ; gender=women",
            metadata={"source": source}))

print(docs)

### Splitting

No splitting required

### Storing

In [None]:
document_ids = vector_store.add_documents(documents=docs)

## Try it out !

In [None]:
import json

items = []

with open('P_20250525_212034.jpg.json', 'rb') as fd:
    j = json.load(fd)

    for item in j['analyzeResult']['documents'][0]['fields']['Items']['valueArray']:
        if item['valueObject']['Description']['valueString'].startswith('レジ袋'):
            continue
        items.append(item['valueObject']['Description']['valueString'])

In [None]:
question = "Given the food list below, show me the amount of vitamin and minerals included and does it meet the daily requirement." + '\n\n'+'\n'.join(items)

print(question)

In [None]:
from langchain import hub

prompt = hub.pull("rlm/rag-prompt")

example_messages = prompt.invoke(
    {"context": "(context goes here)", "question": "(question goes here)"}
).to_messages()

assert len(example_messages)

print(example_messages[0].content)

In [None]:
from langchain_core.documents import Document
from typing_extensions import List, TypedDict

class State(TypedDict):
    question: str
    context: List[Document]
    answer: str

def retrieve(state: State):
    retrieved_docs = vector_store.similarity_search(state["question"])
    return {"context": retrieved_docs}

def generate(state: State):
    docs_content = "\n\n".join(doc.page_content for doc in state["context"])
    messages = prompt.invoke({"question": state["question"], "context": state["context"]})
    response = llm.invoke(messages)
    return {"answer": response.content}

### The control flow

In [None]:
from langgraph.graph import START, StateGraph

graph_builder = StateGraph(State).add_sequence([retrieve, generate])
graph_builder.add_edge(START, "retrieve")

graph = graph_builder.compile()

In [None]:
from IPython.display import Image, display

display(Image(graph.get_graph().draw_mermaid_png()))

In [None]:
result = graph.invoke({"question": question})

In [None]:
print(f'Context: {result["context"]}')
print(f'Answer: {result["answer"]}')
