In [None]:
pip install langchain faiss-cpu transformers sentence-transformers pdfplumber

Collecting faiss-cpu
  Downloading faiss_cpu-1.9.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.4 kB)
Collecting pdfplumber
  Downloading pdfplumber-0.11.4-py3-none-any.whl.metadata (41 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.0/42.0 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
Collecting pdfminer.six==20231228 (from pdfplumber)
  Downloading pdfminer.six-20231228-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.5/48.5 kB[0m [31m455.9 kB/s[0m eta [36m0:00:00[0m
Downloading faiss_cpu-1.9.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (27.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.5/27.5 MB[0m [31m33.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pdfplumber-0.11.4-py3-none-any.whl

### a) Procesamiento del Documento
Extraer texto del documento PDF.

In [None]:
import pdfplumber

def extract_text_from_pdf(pdf_path):
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text += page.extract_text()
    return text

# Ruta al documento Food Code 2022
pdf_path = "Food_Code_2022.pdf"
document_text = extract_text_from_pdf(pdf_path)

### b) División del Texto en Fragmentos
Dividir el texto en fragmentos más pequeños para indexación.

In [None]:
from langchain.text_splitter import CharacterTextSplitter

# Dividir el texto en fragmentos (e.g., 1000 caracteres con 200 de superposición)
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = text_splitter.split_text(document_text)

### c) Creación del Índice
Construir un índice utilizando FAISS para la recuperación de fragmentos relevantes.

In [None]:
pip install -U langchain-community

Collecting langchain-community
  Downloading langchain_community-0.3.7-py3-none-any.whl.metadata (2.9 kB)
Collecting SQLAlchemy<2.0.36,>=1.4 (from langchain-community)
  Downloading SQLAlchemy-2.0.35-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.6 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting httpx-sse<0.5.0,>=0.4.0 (from langchain-community)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.6.1-py3-none-any.whl.metadata (3.5 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading marshmallow-3.23.1-py3-none-any.whl.metadata (7.5 kB)
Collecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading typing_inspect-0.9.0-py3-none-any.whl.metadat

In [None]:
from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from langchain.vectorstores import FAISS

# Inicializar el embeddings
embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

# Construir el índice FAISS
vector_store = FAISS.from_texts(chunks, embeddings)

# Guardar el índice
vector_store.save_local("food_code_faiss_index")

### d) Búsqueda de Fragmentos Relevantes
Recuperar fragmentos relevantes para una consulta dada.

In [None]:
def retrieve_relevant_chunks(query, vector_store, top_k=5):
    return vector_store.similarity_search(query, k=top_k)

# Consulta
query = "What are the requirements for food temperature control?"
relevant_chunks = retrieve_relevant_chunks(query, vector_store)
for i, chunk in enumerate(relevant_chunks):
    print(f"Chunk {i+1}:\n{chunk.page_content}\n")

Chunk 1:
January 18, 2023 Version
Food Code
2022 Recommendations of the
United States Public Health Service
Food and Drug Administration
The Food Code is a model for safeguarding public health and ensuring food is unadulterated
and honestly presented when offered to the consumer. It represents FDA's best advice for a
uniform system of provisions that address the safety and protection of food offered at retail
and in food service.
This model is offered for adoption by local, state, and federal governmental jurisdictions for
administration by the various departments, agencies, bureaus, divisions, and other units
within each jurisdiction that have been delegated compliance responsibilities for food
service, retail food stores, or food vending operations. Alternatives that offer an equivalent
level of public health protection to ensure that food at retail and foodservice is safe are
recognized in this model.
iPrevious Editions of Codes
Recommended by The
United States Public Health Service

### e) Generación de Respuestas
Usar un modelo generativo para responder preguntas basándonos en los fragmentos recuperados.

In [None]:
from transformers import pipeline

# Modelo de generación
qa_pipeline = pipeline("text2text-generation", model="t5-small")

# Construir el contexto y generar respuesta
context = " ".join([chunk.page_content for chunk in relevant_chunks])
prompt = f"Context: {context}\n\nQuestion: {query}\nAnswer:"
response = qa_pipeline(prompt, max_length=200, num_return_sequences=1)
print("Generated Answer:", response[0]["generated_text"])

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (379334 > 512). Running this sequence through the model will result in indexing errors
