In [None]:
!pip install pypdf sentence-transformers faiss-cpu transformers accelerate

Collecting faiss-cpu
  Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (31.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m36.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.12.0


In [33]:
from pypdf import PdfReader
from sentence_transformers import SentenceTransformer
import numpy as np
import faiss
from transformers import pipeline

def ler_pdf(path):
    reader = PdfReader(path)
    texto = ""
    for page in reader.pages:
        if page.extract_text():
            texto += page.extract_text() + "\n"
    return texto

# Dividir em chunks
def dividir_texto(texto, tamanho=300):
    palavras = texto.split()
    return [" ".join(palavras[i:i+tamanho]) for i in range(0, len(palavras), tamanho)]

# Criar embeddings + index FAISS
def criar_index(chunks, modelo_embed="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"):
    embedder = SentenceTransformer(modelo_embed)
    embeddings = embedder.encode(chunks, convert_to_numpy=True)
    faiss.normalize_L2(embeddings)
    index = faiss.IndexFlatIP(embeddings.shape[1])
    index.add(embeddings)
    return index, embedder, embeddings

# Buscar contexto
def buscar_contexto(pergunta, index, embedder, chunks, top_k=3):
    emb_pergunta = embedder.encode([pergunta], convert_to_numpy=True)
    faiss.normalize_L2(emb_pergunta)
    D, I = index.search(emb_pergunta, top_k)
    return [chunks[i] for i in I[0]]

# QA local
qa_pipeline = pipeline("question-answering", model="timpal0l/mdeberta-v3-base-squad2")

def responder(pergunta, index, embedder, chunks):
    contexto = "\n".join(buscar_contexto(pergunta, index, embedder, chunks))
    resposta = qa_pipeline({"question": pergunta, "context": contexto})
    return resposta["answer"]

# -------------------------------
texto = ler_pdf("doencas_respiratorias_cronicas.pdf")
chunks = dividir_texto(texto, tamanho=300)
index, embedder, embeddings = criar_index(chunks)

# Pergunta
pergunta = "Como a asma é classificada de acordo com a gravidade?"
print("Resposta:", responder(pergunta, index, embedder, chunks))


Device set to use cpu


Resposta:  intermitente e persistente
