In [1]:
# ===========================
# Importación de librerías
# ===========================
import fitz  # Para manipular archivos PDF
from pinecone import Pinecone, ServerlessSpec  # Para gestión de vectores en Pinecone
from openai import OpenAI  # Cliente OpenAI
from langchain.text_splitter import RecursiveCharacterTextSplitter  # Para dividir texto en chunks
from langchain.docstore.document import Document  # Para manejar documentos
from pdfminer.high_level import extract_text  # Para extraer texto de PDFs
from sentence_transformers import SentenceTransformer  # Para obtener embeddings con modelos locales

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# ===========================
# Configuración y claves API
# ===========================
PINECONE_API_KEY = ""
PINECONE_ENV = "gcp-starter"
PINECONE_INDEX = "cv-index"
OPENAI_API_KEY = ""
GROQ_API_KEY =""

In [3]:
# ===========================
# Inicialización de clientes
# ===========================

# Cliente OpenAI para llamadas API
client = OpenAI(api_key=OPENAI_API_KEY)

# Cliente Pinecone para gestión de vectores
pc = Pinecone(api_key=PINECONE_API_KEY)

In [4]:
# ===========================
# Configuración del índice en Pinecone
# ===========================

# Verificar si el índice existe; si no, crearlo
if PINECONE_INDEX not in pc.list_indexes().names():
    pc.create_index(
        name=PINECONE_INDEX,
        dimension=384,  # Dimensión de los vectores (se define por el modelo utilizado para generar embeddings)
        metric='euclidean',  # Métrica para comparación
        spec=ServerlessSpec(
            cloud='aws',  
            region='us-east-1'  
        )
    )

# Conectar al índice
index = pc.Index(PINECONE_INDEX)

In [5]:
# ===========================
# Función para extraer texto de PDF usando 'fitz'
# ===========================
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text()
    return text.strip()


In [6]:
# ===========================
# Función para obtener embeddings usando OpenAI
# ===========================
def get_embedding(texto):
    response = client.embeddings.create(
        input=[texto],
        model="text-embedding-3-small"  # Modelo de embedding (puede variar)
    )
    return response.data[0].embedding

In [7]:
# ===========================
# Función para cargar y extraer texto de PDF con 'pdfminer'
# ===========================
def extract_text_from_pdf(file_path):
    return extract_text(file_path)

In [8]:
# ===========================
# Función para crear un objeto Document de LangChain
# ===========================
def create_document(text, metadata=None):
    if metadata is None:
        metadata = {}
    return Document(page_content=text, metadata=metadata)

In [9]:
# ===========================
# Función para dividir texto en chunks
# ===========================
def chunk_text(document, chunk_size=1000, chunk_overlap=200):
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap
    )
    return splitter.split_documents([document])


In [10]:
# ===========================
# Uso del proceso completo
# ===========================

pdf_path = "CV_VictoriaTeran.pdf"

# Extraer todo el texto del PDF
full_text = extract_text_from_pdf(pdf_path)

# Crear un objeto Document con el texto completo y metadata
doc = create_document(full_text, metadata={"source": pdf_path})

# Dividir el texto en chunks 
chunks = chunk_text(doc)

# Mostrar algunos ejemplos de los chunks generados
for i, chunk in enumerate(chunks):
    print(f"\n--- Chunk {i+1} ---")
    print(chunk.page_content[:100])  # Muestra los primeros 100 caracteres del chunk


--- Chunk 1 ---
Victoria Terán

DATA SCIENTIST

Contacto

Experiencia

mvictoriateran@gmail.com

Analista Sr de Ries

--- Chunk 2 ---
Registros  contables  y  procesos  afines  a  la  concilicación

bancaria;  control  y  reposición  


In [11]:
# ===========================
# Obtenemos los embeddings con modelo local
# ===========================

# Cargar modelo de embeddings local
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')  

# Función para obtener embedding usando modelo local
def get_embedding(text):
    return embedding_model.encode(text).tolist()

# Crear lista de textos desde los chunks
text_chunks = [chunk.page_content for chunk in chunks]

# Crear vectores con embeddings y metadatos
vectors = []
for i, chunk in enumerate(text_chunks):
    vector = {
        "id": f"chunk-{i}",
        "values": get_embedding(chunk),
        "metadata": {
            "text": chunk
        }
    }
    vectors.append(vector)

In [12]:
# ===========================
# Se envian los embeddings a pinecone
# ===========================

index.upsert(vectors=vectors)
print(index.describe_index_stats())

{'dimension': 384,
 'index_fullness': 0.0,
 'metric': 'euclidean',
 'namespaces': {'': {'vector_count': 2}},
 'total_vector_count': 2,
 'vector_type': 'dense'}
