In [1]:
pip install -U sentence-transformers torch



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:

# Importações chunk
import numpy as np
import plotly.express as px
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

# Importações embedding
from typing import List, Tuple, Optional
import numpy as np
import torch
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
def split_text_with_overlap(file_path: str, chunk_size: int = 500, overlap: int = 50):
    """
    Lê um arquivo .txt e divide em chunks de texto com sobreposição.

    Args:
        file_path (str): Caminho do arquivo de texto.
        chunk_size (int): Tamanho de cada chunk (número de caracteres).
        overlap (int): Quantidade de sobreposição entre os chunks (número de caracteres).

    Returns:
        list[str]: Lista de chunks de texto.
    """
    if overlap >= chunk_size:
        raise ValueError("O valor de overlap deve ser menor que o chunk_size")

    # Lê o arquivo
    with open(file_path, "r", encoding="utf-8") as f:
        text = f.read()

    chunks = []
    start = 0

    while start < len(text):
        end = start + chunk_size
        chunk = text[start:end]
        chunks.append(chunk)

        # Avança considerando a sobreposição
        start += chunk_size - overlap

    return chunks

In [4]:
def embed_chunks(
    chunks: List[str],
    model_name: str = "sentence-transformers/all-MiniLM-L6-v2",
    batch_size: int = 64,
    device: Optional[str] = None,
    normalize: bool = True,
) -> Tuple[np.ndarray, List[str]]:
    """
    Gera embeddings para cada chunk usando Sentence-Transformers.

    Args:
        chunks (List[str]): Lista de textos (chunks).
        model_name (str): Nome do modelo no Hub (ex.: 'all-MiniLM-L6-v2').
        batch_size (int): Tamanho do batch para acelerar a inferência.
        device (str | None): 'cuda', 'mps', 'cpu' ou None para auto-detecção.
        normalize (bool): Se True, retorna embeddings L2 normalizados.

    Returns:
        Tuple[np.ndarray, List[str]]:
            - embeddings: matriz (N x D) com um vetor por chunk.
            - chunk_ids: lista de IDs/string helpers para mapear cada linha.
    """
    if not chunks:
        return np.empty((0, 0)), []

    # Seleção automática do dispositivo
    if device is None:
        if torch.cuda.is_available():
            device = "cuda"
        elif getattr(torch.backends, "mps", None) and torch.backends.mps.is_available():
            device = "mps"  # Apple Silicon
        else:
            device = "cpu"

    model = SentenceTransformer(model_name, device=device)

    # encode já suporta batching
    embeddings = model.encode(
        chunks,
        batch_size=batch_size,
        convert_to_numpy=True,
        normalize_embeddings=normalize,
        show_progress_bar=True,
    )

    # Gera IDs simples para rastrear qual linha pertence a qual chunk
    chunk_ids = [f"chunk_{i:04d}" for i in range(len(chunks))]
    return embeddings, chunk_ids


In [5]:
# executando o código
file_path = "knowledge_base.txt"  # Substitua pelo caminho do seu arquivo .txt
chunks = split_text_with_overlap(file_path, chunk_size=500, overlap=50)
embeddings, chunk_ids = embed_chunks(chunks)    
embeddings.shape, len(chunk_ids)

Batches: 100%|██████████| 2/2 [00:00<00:00,  2.34it/s]


((98, 384), 98)

In [None]:
# ---- PCA para reduzir para 3 dimensões antes do t-SNE ----
pca = PCA(n_components=3)
embeddings_pca = pca.fit_transform(embeddings)
embeddings_pca.shape

(98, 3)

In [52]:
# ---- Criar DataFrame com Plotly ----
import pandas as pd
df = pd.DataFrame({
    "PC1": embeddings_pca[:, 0],
    "PC2": embeddings_pca[:, 1],
    "PC3": embeddings_pca[:, 2],
    "id": chunk_ids
})

# ---- Visualização 3D interativa ----
fig = px.scatter_3d(
    df, x="PC1", y="PC2", z="PC3",
    text="id",  # aparece ao passar o mouse
    opacity=0.7,
    title="Visualização de Embeddings em 3D (PCA + Plotly)"
)

fig.update_traces(marker=dict(size=4))
fig.update_layout(
    width=700,
    height=700,
    scene=dict(
        aspectmode="cube"  # força proporção 1:1:1 entre x, y, z
    )
)
fig.show()

In [8]:
chunks


['Advanced Knowledge Base: Interpreting EI Mass Spectra of Steroids and Organic Compounds Introduction Electron Ionization (EI) mass spectrometry produces a complex "fingerprint" spectrum for organic molecules, rich in fragment ions. Interpreting these spectra requires understanding the m/z values (mass-to-charge, typically equal to molecular mass for singly-charged ions), identifying the molecular ion and base peak, and analyzing relative intensities of fragment ions. In this guide, we focus on E',
 'ies of fragment ions. In this guide, we focus on EI spectra of steroids (e.g. cholesterol, testosterone, progesterone) as representative complex organic compounds. We will outline common fragmentation pathways and characteristic ions, explain structural features influencing fragmentation, and present step-by-step strategies for elucidating structures from spectra. Several worked examples of steroid mass spectra are provided with annotated fragments and reasoning. Finally, we offer guideli

In [9]:
df_chunks = pd.DataFrame({
    "id": chunk_ids,
    "chunk": chunks,
})
df_chunks.head()

Unnamed: 0,id,chunk
0,chunk_0000,Advanced Knowledge Base: Interpreting EI Mass ...
1,chunk_0001,"ies of fragment ions. In this guide, we focus ..."
2,chunk_0002,"ts and reasoning. Finally, we offer guidelines..."
3,chunk_0003,s presence and intensity vary: some compounds ...
4,chunk_0004,"oids have no nitrogen, so an even-mass M⁺· is ..."


In [10]:
df_chunks.to_csv("chunks.csv", index=False)

In [11]:
import numpy as np
from typing import List, Tuple

def cosine_similarity_matrix(a: np.ndarray, b: np.ndarray) -> float:
    """
    Calcula similaridade de cosseno entre dois vetores.
    """
    dot = np.dot(a, b)
    norm_a = np.linalg.norm(a)
    norm_b = np.linalg.norm(b)
    return dot / (norm_a * norm_b)


def compare_chunk_similarity(
    embeddings: np.ndarray,
    chunks: List[str],
    reference: int | str,
    chunk_ids: List[str] | None = None,
    top_k: int = 5,
) -> List[Tuple[str, float]]:
    """
    Calcula similaridade do cosseno entre um chunk de referência e todos os demais.

    Args:
        embeddings (np.ndarray): Matriz (N x D) de embeddings.
        chunks (List[str]): Lista dos textos originais (chunks).
        reference (int | str): Índice ou ID do chunk de referência.
        chunk_ids (List[str] | None): IDs dos chunks (se None, usa índices).
        top_k (int): Quantos resultados mais semelhantes retornar.

    Returns:
        List[Tuple[str, float]]: Lista com (id, score) ordenada pela similaridade.
    """
    if chunk_ids is None:
        chunk_ids = [f"chunk_{i}" for i in range(len(chunks))]

    # Descobre índice do chunk de referência
    if isinstance(reference, str):
        try:
            ref_idx = chunk_ids.index(reference)
        except ValueError:
            raise ValueError(f"ID '{reference}' não encontrado em chunk_ids")
    else:
        ref_idx = reference

    ref_vec = embeddings[ref_idx]
    results = []

    for i, vec in enumerate(embeddings):
        if i == ref_idx:
            continue
        score = cosine_similarity_matrix(ref_vec, vec)
        results.append((chunk_ids[i], score))

    # Ordena do mais parecido para o menos parecido
    results = sorted(results, key=lambda x: x[1], reverse=True)

    return results[:top_k]


In [12]:
# Exemplo de uso

similares = compare_chunk_similarity(
    embeddings=embeddings_pca,
    chunks=chunks,
    reference=0,   # pode ser índice (0) ou ID ("chunk_0")
    chunk_ids=chunk_ids,
    top_k=5
)
print("Mais semelhantes ao chunk_0:")
for cid, score in similares:
    print(f"{cid}: {score:.4f}")

Mais semelhantes ao chunk_0:
chunk_0095: 0.9778
chunk_0042: 0.9747
chunk_0006: 0.9551
chunk_0062: 0.9487
chunk_0035: 0.9202


In [13]:
import pandas as pd
from typing import List, Tuple

def get_chunks_from_csv(
    similar_chunks: List[Tuple[str, float]],
    csv_path: str = "chunks.csv"
) -> List[Tuple[str, str, float]]:
    """
    Lê um CSV (id, chunk) e retorna os textos dos chunks correspondentes aos ids.

    Args:
        similar_chunks (List[Tuple[str, float]]): Lista (id, score) dos chunks mais similares.
        csv_path (str): Caminho do arquivo CSV (com colunas: id, chunk).

    Returns:
        List[Tuple[str, str, float]]: Lista de tuplas (id, texto, score).
    """
    df = pd.read_csv(csv_path)

    # Cria um dicionário para acesso rápido id -> texto
    chunk_dict = dict(zip(df["id"], df["chunk"]))

    resultados = []
    for cid, score in similar_chunks:
        texto = chunk_dict.get(cid, "[ID não encontrado]")
        resultados.append((cid, texto, score))

    return resultados

In [14]:
resultados = get_chunks_from_csv(similares, csv_path="chunks.csv")

for cid, texto, score in resultados:
    print(f"\n=== {cid} (score={score:.4f}) ===\n{texto}\n")


=== chunk_0095 (score=0.9778) ===
 . While EI alone may not definitively identify stereochemistry, it gives hints that can be confirmed with other methods. By mastering these principles and patterns, one develops an intuition for mass spectral interpretation that is crucial for advanced organic analysis, whether it's identifying an unknown metabolite or confirming the structure of a synthetic steroid. With practice, the "language" of fragment ions becomes an indispensable tool for the organic chemist or mass spectrometrist. Refe


=== chunk_0042 (score=0.9747) ===
, but these represent the most diagnostically useful ions.) Step-by-Step Strategy for EI Spectrum Elucidation Interpreting a complex EI spectrum can be approached systematically. Below is a step-by-step strategy, applied especially to steroid spectra: Step 1: Identify the Molecular Ion (M⁺·) – Look at the highest m/z values in the spectrum. If a peak stands out near the expected molecular weight (based on formula or known MW

### Similaridade de cosseno entre vetores com base na query

In [42]:
import numpy as np
from typing import List, Tuple, Optional
from sentence_transformers import SentenceTransformer
from sklearn.decomposition import PCA

def search_similar_chunks_pca(
    query: str,
    embeddings: np.ndarray,          # pode ser (N, D) original ou já reduzido
    chunks: List[str],
    model: SentenceTransformer,
    chunk_ids: Optional[List[str]] = None,
    top_k: int = 5,
    pca_model: Optional[PCA] = None, # passe o MESMO PCA usado para reduzir embeddings
    embeddings_already_reduced: bool = True
) -> List[Tuple[str, str, float]]:
    """
    Compara uma query com embeddings (possivelmente reduzidos por PCA) e retorna os mais similares.

    Caso seus embeddings estejam em 3D (após PCA), passe o mesmo pca_model para projetar a query.

    Args:
        query: texto da consulta
        embeddings: matriz (N x D) – D pode ser original ou reduzido
        chunks: textos dos chunks
        model: SentenceTransformer usado para embutir a query
        chunk_ids: ids opcionais dos chunks
        top_k: quantos resultados retornar
        pca_model: o MESMO PCA que transformou 'embeddings' (se embeddings estiverem reduzidos)
        embeddings_already_reduced: True se 'embeddings' já estão em espaço PCA

    Returns:
        Lista (id, chunk, score) ordenada por similaridade (cosseno)
    """
    if chunk_ids is None:
        chunk_ids = [f"chunk_{i:04d}" for i in range(len(chunks))]

    # 1) Embedding da query no espaço original do modelo
    q = model.encode([query], convert_to_numpy=True, normalize_embeddings=False)[0]  # sem normalizar ainda

    # 2) Se os embeddings foram reduzidos por PCA, projetar a query com o MESMO PCA
    if embeddings_already_reduced:
        if pca_model is None:
            raise ValueError("Você marcou embeddings_already_reduced=True, mas não passou pca_model.")
        q = pca_model.transform(q.reshape(1, -1))[0]  # transforma com o MESMO PCA (não usar fit_transform)

    # 3) Normalizar (para usar cosseno)
    def l2norm(X):
        n = np.linalg.norm(X, axis=1, keepdims=True) if X.ndim == 2 else np.linalg.norm(X)
        return X / (n + 1e-12)

    E = l2norm(embeddings)     # (N, D)
    q = l2norm(q)              # (D,)

    # 4) Similaridade do cosseno vetorizada
    scores = E @ q             # (N,)

    # 5) Ordenar e devolver
    order = np.argsort(-scores)[:top_k]
    return [(chunk_ids[i], chunks[i], float(scores[i])) for i in order]

In [50]:
from sentence_transformers import SentenceTransformer
from sklearn.decomposition import PCA

# 1) Embeddings originais dos chunks (ex.: 384 dims)
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
embeddings_full = model.encode(chunks, convert_to_numpy=True, normalize_embeddings=False)

# 2) Ajuste PCA nos embeddings (ex.: 3D para visualização)
pca = PCA(n_components=3, random_state=42).fit(embeddings_full)
embeddings_pca = pca.transform(embeddings_full)

# 3) Buscar
query = input("Digite sua pergunta: ")
resultados = search_similar_chunks_pca(
    query=query,
    embeddings=embeddings_pca,          # já reduzidos
    chunks=chunks,
    model=model,
    chunk_ids=chunk_ids,                 # opcional
    top_k=3,
    pca_model=pca,                       # MESMO PCA!
    embeddings_already_reduced=True
)

for cid, texto, score in resultados:
    print(f"\nID: {cid} | score={score:.4f}\n{texto}")



ID: chunk_0083 | score=0.9434
 rings) can show differences in fragmentation. A famous example is 5α-androstan-3-one vs 5β-androstan-3-one; the cis (5β) junction steroid tends to have slightly different [M–CH_3] to [M–H₂O] ratios than the trans (5α) isomer russchemrev.org russchemrev.org . Stereochemical effects in fragmentation are subtle, but certain cleavage may be sterically favored in one isomer. In the literature, differences in the intensity of peaks due to ions formed by ejection of a methyl group were noted between a

ID: chunk_0023 | score=0.8434
tc. High-resolution or MS/MS experiments (beyond scope here) can connect which fragments derive from which precursors link.springer.com , but even in a basic EI spectrum, seeing logical combinations of losses (e.g. M–15, M–18, M–33 (15+18), M–42, M–60 (42+18), etc.) helps confirm which functional groups are present and fragmenting. Characteristic Fragment Ions in Steroids Steroid molecules share a tetracyclic carbon skeleton, and the