# 02 - Gera√ß√£o de Embeddings

Este notebook gera todos os embeddings (TF-IDF+SVD, SBERT, GTE, BGE) e os salva em cache.


In [1]:
import sys
from pathlib import Path
import numpy as np
import pandas as pd
from tqdm import tqdm
import torch

# Adicionar src ao path
project_root = Path().resolve().parent
sys.path.append(str(project_root))

from src.config import (
    EMBEDDINGS_DIR, TFIDF_CONFIG, SVD_CONFIG, EMBEDDING_MODELS,
    EMBEDDING_BATCH_SIZE, EMBEDDING_DEVICE, RANDOM_STATE
)
from src.utils import save_embedding, load_embedding

# Recarregar m√≥dulo config para garantir que est√° usando a vers√£o mais recente
import importlib
import src.config
importlib.reload(src.config)
from src.config import EMBEDDING_MODELS

# Verificar disponibilidade de GPU
if torch.cuda.is_available():
    print(f"‚úÖ GPU detectada: {torch.cuda.get_device_name(0)}")
    print(f"   Mem√≥ria total: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")
else:
    print("‚ö†Ô∏è GPU n√£o detectada. Usando CPU.")

# Verificar modelos carregados
print(f"\nüìã Modelos configurados:")
print(f"   SBERT: {EMBEDDING_MODELS['sbert']}")
print(f"   GTE: {EMBEDDING_MODELS['gte']}")
print(f"   BGE: {EMBEDDING_MODELS['bge']}")


‚úÖ GPU detectada: NVIDIA GeForce RTX 3060 Ti
   Mem√≥ria total: 8.00 GB

üìã Modelos configurados:
   SBERT: sentence-transformers/paraphrase-multilingual-mpnet-base-v2
   GTE: thenlper/gte-base
   BGE: BAAI/bge-m3


In [2]:
# Login no Hugging Face usando vari√°veis de ambiente do arquivo .env
from huggingface_hub import login
from dotenv import load_dotenv
import os

# Carregar vari√°veis de ambiente do arquivo .env
load_dotenv()

# Obter token do arquivo .env
hf_token = os.getenv('HUGGING_FACE_TOKEN')

if hf_token:
    print("üîë Token do Hugging Face encontrado no arquivo .env")
    login(token=hf_token)
    print("‚úÖ Login realizado com sucesso!")
else:
    print("‚ö†Ô∏è Token HUGGING_FACE_TOKEN n√£o encontrado no arquivo .env")
    print("üí° Adicione seu token no arquivo .env:")
    print("   HUGGING_FACE_TOKEN=seu_token_aqui")
    print("\n   Ou fa√ßa login manualmente:")
    print("   login()  # Descomente esta linha e execute")


üîë Token do Hugging Face encontrado no arquivo .env
‚úÖ Login realizado com sucesso!


## 1. Carregar Dados Preparados

Execute o notebook 01_data_prep.ipynb primeiro para ter os dados dispon√≠veis!


In [3]:
# Carregar dados (recarrega automaticamente se necess√°rio)
from sklearn.datasets import fetch_20newsgroups
from src.config import RAW_DATA_DIR, TWENTY_NG_CATEGORIES, PT6_CLASS_COLUMN_CANDIDATES
from src.utils import detect_class_column

# Verificar se os DataFrames j√° est√£o dispon√≠veis
if 'df_20ng' not in globals() or 'df_pt6' not in globals():
    print("üì• Carregando dados...")
    
    # Carregar 20NG-6
    print("   Carregando 20NG-6...")
    newsgroups = fetch_20newsgroups(
        subset='all',
        categories=TWENTY_NG_CATEGORIES,
        remove=('headers', 'footers', 'quotes'),
        shuffle=True,
        random_state=42
    )
    
    df_20ng = pd.DataFrame({
        'text': newsgroups.data,
        'class': newsgroups.target,
        'class_name': [newsgroups.target_names[i] for i in newsgroups.target]
    })
    print(f"   ‚úÖ 20NG-6: {len(df_20ng)} documentos")
    
    # Carregar PT-6 pr√©-processado
    print("   Carregando PT-6 pr√©-processado...")
    pt6_preprocessed_file = RAW_DATA_DIR / "pt6_preprocessed.csv"
    
    if pt6_preprocessed_file.exists():
        df_pt6 = pd.read_csv(pt6_preprocessed_file, encoding='utf-8-sig')
        class_col = detect_class_column(df_pt6, PT6_CLASS_COLUMN_CANDIDATES)
        
        # Verifica√ß√£o de seguran√ßa: garantir que n√£o h√° NaN na coluna de texto
        text_col_check = 'Texto Expandido' if 'Texto Expandido' in df_pt6.columns else 'Texto Original'
        nan_count = df_pt6[text_col_check].isna().sum()
        if nan_count > 0:
            print(f"   ‚ö†Ô∏è Encontrados {nan_count} valores NaN. Removendo...")
            df_pt6 = df_pt6[df_pt6[text_col_check].notna()].reset_index(drop=True)
            print(f"   ‚úÖ Ap√≥s limpeza: {len(df_pt6)} documentos")
        
        print(f"   ‚úÖ PT-6: {len(df_pt6)} documentos (coluna: {class_col})")
    else:
        raise FileNotFoundError(
            f"Arquivo pr√©-processado n√£o encontrado: {pt6_preprocessed_file}\\n"
            "Execute o notebook 01_data_prep.ipynb primeiro para gerar o CSV pr√©-processado!"
        )
    
    print("‚úÖ Todos os dados foram carregados!")
else:
    print("‚úÖ Dados j√° est√£o dispon√≠veis na sess√£o!")
    # Usar globals().get() para evitar avisos do linter
    df_20ng_check = globals().get('df_20ng')
    df_pt6_check = globals().get('df_pt6')
    if df_20ng_check is not None:
        print(f"   20NG-6: {len(df_20ng_check)} documentos")
    if df_pt6_check is not None:
        print(f"   PT-6: {len(df_pt6_check)} documentos")

# Garantir que as vari√°veis est√£o no escopo local
if 'df_20ng' not in locals():
    df_20ng = globals().get('df_20ng')
if 'df_pt6' not in locals():
    df_pt6 = globals().get('df_pt6')

# Definir coluna de texto para PT-6 (usar 'Texto Expandido' se dispon√≠vel)
if df_pt6 is not None:
    text_col_pt6 = 'Texto Expandido' if 'Texto Expandido' in df_pt6.columns else 'Texto Original'
    print(f"\\nüìù Coluna de texto usada para PT-6: '{text_col_pt6}'")
else:
    raise RuntimeError("df_pt6 n√£o est√° dispon√≠vel. Execute a c√©lula anterior primeiro.")


üì• Carregando dados...
   Carregando 20NG-6...
   ‚úÖ 20NG-6: 5906 documentos
   Carregando PT-6 pr√©-processado...
   ‚úÖ PT-6: 315 documentos (coluna: Categoria)
‚úÖ Todos os dados foram carregados!
\nüìù Coluna de texto usada para PT-6: 'Texto Expandido'


## 2. TF-IDF + SVD (Baseline Lexical)


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

def build_tfidf_svd_embeddings(texts, dataset_name):
    """Gera embeddings TF-IDF + SVD."""
    # Verificar cache
    cached = load_embedding(dataset_name, 'tfidf_svd', EMBEDDINGS_DIR)
    if cached is not None:
        print(f"‚úÖ Embedding TF-IDF+SVD j√° existe para {dataset_name}")
        return cached
    
    # TF-IDF
    print(f"\\nüî® Gerando TF-IDF para {dataset_name}...")
    vectorizer = TfidfVectorizer(**TFIDF_CONFIG)
    tfidf_matrix = vectorizer.fit_transform(texts)
    print(f"   TF-IDF shape: {tfidf_matrix.shape}")
    
    # SVD
    print(f"   Aplicando SVD para reduzir a {SVD_CONFIG['n_components']} dimens√µes...")
    svd = TruncatedSVD(**SVD_CONFIG)
    embeddings = svd.fit_transform(tfidf_matrix)
    print(f"   ‚úÖ Embeddings finais shape: {embeddings.shape}")
    
    # Salvar
    save_embedding(embeddings, dataset_name, 'tfidf_svd', EMBEDDINGS_DIR)
    return embeddings

# Garantir que text_col_pt6 est√° definida
if 'text_col_pt6' not in globals():
    text_col_pt6 = 'Texto Expandido' if 'Texto Expandido' in df_pt6.columns else 'Texto Original'
    print(f"üìù Coluna de texto para PT-6: '{text_col_pt6}'")

# Gerar para 20NG-6
print("=" * 60)
print("GERANDO EMBEDDINGS TF-IDF+SVD")
print("=" * 60)
embeddings_20ng_tfidf = build_tfidf_svd_embeddings(df_20ng['text'].tolist(), '20ng6')

# Gerar para PT-6 (dados j√° est√£o pr√©-processados)
# Verifica√ß√£o final antes de gerar embeddings
texts_pt6 = df_pt6[text_col_pt6].tolist()
# Garantir que n√£o h√° NaN
texts_pt6 = [str(t).strip() for t in texts_pt6 if pd.notna(t) and str(t).strip() != '' and str(t) != 'nan']
print(f"üìä PT-6: {len(texts_pt6)} textos v√°lidos para processar")
embeddings_pt6_tfidf = build_tfidf_svd_embeddings(texts_pt6, 'pt6')


## 3. SBERT (Sentence Transformer)


In [None]:
from sentence_transformers import SentenceTransformer
import torch

def build_sbert_embeddings(texts, dataset_name):
    """Gera embeddings SBERT."""
    # Verificar cache
    cached = load_embedding(dataset_name, 'sbert', EMBEDDINGS_DIR)
    if cached is not None:
        print(f"‚úÖ Embedding SBERT j√° existe para {dataset_name}")
        return cached
    
    print(f"\\nüî® Carregando modelo SBERT para {dataset_name}...")
    model = SentenceTransformer(EMBEDDING_MODELS['sbert'])
    
    # Usar GPU se dispon√≠vel
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    # Aumentar batch_size para GPU (RTX 3060 Ti tem 8GB VRAM)
    batch_size = 128 if device == 'cuda' else EMBEDDING_BATCH_SIZE
    print(f"   Gerando embeddings (batch_size={batch_size}, device={device})...")
    embeddings = model.encode(
        texts,
        batch_size=batch_size,
        show_progress_bar=True,
        device=device,
        convert_to_numpy=True
    )
    
    print(f"   ‚úÖ Embeddings shape: {embeddings.shape}")
    
    # Salvar
    save_embedding(embeddings, dataset_name, 'sbert', EMBEDDINGS_DIR)
    return embeddings

# Gerar para 20NG-6
print("=" * 60)
print("GERANDO EMBEDDINGS SBERT")
print("=" * 60)
embeddings_20ng_sbert = build_sbert_embeddings(df_20ng['text'].tolist(), '20ng6')

# Gerar para PT-6 (dados j√° est√£o pr√©-processados)
embeddings_pt6_sbert = build_sbert_embeddings(df_pt6[text_col_pt6].tolist(), 'pt6')


## 4. GTE (General Text Embeddings)


In [None]:
def build_gte_embeddings(texts, dataset_name):
    """Gera embeddings GTE."""
    # Verificar cache
    cached = load_embedding(dataset_name, 'gte', EMBEDDINGS_DIR)
    if cached is not None:
        print(f"‚úÖ Embedding GTE j√° existe para {dataset_name}")
        return cached
    
    print(f"\\nüî® Carregando modelo GTE para {dataset_name}...")
    model = SentenceTransformer(EMBEDDING_MODELS['gte'])
    
    # Usar GPU se dispon√≠vel
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    # Aumentar batch_size para GPU (RTX 3060 Ti tem 8GB VRAM)
    batch_size = 128 if device == 'cuda' else EMBEDDING_BATCH_SIZE
    print(f"   Gerando embeddings (batch_size={batch_size}, device={device})...")
    embeddings = model.encode(
        texts,
        batch_size=batch_size,
        show_progress_bar=True,
        device=device,
        convert_to_numpy=True
    )
    
    print(f"   ‚úÖ Embeddings shape: {embeddings.shape}")
    
    # Salvar
    save_embedding(embeddings, dataset_name, 'gte', EMBEDDINGS_DIR)
    return embeddings

# Gerar para 20NG-6
print("=" * 60)
print("GERANDO EMBEDDINGS GTE")
print("=" * 60)
embeddings_20ng_gte = build_gte_embeddings(df_20ng['text'].tolist(), '20ng6')

# Gerar para PT-6 (dados j√° est√£o pr√©-processados)
embeddings_pt6_gte = build_gte_embeddings(df_pt6[text_col_pt6].tolist(), 'pt6')


## 5. BGE (BAAI General Embedding)


In [8]:
def build_bge_embeddings(texts, dataset_name):
    """Gera embeddings BGE usando apenas safetensors."""
    from sentence_transformers import SentenceTransformer
    from transformers import AutoModel, AutoTokenizer
    from sentence_transformers import models
    import os
    from pathlib import Path
    
    # Verificar cache
    cached = load_embedding(dataset_name, 'bge', EMBEDDINGS_DIR)
    if cached is not None:
        print(f"‚úÖ Embedding BGE j√° existe para {dataset_name}")
        return cached
    
    print(f"\\nüî® Carregando modelo BGE para {dataset_name}...")
    print("   üì• Baixando APENAS safetensors (evitando .bin)...")
    
    cache_dir = Path.home() / '.cache' / 'huggingface' / 'hub'
    bge_cache = cache_dir / 'models--BAAI--bge-m3'
    
    # DELETAR cache completo do BGE para come√ßar limpo
    if bge_cache.exists():
        import shutil
        try:
            shutil.rmtree(bge_cache)
            print("   üóëÔ∏è Cache BGE removido (ser√° baixado apenas safetensors)")
        except Exception as e:
            print(f"   ‚ö†Ô∏è N√£o foi poss√≠vel remover cache completo: {e}")
            # Tentar remover apenas .bin
            for bin_file in bge_cache.rglob('pytorch_model.bin'):
                try:
                    bin_file.unlink()
                except:
                    pass
    
    # Configurar ambiente
    os.environ['SAFETENSORS_FAST_GPU'] = '1'
    os.environ['HF_HUB_DISABLE_SYMLINKS_WARNING'] = '1'
    
    model_name = EMBEDDING_MODELS['bge']
    
    # Usar snapshot_download com ignore_patterns para N√ÉO baixar .bin
    from huggingface_hub import snapshot_download
    
    # Baixar modelo ignorando .bin files
    try:
        model_path = snapshot_download(
            repo_id=model_name,
            ignore_patterns=["*.bin", "pytorch_model.bin"],
            cache_dir=str(cache_dir)
        )
        print("   ‚úÖ Modelo baixado (apenas safetensors)")
    except Exception as e:
        print(f"   ‚ö†Ô∏è Erro: {e}")
        # Continuar mesmo assim
    
    # Carregar usando transformers - vai usar apenas safetensors (sem .bin dispon√≠vel)
    tokenizer = AutoTokenizer.from_pretrained(
        model_name,
        use_safetensors=True,
        local_files_only=False
    )
    
    model_auto = AutoModel.from_pretrained(
        model_name,
        use_safetensors=True,
        trust_remote_code=True,
        local_files_only=False
    )
    
    # Verificar e remover .bin se foi baixado (n√£o deveria, mas por seguran√ßa)
    if bge_cache.exists():
        for bin_file in bge_cache.rglob('pytorch_model.bin'):
            try:
                bin_file.unlink()
                print(f"   üóëÔ∏è Removido {bin_file.name} (n√£o deveria existir)")
            except:
                pass
    
    # Criar SentenceTransformer usando o modelo carregado
    word_embedding = models.Transformer(
        model_name,
        max_seq_length=512,
        model_args={'use_safetensors': True}
    )
    # Substituir o modelo interno pelo que carregamos
    word_embedding.auto_model = model_auto
    word_embedding.tokenizer = tokenizer
    
    pooling = models.Pooling(
        word_embedding.get_word_embedding_dimension(),
        pooling_mode_mean_tokens=True
    )
    
    model = SentenceTransformer(modules=[word_embedding, pooling])
    
    # Usar GPU se dispon√≠vel
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    # Aumentar batch_size para GPU (RTX 3060 Ti tem 8GB VRAM)
    batch_size = 128 if device == 'cuda' else EMBEDDING_BATCH_SIZE
    print(f"   Gerando embeddings (batch_size={batch_size}, device={device})...")
    embeddings = model.encode(
        texts,
        batch_size=batch_size,
        show_progress_bar=True,
        device=device,
        convert_to_numpy=True
    )
    
    print(f"   ‚úÖ Embeddings shape: {embeddings.shape}")
    
    # Salvar
    save_embedding(embeddings, dataset_name, 'bge', EMBEDDINGS_DIR)
    return embeddings

# Gerar para 20NG-6
print("=" * 60)
print("GERANDO EMBEDDINGS BGE")
print("=" * 60)
embeddings_20ng_bge = build_bge_embeddings(df_20ng['text'].tolist(), '20ng6')

# Gerar para PT-6 (dados j√° est√£o pr√©-processados)
embeddings_pt6_bge = build_bge_embeddings(df_pt6[text_col_pt6].tolist(), 'pt6')


GERANDO EMBEDDINGS BGE
Embedding n√£o encontrado: C:\nlp-clustering-benchmark\data\embeddings\20ng6_bge.npy
\nüî® Carregando modelo BGE para 20ng6...
   üì• Baixando APENAS safetensors (evitando .bin)...
   üóëÔ∏è Cache BGE removido (ser√° baixado apenas safetensors)


Fetching 29 files:   0%|          | 0/29 [00:00<?, ?it/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


config_sentence_transformers.json:   0%|          | 0.00/123 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

.gitattributes: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/191 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/687 [00:00<?, ?B/s]

.DS_Store:   0%|          | 0.00/6.15k [00:00<?, ?B/s]

bm25.jpg:   0%|          | 0.00/132k [00:00<?, ?B/s]

colbert_linear.pt:   0%|          | 0.00/2.10M [00:00<?, ?B/s]

long.jpg:   0%|          | 0.00/485k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

long.jpg:   0%|          | 0.00/127k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


mkqa.jpg:   0%|          | 0.00/608k [00:00<?, ?B/s]

nqa.jpg:   0%|          | 0.00/158k [00:00<?, ?B/s]

others.webp:   0%|          | 0.00/21.0k [00:00<?, ?B/s]

Constant_7_attr__value:   0%|          | 0.00/65.6k [00:00<?, ?B/s]

miracl.jpg:   0%|          | 0.00/576k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


config.json:   0%|          | 0.00/698 [00:00<?, ?B/s]

model.onnx:   0%|          | 0.00/725k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.onnx_data:   0%|          | 0.00/2.27G [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/54.0 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


sparse_linear.pt:   0%|          | 0.00/3.52k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/444 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

   ‚úÖ Modelo baixado (apenas safetensors)


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/2.27G [00:00<?, ?B/s]

   Gerando embeddings (batch_size=128, device=cuda)...


Batches:   0%|          | 0/47 [00:00<?, ?it/s]

   ‚úÖ Embeddings shape: (5906, 1024)
Embedding salvo em: C:\nlp-clustering-benchmark\data\embeddings\20ng6_bge.npy
Embedding n√£o encontrado: C:\nlp-clustering-benchmark\data\embeddings\pt6_bge.npy
\nüî® Carregando modelo BGE para pt6...
   üì• Baixando APENAS safetensors (evitando .bin)...
   üóëÔ∏è Cache BGE removido (ser√° baixado apenas safetensors)


Fetching 29 files:   0%|          | 0/29 [00:00<?, ?it/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


config.json:   0%|          | 0.00/687 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/123 [00:00<?, ?B/s]

.gitattributes: 0.00B [00:00, ?B/s]

.DS_Store:   0%|          | 0.00/6.15k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/191 [00:00<?, ?B/s]

miracl.jpg:   0%|          | 0.00/576k [00:00<?, ?B/s]

long.jpg:   0%|          | 0.00/485k [00:00<?, ?B/s]

bm25.jpg:   0%|          | 0.00/132k [00:00<?, ?B/s]

nqa.jpg:   0%|          | 0.00/158k [00:00<?, ?B/s]

colbert_linear.pt:   0%|          | 0.00/2.10M [00:00<?, ?B/s]

mkqa.jpg:   0%|          | 0.00/608k [00:00<?, ?B/s]

long.jpg:   0%|          | 0.00/127k [00:00<?, ?B/s]

others.webp:   0%|          | 0.00/21.0k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


Constant_7_attr__value:   0%|          | 0.00/65.6k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


config.json:   0%|          | 0.00/698 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/54.0 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

model.onnx:   0%|          | 0.00/725k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.onnx_data:   0%|          | 0.00/2.27G [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/444 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

sparse_linear.pt:   0%|          | 0.00/3.52k [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

   ‚úÖ Modelo baixado (apenas safetensors)


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/2.27G [00:00<?, ?B/s]

   Gerando embeddings (batch_size=128, device=cuda)...


Batches:   0%|          | 0/3 [00:00<?, ?it/s]

   ‚úÖ Embeddings shape: (315, 1024)
Embedding salvo em: C:\nlp-clustering-benchmark\data\embeddings\pt6_bge.npy


## 6. Resumo e Verifica√ß√£o

Verificar se todos os embeddings foram gerados e salvos corretamente.


In [9]:
import os

print("=" * 60)
print("RESUMO DOS EMBEDDINGS GERADOS")
print("=" * 60)

datasets = ['20ng6', 'pt6']
embedding_types = ['tfidf_svd', 'sbert', 'gte', 'bge']

for dataset in datasets:
    print(f"\nüìä {dataset.upper()}:")
    for emb_type in embedding_types:
        filename = f"{dataset}_{emb_type}.npy"
        filepath = EMBEDDINGS_DIR / filename
        if filepath.exists():
            emb = np.load(filepath)
            print(f"   ‚úÖ {emb_type:12s} - Shape: {emb.shape}")
        else:
            print(f"   ‚ùå {emb_type:12s} - N√£o encontrado")

print("\n" + "=" * 60)
print("‚úÖ Todos os embeddings foram gerados e salvos em cache!")
print("=" * 60)

RESUMO DOS EMBEDDINGS GERADOS

üìä 20NG6:
   ‚úÖ tfidf_svd    - Shape: (5906, 300)
   ‚úÖ sbert        - Shape: (5906, 768)
   ‚úÖ gte          - Shape: (5906, 768)
   ‚úÖ bge          - Shape: (5906, 1024)

üìä PT6:
   ‚úÖ tfidf_svd    - Shape: (315, 300)
   ‚úÖ sbert        - Shape: (315, 768)
   ‚úÖ gte          - Shape: (315, 768)
   ‚úÖ bge          - Shape: (315, 1024)

‚úÖ Todos os embeddings foram gerados e salvos em cache!
