# Classifica√ß√£o com Embeddings LLM (Google Gemini)

**Objetivo:** Gerar embeddings usando Google Gemini API com input din√¢mico do usu√°rio e executar classifica√ß√£o em tempo real.

**Nota:** Este notebook permite configurar a chave de API dinamicamente e gerar embeddings + classifica√ß√£o em uma √∫nica execu√ß√£o.


In [None]:
import pandas as pd
import numpy as np
import pickle
import os
import time
import google.generativeai as genai
from dotenv import load_dotenv
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report
from tqdm import tqdm
from google.api_core import exceptions as google_exceptions

# Carregar vari√°veis de ambiente (opcional)
load_dotenv()

sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)


## 1. Input da Chave de API


In [None]:
# ============================================================================
# CONFIGURA√á√ÉO DA API - INSIRA SUA CHAVE AQUI
# ============================================================================

# Op√ß√£o 1: Chave hardcoded (apenas para testes locais)
# api_key = "SUA_CHAVE_AQUI"

# Op√ß√£o 2: Usar vari√°vel de ambiente
# export GOOGLE_API_KEY="sua_chave_aqui" (Linux/Mac)
# $env:GOOGLE_API_KEY="sua_chave_aqui" (Windows PowerShell)

# Esta linha tentar√° carregar de vari√°vel de ambiente primeiro
api_key = os.getenv('GOOGLE_API_KEY', None)

if api_key is None:
    print("‚ö†Ô∏è ATEN√á√ÉO: Chave de API n√£o configurada!")
    print("Configure uma das op√ß√µes abaixo:")
    print("1. Descomente e edite a linha 'api_key = SUA_CHAVE_AQUI' acima")
    print("2. Configure vari√°vel de ambiente GOOGLE_API_KEY")
    print("3. Obtenha uma chave gratuita em: https://makersuite.google.com/app/apikey")
else:
    print("‚úÖ Chave de API carregada com sucesso!")

# Configurar o cliente Gemini
if api_key:
    genai.configure(api_key=api_key)
    model_name = "models/embedding-001"
    print(f"Modelo selecionado: {model_name}")
else:
    print("‚ùå N√£o √© poss√≠vel continuar sem chave de API!")


## 2. Carregar dataset pr√©-processado


In [None]:
# Carregar dados pr√©-processados
with open('../data/processed/20news_preprocessed.pkl', 'rb') as f:
    data = pickle.load(f)

X_text = data['text']
y = data['target']
target_names = data['target_names']

print(f"Total de documentos: {len(X_text)}")
print(f"Classes: {target_names}")
print(f"Distribui√ß√£o: {pd.Series(y).value_counts().sort_index().to_dict()}")


## 3. Fun√ß√£o para gerar embeddings via API


In [None]:
def generate_embeddings_batch(texts, model_name, batch_size=1, delay=2.0, use_tqdm=True):
    """
    Gera embeddings com controle rigoroso de rate limiting.
    
    IMPORTANTE: A API gratuita do Gemini tem limite muito restritivo!
    Se receber erro 429, voc√™ precisa:
    1. Aguardar 24h para reset da quota di√°ria, OU
    2. Usar um plano pago, OU
    3. Processar texto por texto com delay maior (2+ segundos)
    
    Args:
        texts: Lista de textos
        model_name: Nome do modelo (ex: "models/embedding-001")
        batch_size: Tamanho do lote (recomendado: 1 para free tier)
        delay: Delay em segundos entre requisi√ß√µes (recomendado: 2.0+ para free tier)
        use_tqdm: Se True, usa barra de progresso tqdm
    
    Returns:
        Array numpy com embeddings
    """
    embeddings = []
    n_batches = (len(texts) + batch_size - 1) // batch_size
    
    print(f"Gerando embeddings para {len(texts)} textos em {n_batches} lotes (batch_size={batch_size}, delay={delay}s)...")
    print("‚ö†Ô∏è Free tier tem limites restritivos. Processando lentamente...")
    
    # Criar barra de progresso
    if use_tqdm:
        pbar = tqdm(total=len(texts), desc="Gerando embeddings", unit="text")
    
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        batch_num = i // batch_size + 1
        max_retries = 3
        retry_count = 0
        success = False
        
        while retry_count < max_retries and not success:
            try:
                # Gerar embeddings para o lote
                result = genai.embed_content(
                    model=model_name,
                    content=batch,
                    task_type="RETRIEVAL_DOCUMENT"
                )
                
                # Extrair embeddings - a API pode retornar de diferentes formas
                if isinstance(result, dict):
                    if 'embedding' in result:
                        batch_embeddings = result['embedding']
                        if isinstance(batch_embeddings, list):
                            if len(batch_embeddings) > 0 and isinstance(batch_embeddings[0], list):
                                embeddings.extend(batch_embeddings)
                            else:
                                embeddings.extend([batch_embeddings])
                        else:
                            embeddings.append(batch_embeddings)
                    else:
                        batch_embeddings = list(result.values())[0] if result else []
                        if isinstance(batch_embeddings, list):
                            embeddings.extend(batch_embeddings if isinstance(batch_embeddings[0], list) else [batch_embeddings])
                elif isinstance(result, list):
                    embeddings.extend(result)
                else:
                    embeddings.append(result)
                
                success = True
                if use_tqdm:
                    pbar.update(len(batch))
                else:
                    if batch_num % 10 == 0 or batch_num == n_batches:
                        print(f"Lote {batch_num}/{n_batches} conclu√≠do ({len(batch)} embedding(s))")
                
            except google_exceptions.ResourceExhausted as e:
                error_msg = str(e)
                if "free_tier" in error_msg.lower() or "limit: 0" in error_msg:
                    if use_tqdm:
                        pbar.close()
                    print(f"\n{'='*60}")
                    print("‚ùå ERRO: Quota da API gratuita excedida!")
                    print("Solu√ß√µes:")
                    print("1. Aguardar 24h para reset da quota di√°ria")
                    print("2. Atualizar para plano pago no Google Cloud")
                    print("3. Verificar quota em: https://ai.dev/usage?tab=rate-limit")
                    print(f"{'='*60}")
                    raise Exception("Quota da API gratuita excedida. Consulte https://ai.google.dev/gemini-api/docs/rate-limits")
                
                wait_time = delay * (2 ** retry_count)
                if use_tqdm:
                    pbar.set_description(f"Rate limit - aguardando {wait_time:.1f}s...")
                else:
                    print(f"Rate limit no lote {batch_num}. Aguardando {wait_time:.1f}s...")
                
                time.sleep(wait_time)
                retry_count += 1
                
            except Exception as e:
                error_msg = str(e)
                if "429" in error_msg or "quota" in error_msg.lower():
                    wait_time = delay * (2 ** retry_count)
                    if use_tqdm:
                        pbar.set_description(f"Erro 429 - aguardando {wait_time:.1f}s...")
                    else:
                        print(f"Erro 429 no lote {batch_num}. Aguardando {wait_time:.1f}s...")
                    time.sleep(wait_time)
                    retry_count += 1
                else:
                    raise e
        
        if not success:
            raise Exception(f"Erro persistente no lote {batch_num} ap√≥s {max_retries} tentativas")
        
        # Delay entre lotes
        if i + batch_size < len(texts):
            time.sleep(delay)
    
    if use_tqdm:
        pbar.close()
    
    return np.array(embeddings)


## 4. Gerar embeddings via API


In [None]:
# Verificar se temos chave de API
if not api_key:
    print("‚ùå Interrompendo: chave de API necess√°ria para continuar")
else:
    # Converter para lista de strings
    texts_list = [str(text) for text in X_text]
    
    print("\n" + "="*60)
    print("GERANDO EMBEDDINGS COM GOOGLE GEMINI API")
    print("="*60)
    print(f"Total de textos: {len(texts_list)}")
    print(f"Batch size: 1 (texto por texto)")
    print(f"Delay entre requisi√ß√µes: 2.0 segundos")
    tempo_estimado = (len(texts_list) * 2.0) / 60
    print(f"Tempo estimado: ~{tempo_estimado:.1f} minutos")
    print("="*60 + "\n")
    
    try:
        X_emb = generate_embeddings_batch(
            texts_list, 
            model_name, 
            batch_size=1,
            delay=2.0,
            use_tqdm=True
        )
        model_name_used = model_name
        print(f"\n{'='*60}")
        print("‚úÖ Embeddings gerados com sucesso!")
        print(f"Shape: {X_emb.shape}")
        print(f"Dimens√£o do embedding: {X_emb.shape[1]}")
        print(f"Modelo: {model_name_used}")
        print(f"{'='*60}")
    except Exception as e:
        print(f"\n{'='*60}")
        print("‚ùå ERRO ao gerar embeddings:")
        print(str(e)[:300])
        print(f"{'='*60}")
        X_emb = None


## 5. Divis√£o Treino/Teste (80/20)


In [None]:
# Verificar se embeddings foram gerados
if X_emb is None:
    print("‚ùå Imposs√≠vel continuar sem embeddings")
else:
    X_train, X_test, y_train, y_test = train_test_split(
        X_emb, y, 
        test_size=0.2, 
        random_state=42, 
        stratify=y
    )
    
    print(f"Treino: {X_train.shape[0]} documentos")
    print(f"Teste: {X_test.shape[0]} documentos")


## 6. Treinar e avaliar modelos


In [None]:
# Verificar se embeddings foram gerados
if X_emb is None:
    print("‚ùå Imposs√≠vel continuar sem embeddings")
else:
    # Definir modelos
    models = {
        'GaussianNB': GaussianNB(),
        'KNN (k=5)': KNeighborsClassifier(n_neighbors=5),
        'DecisionTree': DecisionTreeClassifier(random_state=42, max_depth=20),
        'LogisticRegression': LogisticRegression(random_state=42, max_iter=1000)
    }
    
    # Treinar e avaliar cada modelo
    results = {}
    predictions = {}
    
    for name, model in models.items():
        print(f"\n{'='*60}")
        print(f"Treinando {name}...")
        print(f"{'='*60}")
        
        # Treinar
        model.fit(X_train, y_train)
        
        # Prever
        y_pred = model.predict(X_test)
        predictions[name] = y_pred
        
        # Calcular m√©tricas
        accuracy = accuracy_score(y_test, y_pred)
        f1_macro = f1_score(y_test, y_pred, average='macro')
        
        results[name] = {
            'accuracy': accuracy,
            'f1_macro': f1_macro
        }
        
        print(f"Accuracy: {accuracy:.4f}")
        print(f"Macro F1: {f1_macro:.4f}")
    
    # Criar DataFrame com resultados
    df_results = pd.DataFrame(results).T
    print(f"\n{'='*60}")
    print("üìä Resumo dos Resultados")
    print(f"{'='*60}")
    print(df_results)


## 7. Valida√ß√£o Cruzada (k=5)


In [None]:
# Valida√ß√£o cruzada para cada modelo
if X_emb is not None:
    cv_results = {}
    
    for name, model in models.items():
        print(f"\nExecutando valida√ß√£o cruzada para {name}...")
        cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='f1_macro')
        cv_results[name] = {
            'mean': cv_scores.mean(),
            'std': cv_scores.std(),
            'scores': cv_scores
        }
        print(f"F1 Macro (CV): {cv_scores.mean():.4f} (+/- {cv_scores.std()*2:.4f})")
    
    # Criar DataFrame
    df_cv = pd.DataFrame({
        name: [cv_results[name]['mean'], cv_results[name]['std']]
        for name in cv_results.keys()
    }, index=['Mean', 'Std']).T
    
    print(f"\n{'='*60}")
    print("üìä Resultados da Valida√ß√£o Cruzada (F1 Macro)")
    print(f"{'='*60}")
    print(df_cv)
else:
    print("‚ùå Imposs√≠vel continuar sem embeddings")


## 8. Matrizes de Confus√£o


In [None]:
# Plotar matrizes de confus√£o
if X_emb is not None:
    # Criar diret√≥rio para figuras
    os.makedirs('../reports/figures', exist_ok=True)
    os.makedirs('../reports/metrics', exist_ok=True)
    
    fig, axes = plt.subplots(2, 2, figsize=(16, 14))
    axes = axes.ravel()
    
    for idx, (name, y_pred) in enumerate(predictions.items()):
        cm = confusion_matrix(y_test, y_pred)
        cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        
        sns.heatmap(cm_normalized, annot=True, fmt='.2f', cmap='Blues',
                    xticklabels=target_names, yticklabels=target_names,
                    ax=axes[idx], cbar_kws={'label': 'Propor√ß√£o'})
        axes[idx].set_title(f'{name}\nAccuracy: {results[name]["accuracy"]:.3f}, F1: {results[name]["f1_macro"]:.3f}')
        axes[idx].set_xlabel('Predito')
        axes[idx].set_ylabel('Real')
        axes[idx].tick_params(axis='x', rotation=45)
        axes[idx].tick_params(axis='y', rotation=0)
    
    plt.tight_layout()
    plt.savefig('../reports/figures/confusion_matrices_llm_embeddings.png', dpi=300, bbox_inches='tight')
    plt.show()
else:
    print("‚ùå Imposs√≠vel continuar sem embeddings")


## 9. Salvar resultados


In [None]:
# Salvar resultados
if X_emb is not None:
    # Salvar resultados em CSV
    df_results.to_csv('../reports/metrics/classification_llm_embeddings_results.csv')
    df_cv.to_csv('../reports/metrics/classification_llm_embeddings_cv.csv')
    
    print("‚úÖ Resultados salvos em:")
    print("  - ../reports/metrics/classification_llm_embeddings_results.csv")
    print("  - ../reports/metrics/classification_llm_embeddings_cv.csv")
    
    # Gerar relat√≥rios detalhados por modelo
    for name, y_pred in predictions.items():
        report = classification_report(y_test, y_pred, target_names=target_names, output_dict=True)
        df_report = pd.DataFrame(report).transpose()
        filename = f'classification_llm_embeddings_{name.lower().replace(" ", "_")}_report.csv'
        df_report.to_csv(f'../reports/metrics/{filename}')
        print(f"  - ../reports/metrics/{filename}")
    
    # Salvar embeddings se tudo deu certo
    if X_emb is not None:
        data_to_save = {
            'X_emb': X_emb,
            'y': y,
            'target_names': target_names,
            'model_name': model_name_used
        }
        output_path = '../data/processed/embeddings_llm_api.pkl'
        with open(output_path, 'wb') as f:
            pickle.dump(data_to_save, f)
        print(f"\n‚úÖ Embeddings salvos em: {output_path}")
else:
    print("‚ùå Imposs√≠vel salvar resultados sem embeddings")
