In [2]:
%pip install google-genai bertopic

Note: you may need to restart the kernel to use updated packages.


In [None]:
import os, numpy as np
from google import genai                      # google-generativeai ≥0.5
from  bertopic.backend._base import BaseEmbedder # bertopic ≥0.14.0
from typing import List

genai_client = genai.Client(api_key=os.getenv("GOOGLE_API_KEY"))

class GeminiEmbeddingBackend(BaseEmbedder):
    """BERTopic-ready backend that sends batched requests to Gemini."""
    def __init__(
        self,
        model: str = "models/embedding-001",      # or gemini-embedding-exp-03-07
        batch_size: int = 100,                    # SDK max as of May 2025
        task_type: str | None = "clustering",     # or retrieval_document …
        title: str | None = None,                 # used only for task_type=retrieval_document
    ):
        self.client      = genai_client 
        self.model       = model
        self.batch_size  = batch_size
        self.task_type   = task_type
        self.title       = title

    # -------- required by BERTopic -------- #
    def embed(self, documents: List[str], verbose: bool = False) -> np.ndarray:
        """Return a 2-D float32 NumPy array of shape (n_docs, dim)."""
        vectors = []
        for start in range(0, len(documents), self.batch_size):
            batch = documents[start : start + self.batch_size]

            if verbose:
                print(f"Embedding docs {start}–{start+len(batch)-1} / {len(documents)}")

            resp = self.client.models.embed_content(
                model      = self.model,
                contents    = batch,         
                config= genai.types.EmbedContentConfig(
                    task_type = self.task_type,
                    title     = self.title,
                ),
                
            )
            # `resp` → google.genai.types.BatchEmbedContentsResponse
            vectors.extend([e.values for e in resp.embeddings])

        return np.asarray(vectors, dtype="float32")


In [24]:
from dotenv import load_dotenv
load_dotenv()  # Load environment variables from .env file

True

In [46]:
import pandas as pd
from bertopic import BERTopic


dfs1 = {
    'trump_gaza_1000.csv': pd.read_csv('../data/cleaned/trump_gaza_1000.csv'), 
    'JH_new_wars_1000': pd.read_csv('../data/cleaned/JH_new_wars_1000.csv'),
    'jack_vs_calley_1000': pd.read_csv('../data/cleaned/jack_vs_calley_1000.csv')
    }

embeddeding_001_model = BERTopic(
    embedding_model=GeminiEmbeddingBackend(),
    
)
for df_name, df in dfs1.items():
    print(f"Processing {df_name}...")
    topics, probs = embeddeding_001_model.fit_transform(df['cleaned_text'].tolist())
    fig = embeddeding_001_model.visualize_documents(docs=df['cleaned_text'].tolist(), topics=topics, custom_labels=df['text'].tolist())
    fig.show()

Processing trump_gaza_1000.csv...


Processing JH_new_wars_1000...


KeyboardInterrupt: 

## Text Embedding 004 (Gecko)
Gecko, or Text embedding 004 is a text embedding model, distilled from google's gemini LLm. 

In [53]:
text_embedding_004_model = BERTopic(
    embedding_model=GeminiEmbeddingBackend(model="text-embedding-004", task_type="clustering"),
)

for df_name, df in dfs1.items():
    print(f"Processing {df_name}...")
    topics, probs = text_embedding_004_model.fit_transform(df['cleaned_text'].tolist())
    fig = text_embedding_004_model.visualize_documents(docs=df['cleaned_text'].tolist(), topics=topics, custom_labels=df['text'].tolist())
    fig.show()

Processing trump_gaza_1000.csv...


Processing JH_new_wars_1000...


Processing jack_vs_calley_1000...


In [54]:
gemini_embedding_exp_03_07_model = BERTopic(
    embedding_model=GeminiEmbeddingBackend(model="gemini-embedding-exp-03-07", task_type=None),
)

for df_name, df in dfs1.items():
    print(f"Processing {df_name}...")
    topics, probs = gemini_embedding_exp_03_07_model.fit_transform(df['cleaned_text'].tolist())
    fig = gemini_embedding_exp_03_07_model.visualize_documents(docs=df['cleaned_text'].tolist(), topics=topics, custom_labels=df['text'].tolist())
    fig.show()

Processing trump_gaza_1000.csv...


ClientError: 429 RESOURCE_EXHAUSTED. {'error': {'code': 429, 'message': 'Resource has been exhausted (e.g. check quota).', 'status': 'RESOURCE_EXHAUSTED'}}