# Vectorization with Ollama (EmbeddingGemma)

This notebook installs Ollama, pulls the `embeddinggemma` model, and vectorizes text data from an uploaded CSV file.

In [None]:
# 1. Install dependencies
!pip install colab-xterm ollama

In [None]:
# 2. Install and Start Ollama
# This runs in the background.
get_ipython().system_raw('curl -fsSL https://ollama.com/install.sh | sh')
get_ipython().system_raw('ollama serve &')

In [None]:
# 3. Pull the model
# Wait a few seconds for the server to start before running this.
import time
time.sleep(5)
!ollama pull embeddinggemma:latest

In [None]:
# 4. Upload CSV file
from google.colab import files
import pandas as pd
import io

uploaded = files.upload()
filename = next(iter(uploaded))
df = pd.read_csv(io.BytesIO(uploaded[filename]))
print(f"Loaded {len(df)} rows from {filename}")
df.head()

In [None]:
# 5. Vectorize
import ollama
from tqdm import tqdm

tqdm.pandas()

# Define the column to vectorize. Change '問題名' if necessary.
TARGET_COLUMN = '問題名'

def get_embedding(text):
    try:
        # Ensure text is a string
        if not isinstance(text, str):
            text = str(text)
        response = ollama.embeddings(model="embeddinggemma:latest", prompt=text)
        return response["embedding"]
    except Exception as e:
        print(f"Error processing '{text}': {e}")
        return []

print(f"Vectorizing column: {TARGET_COLUMN}...")
# Apply the function with a progress bar
df['embedding'] = df[TARGET_COLUMN].progress_apply(get_embedding)

print("Vectorization complete.")

In [None]:
# 6. Save to JSON
# Convert dataframe to list of dicts or just save the embeddings with ID
output_filename = "gemma_embeddings.json"

# Saving the entire dataframe as JSON
df.to_json(output_filename, orient='records', force_ascii=False, indent=2)

print(f"Saved to {output_filename}")
files.download(output_filename)