## Day 2: classifying embeddings with Gemini API
Use embeddings produced by the gemini API to train a classifier that can classify newsgroup posts into categories based on post content

### Step 1: Install packages and set up API key

In [None]:
!pip install -U -q "google-genai==1.7.0"

In [None]:
from google import genai
from google.genai import types

genai.__version__

In [None]:
from kaggle_secrets import UserSecretsClient
GOOGLE_API_KEY = UserSecretsClient().get_secret("GOOGLE_API_KEY")
client = genai.Client(api_key=GOOGLE_API_KEY)

### Step 2: Load and Preprocess Data

In [None]:
from sklearn.datasets import fetch_20newsgroups

newsgroups_train = fetch_20newsgroups(subset="train")
newsgroups_test = fetch_20newsgroups(subset="test")

# View list of class names for dataset
newsgroups_train.target_names

In [None]:
import email
import re

import pandas as pd

def preprocess_newsgroup_row(data):
    # import only the subject and body
    msg = email.message_from_string(data)
    text = f"{mag['Subject']}\n\n{msg.get_payload()}"

    # Strip any remaining email addresses
    text = re.sub(r"[\w\.-]+@[\w\.-]+","",text)
    # Truncate each entry to 5,000 characters
    text = text[:5000]

## Day 2: classifying embeddings with Gemini API
Use embeddings produced by the gemini API to train a classifier that can classify newsgroup posts into categories based on post content

### Step 1: Install packages and set up API key

In [None]:
!pip install -U -q "google-genai==1.7.0"

In [None]:
from google import genai
from google.genai import types

genai.__version__

In [None]:
from kaggle_secrets import UserSecretsClient
GOOGLE_API_KEY = UserSecretsClient().get_secret("GOOGLE_API_KEY")
client = genai.Client(api_key=GOOGLE_API_KEY)

### Step 2: Load and Preprocess Data

In [None]:
from sklearn.datasets import fetch_20newsgroups

newsgroups_train = fetch_20newsgroups(subset="train")
newsgroups_test = fetch_20newsgroups(subset="test")

# View list of class names for dataset
newsgroups_train.target_names

In [None]:
import email
import re

import pandas as pd

## Remove sensitive information like names and emails. Keep only the subject and body

def preprocess_newsgroup_row(data):
    # Extract only the subject and body
    msg = email.message_from_string(data)
    text = f"{msg['Subject']}\n\n{msg.get_payload()}"
    # Strip any remaining email addresses
    text = re.sub(r"[\w\.-]+@[\w\.-]+", "", text)
    # Truncate each entry to 5,000 characters
    text = text[:5000]

    return text

def preprocess_newsgroup_data(newsgroup_dataset):
    # Put data points into dataframe
    df = pd.DataFrame(
        {"Text": newsgroup_dataset.data, "Label": newsgroup_dataset.target}
    )
    # Clean up the text
    df["Text"] = df["Text"].apply(preprocess_newsgroup_row)
    # Match label to target name index
    df["Class Name"] = df["Label"].map(lambda l: newsgroup_dataset.target_names[l])

    return df

In [None]:
# Apply preprocessing function to training and test datasets
df_train = preprocess_newsgroup_data(newsgroups_train)
df_test = preprocess_newsgroup_data(newsgroups_test)

df_train.head()

sample some of the data by taking 100 data points in the training dataset, and dropping a few of the categories to run through this tutorial. Choose the science categories to compare.


In [None]:
def sample_data(df, num_samples, classes_to_keep):
    # Sample rows, selecting num_samples of each Label.
    df = (
        df.groupby("Label")[df.columns]
        .apply(lambda x: x.sample(num_samples))
        .reset_index(drop=True)
    )

    df = df[df["Class Name"].str.contains(classes_to_keep)]

    # We have fewer categories now, so re-calibrate the label encoding.
    df["Class Name"] = df["Class Name"].astype("category")
    df["Encoded Label"] = df["Class Name"].cat.codes

    return df

In [None]:
TRAIN_NUM_SAMPLES = 100
TEST_NUM_SAMPLES = 25
# Class name should contain 'sci' to keep science categories.
# Try different labels from the data - see newsgroups_train.target_names
CLASSES_TO_KEEP = "sci"

df_train = sample_data(df_train, TRAIN_NUM_SAMPLES, CLASSES_TO_KEEP)
df_test = sample_data(df_test, TEST_NUM_SAMPLES, CLASSES_TO_KEEP)

### Step 3: Create embeddings

In [None]:
from google.api_core import retry
import tqdm
from tqdm.rich import tqdm as tqdmr
import warnings

# Add tqdm to Pandas
tqdmr.pandas()

#...But suppress the experimental warning
warnings.filterwarnings("ignore", category=tqdm.TqdmExperimentalWarning)

# Define a helper to retry when per-minute quota is reached.
is_retriable = lambda e: (isinstance(e, genai.errors.APIError) and e.code in {429,503})

@retry.Retry(predicate=is_retriable, timeout=300.0)
def embed_fn(text: str) -> list[float]:

    # You will be performing classification, so set task_type accordingly. 
    response = client.models.embed_content(
        model="models/text-embedding-004",
        contents=text,
        config=types.EmbedContentConfig(
            task_type="classification",
        )
    )

    return response.embeddings[0].values

def create_embeddings(df):
    df["Embeddings"] = df["Text"].progress_apply(embed_fun)
    return df

In [None]:
df_train = create_embeddings(df_train)
df_test = create_embeddings(df_test)

### Step 4: Build a classification model

In [None]:
import keras
from keras import layers

def build_classification_model(input_size: int, num_classes: int) -> keras.Model:
    return keras.Sequential(
        [
            layers.Input([input_size], name="embedding_inputs"),
            layers.Dense(input_size, activation="relu", name="hidden"),
            layers.Dense(num_classes, activation="softmax", name="output_probs"),
        ]
    )

In [None]:
# Derive the embedding size from observing the data. The embedding size can also be specified with the "output_dimensionality" parameter to "embed_content" if you need to reduce it.
embedding_size = len(df_train["Embeddings"].iloc[0])

classifier = build_classification_model(
    embedding_size, len(df_train['Class Name'].unique())
)
classifier.summary()

classifier.compile(
    loss=keras.losses.SparseCategoricalCrossentropy(),
    optimizer=keras.optimizers.Adam(learning_rate=0.001),
    metrics=["accuracy"],
)

: 

### Step 5: Train the model

In [None]:
import numpy as np

NUM_EPOCHS = 20
BATCH_SIZE = 32

# Split the x and y components of the train and validation subsets
y_train = df_train["Encoded Label"]
x_train = np.stack(df_train["Embeddings"])
y_val = df_test["Encoded Label"]
x_val = np.stack(df_test["Embeddings"])

early_stop = keras.callbacks.EarlyStopping(monitor="accuracy", patience=3)

# train the model
history = classifier.fit(
    x=x_train,
    y=y_train,
    validation_data=(x_val, y_val),
    callbacks=[early_stop],
    batch_size=BATCH_SIZE,
    epochs=NUM_EPOCHS,
)

### Step 6: Evaluate model performance

In [None]:
classifier.evaluate(x=x_val, y=y_val, return_dict = True)

: 