#Task - Classify Embeddings using keras and Gemini API.


In [2]:
%pip install -U -q "google-generativeai>=0.8.3"

In [3]:
import google.generativeai as genai

from google.colab import userdata
api_key = userdata.get('GOOGLE_API_KEY')

genai.configure(api_key = api_key)

In [4]:
# Dataset 20 Newsgroups Text Dataset

from sklearn.datasets import fetch_20newsgroups

newsgroups_train = fetch_20newsgroups(subset = "train")
newgroups_test = fetch_20newsgroups(subset = "test")


In [24]:
# DataPoint from the training set
print(newsgroups_train.data[0])
print(newsgroups_train.target[0])
# Looks like it is a mail.

From: lerxst@wam.umd.edu (where's my thing)
Subject: WHAT car is this!?
Nntp-Posting-Host: rac3.wam.umd.edu
Organization: University of Maryland, College Park
Lines: 15

 I was wondering if anyone out there could enlighten me on this car I saw
the other day. It was a 2-door sports car, looked to be from the late 60s/
early 70s. It was called a Bricklin. The doors were really small. In addition,
the front bumper was separate from the rest of the body. This is 
all I know. If anyone can tellme a model name, engine specs, years
of production, where this car is made, history, or whatever info you
have on this funky looking car, please e-mail.

Thanks,
- IL
   ---- brought to you by your neighborhood Lerxst ----





7


In [32]:
newsgroups_train.target_names[5]

'comp.windows.x'

#Data Preprocessing

In [33]:
# We will only use body text and subject of the email for training.
import email
import re
import pandas as pd

def preprocess_row(data):
  # Extract only the subject and body
  message = email.message_from_string(data)
  # Extracting subject and body of email form the message object
  text = f"{message['Subject']} \n\n {message.get_payload()}"
  # Strip email addresses
  text = re.sub(r"[\w\.-]+@[\w\.-]+", "", text)
  # Truncate the text to 5000 characters
  text = text[: 500]

  return text

def preprocess_newsgroup_data(newsgroup_dataset):
  # Firstly creating a DataFrame
  df = pd.DataFrame(
      {"Text": newsgroup_dataset.data,
       "Label": newsgroup_dataset.target}
  )
  # Cleaning the Text Columns
  df["Text"] = df["Text"].apply(preprocess_row)

  df["Class Name"] = df["Label"].map(lambda l: newsgroup_dataset.target_names[l] )

  return df


In [34]:
# Applying Functions on the datasets
df_train = preprocess_newsgroup_data(newsgroups_train)
df_test = preprocess_newsgroup_data(newgroups_test)

df_train.head()

Unnamed: 0,Text,Label,Class Name
0,WHAT car is this!? \n\n I was wondering if an...,7,rec.autos
1,SI Clock Poll - Final Call \n\n A fair number ...,4,comp.sys.mac.hardware
2,"PB questions... \n\n well folks, my mac plus f...",4,comp.sys.mac.hardware
3,Re: Weitek P9000 ? \n\n Robert J.C. Kyanko () ...,1,comp.graphics
4,Re: Shuttle Launch Question \n\n From article ...,14,sci.space


In [None]:
df2 =

In [35]:
# Creating a SubSampling that should contain Science Category.
# for exapmple - Each Category Specified have say 100 examples
# Small dataset can do good work with LLm

def SampleData(df, num_samples, class_to_keep):
  df = (
      df.groupby("Label")[df.columns]
      .apply(lambda x: x.sample(num_samples))
      .reset_index(drop = True)

  )
  df = df[df["Class Name"].str.contains(class_to_keep)]

  df["Class Name"] = df["Class Name"].astype("category")
  df["Encoded Label"] = df["Class Name"].cat.codes

  return df


In [36]:
TRAIN_NUM_SAMPLES = 100
TEST_NUM_SAMPLES = 25
CLASSES_TO_KEEP = "sci"

df_train = SampleData(df_train, TRAIN_NUM_SAMPLES , CLASSES_TO_KEEP)
df_test = SampleData(df_test, TEST_NUM_SAMPLES,CLASSES_TO_KEEP)

In [37]:
df_train

Unnamed: 0,Text,Label,Class Name,Encoded Label
1100,Cryptology in the world \n\n What is the statu...,11,sci.crypt,0
1101,"Re: Once tapped, your code is no good any more...",11,sci.crypt,0
1102,Re: Fighting the Clipper Initiative \n\n In ar...,11,sci.crypt,0
1103,"Re: Once tapped, your code is no good any more...",11,sci.crypt,0
1104,Re: disk safety measure? \n\n (Tim Cuffel) wr...,11,sci.crypt,0
...,...,...,...,...
1495,"Re: TRUE ""GLOBE"", Who makes it? \n\n In articl...",14,sci.space,3
1496,Re: Why not give $1 billion to first year-long...,14,sci.space,3
1497,Re: Proton/Centaur? \n\n In article <1r54to$> ...,14,sci.space,3
1498,Life on Mars??? \n\n What is the deal with lif...,14,sci.space,3


In [40]:
df_train.value_counts("Class Name")

Unnamed: 0_level_0,count
Class Name,Unnamed: 1_level_1
sci.crypt,100
sci.electronics,100
sci.med,100
sci.space,100


In [41]:
df_test.value_counts("Class Name")

Unnamed: 0_level_0,count
Class Name,Unnamed: 1_level_1
sci.crypt,25
sci.electronics,25
sci.med,25
sci.space,25


# Creating the Embeddings

In [42]:
from google.api_core import retry
from tqdm.rich import tqdm

tqdm.pandas()

@retry.Retry(timeout= 300.0)
def embed_fn(text: str) -> list[float]:

  response = genai.embed_content(
      model = "models/text-embedding-004",
      content = text,
      task_type = "classification"
  )
  return response["embedding"]

def create_embeddings(df):
  df["Embeddings"] = df["Text"].progress_apply(embed_fn)
  return df

In [43]:
df_train = create_embeddings(df_train)
df_test = create_embeddings(df_test)

Output()

  t = cls(total=total, **tqdm_kwargs)


Output()

  t = cls(total=total, **tqdm_kwargs)


In [45]:
df_train.head()

Unnamed: 0,Text,Label,Class Name,Encoded Label,Embeddings
1100,Cryptology in the world \n\n What is the statu...,11,sci.crypt,0,"[-0.01100536, 0.022500366, -0.04057735, 0.0180..."
1101,"Re: Once tapped, your code is no good any more...",11,sci.crypt,0,"[-0.011343043, 0.015254265, -0.04219753, 0.035..."
1102,Re: Fighting the Clipper Initiative \n\n In ar...,11,sci.crypt,0,"[-0.010537983, 0.034328923, -0.030458104, 0.03..."
1103,"Re: Once tapped, your code is no good any more...",11,sci.crypt,0,"[-0.00558206, 0.02050471, -0.028905282, 0.0385..."
1104,Re: disk safety measure? \n\n (Tim Cuffel) wr...,11,sci.crypt,0,"[0.0018483903, 0.03105813, -0.04033552, 0.0111..."


#Building a Classification Model Using Keras

In [56]:
import keras
from keras import layers

def build_classification_model(input_size: int, num_class: int) -> keras.Model:
  return keras.Sequential(
      [
          layers.Input([input_size], name = "embedding_inputs"),
          layers.Dense(input_size, activation = 'relu', name = "hidden_layer"),
          layers.Dense(num_class, activation = "softmax", name = "output_probs"),

      ]
  )



In [57]:
embedding_size = len(df_train["Embeddings"].iloc[0])
number_of_classes = len(df_train["Class Name"].unique())
classifier = build_classification_model(
    embedding_size, number_of_classes
)
classifier.summary()

In [58]:
classifier.compile(
    loss = keras.losses.SparseCategoricalCrossentropy(),
    optimizer = keras.optimizers.Adam(learning_rate = 0.001),
    metrics = ["accuracy"],
)

Model Training

In [59]:
import numpy as np

NUM_EPOCHS = 20
BATCH_SIZE = 32

x_train = np.stack(df_train["Embeddings"])
y_train = df_train["Encoded Label"]
x_test = np.stack(df_test["Embeddings"])
y_test = df_test["Encoded Label"]


# Early stop if error stabilises
early_stop = keras.callbacks.EarlyStopping(monitor="accuracy", patience=3)


history = classifier.fit(
    x= x_train,
    y = y_train,
    validation_data= (x_test, y_test),
   callbacks = [early_stop],
    batch_size = BATCH_SIZE,
    epochs = NUM_EPOCHS,
)


Epoch 1/20
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 26ms/step - accuracy: 0.3310 - loss: 1.3491 - val_accuracy: 0.6300 - val_loss: 1.2733
Epoch 2/20
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - accuracy: 0.6980 - loss: 1.1992 - val_accuracy: 0.6600 - val_loss: 1.1479
Epoch 3/20
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - accuracy: 0.7024 - loss: 1.0481 - val_accuracy: 0.7100 - val_loss: 1.0436
Epoch 4/20
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.8257 - loss: 0.8772 - val_accuracy: 0.8600 - val_loss: 0.8864
Epoch 5/20
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - accuracy: 0.9254 - loss: 0.7182 - val_accuracy: 0.8700 - val_loss: 0.7611
Epoch 6/20
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - accuracy: 0.9315 - loss: 0.5597 - val_accuracy: 0.8500 - val_loss: 0.6698
Epoch 7/20
[1m13/13[0m [32m━━━━

In [61]:
classifier.evaluate(x = x_test, y= y_test, return_dict = True)

[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.8490 - loss: 0.4312 


{'accuracy': 0.8700000047683716, 'loss': 0.4050367474555969}

Our Model did good job with 87prcnt accuracy.

TESTING

In [62]:
test_text = """
Hii, I got very much exited watching night sky. Yesterday was full moon,
i got to know we can see mars from earch using good telescope. Can you
guide me to purchase one.
Thanks!  """

embed_test = embed_fn(test_text)

In [65]:
embed_test[:10]

[0.007853086,
 0.002002389,
 -0.011999561,
 0.03037862,
 0.016715974,
 0.029042153,
 0.06818176,
 0.010535904,
 -0.009475479,
 -0.027762512]

In [66]:
input_test_example= np.array([embed_test])

[result] = classifier.predict(input_test_example)



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step


In [69]:
result

array([0.00194749, 0.37111375, 0.03603706, 0.5909017 ], dtype=float32)

In [70]:
for idx, category in enumerate(df_test["Class Name"].cat.categories):
  print(f"{category}: {result[idx]*100: 0.2f}%")

sci.crypt:  0.19%
sci.electronics:  37.11%
sci.med:  3.60%
sci.space:  59.09%
