## 1. Load required libraries

In [1]:
import os
import time
import random
import joblib
import umap
import hdbscan
import pandas as pd
import google.generativeai as genai

from dotenv import load_dotenv
from google.generativeai.types import HarmCategory, HarmBlockThreshold


# Load environment variables from .env file
load_dotenv()

# Retrieve API key from environment variable
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")

genai.configure(api_key=GEMINI_API_KEY)
#gemini-1.0-pro
model = genai.GenerativeModel("gemini-1.0-pro-001", safety_settings={
    HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
    HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
    HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE,
    HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE})


## 2. Define training functions

In [2]:
def get_file_list(directory: str) -> list[str] | None:
    files = [os.path.join(directory, f) for f in os.listdir(directory)]
    files = [f for f in files if os.path.isfile(f)]
    return files


def get_training_data() -> list[float]:
    df = pd.DataFrame()
    files = get_file_list("../data")
    for file in files:
        df = pd.concat([df, pd.read_json(file, orient="records")])
    return df


def random_select_or_all(lst, num_elements=1000):
    if len(lst) <= num_elements:
        return lst
    else:
        return random.sample(lst, num_elements)


def umap_reducer(n_neighbors=30, min_dist=0.0, n_components=20, metric='cosine') -> umap.UMAP:
    return umap.UMAP(
        n_neighbors=n_neighbors,
        min_dist=min_dist,
        n_components=n_components,
        metric=metric
        )


def umap_reducer_2d(n_neighbors=15, min_dist=0.1, n_components=2, metric='cosine') -> umap.UMAP:
    return umap.UMAP(
        n_neighbors=n_neighbors,
        min_dist=min_dist,
        n_components=n_components,
        metric=metric
        )


def hdbscan_clusterer(min_cluster_size=250, min_samples=5, cluster_selection_epsilon=0.0,
                      metric="euclidean", cluster_selection_method="eom", prediction_data=True) -> hdbscan.HDBSCAN:
    return hdbscan.HDBSCAN(min_cluster_size=min_cluster_size, 
                           min_samples=min_samples, 
                           cluster_selection_epsilon=cluster_selection_epsilon,
                           metric=metric, 
                           cluster_selection_method=cluster_selection_method, 
                           prediction_data=prediction_data
                           )


def get_cluster_descriptions(df: pd.DataFrame) -> list[str]:
    unique_labels = df["labels"].unique().tolist()
    unique_labels.sort()
    cluster_descriptions = {}
    for ul in unique_labels:
        titles = df[df["labels"] == ul]["title"].to_list()
        titles = random_select_or_all(titles)
        prompt = """Abstracts from Arxiv were clustered. All of the following articles are from one cluster.
Provide three single words that describe the cluster. Do not use phrases. Do not use hyphenated words.
Do not use compound words."""

        for i, title in enumerate(titles):
            prompt = f"{prompt}\n Article {i}: {title}"
        print(f"Label: {ul} - Num of titles: {len(titles)} - Token count: {model.count_tokens(prompt)}")
        model_results = model.generate_content(prompt)
        model_results = model_results.text.replace("\n", ", ")
        cluster_descriptions[ul] = model_results
        time.sleep(3)
    return cluster_descriptions


def train_models():
    df = get_training_data()
    embeddings = df["embedding"].to_list()

    umap_model = umap_reducer()
    umap_2d_model = umap_reducer_2d()
    hdbscan_model = hdbscan_clusterer()

    reduced_embeddings = umap_model.fit_transform(embeddings)
    labels = hdbscan_model.fit_predict(reduced_embeddings)
    df["labels"] = labels
    print("Number of observations:", len(labels))
    print("Number of noise entries:", len([num for num in labels if num == -1]))
    print("Number of labels:", max(labels) + 2)
    cluster_descriptions = get_cluster_descriptions(df)
    print(cluster_descriptions)
    umap_2d_model.fit(embeddings)
    joblib.dump(umap_model, "../models/umap_model.pkl")
    joblib.dump(umap_2d_model, "../models/umap_2d_model.pkl")
    joblib.dump(hdbscan_model, "../models/hdbscan_model.pkl")
    return

## 3. Train models

In [3]:
if __name__ == "__main__":
    train_models()

Number of observations: 10896
Number of noise entries: 3181
Number of labels: 11
Label: -1 - Num of titles: 1000 - Token count: total_tokens: 20776

Label: 0 - Num of titles: 324 - Token count: total_tokens: 7230

Label: 1 - Num of titles: 1000 - Token count: total_tokens: 21327

Label: 2 - Num of titles: 332 - Token count: total_tokens: 6708

Label: 3 - Num of titles: 830 - Token count: total_tokens: 18521

Label: 4 - Num of titles: 443 - Token count: total_tokens: 10087

Label: 5 - Num of titles: 555 - Token count: total_tokens: 11738

Label: 6 - Num of titles: 578 - Token count: total_tokens: 12429

Label: 7 - Num of titles: 1000 - Token count: total_tokens: 19576

Label: 8 - Num of titles: 355 - Token count: total_tokens: 6779

Label: 9 - Num of titles: 669 - Token count: total_tokens: 14767

{-1: '- Diffusion, - Generative models, - Time series', 0: 'Wireless, Artificial Intelligence, Computing', 1: 'Language, Model, Logic', 2: 'Federated learning, Privacy, Security', 3: '- Neural