How would I subjectively say one set of clusters are better than another?
* the cluster label/phrase accurately describes the cluster
* the cluster labels/phrases are distinct
* the noise is minimized
* 

In [53]:
import umap
import hdbscan
import optuna
import random
import numpy as np
import pandas as pd

from sklearn.decomposition import PCA



def pca_preprocessor(data):
    n_components_max = len(data)
    pca = PCA(n_components=n_components_max)
    pca.fit(data)
   
    explained_variance_ratio_ = pca.explained_variance_ratio_
    cumulative_explained_variance_ratio_ = explained_variance_ratio_.cumsum()
   
    # Calculate the second derivative
    second_derivative = np.gradient(np.gradient(cumulative_explained_variance_ratio_))
    # Find the index of the maximum of the second derivative
    recommended_components_elbow = np.argmax(second_derivative)
   
    # Alternative recommendation based on 90% explained variance
    threshold = 0.9
    recommended_components_90_pct = None  # Initialize outside the loop
    for i, ratio in enumerate(cumulative_explained_variance_ratio_):
        if ratio >= threshold:
            recommended_components_90_pct = i + 1
            break
      
    selected_components = min(recommended_components_elbow, recommended_components_90_pct)
    pca = PCA(n_components=selected_components)

    return pca.fit_transform(data)


def umap_reduce(data, n_neighbors=30, min_dist=0.0, n_components=10, metric='cosine', preprocess_pca=True):
    reducer = umap.UMAP(
        n_neighbors=n_neighbors,
        min_dist=min_dist,
        n_components=n_components,
        metric=metric
        )
    
    if preprocess_pca:
       data = pca_preprocessor(data)
    return reducer.fit_transform(data)


def get_abstracts(cluster_id, probability, abstract, use_top_three=True):
    # Convert lists to numpy arrays
    cluster_id = np.array(cluster_id)
    probability = np.array(probability)
    abstract = np.array(abstract)
    
    # Get unique cluster ids
    unique_clusters = np.unique(cluster_id)
    
    # Create a dictionary to hold the selected abstracts
    selected_abstracts = {}
    
    for cluster in unique_clusters:
        # Get indices of cluster entries
        cluster_indices = np.where(cluster_id == cluster)[0]
        
        # Sort probabilities for this cluster
        sorted_indices = np.argsort(probability[cluster_indices])[::-1]
        
        if use_top_three:
            selected_indices = sorted_indices[:3]
        else:
            remaining_indices = sorted_indices[3:]
            selected_indices = random.sample(remaining_indices.tolist(), min(3, len(remaining_indices)))
        
        selected_abstracts[cluster] = abstract[cluster_indices[selected_indices]].tolist()
    
    return selected_abstracts


def format_initial_prompt(selected_abstracts):
    return f"Use the following data which is in the format <cluster_id>: [articles] to generate \
a single word and single sentence summary of the type of information the cluster represents. \
The single world and single sentence should uniquely identify a given cluster. \
--- Abstracts --- \
 {selected_abstracts} \
 -------- \
The returned values should only be in the format <cluster_id>: (word, sentence). "


def format_evaluation_prompt(selected_abstracts, generated_summaries):
    return f"You are provided with grouped article abstracts in the format <cluster_id>: [list of abstracts]. \
You are also provided with a single word and single sentence proposed summary of the clusters in the format \
<cluster_id>: (single word, single sentence). On a scale of 1 to 100 with 1 being worst and 100 being best \
evaluate the semantic accuracy of the summary and uniqueness of the summary per cluster. \
--- Abstracts --- \
{selected_abstracts} \
--- Summaries --- \
{generated_summaries} \
------ \
Your answer should only be in the form: <cluster_id>: rating. "


def retrieve_llm_score(cluster_ids, probabilities, abstracts):
    # Retreive 3 articles from each cluster closest to the center
    articles = get_abstracts(cluster_ids, probabilities, abstracts, use_top_three=True)
    # Stuff articles in prompt
    initial_prompt = format_initial_prompt(articles)
    # Retreive summaries from the LLM
    example_generated_summaries = "-1: ('Quantum', 'This cluster focuses on the generation of realistic-looking quantum circuits to enhance benchmarking and advance the development of quantum compilers and hardware.')\
0: ('Reasoning', 'This cluster revolves around evaluating large language models' abilities in critique and rectify reasoning across various domains using the CriticBench benchmark.')\
1: ('Graph', 'The topics in this cluster include the expansion of polytopes, finding Hamiltonian cycles in graphs, and embedding Poincar\'e halfspace into discrete metric spaces.')\
2: ('Vision', 'This cluster discusses novel approaches and applications in computer vision, including oriented object detection with Vision Transformers and controllable video editing for seamless object insertion in videos.')"
    # Retreive 3 other articles randomly from cluster
    articles = get_abstracts(cluster_ids, probabilities, abstracts, use_top_three=False)
    # Stuff the articles in a prompt
    evaluation_prompt = format_evaluation_prompt(articles, example_generated_summaries)
    # Retrive a score from the LLM
    example_llm_output = {-1: 90,
                          0: 85,
                          1: 80,
                          2: 85}
    return np.mean(list(example_llm_output.values()))


def objective(trial):
    
    n_neighbors = trial.suggest_int("n_neighbors", 20, 40)
    min_dist = trial.suggest_float("min_dist", 0.0, 1.0)
    n_components = trial.suggest_int("n_components", 10, 20)
    preprocess_pca = trial.suggest_categorical("preprocess_pca", ["True", "False"])

    min_cluster_size = trial.suggest_int("min_cluster_size", 10, 30)
    min_samples = trial.suggest_int("min_samples", 5, 30)
    cluster_selection_epsilon = trial.suggest_float("cluster_selection_epsilon", 0.0, 1.0)
    cluster_selection_method = trial.suggest_categorical("cluster_selection_method", ["eom", "leaf"])

    data = df["embeddings"].to_list()
    data = umap_reduce(data, 
                    n_neighbors=n_neighbors, 
                    min_dist=min_dist, 
                    n_components=n_components, 
                    preprocess_pca=preprocess_pca)

    clusterer = hdbscan.HDBSCAN(min_cluster_size=min_cluster_size,
                            min_samples=min_samples,
                            cluster_selection_epsilon=cluster_selection_epsilon,
                            metric="euclidean",
                            cluster_selection_method=cluster_selection_method,
                            prediction_data=True)
    
    clusterer.fit(data)

    cluster_ids = clusterer.labels_
    print(cluster_ids.max()+2)
    probabilities = clusterer.probabilities_
    abstracts = df["Abstract"].to_list()

    llm_score = retrieve_llm_score(cluster_ids, probabilities,abstracts)
    print(llm_score)

    noise_count = sum(clusterer.labels_ == -1)

    threshold = 0.05
    low_prob_count = probabilities[probabilities <= threshold].sum()

    value = noise_count + low_prob_count
    return value


df = pd.read_pickle("./archive/arxiv_step2.pkl")
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=1)

study.best_params

[I 2024-03-03 14:25:17,913] A new study created in memory with name: no-name-0a519b4f-5fe5-4f55-b317-af47b53f4049
[I 2024-03-03 14:25:21,624] Trial 0 finished with value: 413.0 and parameters: {'n_neighbors': 30, 'min_dist': 0.9673649701713906, 'n_components': 11, 'preprocess_pca': 'False', 'min_cluster_size': 15, 'min_samples': 17, 'cluster_selection_epsilon': 0.38672734734204406, 'cluster_selection_method': 'eom'}. Best is trial 0 with value: 413.0.


5
85.0


{'n_neighbors': 30,
 'min_dist': 0.9673649701713906,
 'n_components': 11,
 'preprocess_pca': 'False',
 'min_cluster_size': 15,
 'min_samples': 17,
 'cluster_selection_epsilon': 0.38672734734204406,
 'cluster_selection_method': 'eom'}