In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from collections import Counter

def compute_coherence_measurements(article_embeddings, cluster_assignments):
    """
    Computes coherence measurements for each cluster based on cosine similarity between article embeddings.
    
    Parameters:
    article_embeddings (numpy.ndarray): Array of article embeddings (each row represents an article).
    cluster_assignments (numpy.ndarray): Array of cluster assignments for each article.
    
    Returns:
    numpy.ndarray: Array of coherence measurements for each cluster.
    """
    num_clusters = np.max(cluster_assignments) + 1
    coherence_measurements = []

    for cluster_id in range(num_clusters):
        cluster_indices = np.where(cluster_assignments == cluster_id)[0]
        cluster_embeddings = article_embeddings[cluster_indices]
        
        # Compute pairwise cosine similarity between article embeddings within the cluster
        if len(cluster_embeddings) > 1:
            similarity_matrix = cosine_similarity(cluster_embeddings)
            mean_similarity = np.mean(similarity_matrix[np.triu_indices(len(cluster_embeddings), k=1)])
            coherence_measurements.append(mean_similarity)
        else:
            coherence_measurements.append(0.0)  # If only one article in the cluster, coherence is 0

    return np.array(coherence_measurements)


def compute_diversity_measurements(cluster_terms):
    """
    Computes diversity measurements for each cluster based on the diversity of terms.
    
    Parameters:
    cluster_terms (list): List of lists, where each inner list contains the terms for a cluster.
    
    Returns:
    numpy.ndarray: Array of diversity measurements for each cluster.
    """
    diversity_measurements = []

    for cluster in cluster_terms:
        term_counts = Counter(cluster)
        total_terms = sum(term_counts.values())
        term_probs = [count / total_terms for count in term_counts.values()]
        entropy = -np.sum([p * np.log(p) for p in term_probs if p > 0])  # Compute entropy
        diversity_measurements.append(entropy)

    return np.array(diversity_measurements)


def custom_loss(coherence_measurements, diversity_measurements, alpha=0.5):
    """
    Creates a custom loss function balancing semantic coherence and topic diversity.
    
    Parameters:
    coherence_measurements (numpy.ndarray): Array of coherence measurements for each cluster.
    diversity_measurements (numpy.ndarray): Array of diversity measurements for each cluster.
    alpha (float): Weight parameter controlling the balance between coherence and diversity.
    
    Returns:
    float: Combined loss value.
    """
    # Ensure coherence and diversity measurements are numpy arrays
    coherence_measurements = np.array(coherence_measurements)
    diversity_measurements = np.array(diversity_measurements)
    
    # Compute coherence loss as the negative mean coherence measurement
    coherence_loss = -np.mean(coherence_measurements)
    
    # Compute diversity loss as the mean diversity measurement
    diversity_loss = np.mean(diversity_measurements)
    
    # Combine coherence and diversity losses using alpha
    combined_loss = alpha * coherence_loss + (1 - alpha) * diversity_loss
    
    return combined_loss


Davies-Bouldin Index (DBI)
Silhouette Score
Calinski-Harabasz Index
Adjusted Rand Index (ARI)