In [None]:
import numpy as np
from typing import Callable,DefaultDict
from gensim.models.ldamodel import LdaModel
from operator import itemgetter
from sklearn.cluster import KMeans

Cluster = list[np.ndarray]
ClusterMetric = Callable[[list[Cluster]],float|list[float]] #inputs list of clusters, outputs value per cluster

def CH(clusters:list[Cluster])->float:
  assert isinstance(clusters,list) and isinstance(clusters[0],list) and isinstance(clusters[0][0],np.ndarray)
  n_clusters = len(clusters)
  Wk:float = 0
  centers = [np.average(clust,axis=0) for clust in clusters];
  full_center = np.average([v for c in clusters for v in c],axis=0)
  for cluster,center in zip(clusters,centers):
    for x in cluster:
      Wk += np.sum((center - x)**2)

  Bk:float = 0
  for cluster,center in zip(clusters,centers):
    Bk += len(cluster)*(np.sum((center-full_center)**2))

  return Bk/Wk * (sum(map(len,clusters))-n_clusters)/(n_clusters - 1)

def SC(clusters:list[Cluster])->list[float]:
  assert isinstance(clusters,list) and isinstance(clusters[0],list) and isinstance(clusters[0][0],np.ndarray)
  n_clusters = len(clusters)
  centers = [np.average(clust,axis=0) for clust in clusters];
  res = [];
  for cluster,center in zip(clusters,centers):
    nearest = clusters[np.argmin(np.linalg.norm(center-centers,axis=1))]
    sc_k = 0
    for v in cluster:
      a = np.sum(np.linalg.norm(v-cluster,axis=1))/(len(cluster)-1) #don't ask about the math. a is average distance within a cluster
      b = np.sum(np.linalg.norm(v-nearest,axis=1))/(len(cluster))
      sc_k += (b-a)/(max(b,a))/len(cluster)
    res.append(sc_k)
  return res

BagOfWords = list[tuple[int,float]]
BagsOfWords = list[list[tuple[int,float]]]
def vecsToBows(vecs:list[np.ndarray])->BagsOfWords:
  bow:BagsOfWords = []
  for vec in vecs:
    b = []
    for i in np.nonzero(vec)[0]:
      b.append((i,vec[i]));
    bow.append(b)
  return bow

def bowsToVecs(bow:BagsOfWords,n_words:int)->list[np.ndarray]:
  res = []
  for doc in bow:
    vec = np.zeros((n_words));
    for word,count in doc:
      vec[word] = count
    res.append(vec)
  return res

def do_LDA(bag_of_words:BagsOfWords,
           num_topics:int|None=None,
           cluster_metrics:dict[str,ClusterMetric]={"CH":CH,"SC":SC}
           )->tuple[list[Cluster],dict[str,float|list[float]]]:
  assert isinstance(bag_of_words,list) and isinstance(bag_of_words[0],list) and isinstance(bag_of_words[0][0],tuple) #type STRONG
  if num_topics:
    model = LdaModel(bag_of_words,passes=4,iterations=200)
  else:
    model = LdaModel(bag_of_words,passes=4,iterations=200)

  topic_dict:dict[int,BagsOfWords] = DefaultDict(list)
  for doc in bag_of_words:
    topics:list[tuple[int,float]] = model[doc] #topic probability distribution
    topic:int = max(topics,key=itemgetter(1))[0]
    topic_dict[topic].append(doc)

  clusters:list[Cluster] = []
  veclength:int = np.max([v[0] for vs in topic_dict.values() for vi in vs for v in vi])+1;
  vlenth = model.num_topics
  for topic,bow in topic_dict.items():
    clusters.append(bowsToVecs(bow,veclength));

  metrics:dict[str,float|list[float]] = {name:metric(clusters) for name,metric in cluster_metrics.items()}
  return clusters,metrics

def do_kmeans(vectors:list[np.ndarray],
              num_means:int=8,
              cluster_metrics:dict[str,ClusterMetric]={"CH":CH,"SC":SC}
              )->tuple[list[Cluster],dict[str,float|list[float]]]:
  assert isinstance(vectors,list) and isinstance(vectors[0],np.ndarray)
  model = KMeans(num_means);
  labels = model.fit_predict(vectors);

  clusters:list[Cluster] = []
  for id in np.unique(labels):
    clusters.append([vectors[i] for i in np.argwhere(labels == id)[0]])

  metrics:dict[str,float|list[float]] = {name:metric(clusters) for name,metric in cluster_metrics.items()}
  return clusters,metrics