In [None]:
%load_ext autoreload
%autoreload 2

%cd '..'

In [None]:
import random

from load.constants import SEED

import numpy as np
from nltk.cluster import KMeansClusterer
from nltk.cluster.util import cosine_distance
from tqdm import tqdm

from preprocessing.utils import (
    load_event_comments,
)


In [None]:
import logging
import sys

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
    handlers=[
        logging.FileHandler("data/logs/embeddings_knn.log"),
        logging.StreamHandler(stream=sys.stdout)
    ]
)

In [None]:
EVENT_KEY = "us_elections_2016"

event_comments = load_event_comments(f"{EVENT_KEY}_with_embeddings")

In [None]:
logging.info("Loaded event comments")
logging.info(event_comments.shape)

### Clustering

In [None]:
NUM_CLUSTERS_LIST = [3, 5, 7, 10]

RNG = random.Random(SEED)

In [None]:
for num_clusters in tqdm(NUM_CLUSTERS_LIST):
    kclusterer = KMeansClusterer(
        num_means=num_clusters,
        distance=cosine_distance,
        rng=RNG,
    )

    assigned_clusters = kclusterer.cluster(
        event_comments["embedding"],
        assign_clusters=True,
    )

    means = np.array(kclusterer.means())

    sum_dists = []
    for i, c in enumerate(assigned_clusters):
        sum_dists.append(cosine_distance(means[c], comment_embeddings[i]))

    logging.info(f"Number of clusters: {num_clusters}")
    logging.info(
        f"Average distance to cluster center: {np.mean(sum_dists)}"
    )  # the smaller, the better
