In [10]:
import glob
import pandas as pd
import numpy as np
import pytz
from tqdm import tqdm
import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec
from collections import defaultdict
import seaborn as sns
import os
import hdbscan

In [12]:
# LOAD embeddings
X_all=np.load("./data/embeddings_United States_01.npy")

# LOAD ran tSNE result
X_2d_all=np.load("./data/tsne_United States_01.npy")

# LOAD submissions from US in Jan
subset_country=pd.read_csv("./data/USsub_January.csv")

In [13]:
# from sklearn.manifold import TSNE
# # ----------------------------
# # 6. t-SNE reduction
# # ----------------------------
# print("Fitting t-SNE (this may take a while)...")
# tsne = TSNE(n_components=2, random_state=42, metric='cosine', n_jobs=-1, init='pca', verbose=2)
# X_2d_all = tsne.fit_transform(X_all)
# print("t-SNE complete.")


# Save t-SNE results
# tsne_save_file = f"tsne_{target_country}_{target_month}.npy"
# os.makedirs("tsne_results", exist_ok=True)
# np.save(os.path.join("tsne_results", tsne_save_file), X_2d_all)
# print(f"t-SNE coordinates saved to tsne_results/{tsne_save_file}")

In [14]:
subset_country['embedding_2d'] = list(X_2d_all)

# ----------------------------
# 7. HDBSCAN clustering
# ----------------------------
clusterer = hdbscan.HDBSCAN(min_cluster_size=10, metric='euclidean')
cluster_labels = clusterer.fit_predict(X_2d_all)
subset_country["cluster"] = cluster_labels
print(f"Number of clusters found: {len(np.unique(cluster_labels))}")

Number of clusters found: 2863


In [15]:
# Assuming subset_country already has columns:
# 'local_hour', 'cluster', 'embedding_2d' (t-SNE 2D)
hours = range(24)
clusters = np.unique(subset_country['cluster'])
clusters = clusters[clusters != -1]  # ignore noise

# Track cumulative size of each cluster
cumulative_sizes = {c: 0 for c in clusters}
# Track hourly additions
hourly_additions = {c: [] for c in clusters}

for hour in hours:
    df_hour = subset_country[subset_country['local_hour'] == hour]
    for c in clusters:
        n_points = np.sum(df_hour['cluster'] == c)
        hourly_additions[c].append(n_points)
        cumulative_sizes[c] += n_points

# Convert to DataFrame for plotting
hourly_df = pd.DataFrame(hourly_additions, index=hours)
hourly_df.index.name = 'hour'

In [None]:
# ============================================================
# Parameters
# ============================================================
TIME_POINTS = np.arange(0, 24, 4)   # 6 time points
ALPHA_CLUSTER = 0.6
ALPHA_LINE = 0.15

# ============================================================
# Load embeddings + clustering
# ============================================================

clusterer = hdbscan.HDBSCAN(min_cluster_size=10)
subset_country["cluster"] = clusterer.fit_predict(X_2d_all)

clusters = np.unique(subset_country["cluster"])
clusters = clusters[clusters != -1]

# Global axis limits for t-SNE
x_min, x_max = X_2d_all[:, 0].min(), X_2d_all[:, 0].max()
y_min, y_max = X_2d_all[:, 1].min(), X_2d_all[:, 1].max()

# ============================================================
# Hourly cluster counts
# ============================================================
hourly_additions = {c: [] for c in clusters}

for h in range(24):
    df_h = subset_country[subset_country["local_hour"] == h]
    for c in clusters:
        hourly_additions[c].append(np.sum(df_h["cluster"] == c))

hourly_df = pd.DataFrame(hourly_additions, index=range(24))
hourly_df_6 = hourly_df.loc[TIME_POINTS]

total_posts_per_cluster = hourly_df.sum(axis=0)
ranked_clusters = total_posts_per_cluster.sort_values(ascending=False).index

# ============================================================
# Figure layout
# ============================================================
plt.rcParams.update({
    "font.size": 11,
    "axes.spines.top": False,
    "axes.spines.right": False
})

fig = plt.figure(figsize=(20, 5))
gs = GridSpec(1, 3, figure=fig, width_ratios=[1.6, 1, 1], wspace=0.2)

# ============================================================
# (A) t-SNE cluster snapshots (6 time points)
# ============================================================
gsA = gs[0, 0].subgridspec(2, 3, wspace=0.05, hspace=0.15)

for i, hour in enumerate(TIME_POINTS):
    ax = fig.add_subplot(gsA[i // 3, i % 3])
    df_h = subset_country[subset_country["local_hour"] == hour]

    if len(df_h) > 0:
        X_h = np.vstack(df_h["embedding_2d"].values)
        labels = df_h["cluster"].values

        mask = labels != -1
        ax.scatter(
            X_h[mask, 0], X_h[mask, 1],
            c=labels[mask] % 20,
            cmap="tab20",
            s=12,
            alpha=ALPHA_CLUSTER
        )

    ax.set_title(f"h = {hour}", fontsize=10)
    ax.set_xlim(x_min, x_max)
    ax.set_ylim(y_min, y_max)
    ax.set_xticks([])
    ax.set_yticks([])



# ============================================================
# (B) Power-law scaling of cluster sizes
# ============================================================
axB = fig.add_subplot(gs[0, 1])

bins = np.logspace(
    np.log10(total_posts_per_cluster.min()),
    np.log10(total_posts_per_cluster.max()),
    40
)
hist, edges = np.histogram(total_posts_per_cluster, bins=bins)
centers = (edges[:-1] + edges[1:]) / 2

axB.loglog(centers, hist, "o")
axB.set_xlabel("Cluster size (total posts)")
axB.set_ylabel("Number of clusters")
# axB.set_title("Cluster size distribution")

mask = hist > 0
slope, _ = np.polyfit(
    np.log10(centers[mask]),
    np.log10(hist[mask]),
    1
)
axB.text(
    0.05, 0.05,
    rf"$P(s)\sim s^{{-{abs(slope):.2f}}}$",
    transform=axB.transAxes
)

# ============================================================
# (C) Cumulative cluster growth (6 time points)
# ============================================================
axC = fig.add_subplot(gs[0, 2])

for c in ranked_clusters:
    axC.plot(
        np.arange(24),
        np.cumsum(hourly_df[c]),
        color="black",
        alpha=ALPHA_LINE
    )

axC.set_xlabel("Local hour")
axC.set_ylabel("Cumulative submissions")
# axC.set_title("Cumulative growth")

# ============================================================
# Final layout
# ============================================================
# fig.suptitle(
#     f"Temporal organization and scaling of semantic clusters\n"
#     f"{target_country}, {target_month}",
#     y=1.05,
#     fontsize=14
# )

plt.tight_layout()
fig.text(0.12, 0.95, "(A)", fontsize=20, weight="bold")
fig.text(0.44, 0.95, "(B)", fontsize=20, weight="bold")
fig.text(0.69, 0.95, "(C)", fontsize=20, weight="bold")
plt.savefig("Figure6.png", dpi=300)