Please ensure that you are using the CMSE495-TwoSix conda environment (see INSTALL.md), and have a MongoDB server running locally.

Prior to running this demo code, please collect some data. To collect a small sample, run the code located in `data_collection/API-Scraping.ipynb`.



In [None]:
import numpy as np
import pandas as pd

In [None]:
from pymongo import MongoClient

client = MongoClient()
db = client.regulationsgov_test # Testing DB
comments_collection = db.comments
details_collection = db.details

In [None]:
from sentence_transformers import SentenceTransformer
from transformers import pipeline
topic_embedder = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")
perspective_model = SentenceTransformer("nli-roberta-base-v2")
# comment_perspectives = pipeline(task="text2text-generation", model="google/flan-t5-small")

In [None]:
comment_data = [ details["comment"]["plaintext"] for details in details_collection.find()]

In [None]:
# create a list of files in the current working directory, such as ‘files = os.listdir()

files = os.listdir()

if "topic_embeddings.npy" in files:
    topic_embeddings = np.load("topic_embeddings.npy")
else:
    topic_embeddings = topic_embedder.encode(comment_data,show_progress_bar=True)
    np.save("topic_embeddings.npy", topic_embeddings)

if "perspective_embeddings.npy" in files:
    perspective_embeddings = np.load("perspective_embeddings.npy")
else:
    perspective_embeddings = perspective_model.encode(comment_data,show_progress_bar=True)
    np.save("perspective_embeddings.npy", perspective_embeddings)

# Concatenate topic + perspective embeddings
final_embeddings = np.hstack((topic_embeddings, perspective_embeddings))

In [None]:
from sklearn.preprocessing import MinMaxScaler

# comments variable from API-Scraping.ipynb - please bring that in
# Example timestamps (YYYY-MM-DD format)
timestamps = pd.to_datetime(comments["Posted Date"]).astype(int) / 10**9  # Convert to Unix timestamps
timestamps = timestamps.to_numpy()

# Normalize timestamps to [0,1] range
scaler = MinMaxScaler()
normalized_timestamps = scaler.fit_transform(timestamps.reshape(-1, 1))

# Append normalized time to embeddings
final_embeddings = np.hstack((final_embeddings, normalized_timestamps))


In [None]:

import hdbscan
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaled_embeddings = scaler.fit_transform(topic_embeddings)

# Apply HDBSCAN to cluster comments into topics
clusterer = hdbscan.HDBSCAN(min_cluster_size=15, metric="euclidean", cluster_selection_method="eom")
comments["topic_cluster"] = clusterer.fit_predict(scaled_embeddings)

# Count the number of topics
num_topics = len(set(comments["topic_cluster"])) - (1 if -1 in comments["topic_cluster"].values else 0)
print(f"Number of Topics Identified: {num_topics}")


In [None]:
import phate
import matplotlib.pyplot as plt

# Run PHATE
phate_operator = phate.PHATE()
phate_embedding = phate_operator.fit_transform(final_embeddings)

# Visualize
plt.scatter(phate_embedding[:, 0], phate_embedding[:, 1], c=normalized_timestamps, cmap="viridis")
plt.colorbar(label="Time Progression")
plt.title("Comment Evolution Over Time")
plt.show()