In [1]:
from pymongo import MongoClient

client = MongoClient()
db = client.regulationsgov_test # Testing DB
comments_collection = db.comments
details_collection = db.details

In [3]:
from copy import deepcopy
import time
from data_collection.CommentScraper import exists, insert

## Custom getAllComments breaks after 2 pages
def getAllComments(apibasereq, collection):
    pageNum = 1
    metaPageNum = 1
    while True: 
        apireq = deepcopy(apibasereq)
        try:
            documents = apireq.sort("lastModifiedDate").page(pageNum).get()
            print(f"[{metaPageNum}](pg {pageNum}/20) ratelimit={apireq.ratelimit}", end="")
            print(" "*100, end="\r")
        except RuntimeError:
            print("Rate Limit exceeded, retrying in 1 minute")
            time.sleep(60)
            continue
        except ConnectionError as e:
            print(e)
            continue

        if len(documents["data"]) == 0:
            break

        for doc in documents["data"]:
            if exists(doc, collection):
                continue
            insert(doc, collection)

        if pageNum >= 2: ## TESTING ONLY
            break        #
        
        if documents["meta"]["hasNextPage"] == False:
            date = documents["data"][-1]["attributes"]["lastModifiedDate"]
            apireq = apireq.lastmodified(date)
            pageNum = 1
            metaPageNum += 1
        else:
            pageNum += 1

In [None]:
from data_collection.RegAPI import RegAPI
from data_collection.CommentScraper import getCommentDetails, APICommentDetailScraper

api = RegAPI(250)
getAllComments(api.endpoint("/comments"), comments_collection)
comments = [comment for comment in comments_collection.find()]
getCommentDetails(APICommentDetailScraper(api), comments, details_collection)

Failed to parse filetype 'doc' for attachment https://downloads.regulations.gov/NARA-05-0005-0002/attachment_2.doc            
The command `antiword /tmp/tmpvvq0nnq8` failed with exit code 1
------------- stdout -------------
b''------------- stderr -------------
b'/tmp/tmpvvq0nnq8 is not a Word Document. It is probably a Word Perfect file\n'
Failed to parse filetype 'doc' for attachment https://downloads.regulations.gov/EPA-HQ-OA-2003-0006-0003/attachment_1.doc
The command `antiword /tmp/tmpmuot01u_` failed with exit code 1
------------- stdout -------------
b''------------- stderr -------------
b'/tmp/tmpmuot01u_ is not a Word Document. It is probably a Rich Text Format file\n/tmp/tmpmuot01u_ is not a Word Document.\n'


In [None]:
from sentence_transformers import SentenceTransformer
from transformers import pipeline
topic_embedder = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")
perspective_model = SentenceTransformer("nli-roberta-base-v2")
# comment_perspectives = pipeline(task="text2text-generation", model="google/flan-t5-small")

In [None]:
comment_data = [ details["comment"]["plaintext"] for details in details_collection.find()]

In [None]:
if "topic_embeddings.npy" in files:
    topic_embeddings = np.load("topic_embeddings.npy")
else:
    topic_embeddings = topic_embedder.encode(comment_data,show_progress_bar=True)
    np.save("topic_embeddings.npy", topic_embeddings)

if "perspective_embeddings.npy" in files:
    perspective_embeddings = np.load("perspective_embeddings.npy")
else:
    perspective_embeddings = perspective_model.encode(comment_data,show_progress_bar=True)
    np.save("perspective_embeddings.npy", perspective_embeddings)

# Concatenate topic + perspective embeddings
final_embeddings = np.hstack((topic_embeddings, perspective_embeddings))

In [None]:
from sklearn.preprocessing import MinMaxScaler

# Example timestamps (YYYY-MM-DD format)
timestamps = pd.to_datetime(comments["Posted Date"]).astype(int) / 10**9  # Convert to Unix timestamps
timestamps = timestamps.to_numpy()

# Normalize timestamps to [0,1] range
scaler = MinMaxScaler()
normalized_timestamps = scaler.fit_transform(timestamps.reshape(-1, 1))

# Append normalized time to embeddings
final_embeddings = np.hstack((final_embeddings, normalized_timestamps))


In [None]:
import hdbscan
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaled_embeddings = scaler.fit_transform(topic_embeddings)

# Apply HDBSCAN to cluster comments into topics
clusterer = hdbscan.HDBSCAN(min_cluster_size=15, metric="euclidean", cluster_selection_method="eom")
comments["topic_cluster"] = clusterer.fit_predict(scaled_embeddings)

# Count the number of topics
num_topics = len(set(comments["topic_cluster"])) - (1 if -1 in comments["topic_cluster"].values else 0)
print(f"Number of Topics Identified: {num_topics}")


In [None]:
import phate
import matplotlib.pyplot as plt

# Run PHATE
phate_operator = phate.PHATE()
phate_embedding = phate_operator.fit_transform(final_embeddings)

# Visualize
plt.scatter(phate_embedding[:, 0], phate_embedding[:, 1], c=normalized_timestamps, cmap="viridis")
plt.colorbar(label="Time Progression")
plt.title("Comment Evolution Over Time")
plt.show()