exploratory data analysis file for current data

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [None]:
data = pd.read_csv("../data/m72-ea4p-pnqp.csv",header=0)

In [None]:
data.head()

In [None]:
data.columns

In [None]:
data.describe()

In [None]:
data = data.dropna(axis=1, how='all')

In [None]:
data.info()

In [None]:
data.columns

In [None]:
data.head()

In [None]:
data

In [None]:
print(data["Tracking Number"].notna().sum())
print(data["Tracking Number"].isna().sum())

In [None]:
#create a mask for 
data[data["Government Agency"].notna()]

In [None]:
data[data["Attachment Files"].notna()]


# Observations looking at the above data:
1) Lots of columns with absolutely no data
2) Different scenarios for records:
    i) Records for document name (given away by records that have Federal Register number)
    ii) Normal people who comment
    iii) govt. agencies that comment (giveaway is non-null govt agency and govt agency type)
    iv) file attachments - any commentor can have attachments, usually document with comments/letter
Approach:
1) first need to clean data so that each row corresponds to 1 comment:
    - take document info row, remove, and place its details in each respective comment row
    - clean data - convert date columns to pd.datetime
    - use SBERT, BERTopic, sentence transformers primarily for text embedding
    - store embeddings in ChromaDB
    - identify theme/content for document
    - identify sentiment of comment on given document
    - PHATE should cluster (in a branch) together similar sentiments for a given topic
    - hence should appear as numerous tree structures in viz with branches determining sentiments
    - as you move further down branch, should indicate passing of time

In [None]:
#Posted Date & Received Date need to be put into datetime format
data["Posted Date"] = pd.to_datetime(data["Posted Date"],errors="coerce")
data["Received Date"] = pd.to_datetime(data["Received Date"],errors="coerce")


In [None]:
data.head()

In [None]:
documents = data[data["Federal Register Number"].notna()]
comments = data[data["Federal Register Number"].isna()]

In [None]:
documents.shape

In [None]:
comments.shape

In [None]:
comments.head()

In [None]:
documents.head()

In [None]:
data["is_govt_agency"] = data["Government Agency"].notna()

In [None]:
#embedding models:
#SBERT, BERTopic, LLama, Huggingface R1, 

In [None]:
from sentence_transformers import SentenceTransformer
from transformers import pipeline
topic_embedder = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")
# comment_perspectives = pipeline(task="text2text-generation", model="google/flan-t5-small")

In [None]:
perspective_model = SentenceTransformer("nli-roberta-base-v2")

In [None]:
comment_data = comments["Comment"].to_numpy()

In [None]:
import os
files = os.listdir()

In [None]:
if "topic_embeddings.npy" in files:
    print("True!")

In [None]:
if "topic_embeddings.npy" in files:
    topic_embeddings = np.load("topic_embeddings.npy")
else:
    topic_embeddings = topic_embedder.encode(comment_data,show_progress_bar=True)
    np.save("topic_embeddings.npy", topic_embeddings)


In [None]:
if "perspective_embeddings.npy" in files:
    perspective_embeddings = np.load("perspective_embeddings.npy")
else:
    perspective_embeddings = perspective_model.encode(comment_data,show_progress_bar=True)
    np.save("perspective_embeddings.npy", perspective_embeddings)


In [None]:
# Concatenate topic + perspective embeddings
final_embeddings = np.hstack((topic_embeddings, perspective_embeddings))

In [None]:
from sklearn.preprocessing import MinMaxScaler

# Example timestamps (YYYY-MM-DD format)
timestamps = pd.to_datetime(comments["Posted Date"]).astype(int) / 10**9  # Convert to Unix timestamps
timestamps = timestamps.to_numpy()

# Normalize timestamps to [0,1] range
scaler = MinMaxScaler()
normalized_timestamps = scaler.fit_transform(timestamps.reshape(-1, 1))

# Append normalized time to embeddings
final_embeddings = np.hstack((final_embeddings, normalized_timestamps))


In [None]:
import hdbscan
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaled_embeddings = scaler.fit_transform(topic_embeddings)

# Apply HDBSCAN to cluster comments into topics
clusterer = hdbscan.HDBSCAN(min_cluster_size=15, metric="euclidean", cluster_selection_method="eom")
comments["topic_cluster"] = clusterer.fit_predict(scaled_embeddings)

# Count the number of topics
num_topics = len(set(comments["topic_cluster"])) - (1 if -1 in comments["topic_cluster"].values else 0)
print(f"Number of Topics Identified: {num_topics}")


In [None]:
import phate
import matplotlib.pyplot as plt

# Run PHATE
phate_operator = phate.PHATE()
phate_embedding = phate_operator.fit_transform(final_embeddings)

# Visualize
plt.scatter(phate_embedding[:, 0], phate_embedding[:, 1], c=normalized_timestamps, cmap="viridis")
plt.colorbar(label="Time Progression")
plt.title("Comment Evolution Over Time")
plt.show()