In [None]:
# Read the Data in
import pandas as pd
import re

file_path = 'data/fake_narratives.csv'
df = pd.read_csv(file_path)
df = df.dropna() #drops all the empty values because Excel is super awesome and not flawed in any way
df['narrative'] = df['narrative'].str.strip()
def normalize_initials(text):
    return re.sub(r'\b[A-Z]{2,3}\b', '[CLIENT]', text)

df['cleaned_narrative'] = df['narrative'].apply(normalize_initials)

# df= df.sample(500, random_state=42) #safety sample for troubleshooting

import time
start = time.time()

# Embed the Narratives using a transformer
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-mpnet-base-v2')
embeddings = model.encode(df['cleaned_narrative'].tolist(), show_progress_bar=True)
print("Embeddings took", time.time() - start, "seconds")


from sklearn.manifold import TSNE
print("starting TSNE")
tsne = TSNE(n_components=2, perplexity=30, random_state=42)
tsne_result = tsne.fit_transform(embeddings)

# Cluster the Embedded Narratives
import hdbscan

print("Starting Clustering")
clusterer = hdbscan.HDBSCAN(min_cluster_size=10, metric='euclidean')
cluster_labels = clusterer.fit_predict(tsne_result)
df['cluster'] = cluster_labels
print("\nClustering took", time.time() - start, "seconds")


#Super-cluster the clusters
from sklearn.cluster import KMeans
import numpy as np

cluster_vectors = []
cluster_ids = []

for cluster_id in sorted(df['cluster'].unique()):
    if cluster_id == -1:
        continue
    cluster_embs = embeddings[df['cluster'] == cluster_id]
    cluster_vectors.append(np.mean(cluster_embs, axis=0))
    cluster_ids.append(cluster_id)

kmeans= KMeans(n_clusters=10, random_state=42)
superclusters = kmeans.fit_predict(cluster_vectors)

cluster_to_super = dict(zip(cluster_ids, superclusters))
df['supercluster'] = df['cluster'].map(cluster_to_super)


# # Develop the visual
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.colors as colors

# Create your visualization DataFrame
viz_df = pd.DataFrame({
    'x': tsne_result[:, 0],
    'y': tsne_result[:, 1],
    'IndexScore': df['IndexScore'].values,
    'supercluster': df['supercluster'].values
})

supercluster_stats = viz_df.groupby('supercluster')['IndexScore'].agg(['mean', 'count']).sort_values(by='mean', ascending=False)
print(supercluster_stats.head(10))
high_risk_clusters = supercluster_stats[supercluster_stats['mean'] > 5].index.tolist()


# # Normalize colors and choose colormap
norm = colors.Normalize(vmin=viz_df['IndexScore'].min(), vmax=viz_df['IndexScore'].max())
cmap = cm.get_cmap('magma')

# Create the figure + axes explicitly
fig, ax = plt.subplots(figsize=(12, 10))

# Scatter with Matplotlib directly (bypasses Seaborn's color limitations)
sc = ax.scatter(
    viz_df['x'],
    viz_df['y'],
    c=viz_df['IndexScore'],
    cmap=cmap,
    norm=norm,
    s=20,
    alpha=0.8
)

plt.figure(figsize=(10, 8))
# scatter = plt.scatter(umap_embeddings[:, 0], umap_embeddings[:, 1],
scatter = plt.scatter(
    tsne_result[:, 0], 
    tsne_result[:, 1],
    c=df['supercluster'],
    cmap='nipy_spectral', 
    s=20,
    alpha=0.8
)
plt.title('Narrative Embedding Superclusters')
plt.xlabel('t-SNE-1'); plt.ylabel('t-SNE-2')
plt.colorbar(scatter, label='Cluster')
plt.show()

# Add colorbar to the correct Axes
cbar = plt.colorbar(sc, ax=ax)
cbar.set_label("Index Score")

# Labels and formatting
ax.set_title("Narrative Embeddings Colored by Index Score")
ax.set_xlabel("t-SNE Dimension 1")
ax.set_ylabel("t-SNE Dimension 2")
plt.tight_layout()
plt.show()

# Printing out the narratives in full
output_path = "cluster_samples.txt"
with open(output_path, "w", encoding="utf-8") as f:
    for cluster_id in sorted(df['cluster'].unique()):
        if cluster_id == -1:
            continue  # Skip noise

        subset = df[df['cluster'] == cluster_id]
        n_to_sample = min(5, len(subset))

        f.write(f"\n=== Cluster {cluster_id} ({len(subset)} narratives) ===\n")
        for text in subset.sample(n=n_to_sample, random_state=42)['narrative']:
            f.write(f"- {text}\n")

# LLM generated summaries
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
from sklearn.feature_extraction.text import TfidfVectorizer

model_name = "facebook/bart-large-cnn"  # or 't5-small' for performance

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
summarizer = pipeline("summarization", model=model, tokenizer=tokenizer, device=-1)

output_path = "supercluster_summaries.txt"
with open(output_path, "w", encoding="utf-8") as f:
    for super_id in sorted(df['supercluster'].unique()):
        if super_id == -1:
            continue  # Skip noise

        subset = df[df['supercluster'] == super_id]
        sample_texts = subset['narrative'].dropna().sample(min(1000, len(subset)), random_state=42).tolist()
        joined = " ".join(sample_texts)
        prompt = joined[:1024]  # truncate if needed for small models

        # 🔑 TF-IDF Keyword Extraction
        tfidf = TfidfVectorizer(stop_words='english', max_features=10)
        if len(sample_texts) == 0 or len(" ".join(sample_texts).split()) < 10:
            continue

        try:
            tfidf = TfidfVectorizer(stop_words='english', max_features=10)
            tfidf.fit(sample_texts)
            keywords = tfidf.get_feature_names_out()
        except ValueError:
            keywords = ["[No keywords found]"]

        try:
            summary = summarizer(prompt, max_length=50, min_length=10, do_sample=False)[0]['summary_text']
        except Exception as e:
            summary = f"[ERROR summarizing cluster {cluster_id}: {e}]"

        metrics = supercluster_stats.loc[super_id]
        f.write(f"\n=== Cluster {super_id} ({len(subset)} narratives) ===\n")
        f.write(f"Avg IndexScore: {supercluster_stats['mean']:.2f}\n")
        f.write(f"Top Keywords: {', '.join(keywords)}\n")
        f.write(f"Summary: {summary}\n")
        f.write("- Sample:\n")
        for text in sample_texts:
            f.write(f"  • {text}\n")