In [49]:
import json

records_with_embeddings = json.load(open('sample/affb/records_with_embeddings.json'))

In [50]:
from openai import OpenAI

API_KEY = open('OPENAI_KEY').read().strip()
INSTRUCTIONS = open('NAME-CLUSTER.md').read().strip()

client = OpenAI(api_key=API_KEY)

def find_cluster_title(topics, taglines):
    prompt = f'''{INSTRUCTIONS}

List of submission taglines:
- {"\n- ".join(taglines)}
'''
    completion = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {
                "role": "user",
                "content": [
                    { "type": "text", "text": prompt },
                ],
            }
        ],
        temperature=0.0000001
    )
    return completion.choices[0].message.content    

In [51]:
from sklearn.cluster import AgglomerativeClustering
import numpy as np
from sklearn.manifold import TSNE

X = np.array([record['embedding'] for record in records_with_embeddings])
print(X.shape)

tsne = TSNE(perplexity=50, n_components=2, init='random', n_iter=5000)
X = tsne.fit_transform(X)

clustering = AgglomerativeClustering(n_clusters=10, metric='cosine', distance_threshold=None, linkage='complete')
clustering.fit(X)
labels = clustering.labels_
num_clusters = len(set(labels))
print(f'num_clusters: {num_clusters}')

(157, 3072)




num_clusters: 10


In [None]:
from collections import Counter

label_counts = []
for i in range(num_clusters):
    cluster_members = [x for x, label in zip(records_with_embeddings, labels) if label == i]
    label_counts.append((i, len(cluster_members), cluster_members))

label_counts.sort(key=lambda x: x[1], reverse=True)
total = 0

for _, label, cluster_members in label_counts:
    print(f'Cluster {label} size: {len(cluster_members)}, {len(cluster_members) / len(records_with_embeddings) * 100:.2f}% of total')
    topics = Counter()
    for member in cluster_members:
        topics.update(member['future_scenario_topics'])
    most_common_topics = [x[0] for x in topics.most_common(7)]
    taglines = [member['future_scenario_description'] for member in cluster_members]

    title = find_cluster_title(most_common_topics, taglines)

    print(f'Cluster {i}', topics.most_common(7))
    print(f'Title: {title}')

    total += len(cluster_members)
    if total > 0.85 * len(records_with_embeddings):
        break


Cluster 22 size: 22, 14.01% of total
Cluster 9 [('politics', 12), ('geopolitics', 8), ('globalization', 4), ('technology', 3), ('environment', 3), ('migration', 3), ('social change', 2)]
Title: גבולות גיאופוליטיים חדשים
Cluster 22 size: 22, 14.01% of total
Cluster 9 [('technology', 9), ('AI', 7), ('education', 3), ('communication', 3), ('news', 2), ('society', 2), ('healthcare', 2)]
Title: בינה מלאכותית
Cluster 20 size: 20, 12.74% of total
Cluster 9 [('technology', 12), ('social media', 7), ('politics', 4), ('society', 3), ('activism', 3), ('privacy', 2), ('digital communication', 2)]
Title: כוח טכנולוגי מוגבר
Cluster 17 size: 17, 10.83% of total
Cluster 9 [('politics', 14), ('technology', 5), ('elections', 3), ('social movements', 3), ('human rights', 2), ('social unrest', 2), ('privacy', 2)]
Title: משטרים סמכותניים
Cluster 16 size: 16, 10.19% of total
Cluster 9 [('environment', 9), ('climate change', 7), ('politics', 7), ('sustainability', 3), ('social media', 3), ('AI', 2), ('commun