In [None]:
import pickle
from tqdm import tqdm
import numpy as np
import cuml
import torch
import sys
import hdbscan
from dataclasses import dataclass, field
import pandas as pd
import json

sys.setrecursionlimit(300000)
print(sys.getrecursionlimit())

300000


In [3]:
with open('/exports/eddie/scratch/s1891075/AIP_NLS_data/embeddings/embedding_list_0.pkl', 'rb') as f:
    data0 = pickle.load(f)
metas = []
for batch in data0[0]:
    for row in batch[0]:
        metas.append(row)

ebd_batches = []
for batch in data0[1]:
    ebd_batches.append(batch)
ebd = np.concatenate(ebd_batches, axis=0)

with torch.no_grad():
    embedding = torch.tensor(ebd, device='cuda:1')
    # embedding = embedding / torch.sqrt((embedding**2).sum(-1, keepdim=True))


In [4]:
umap = cuml.manifold.UMAP(
    n_neighbors=50,          # balance between local & global structure
    n_components=64,         # target dimension for HDBSCAN
    metric='cosine',         # appropriate for embedding similarity
    min_dist=0.0,            # keep clusters tight
    spread=1.0,              # controls overall scale of embedding
    random_state=42,
    # low_memory=True,
    # densmap=False,           # set True if you want smoother cluster shapes (slower)
    # transform_seed=42,
)
reduced_data = umap.fit_transform(embedding)
reduced_data.shape

[2025-11-12 20:39:16.986] [CUML] [info] build_algo set to brute_force_knn because random_state is given


(1272857, 64)

In [5]:
clusterer = cuml.cluster.hdbscan.HDBSCAN(min_cluster_size=15, metric='euclidean', prediction_data=True, gen_min_span_tree=True)
clusterer.fit(reduced_data)

In [10]:
len(metas)

1272857

In [18]:
topic_labels = clusterer.labels_
assert len(topic_labels) == len(metas)

for meta, label in zip(metas, topic_labels):
    meta['topic_label'] = label.tolist()


In [20]:
metas[0]

{'title': 'Memoirs of my time, including personal reminiscences of eminent men.',
 'creator': 'Hodder, George.',
 'type': 'text',
 'publisher': 'London, ',
 'date': '1870.',
 'language': 'eng',
 'description': None,
 'subject': None,
 'relation': None,
 'rights': None,
 'identifier': None,
 'coverage': None,
 'format': None,
 'topic_label': 2801}

In [21]:
clusterer.labels_.max()

array(3194, dtype=int32)

In [23]:
table = pd.DataFrame(metas)


In [32]:
(1272857 - 265378) / 3195

315.32989045383414

In [31]:
table.topic_label.value_counts()

topic_label
-1       265378
 1523     97154
 1704     76041
 1467     52767
 1595     39217
          ...  
 3066        15
 2941        15
 2668        15
 2209        15
 2152        15
Name: count, Length: 3196, dtype: int64

In [34]:
table.to_parquet('topic_table.parquet')

In [37]:
topic_values = table.topic_label.unique()

In [50]:
table.topic_label.value_counts()

topic_label
-1       265378
 1523     97154
 1704     76041
 1467     52767
 1595     39217
          ...  
 3066        15
 2941        15
 2668        15
 2209        15
 2152        15
Name: count, Length: 3196, dtype: int64

In [80]:
most_common = table.topic_label.value_counts()[1:101].index.tolist()
most_common_count = table.topic_label.value_counts()[1:101].tolist()

In [82]:
most_common_count

[97154,
 76041,
 52767,
 39217,
 21730,
 20201,
 19236,
 17892,
 17408,
 13355,
 9507,
 9184,
 7091,
 6986,
 6282,
 6094,
 5959,
 5777,
 5695,
 5194,
 5099,
 5020,
 4755,
 4477,
 4321,
 4225,
 4185,
 4128,
 4079,
 3654,
 3648,
 3633,
 3603,
 3492,
 3456,
 3404,
 3236,
 3227,
 3163,
 3026,
 3013,
 2969,
 2910,
 2878,
 2839,
 2790,
 2737,
 2703,
 2702,
 2667,
 2648,
 2640,
 2619,
 2593,
 2587,
 2514,
 2478,
 2476,
 2467,
 2408,
 2344,
 2326,
 2240,
 2179,
 2150,
 2144,
 2110,
 2107,
 2056,
 2029,
 2012,
 2010,
 2005,
 2000,
 1976,
 1968,
 1955,
 1954,
 1877,
 1871,
 1864,
 1848,
 1839,
 1788,
 1772,
 1761,
 1750,
 1727,
 1727,
 1714,
 1677,
 1657,
 1639,
 1631,
 1624,
 1607,
 1544,
 1481,
 1472,
 1456]

In [49]:
most_common

[1523,
 1704,
 1467,
 1595,
 1644,
 1538,
 2129,
 1455,
 1573,
 1282,
 2310,
 1158,
 1720,
 2399,
 2570,
 10,
 2088,
 2861,
 1681,
 1811,
 1059,
 1689,
 2036,
 319,
 958,
 1612,
 844,
 2176,
 1576,
 918,
 926,
 1068,
 1679,
 3033,
 2026,
 1162,
 1836,
 1357,
 786,
 2130,
 556,
 1073,
 798,
 3035,
 1500,
 1766,
 1250,
 1840,
 691,
 1516,
 1085,
 553,
 804,
 1956,
 2126,
 2569,
 2433,
 3131,
 3021,
 519,
 1603,
 1255,
 405,
 2801,
 1451,
 1569,
 3024,
 3138,
 1473,
 1764,
 1526,
 2905,
 1714,
 1483,
 1560,
 2315,
 947,
 1654,
 1400,
 951,
 3164,
 2167,
 2932,
 2098,
 715,
 1568,
 1835,
 873,
 374,
 2470,
 843,
 739,
 3085,
 484,
 929,
 2467,
 549,
 1368,
 1136,
 1288]

In [41]:
topic_values[1]

np.int64(1538)

In [54]:
table[table.topic_label == 1523].sample(n=20)

Unnamed: 0,title,creator,type,publisher,date,language,description,subject,relation,rights,identifier,coverage,format,topic_label
751146,Christ and the Blind Man. A symphonic poem for...,"Spelman, Timothy Mather",notated music,,,,MC. ; ORCHESTRA,,,,,,,1523
1086879,Sally Ann at the ballet /,"Beek, Deborah van der.",text,London Hippo,1990,eng,,,,,,,,1523
495301,"Concerto in Fa maggiore per violino, archi e c...","Malipiero, Gian Francesco",notated music,,,,VIOLIN & STRING ORCHESTRA ; Timing: 9' ; [Isti...,,,,,,,1523
134378,Raider's dawn : song for high voice /,"Lewis, Alun,1915-1944.",notated music,"Magor : Mansel Thomas Trust,","[199-], c1986.",eng,Duration: 1:53.,Songs (High voice) with piano.,,,,,,1523
396763,OUR BABY. Sleepy Songs & Lullabies.,,sound recording,,,,Books of BCM 1995 (Ass. Board),,,,,,,1523
1135806,"Psalms, hymns, and anthems; sung in the chapel...","Foundling Hospital (London, England)",text,"London : printed in the year,",1797.,eng,ESTC,,,,,,,1523
367970,The Melodies You Sing. [Song.] Words by Cliffo...,"Bax, Clifford.",notated music,,,,A1,,,,,,,1523
83823,"Second petit melange for the harp, on favorite...","Bochsa, N. C.",notated music,,,,,,,,,,,1523
588629,Team woodwind piano accompaniments & ensemble ...,"Russell-Smith, Geoffry.",notated music,Woodford Green International Music Publications,1992,zxx,Publ. no.,,,,,,,1523
997524,Beul-aithris òigridh : fileantaich/luchd-ionns...,"Mòd Naìseanta Rìoghail(2020 :Inverness, Sco...",text,,,gla,"Prescribed pieces for the Inverness Mod, 9-17 ...","Mòd Naìseanta Rìoghail (2020 : Inverness, S...",,,,,,1523


In [None]:
from elasticsearch import Elasticsearch, helpers                                
import pickle
import numpy as np
from tqdm import tqdm
import re
import hdbscan
import torch
import os
from openai import OpenAI

TOPIC_PROMPT = """You are a semantic summarizer. 
Given a list of book infomation, infer their common theme, topic, or concept.
Return a concise English phrase or a word (1–3 words) that best summarizes all of them.

Book infos:
{list_of_titles}

Output:"""


In [66]:
type(table)

pandas.core.frame.DataFrame

In [67]:
def generate_text(prompt: str, model: str = "gpt-4o") -> str:
    """
    Send a text prompt to the OpenAI API and return the generated response.

    Args:
        prompt (str): The input prompt text.
        model (str): Model name (e.g., "gpt-4o", "gpt-4o-mini", "gpt-3.5-turbo").

    Returns:
        str: The generated text output.
    """
    client = OpenAI()  # assumes OPENAI_API_KEY is set in your environment

    response = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": prompt}],
    )

    return response.choices[0].message.content.strip()

In [68]:
def get_topic(book_table: pd.DataFrame) -> str:
    list_of_titles = book_table['title'].tolist()
    list_of_types = book_table['type'].tolist()
    list_of_subjects = book_table['subject'].tolist()

    book_descriptions = [f"<title>:{title} <type>:({type_}) <subject>:{subject}" for title, type_, subject in zip(list_of_titles, list_of_types, list_of_subjects)]

    prompt = TOPIC_PROMPT.format(list_of_titles='\n'.join(book_descriptions))
    return generate_text(prompt)


In [78]:
most_common_topics = []
for labe_id in tqdm(most_common):
    most_common_topics.append(get_topic(table[table.topic_label == labe_id].sample(n=30)))

100%|██████████| 100/100 [01:35<00:00,  1.04it/s]


In [79]:
most_common_topics

['Music and Song',
 'Religion',
 'Economics and Trade',
 'Medicine',
 'Local History',
 'Poetry',
 'Art and Exhibitions',
 'Drama',
 'Computer Science and Information Technology',
 'Education',
 'Mathematics',
 'Language and Literacy',
 'Cartography',
 'Health and Nursing Services',
 'Government Administration',
 'City Planning Appeals',
 'Maps and Atlases',
 'Politics',
 'Books and Libraries',
 'Botany and Gardening',
 'Cooking',
 'Water and Environmental Management',
 'Sports',
 'Railways',
 'Church Architecture',
 'Chemistry',
 'Urban Planning',
 'Ancient Greece and Rome',
 'Automobiles',
 'Geology',
 'Energy and Resources',
 'Agriculture',
 'Philosophy',
 'Law',
 'World War II',
 'Transportation and Roads',
 'Russian Literature',
 'Archaeology',
 'Accounting',
 'Environmental Sustainability',
 'Housing and Tenancy',
 'Libraries and information services',
 'Aviation',
 'Murder Mystery',
 'Poetry and Satire',
 'Materials Science',
 'Architecture',
 'Law enforcement',
 'Criminal Justi

In [74]:
get_topic(table[table.topic_label == 1538].sample(n=30))

'Poetry'

In [83]:
most_common_topics_rows = []
for label_id, topic, count in zip(most_common, most_common_topics, most_common_count):
    most_common_topics_rows.append({
        'topic_label': label_id,
        'topic': topic,
        'count': count,
    })

In [85]:
most_common_topic_table = pd.DataFrame(most_common_topics_rows)
most_common_topic_table

Unnamed: 0,topic_label,topic,count
0,1523,Music and Song,97154
1,1704,Religion,76041
2,1467,Economics and Trade,52767
3,1595,Medicine,39217
4,1644,Local History,21730
...,...,...,...
95,2467,Italy,1607
96,549,Substance Abuse,1544
97,1368,Sociology,1481
98,1136,Animals and Pets,1472


In [86]:
most_common_topic_table.to_csv('most_common_topics.csv', index=False)

In [90]:
most_common_topic_table.topic.value_counts()

topic
Sports                 2
Religion               1
Music and Song         1
Economics and Trade    1
Medicine               1
                      ..
Italy                  1
Substance Abuse        1
Sociology              1
Animals and Pets       1
Science                1
Name: count, Length: 99, dtype: int64

In [91]:
import json

def python_dict_to_js_object_string(data: dict) -> str:
    """
    Converts a Python dictionary into a string representation of a 
    JavaScript object.

    It leverages the 'json' library, as the JSON format is a valid
    subset of JavaScript object literal syntax.

    Args:
        data: The Python dictionary to convert.

    Returns:
        A string containing the JavaScript object, ready to be
        pasted into a .js file or <script> tag.
    """
    try:
        # Use json.dumps to serialize the Python dict.
        # indent=4 makes it nicely formatted and readable.
        js_string = json.dumps(data, indent=4)
        
        # We can assign it directly to a JS variable, e.g., 'const data = ...;'
        return f"const data = {js_string};"
        
    except TypeError as e:
        print(f"Error: The dictionary contains a type that cannot be serialized to JSON: {e}")
        return "const data = {}; // Error during conversion"

In [101]:
js_data = []
for row in most_common_topic_table.to_dict('records'):
    js_data.append({
        'name': row['topic'] + ': ' + str(row['count']),
        'size': row['count'],
    })

In [98]:
text = python_dict_to_js_object_string(js_data)
with open('text.txt', 'w') as f:
    f.write(text)

In [None]:

def generate_bubble_chart_html(
    topic_data: list[dict], 
    output_filepath: str = "my_new_chart.html",
    root_name: str = "Library Collection",
    title: str = "Library Topic Popularity"
):
    """
    Generates a complete D3 bubble chart HTML file from a list of topic data.

    Args:
        topic_data: A list of dictionaries, where each dict has a "name" (str) 
                    and "size" (int/float) key. 
                    Example: [{"name": "Fiction", "size": 9500}, ...]
        output_filepath: The name of the HTML file to create (e.g., "index.html").
        root_name: The name for the central root node (default: "Library Collection").
        title: The main title for the HTML page and header (default: "Library Topic Popularity").
    """
    
    # --- 1. Process Data ---
    
    # Calculate total size for the legend
    total_size = sum(item.get("size", 0) for item in topic_data)
    total_size_formatted = f"{total_size:,}" # Add commas
    
    # Create the hierarchical data structure D3 needs
    data_for_js = {
        "name": root_name,
        "children": topic_data
    }
    
    # Convert the Python dict to a JSON string for injection into JavaScript
    # We can't just use f-strings for the list due to quote/key differences.
    # json.dumps is the correct and safe way.
    js_data_string = json.dumps(data_for_js, indent=4)
    js_data_block = f"const data = {js_data_string};"
    
    # Generate the HTML list items for the legend
    legend_items = []
    for item in topic_data:
        name = item.get("name", "N/A")
        size = item.get("size", 0)
        size_formatted = f"{size:,}"
        legend_items.append(
            f'                <li><span class="font-medium">{name}:</span> {size_formatted}</li>'
        )
    legend_items_html = "\n".join(legend_items)

    # --- 2. Define HTML Template ---
    # This is the entire index.html file as a template string, with
    # placeholders {variable_name} for our dynamic content.
    
    html_template = f"""<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>{title}</title>
    <!-- Load Tailwind CSS -->
    <script src="https://cdn.tailwindcss.com"></script>
    <!-- Load D3.js -->
    <script src="https://d3js.org/d3.v7.min.js"></script>
    <style>
        /* Define the font family for better aesthetics */
        :root {{
            font-family: 'Inter', sans-serif;
        }}

        /* Basic styling for the SVG container */
        #bubble-chart-container {{
            width: 100%;
            height: 80vh; /* Use viewport height for a good size */
            display: flex;
            justify-content: center;
            align-items: center;
            overflow: hidden; /* Hide overflow outside the SVG viewbox */
        }}

        /* Styling for the bubbles */
        .node circle {{
            cursor: pointer;
            transition: fill 0.3s, transform 0.3s;
            stroke: #fff;
            stroke-width: 2px;
        }}

        .node circle:hover {{
            opacity: 0.85;
            /* Note: We remove the CSS scale on hover here because it might conflict with D3 zoom transform */
        }}
    </style>
</head>
<body class="bg-gray-100 min-h-screen p-4 sm:p-8">

    <div class="max-w-4xl mx-auto">
        <!-- Header Card -->
        <header class="text-center mb-8 p-6 bg-white rounded-xl shadow-lg">
            <h1 class="text-3xl font-extrabold text-indigo-700">{title}</h1>
            <p class="mt-2 text-gray-600">
                A D3 Bubble Chart visualizing the relative size/frequency of major library topics.
                Bubble size corresponds to the number of books in that category.
            </p>
            <p class="mt-4 text-sm font-semibold text-green-600">
                Use your mouse wheel to **zoom** and click-and-drag to **pan** the chart!
            </p>
        </header>

        <!-- Bubble Chart Container -->
        <div class="bg-white p-4 sm:p-8 rounded-xl shadow-lg border border-gray-200">
            <div id="bubble-chart-container">
                <!-- D3 SVG will be inserted here -->
            </div>
        </div>

        <!-- Data Legend/Key -->
        <div class="mt-8 p-4 bg-white rounded-xl shadow-md text-sm text-gray-700">
            <h2 class="font-bold text-lg mb-2 text-indigo-600">Data Key</h2>
            <p>The total "size" of the collection represented here is {total_size_formatted} units (e.g., books/documents). The topics are:</p>
            <ul class="list-disc list-inside mt-2 grid grid-cols-2 sm:grid-cols-4 gap-x-4">
{legend_items_html}
            </ul>
        </div>
    </div>

    <script>
        // Global variable to hold the initialized D3 chart group
        let chartGroup; 

        // Use a function to ensure D3 logic runs only after the DOM is fully loaded.
        function createBubbleChart() {{
            // --- 1. Dynamic Data (Injected by Python) ---
{js_data_block}

            // --- 2. Chart Setup and Responsiveness ---
            const container = document.getElementById('bubble-chart-container');
            let width = container.clientWidth;
            let height = container.clientHeight;

            // Initialize or clear the SVG
            d3.select("#bubble-chart-container svg").remove();

            const svg = d3.select("#bubble-chart-container")
                .append("svg")
                .attr("viewBox", `0 0 ${{width}} ${{height}}`)
                .attr("preserveAspectRatio", "xMinYMin meet")
                .style("display", "block");

            // NEW: Append a group element that will hold the zoomable chart content
            // All bubbles and text will be appended to this group.
            chartGroup = svg.append("g"); 

            // Define the pack layout function
            const pack = d3.pack()
                .size([width, height])
                .padding(2);

            // --- 3. Data Processing ---
            const root = d3.hierarchy(data)
                .sum(d => d.size) // Use the 'size' property to determine bubble area
                .sort((a, b) => b.value - a.value); // Sort larger bubbles first

            // Apply the layout to the root node to calculate positions (x, y) and radius (r)
            const nodes = pack(root).descendants();

            // --- 4. Color Scale ---
            // Define a color scale to differentiate topics
            const color = d3.scaleOrdinal(d3.schemeCategory10);
            
            // Define the group (node) elements for the bubbles and text, appending to chartGroup
            const node = chartGroup.selectAll(".node")
                .data(nodes.filter(d => d.depth === 1)) // Filter to only include the main topics (depth 1)
                .enter().append("g")
                .attr("class", "node")
                .attr("transform", d => `translate(${{d.x}},${{d.y}})`);

            // --- 5. Draw Bubbles (Circles) ---
            node.append("circle")
                .attr("r", d => d.r) // Use the radius calculated by d3.pack
                .attr("fill", (d, i) => color(i))
                .attr("opacity", 0.9)
                .append("title") // Add tooltips for accessibility
                .text(d => `${{d.data.name}}: ${{d.data.size}} units`);

            // --- 6. Draw Labels (Text) ---
            // We use <foreignObject> to embed HTML <divs> which allows for text wrapping.
            const foreignObject = node.append("foreignObject")
                .attr("width", d => d.r * 2) // Set width to bubble diameter
                .attr("height", d => d.r * 2) // Set height to bubble diameter
                .attr("x", d => -d.r) // Center horizontally (offset by radius)
                .attr("y", d => -d.r) // Center vertically (offset by radius)
                .style("pointer-events", "none"); // Text should not interfere with mouse events

            // Add the HTML div for text content
            foreignObject.append("xhtml:div")
                .style("width", "100%")
                .style("height", "100%")
                .style("display", "flex")
                .style("align-items", "center")
                .style("justify-content", "center")
                .style("text-align", "center")
                .style("font-weight", "600")
                .style("color", "white")
                .style("text-shadow", "0 1px 1px rgba(0, 0, 0, 0.3)")
                // Scale font size. We use d.r / 4 to be a bit smaller to allow for wrapping.
                .style("font-size", d => `${{Math.min(24, Math.max(8, d.r / 4))}}px`)
                // Allow long words to break and wrap
                .style("word-break", "break-word")
                .style("overflow-wrap", "break-word")
                .html(d => d.data.name); // Set the text content

            // --- 7. Zoom Functionality ---
            
            // Handler function for the zoom event
            function zoomed(event) {{
                // Apply the new transform (k, x, y) to the chart group
                chartGroup.attr("transform", event.transform);
            }}

            // Create the zoom behavior
            const zoom = d3.zoom()
                .scaleExtent([0.5, 8]) // Allow scaling between 50% and 800%
                .on("zoom", zoomed); // Attach the zoom handler

            // Apply the zoom behavior to the main SVG element
            svg.call(zoom);


            // --- 8. Responsive handling for resizing (container changes) ---
            function handleResize() {{
                // Recalculate container dimensions
                width = container.clientWidth;
                height = container.clientHeight;

                // Update SVG viewbox and pack layout size
                svg.attr("viewBox", `0 0 ${{width}} ${{height}}`);
                pack.size([width, height]);
                
                // Recalculate layout and update positions/radii
                const newRoot = pack(root);

                // --- UPDATED LOGIC ---
                // Store the selection of nodes and bind the new data
                const updatedNodes = chartGroup.selectAll(".node")
                    .data(newRoot.descendants().filter(d => d.depth === 1));

                // Update positions of the <g> node group
                updatedNodes.attr("transform", d => `translate(${{d.x}},${{d.y}})`);

                // Use the updated selection to update the child <circle>
                updatedNodes.select("circle")
                    .attr("r", d => d.r);

                // Use the updated selection to update the child <foreignObject>
                const updatedForeignObject = updatedNodes.select("foreignObject")
                    .attr("width", d => d.r * 2)
                    .attr("height", d => d.r * 2)
                    .attr("x", d => -d.r)
                    .attr("y", d => -d.r);
                
                // And update the div inside it
                updatedForeignObject.select("div")
                    .style("font-size", d => `${{Math.min(24, Math.max(1, d.r / 6))}}px`)
                    .html(d => d.data.name); // Re-apply text in case it was lost
            }}

            // Debounce the resize event to prevent performance issues
            let resizeTimer;
            window.addEventListener('resize', () => {{
                clearTimeout(resizeTimer);
                resizeTimer = setTimeout(handleResize, 150);
            }});
            
            // Call resize once on load to ensure initial fit
            handleResize();
        }}

        // Run the chart function when the window loads
        window.onload = createBubbleChart;
    </script>
</body>
</html>
"""
    
    # --- 3. Write File ---
    try:
        with open(output_filepath, 'w', encoding='utf-8') as f:
            f.write(html_template)
        print(f"Successfully generated HTML file at: {output_filepath}")
    except IOError as e:
        print(f"Error writing file {output_filepath}: {e}")


In [109]:
generate_bubble_chart_html(
    js_data, 
    output_filepath="most_common_topics.html",
    title="Library Topic Popularity"
)

Successfully generated HTML file at: most_common_topics.html
