In [115]:
LANG = 'wel'
LANGUAGE = 'Welsh'

In [None]:
import pickle
from tqdm import tqdm
import numpy as np
import cuml
import torch
import sys
import hdbscan
from dataclasses import dataclass, field
import pandas as pd
import json
import re
import os
from openai import OpenAI
from multiprocessing import Pool, cpu_count
from tqdm import tqdm

sys.setrecursionlimit(300000)
print(sys.getrecursionlimit())

TOPIC_PROMPT = """You are a semantic summarizer. 
Given a list of book infomation, infer their common theme, topic, or concept.
Return a concise English phrase or a word (1–5 words) that best summarizes all of them.

Book infos:
{list_of_titles}

Output:"""

def generate_text(prompt: str, model: str = "gpt-4o") -> str:
    """
    Send a text prompt to the OpenAI API and return the generated response.

    Args:
        prompt (str): The input prompt text.
        model (str): Model name (e.g., "gpt-4o", "gpt-4o-mini", "gpt-3.5-turbo").

    Returns:
        str: The generated text output.
    """
    client = OpenAI()  # assumes OPENAI_API_KEY is set in your environment

    response = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": prompt}],
    )

    return response.choices[0].message.content.strip()

def get_topic(book_table: pd.DataFrame) -> str:
    list_of_titles = book_table['title'].tolist()
    list_of_types = book_table['type'].tolist()
    list_of_subjects = book_table['subject'].tolist()

    book_descriptions = [f"<title>:{title} <type>:({type_}) <subject>:{subject}" for title, type_, subject in zip(list_of_titles, list_of_types, list_of_subjects)]

    prompt = TOPIC_PROMPT.format(list_of_titles='\n'.join(book_descriptions))
    return generate_text(prompt)



300000


In [117]:
table = pd.read_parquet('topic_table.parquet')

In [118]:
table = table[table.language == LANG]
table

Unnamed: 0,title,creator,type,publisher,date,language,description,subject,relation,rights,identifier,coverage,format,topic_label
20,Corwynt!,"Tomos, Angharad,1958-",text,Y Lolfa,1988,wel,,,,,,,,2635
1364,Coginio traddodiadol bara ceirch a rhai bwydyd...,Welsh Folk Museum.,text,[Cardiff] Amgueddfa Genedlaethol Cymru (Amgued...,1982,wel,Text on inside front cover,"Cooking, Welsh.",,,,,,1059
1384,"Siwrneu, neu daith Cristiana a'i phlant o ddin...","Bunyan, John,1628-1688.",text,[Caerfyrddin] : Argraphwyd yng Haerfyrddin gan...,[1726?],wel,"Rowlands, W. Cambrian bib.,",,,,http://galenet.galegroup.com/servlet/ECCO?c=1&...,,,1704
1471,Glas ydi'r nefoedd /,"Edwards, Sonia.",text,"Caernarfon : Gwasg Gwynedd,","1995, c1993.",wel,Originally published: 1993.,,,,,,,-1
1495,Y safonau gofynnol cenedlaethol ar gyfer gwarc...,Wales.Welsh Assembly Government.,text,"Caerdydd : Llywodraeth Cynulliad Cymru,",2002.,wel,Includes bibliographical references.,Child care workers,,,,,,1956
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1270393,"Llyfr gweddi gyffredin, a gweinyddiad y sacram...",,text,"Rhydychen : Argraphedig yn argraphdy y brifysgol,",[1947?],wel,,,,,,,,1704
1270549,Llyfr mawr Culhwch /,Welsh Joint Education Committee.,text,"Aberystwyth : Cymdeithas Lyfrau Ceredigion,",1995.,wel,Published under the auspices of the Welsh Join...,Welsh language,,,,,,1504
1270634,Gofal ac addysg plant. [electronic resource] /,"Tassoni, Penny.",text,"[Aberystwyth] : CAA,",[2010?],wel,Title from container.,,,Available for use on a standalone workstation ...,,,,1242
1271920,Y calendr comedi for,"Roberts, Aled.",text,Llanrwst Gwasg Carreg Gwalch,1988,wel,,,,,,,,1455


In [119]:
common_topic_series = table.topic_label.value_counts()[1:101]
common_topic_ids = common_topic_series.index.tolist()
common_topic_counts = common_topic_series.tolist()

In [120]:
def process_topic(args):
    topic_id, topic_count, table = args
    topic = get_topic(table[table.topic_label == topic_id].sample(n=10, replace=True))
    return {
        'name': topic + ': ' + str(topic_count),
        'size': topic_count,
    }

# Prepare arguments as a list of tuples
args_list = [(tid, tcount, table) for tid, tcount in zip(common_topic_ids, common_topic_counts)]

# Use all CPU cores, or specify processes=N
with Pool(processes=cpu_count()) as pool:
    # tqdm supports parallel map if you wrap with list() and total
    common_topic_info = list(tqdm(pool.imap(process_topic, args_list), total=len(args_list)))

100%|██████████| 100/100 [00:08<00:00, 11.65it/s]


In [121]:
len(common_topic_info)

100

In [122]:

def generate_bubble_chart_html(
    topic_data: list[dict], 
    output_filepath: str = "my_new_chart.html",
    root_name: str = "Library Collection",
    title: str = "Library Topic Popularity"
):
    """
    Generates a complete D3 bubble chart HTML file from a list of topic data.

    Args:
        topic_data: A list of dictionaries, where each dict has a "name" (str) 
                    and "size" (int/float) key. 
                    Example: [{"name": "Fiction", "size": 9500}, ...]
        output_filepath: The name of the HTML file to create (e.g., "index.html").
        root_name: The name for the central root node (default: "Library Collection").
        title: The main title for the HTML page and header (default: "Library Topic Popularity").
    """
    
    # --- 1. Process Data ---
    
    # Calculate total size for the legend
    total_size = sum(item.get("size", 0) for item in topic_data)
    total_size_formatted = f"{total_size:,}" # Add commas
    
    # Create the hierarchical data structure D3 needs
    data_for_js = {
        "name": root_name,
        "children": topic_data
    }
    
    # Convert the Python dict to a JSON string for injection into JavaScript
    # We can't just use f-strings for the list due to quote/key differences.
    # json.dumps is the correct and safe way.
    js_data_string = json.dumps(data_for_js, indent=4)
    js_data_block = f"const data = {js_data_string};"
    
    # Generate the HTML list items for the legend
    legend_items = []
    for item in topic_data:
        name = item.get("name", "N/A")
        size = item.get("size", 0)
        size_formatted = f"{size:,}"
        legend_items.append(
            f'                <li><span class="font-medium">{name}:</span> {size_formatted}</li>'
        )
    legend_items_html = "\n".join(legend_items)

    # --- 2. Define HTML Template ---
    # This is the entire index.html file as a template string, with
    # placeholders {variable_name} for our dynamic content.
    
    html_template = f"""<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>{title}</title>
    <!-- Load Tailwind CSS -->
    <script src="https://cdn.tailwindcss.com"></script>
    <!-- Load D3.js -->
    <script src="https://d3js.org/d3.v7.min.js"></script>
    <style>
        /* Define the font family for better aesthetics */
        :root {{
            font-family: 'Inter', sans-serif;
        }}

        /* Basic styling for the SVG container */
        #bubble-chart-container {{
            width: 100%;
            height: 80vh; /* Use viewport height for a good size */
            display: flex;
            justify-content: center;
            align-items: center;
            overflow: hidden; /* Hide overflow outside the SVG viewbox */
        }}

        /* Styling for the bubbles */
        .node circle {{
            cursor: pointer;
            transition: fill 0.3s, transform 0.3s;
            stroke: #fff;
            stroke-width: 2px;
        }}

        .node circle:hover {{
            opacity: 0.85;
            /* Note: We remove the CSS scale on hover here because it might conflict with D3 zoom transform */
        }}
    </style>
</head>
<body class="bg-gray-100 min-h-screen p-4 sm:p-8">

    <div class="max-w-4xl mx-auto">
        <!-- Header Card -->
        <header class="text-center mb-8 p-6 bg-white rounded-xl shadow-lg">
            <h1 class="text-3xl font-extrabold text-indigo-700">{title}</h1>
            <p class="mt-2 text-gray-600">
                A D3 Bubble Chart visualizing the relative size/frequency of major library topics.
                Bubble size corresponds to the number of books in that category.
            </p>
            <p class="mt-4 text-sm font-semibold text-green-600">
                Use your mouse wheel to **zoom** and click-and-drag to **pan** the chart!
            </p>
        </header>

        <!-- Bubble Chart Container -->
        <div class="bg-white p-4 sm:p-8 rounded-xl shadow-lg border border-gray-200">
            <div id="bubble-chart-container">
                <!-- D3 SVG will be inserted here -->
            </div>
        </div>

        <!-- Data Legend/Key -->
        <div class="mt-8 p-4 bg-white rounded-xl shadow-md text-sm text-gray-700">
            <h2 class="font-bold text-lg mb-2 text-indigo-600">Data Key</h2>
            <p>The total "size" of the collection represented here is {total_size_formatted} units (e.g., books/documents). The topics are:</p>
            <ul class="list-disc list-inside mt-2 grid grid-cols-2 sm:grid-cols-4 gap-x-4">
{legend_items_html}
            </ul>
        </div>
    </div>

    <script>
        // Global variable to hold the initialized D3 chart group
        let chartGroup; 

        // Use a function to ensure D3 logic runs only after the DOM is fully loaded.
        function createBubbleChart() {{
            // --- 1. Dynamic Data (Injected by Python) ---
{js_data_block}

            // --- 2. Chart Setup and Responsiveness ---
            const container = document.getElementById('bubble-chart-container');
            let width = container.clientWidth;
            let height = container.clientHeight;

            // Initialize or clear the SVG
            d3.select("#bubble-chart-container svg").remove();

            const svg = d3.select("#bubble-chart-container")
                .append("svg")
                .attr("viewBox", `0 0 ${{width}} ${{height}}`)
                .attr("preserveAspectRatio", "xMinYMin meet")
                .style("display", "block");

            // NEW: Append a group element that will hold the zoomable chart content
            // All bubbles and text will be appended to this group.
            chartGroup = svg.append("g"); 

            // Define the pack layout function
            const pack = d3.pack()
                .size([width, height])
                .padding(2);

            // --- 3. Data Processing ---
            const root = d3.hierarchy(data)
                .sum(d => d.size) // Use the 'size' property to determine bubble area
                .sort((a, b) => b.value - a.value); // Sort larger bubbles first

            // Apply the layout to the root node to calculate positions (x, y) and radius (r)
            const nodes = pack(root).descendants();

            // --- 4. Color Scale ---
            // Define a color scale to differentiate topics
            const color = d3.scaleOrdinal(d3.schemeCategory10);
            
            // Define the group (node) elements for the bubbles and text, appending to chartGroup
            const node = chartGroup.selectAll(".node")
                .data(nodes.filter(d => d.depth === 1)) // Filter to only include the main topics (depth 1)
                .enter().append("g")
                .attr("class", "node")
                .attr("transform", d => `translate(${{d.x}},${{d.y}})`);

            // --- 5. Draw Bubbles (Circles) ---
            node.append("circle")
                .attr("r", d => d.r) // Use the radius calculated by d3.pack
                .attr("fill", (d, i) => color(i))
                .attr("opacity", 0.9)
                .append("title") // Add tooltips for accessibility
                .text(d => `${{d.data.name}}: ${{d.data.size}} units`);

            // --- 6. Draw Labels (Text) ---
            // We use <foreignObject> to embed HTML <divs> which allows for text wrapping.
            const foreignObject = node.append("foreignObject")
                .attr("width", d => d.r * 2) // Set width to bubble diameter
                .attr("height", d => d.r * 2) // Set height to bubble diameter
                .attr("x", d => -d.r) // Center horizontally (offset by radius)
                .attr("y", d => -d.r) // Center vertically (offset by radius)
                .style("pointer-events", "none"); // Text should not interfere with mouse events

            // Add the HTML div for text content
            foreignObject.append("xhtml:div")
                .style("width", "100%")
                .style("height", "100%")
                .style("display", "flex")
                .style("align-items", "center")
                .style("justify-content", "center")
                .style("text-align", "center")
                .style("font-weight", "600")
                .style("color", "white")
                .style("text-shadow", "0 1px 1px rgba(0, 0, 0, 0.3)")
                // Scale font size. We use d.r / 4 to be a bit smaller to allow for wrapping.
                .style("font-size", d => `${{Math.min(24, Math.max(8, d.r / 4))}}px`)
                // Allow long words to break and wrap
                .style("word-break", "break-word")
                .style("overflow-wrap", "break-word")
                .html(d => d.data.name); // Set the text content

            // --- 7. Zoom Functionality ---
            
            // Handler function for the zoom event
            function zoomed(event) {{
                // Apply the new transform (k, x, y) to the chart group
                chartGroup.attr("transform", event.transform);
            }}

            // Create the zoom behavior
            const zoom = d3.zoom()
                .scaleExtent([0.5, 8]) // Allow scaling between 50% and 800%
                .on("zoom", zoomed); // Attach the zoom handler

            // Apply the zoom behavior to the main SVG element
            svg.call(zoom);


            // --- 8. Responsive handling for resizing (container changes) ---
            function handleResize() {{
                // Recalculate container dimensions
                width = container.clientWidth;
                height = container.clientHeight;

                // Update SVG viewbox and pack layout size
                svg.attr("viewBox", `0 0 ${{width}} ${{height}}`);
                pack.size([width, height]);
                
                // Recalculate layout and update positions/radii
                const newRoot = pack(root);

                // --- UPDATED LOGIC ---
                // Store the selection of nodes and bind the new data
                const updatedNodes = chartGroup.selectAll(".node")
                    .data(newRoot.descendants().filter(d => d.depth === 1));

                // Update positions of the <g> node group
                updatedNodes.attr("transform", d => `translate(${{d.x}},${{d.y}})`);

                // Use the updated selection to update the child <circle>
                updatedNodes.select("circle")
                    .attr("r", d => d.r);

                // Use the updated selection to update the child <foreignObject>
                const updatedForeignObject = updatedNodes.select("foreignObject")
                    .attr("width", d => d.r * 2)
                    .attr("height", d => d.r * 2)
                    .attr("x", d => -d.r)
                    .attr("y", d => -d.r);
                
                // And update the div inside it
                updatedForeignObject.select("div")
                    .style("font-size", d => `${{Math.min(24, Math.max(1, d.r / 6))}}px`)
                    .html(d => d.data.name); // Re-apply text in case it was lost
            }}

            // Debounce the resize event to prevent performance issues
            let resizeTimer;
            window.addEventListener('resize', () => {{
                clearTimeout(resizeTimer);
                resizeTimer = setTimeout(handleResize, 150);
            }});
            
            // Call resize once on load to ensure initial fit
            handleResize();
        }}

        // Run the chart function when the window loads
        window.onload = createBubbleChart;
    </script>
</body>
</html>
"""
    
    # --- 3. Write File ---
    try:
        with open(output_filepath, 'w', encoding='utf-8') as f:
            f.write(html_template)
        print(f"Successfully generated HTML file at: {output_filepath}")
    except IOError as e:
        print(f"Error writing file {output_filepath}: {e}")


In [123]:
generate_bubble_chart_html(
    common_topic_info,
    f'{LANG}_top100_topics.html',
    title=f'{LANGUAGE} Topic Popularity'
)

Successfully generated HTML file at: wel_top100_topics.html
