In [1]:
import json
import pandas as pd

with open('../data/orgs/uk/govuk_orgs_enriched.json', 'r') as file:
    json_data = json.load(file)


# Read JSON data into a DataFrame
df = pd.read_json('../data/orgs/uk/govuk_orgs_enriched.json')

# Write DataFrame to CSV
df.to_csv('data.csv', index=False)

# break down parent-child relationships to allow graphing

In [2]:
# iot graph parent/child relationships, we want to:
# - break down the nested dict within 'parent' into 'parent_id' and 'parent_url' 
# --- actually we don't need parent_url as they should map
# - so append a new column to the df with parent_id and use that and id to generate the edges


# iterate through each row
# extract parent_id
# create a new df
# zip the two together
df['parent_id'] = df['parent_organisations'].apply(
      lambda x: x[0]['id'] if x and len(x) > 0 else None
)
df


    #print(row.parent_organisations[:].get('id'))
    # print(json_normalize(row.parent_organisations))

Unnamed: 0,id,title,format,updated_at,web_url,details,analytics_identifier,parent_organisations,child_organisations,superseded_organisations,superseding_organisations,non_govuk_domain,parent_id
0,https://www.gov.uk/api/organisations/active-tr...,Active Travel England,Executive agency,2024-10-03T13:15:29.000+01:00,https://www.gov.uk/government/organisations/ac...,"{'slug': 'active-travel-england', 'abbreviatio...",EA1350,[{'id': 'https://www.gov.uk/api/organisations/...,[],[],[],https://www.activetravelengland.gov.uk,https://www.gov.uk/api/organisations/departmen...
1,https://www.gov.uk/api/organisations/advanced-...,Advanced Research and Invention Agency,Executive non-departmental public body,2022-07-19T09:24:41.000+01:00,https://www.gov.uk/government/organisations/ad...,{'slug': 'advanced-research-and-invention-agen...,PB1364,[{'id': 'https://www.gov.uk/api/organisations/...,[],[],[],https://www.aria.org.uk,https://www.gov.uk/api/organisations/departmen...
2,https://www.gov.uk/api/organisations/advisory-...,Advisory Committee on Animal Feedingstuffs,Advisory non-departmental public body,2014-10-15T15:36:52.000+01:00,https://www.gov.uk/government/organisations/ad...,{'slug': 'advisory-committee-on-animal-feeding...,PB573,[{'id': 'https://www.gov.uk/api/organisations/...,[],[],[],http://www.food.gov.uk/acaf,https://www.gov.uk/api/organisations/food-stan...
3,https://www.gov.uk/api/organisations/advisory-...,Advisory Committee on Novel Foods and Processes,Advisory non-departmental public body,2014-10-15T15:36:52.000+01:00,https://www.gov.uk/government/organisations/ad...,{'slug': 'advisory-committee-on-novel-foods-an...,PB574,[{'id': 'https://www.gov.uk/api/organisations/...,[],[],[],http://www.food.gov.uk/acnfp,https://www.gov.uk/api/organisations/food-stan...
4,https://www.gov.uk/api/organisations/advisory-...,Advisory Committee on the Microbiological Safe...,Advisory non-departmental public body,2014-10-15T15:36:53.000+01:00,https://www.gov.uk/government/organisations/ad...,{'slug': 'advisory-committee-on-the-microbiolo...,PB575,[{'id': 'https://www.gov.uk/api/organisations/...,[],[],[],http://www.food.gov.uk/acmsf,https://www.gov.uk/api/organisations/food-stan...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
890,https://www.gov.uk/api/organisations/welsh-lan...,Welsh Language Commissioner,Other,2015-02-26T14:26:21.000+00:00,https://www.gov.uk/government/organisations/we...,"{'slug': 'welsh-language-commissioner', 'abbre...",OT1101,[],[],[],[{'id': 'https://www.gov.uk/api/organisations/...,http://www.comisiynyddygymraeg.org/,
891,https://www.gov.uk/api/organisations/westminst...,Westminster Foundation for Democracy,Executive non-departmental public body,2020-09-02T06:02:51.000+01:00,https://www.gov.uk/government/organisations/we...,{'slug': 'westminster-foundation-for-democracy...,OT316,[{'id': 'https://www.gov.uk/api/organisations/...,[],[],[],http://www.wfd.org/,https://www.gov.uk/api/organisations/foreign-c...
892,https://www.gov.uk/api/organisations/wilton-park,Wilton Park,Executive agency,2020-09-02T06:01:01.000+01:00,https://www.gov.uk/government/organisations/wi...,"{'slug': 'wilton-park', 'abbreviation': 'Wilto...",EA62,[{'id': 'https://www.gov.uk/api/organisations/...,[],[],[],http://www.wiltonpark.org.uk,https://www.gov.uk/api/organisations/foreign-c...
893,https://www.gov.uk/api/organisations/yorkshire...,Yorkshire Dales National Park Authority,Other,2014-10-15T15:36:47.000+01:00,https://www.gov.uk/government/organisations/yo...,{'slug': 'yorkshire-dales-national-park-author...,OT547,[{'id': 'https://www.gov.uk/api/organisations/...,[],[],[],http://www.yorkshiredales.org.uk/,https://www.gov.uk/api/organisations/departmen...


In [3]:
# Look at all orgs with a parent
df[df['parent_id'].notna()][['parent_id','id']]

Unnamed: 0,parent_id,id
0,https://www.gov.uk/api/organisations/departmen...,https://www.gov.uk/api/organisations/active-tr...
1,https://www.gov.uk/api/organisations/departmen...,https://www.gov.uk/api/organisations/advanced-...
2,https://www.gov.uk/api/organisations/food-stan...,https://www.gov.uk/api/organisations/advisory-...
3,https://www.gov.uk/api/organisations/food-stan...,https://www.gov.uk/api/organisations/advisory-...
4,https://www.gov.uk/api/organisations/food-stan...,https://www.gov.uk/api/organisations/advisory-...
...,...,...
888,https://www.gov.uk/api/organisations/departmen...,https://www.gov.uk/api/organisations/wallace-c...
891,https://www.gov.uk/api/organisations/foreign-c...,https://www.gov.uk/api/organisations/westminst...
892,https://www.gov.uk/api/organisations/foreign-c...,https://www.gov.uk/api/organisations/wilton-park
893,https://www.gov.uk/api/organisations/departmen...,https://www.gov.uk/api/organisations/yorkshire...


In [5]:
import networkx as nx
import plotly.graph_objects as go

g = nx.DiGraph()
# Populate edges: parent_id:id matching where there IS a parent
edges = df[df['parent_id'].notna()][['parent_id','id']].to_records(index=False)
g.add_edges_from(edges)

# List of isolated nodes: id where there is NO parent
isolated_nodes = df[df['parent_id'].isna()]['id'].tolist()       
g.add_nodes_from(isolated_nodes)

# A dictionary to translate ID to the easily readable title
labels = dict(zip(df['id'], df['title']))


pos = nx.spring_layout(g, k=2, iterations=50)
### Could add in bipartite graph here instead

edge_x = []
edge_y = []
for edge in g.edges():
    x0, y0 = pos[edge[0]]
    x1, y1 = pos[edge[1]]
    edge_x.extend([x0, x1, None])
    edge_y.extend([y0, y1, None])

edge_trace = go.Scatter(
    x=edge_x, y=edge_y,
    line=dict(width=0.5, color='#888'),
    hoverinfo='none',
    mode='lines')

node_x = []
node_y = []
node_text = []
for node in g.nodes():
    x, y = pos[node]
    node_x.append(x)
    node_y.append(y)
# If the label exists for 'node' - return the friendly name, else return 'node' 
    node_text.append(labels.get(node, node))

node_trace = go.Scatter(
    x=node_x, y=node_y,
    mode='markers+text',
    text=node_text,
    textposition="top center",
    textfont=dict(size=8),
    hoverinfo='text',
    marker=dict(
        showscale=False,
        colorscale='Blues',
        size=10,
        color='lightblue',
        line_width=2))

fig = go.Figure(data=[edge_trace, node_trace],
            layout=go.Layout(
                title=f'UK Government Organizations Hierarchy<br>{g.number_of_nodes()} nodes, {g.number_of_edges()} edges',
                showlegend=False,
                hovermode='closest',
                margin=dict(b=0,l=0,r=0,t=40),
                xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
                yaxis=dict(showgrid=False, zeroline=False, showticklabels=False),                height=800))

fig.show()

In [6]:
import plotly.figure_factory as ff
from collections import defaultdict

# Build the tree structure from parent-child relationships
def build_tree_hierarchy(df):
    """
    Build a hierarchical tree structure from the dataframe.
    Returns a list of roots (nodes with no parents) and a mapping of parent to children.
    """
    # Create parent->children mapping
    children_map = df[df['parent_id'].notna()][['parent_id','id']]
    # Find root nodes (nodes with no parents but with children)
    roots = df[(df['parent_id'].isna())&(df['child_organisations'].apply(len) > 0)]['id'].tolist()
    
    return roots, children_map

def create_dendrogram_data(node_id, children_map, labels, level=0, x_offset=0):
    """
    Recursively create dendrogram coordinates for hierarchical tree.
    Returns list of nodes with their positions and connections.
    """
    nodes = []
    children = children_map.get(node_id, [])
    
    if not children:
        # Leaf node
        nodes.append({
            'id': node_id,
            'label': labels.get(node_id, node_id),
            'x': x_offset,
            'y': level,
            'is_leaf': True
        })
        return nodes, x_offset + 1
    
    # Internal node - process children first
    child_nodes = []
    current_x = x_offset
    
    for child_id in children:
        child_data, current_x = create_dendrogram_data(
            child_id, children_map, labels, level + 1, current_x
        )
        child_nodes.extend(child_data)
    
    # Position parent at midpoint of children
    if child_nodes:
        child_x_positions = [n['x'] for n in child_nodes if n['id'] in children]
        parent_x = sum(child_x_positions) / len(child_x_positions)
    else:
        parent_x = x_offset
    
    nodes.append({
        'id': node_id,
        'label': labels.get(node_id, node_id),
        'x': parent_x,
        'y': level,
        'is_leaf': False
    })
    nodes.extend(child_nodes)
    
    return nodes, current_x

# Build tree hierarchy
roots, children_map = build_tree_hierarchy(df)

# Create dendrogram for each root (there may be multiple disconnected trees)
all_nodes = []
current_x_offset = 0

for root in roots:
    root_nodes, new_x_offset = create_dendrogram_data(
        root, children_map, labels, 0, current_x_offset
    )
    all_nodes.extend(root_nodes)
    current_x_offset = new_x_offset + 2  # Add spacing between trees

# Create plotly figure
fig = go.Figure()

# Draw edges
for _, row in df[df['parent_id'].notna()].iterrows():
    parent_id = row['parent_id']
    child_id = row['id']
    
    parent_node = next((n for n in all_nodes if n['id'] == parent_id), None)
    child_node = next((n for n in all_nodes if n['id'] == child_id), None)
    
    if parent_node and child_node:
        # Draw L-shaped connector (horizontal then vertical)
        fig.add_trace(go.Scatter(
            x=[parent_node['x'], parent_node['x'], child_node['x']],
            y=[parent_node['y'], child_node['y'], child_node['y']],
            mode='lines',
            line=dict(color='lightgray', width=1),
            hoverinfo='skip',
            showlegend=False
        ))

# Draw nodes
node_x = [n['x'] for n in all_nodes]
node_y = [n['y'] for n in all_nodes]
node_text = [n['label'] for n in all_nodes]
node_colors = ['lightcoral' if n['is_leaf'] else 'lightblue' for n in all_nodes]

fig.add_trace(go.Scatter(
    x=node_x,
    y=node_y,
    mode='markers+text',
    marker=dict(
        size=8,
        color=node_colors,
        line=dict(color='darkgray', width=1)
    ),
    text=node_text,
    textposition='middle right',
    textfont=dict(size=7),
    hovertext=node_text,
    hoverinfo='text',
    showlegend=False
))

fig.update_layout(
    title='Hierarchical Dendrogram of UK Government Organizations<br>Parent-Child Relationships',
    xaxis=dict(
        showgrid=False,
        zeroline=False,
        showticklabels=False,
        title='Organizations'
    ),
    yaxis=dict(
        showgrid=True,
        zeroline=False,
        title='Hierarchy Level',
        autorange='reversed'  # Put roots at top
    ),
    height=500,
    width=540,
    hovermode='closest',
    plot_bgcolor='white'
)

fig.show()

print(f"Total nodes: {len(all_nodes)}")
print(f"Root nodes (top-level organizations): {len(roots)}")
print(f"Organizations with parents: {len(df[df['parent_id'].notna()])}")

Total nodes: 14
Root nodes (top-level organizations): 14
Organizations with parents: 471


In [None]:
import pandas as pd
import plotly.graph_objects as go
import json
from collections import defaultdict, deque

def parse_orgs(org_list):
    """Extract organization IDs from the organization list"""
    # Handle None/NaN
    if org_list is None:
        return []
    # If it's already a list, extract IDs
    if isinstance(org_list, list):
        if len(org_list) == 0:
            return []
        # Extract just the 'id' field from each organization dict
        return [org.get('id') for org in org_list if isinstance(org, dict) and 'id' in org]
    # Fallback for unexpected types
    return []

# Extract relationships
df['parent_list'] = df['parent_organisations'].apply(parse_orgs)
df['child_list'] = df['child_organisations'].apply(parse_orgs) 


# Create ID lookup
df['org_id'] = df['id']
id_to_title = dict(zip(df['org_id'], df['title']))
id_to_format = dict(zip(df['org_id'], df['format']))

# Build parent-child mapping
parent_to_children = defaultdict(list)
child_to_parent = {}

for idx, row in df.iterrows():
    org_id = row['org_id']
    
    # Add children relationships - child is already an ID string, not a dict
    for child_id in row['child_list']:
        parent_to_children[org_id].append(child_id)
        child_to_parent[child_id] = org_id

# Find root nodes (organizations with no parents)
roots = [org_id for org_id in df['org_id'] if org_id not in child_to_parent]

print(f"Total organizations: {len(df)}")
print(f"Root organizations (no parents): {len(roots)}")
print(f"Organizations with children: {sum(1 for children in parent_to_children.values() if children)}")

# Print all unique formats to verify them
print(f"\nUnique organization formats:")
for fmt in sorted(df['format'].unique()):
    count = len(df[df['format'] == fmt])
    print(f"  {fmt}: {count}")

# Define format-based size weights, based on a reckoning of importance (i.e. ministerial departments are largest)
FORMAT_WEIGHTS = {
    'Ministerial department': 1000,
    'Non-ministerial department': 800,
    'Executive non-departmental public body': 600,
    'Executive agency': 600,
    'Executive office': 600,
    'Public corporation': 600,
    'Tribunal non-departmental public body': 400,
    'Independent monitoring body': 400,
    'Civil service': 400,
    'Court': 400,
    'Devolved administration': 1000,
    'Other': 400,
    'Advisory non-departmental public body': 400,
    'Sub-organisation': 400,
}

# Build tree structure for plotly treemap
# We need: labels (names), parents (parent names), and values
labels = []
parents = []
ids = []
formats = []
values = []

# Add all organizations
for org_id in df['org_id']:
    labels.append(id_to_title.get(org_id, org_id.split('/')[-1]))
    ids.append(org_id)
    org_format = id_to_format.get(org_id, 'Other')
    formats.append(org_format)
    
    # Set parent
    if org_id in child_to_parent:
        parent_id = child_to_parent[org_id]
        parents.append(id_to_title.get(parent_id, parent_id.split('/')[-1]))
    else:
        # Root organization - set parent to empty string
        parents.append("")
    
    # Count total descendants (BFS)
    descendants = 0
    queue = deque([org_id])
    visited = {org_id}
    while queue:
        current = queue.popleft()
        for child in parent_to_children.get(current, []):
            if child not in visited:
                visited.add(child)
                queue.append(child)
                descendants += 1
    
    # Calculate value based on format weight + number of descendants
    # Base weight from format + bonus for each descendant
    format_weight = FORMAT_WEIGHTS.get(org_format, 50)
    descendant_bonus = descendants * 5  # Each descendant adds 5 to the size
    values.append(format_weight + descendant_bonus)

# Create DataFrame for plotting
plot_df = pd.DataFrame({
    'labels': labels,
    'parents': parents,
    'ids': ids,
    'formats': formats,
    'values': values
})

Total organizations: 895
Root organizations (no parents): 794
Organizations with children: 43

Unique organization formats:
  Advisory non-departmental public body: 46
  Court: 1
  Devolved government: 3
  Executive agency: 115
  Executive non-departmental public body: 145
  Executive office: 1
  Independent monitoring body: 4
  Ministerial department: 30
  Non-ministerial department: 14
  Other: 427
  Public corporation: 22
  Special health authority: 4
  Sub organisation: 73
  Tribunal: 10

Treemap saved: ../org_hierarchy_proper_treemap.html


In [70]:

import plotly.express as px
plot_colors = ['#1D70B8', '#5694CA', '#912B88', '#D53880', '#F499BE',
              '#00703C', '#85994B', '#FFDD00', '#FFB81C', '#F47738']

# Create Treemap
fig_treemap = px.treemap(
    plot_df,
    names='labels',
    parents='parents',
    values='values',
    color='formats',
    hover_data={'ids': True, 'formats': True,},
    title='UK Government Organizational Hierarchy (Treemap)<br><sub>Size reflects org format and descendant count</sub>',
    height=2000,
    width=2800,
    color_discrete_sequence=plot_colors
)

fig_treemap.update_traces(
    textposition='middle center',
    textfont=dict(
        size=11,
        family='Arial, sans-serif',
        color='white'
    ),
    marker=dict(
        line=dict(color='white', width=4),
        pad=dict(t=2, l=2, r=2, b=2)
    ),
    hovertemplate='<b>%{label}</b><br>Type: %{customdata[0]}<extra></extra>'
)


fig_treemap.update_layout(
    font=dict(size=12, family='Arial, sans-serif'),
    margin=dict(t=100, l=10, r=10, b=10),
    paper_bgcolor='#f8f9fa',
    plot_bgcolor='#f8f9fa',
    title=dict(
        text='UK Government Organizational Hierarchy<br><sub style="font-size:13px; color:#505a5e;">Size reflects org format and descendant count</sub>',
        font=dict(size=26, color='#0b0c0c', family='Arial'),
        x=0.5,
        xanchor='center'
    ),
)

fig_treemap.update_layout(
    font=dict(size=10),
    margin=dict(t=80, l=0, r=0, b=20)
)

treemap_path = '../org_hierarchy_proper_treemap.html'
fig_treemap.write_html(treemap_path)
print(f"\nTreemap saved: {treemap_path}")


Treemap saved: ../org_hierarchy_proper_treemap.html


In [None]:
import pandas as pd
import plotly.graph_objects as go
import json
import textwrap
from collections import defaultdict, deque

def parse_orgs(org_list):
    """Extract organization IDs from the organization list"""
    # Handle None/NaN
    if org_list is None:
        return []
    # If it's already a list, extract IDs
    if isinstance(org_list, list):
        if len(org_list) == 0:
            return []
        # Extract just the 'id' field from each organization dict
        return [org.get('id') for org in org_list if isinstance(org, dict) and 'id' in org]
    # Fallback for unexpected types
    return []

def wrap_text_textwrap(text, width=20):
    """Wrap text using Python's textwrap module"""
    return '<br>'.join(textwrap.wrap(text, width=width))

# Extract relationships
df['parent_list'] = df['parent_organisations'].apply(parse_orgs)
df['child_list'] = df['child_organisations'].apply(parse_orgs) 


# Create ID lookup
df['org_id'] = df['id']
id_to_title = dict(zip(df['org_id'], df['title']))
id_to_format = dict(zip(df['org_id'], df['format']))

# Build parent-child mapping
parent_to_children = defaultdict(list)
child_to_parent = {}

for idx, row in df.iterrows():
    org_id = row['org_id']
    
    # Add children relationships - child is already an ID string, not a dict
    for child_id in row['child_list']:
        parent_to_children[org_id].append(child_id)
        child_to_parent[child_id] = org_id

# Find root nodes (organizations with no parents)
roots = [org_id for org_id in df['org_id'] if org_id not in child_to_parent]

print(f"Total organizations: {len(df)}")
print(f"Root organizations (no parents): {len(roots)}")
print(f"Organizations with children: {sum(1 for children in parent_to_children.values() if children)}")

# Print all unique formats to verify them
print(f"\nUnique organization formats:")
for fmt in sorted(df['format'].unique()):
    count = len(df[df['format'] == fmt])
    print(f"  {fmt}: {count}")

# Define format-based size weights, based on a reckoning of importance (i.e. ministerial departments are largest)
FORMAT_WEIGHTS = {
    'Ministerial department': 1000,
    'Non-ministerial department': 800,
    'Executive non-departmental public body': 600,
    'Executive agency': 600,
    'Executive office': 600,
    'Public corporation': 600,
    'Tribunal non-departmental public body': 200,
    'Independent monitoring body': 200,
    'Civil service': 200,
    'Court': 200,
    'Devolved administration': 1000,
    'Other': 200,
    'Advisory non-departmental public body': 200,
    'Sub-organisation': 200,
}

# Build tree structure for plotly treemap
# We need: labels (names), parents (parent names), and values
labels = []
parents = []
ids = []
formats = []
values = []

# Add all organizations
for org_id in df['org_id']:
    # Wrap long titles for better display
    title = id_to_title.get(org_id, org_id.split('/')[-1])
    wrapped_title = wrap_text_textwrap(title, width=25)
    labels.append(wrapped_title)
    
    ids.append(org_id)
    org_format = id_to_format.get(org_id, 'Other')
    formats.append(org_format)
    
    # Set parent
    if org_id in child_to_parent:
        parent_id = child_to_parent[org_id]
        parent_title = id_to_title.get(parent_id, parent_id.split('/')[-1])
        wrapped_parent = wrap_text_textwrap(parent_title, width=25)
        parents.append(wrapped_parent)
    else:
        # Root organization - set parent to empty string
        parents.append("")
    
    # Count total descendants (BFS)
    descendants = 0
    queue = deque([org_id])
    visited = {org_id}
    while queue:
        current = queue.popleft()
        for child in parent_to_children.get(current, []):
            if child not in visited:
                visited.add(child)
                queue.append(child)
                descendants += 1
    
    # Calculate value based on format weight + number of descendants
    # Base weight from format + bonus for each descendant
    format_weight = FORMAT_WEIGHTS.get(org_format, 50)
    descendant_bonus = descendants * 5  # Each descendant adds 5 to the size
    values.append(format_weight + descendant_bonus)

# Create DataFrame for plotting
plot_df = pd.DataFrame({
    'labels': labels,
    'parents': parents,
    'ids': ids,
    'formats': formats,
    'values': values
})

import plotly.express as px

# Professional government color scheme
gov_colors = ['#1D70B8', '#5694CA', '#912B88', '#D53880', '#F499BE',
              '#00703C', '#85994B', '#FFDD00', '#FFB81C', '#F47738']

# Create Treemap
fig_treemap = px.treemap(
    plot_df,
    names='labels',
    parents='parents',
    values='values',
    color='formats',
    color_discrete_sequence=gov_colors,
    hover_data={'ids': False, 'formats': True, 'values': False},
    title='UK Government Organizational Hierarchy',
    height=1200,
    width=1600,
    branchvalues='total',  # Ensures parent size = sum of children, reserves space for parent labels
)

fig_treemap.update_traces(
    textposition='top left',  # Position parent labels at top left so they're visible
    textfont=dict(
        size=12,
        family='Arial, sans-serif',
        color='white'
    ),
    marker=dict(
        line=dict(color='white', width=4),
        pad=dict(t=25, l=5, r=5, b=5)  # Add top padding to create header space for parent labels
    ),
    hovertemplate='<b>%{label}</b><br>Type: %{customdata[0]}<extra></extra>'
)

fig_treemap.update_layout(
    font=dict(size=12, family='Arial, sans-serif'),
    margin=dict(t=100, l=10, r=10, b=10),
    paper_bgcolor='#f8f9fa',
    plot_bgcolor='#f8f9fa',
    title=dict(
        text='UK Government Organizational Hierarchy<br><sub style="font-size:13px; color:#505a5e;">Size reflects org format and descendant count</sub>',
        font=dict(size=26, color='#0b0c0c', family='Arial'),
        x=0.5,
        xanchor='center'
    ),
)

treemap_path = '../org_hierarchy_proper_treemap.html'
fig_treemap.write_html(treemap_path)
print(f"\nTreemap saved: {treemap_path}")

Total organizations: 895
Root organizations (no parents): 794
Organizations with children: 43

Unique organization formats:
  Advisory non-departmental public body: 46
  Court: 1
  Devolved government: 3
  Executive agency: 115
  Executive non-departmental public body: 145
  Executive office: 1
  Independent monitoring body: 4
  Ministerial department: 30
  Non-ministerial department: 14
  Other: 427
  Public corporation: 22
  Special health authority: 4
  Sub organisation: 73
  Tribunal: 10

Treemap saved: ../org_hierarchy_proper_treemap.html


In [95]:
import pandas as pd
import plotly.graph_objects as go
import json
import textwrap
from collections import defaultdict, deque

def parse_orgs(org_list):
    """Extract organization IDs from the organization list"""
    # Handle None/NaN
    if org_list is None:
        return []
    # If it's already a list, extract IDs
    if isinstance(org_list, list):
        if len(org_list) == 0:
            return []
        # Extract just the 'id' field from each organization dict
        return [org.get('id') for org in org_list if isinstance(org, dict) and 'id' in org]
    # Fallback for unexpected types
    return []

def wrap_text_textwrap(text, width=20):
    """Wrap text using Python's textwrap module"""
    return '<br>'.join(textwrap.wrap(text, width=width))

# Extract relationships
df['parent_list'] = df['parent_organisations'].apply(parse_orgs)
df['child_list'] = df['child_organisations'].apply(parse_orgs) 


# Create ID lookup
df['org_id'] = df['id']
id_to_title = dict(zip(df['org_id'], df['title']))
id_to_format = dict(zip(df['org_id'], df['format']))

# Build parent-child mapping
parent_to_children = defaultdict(list)
child_to_parent = {}

for idx, row in df.iterrows():
    org_id = row['org_id']
    
    # Add children relationships - child is already an ID string, not a dict
    for child_id in row['child_list']:
        parent_to_children[org_id].append(child_id)
        child_to_parent[child_id] = org_id

# Find root nodes (organizations with no parents)
roots = [org_id for org_id in df['org_id'] if org_id not in child_to_parent]

print(f"Total organizations: {len(df)}")
print(f"Root organizations (no parents): {len(roots)}")
print(f"Organizations with children: {sum(1 for children in parent_to_children.values() if children)}")

# Print all unique formats to verify them
print(f"\nUnique organization formats:")
for fmt in sorted(df['format'].unique()):
    count = len(df[df['format'] == fmt])
    print(f"  {fmt}: {count}")

# Define format-based size weights, based on a reckoning of importance (i.e. ministerial departments are largest)
FORMAT_WEIGHTS = {
    'Ministerial department': 1000,
    'Non-ministerial department': 800,
    'Executive non-departmental public body': 600,
    'Executive agency': 600,
    'Executive office': 600,
    'Public corporation': 600,
    'Tribunal non-departmental public body': 200,
    'Independent monitoring body': 200,
    'Civil service': 200,
    'Court': 200,
    'Devolved administration': 1000,
    'Other': 200,
    'Advisory non-departmental public body': 200,
    'Sub-organisation': 200,
}

# Build tree structure for plotly treemap
# We need: labels (names), parents (parent names), and values
labels = []
parents = []
ids = []
formats = []
values = []
web_urls = []

# Add all organizations
for org_id in df['org_id']:
    # Wrap long titles for better display
    title = id_to_title.get(org_id, org_id.split('/')[-1])
    wrapped_title = wrap_text_textwrap(title, width=25)
    labels.append(wrapped_title)
    web_urls.append(df[df['org_id'] == org_id]['web_url'].values[0] if not df[df['org_id'] == org_id].empty else "")
    
    ids.append(org_id)
    org_format = id_to_format.get(org_id, 'Other')
    formats.append(org_format)
    
    # Set parent
    if org_id in child_to_parent:
        parent_id = child_to_parent[org_id]
        parent_title = id_to_title.get(parent_id, parent_id.split('/')[-1])
        wrapped_parent = wrap_text_textwrap(parent_title, width=25)
        parents.append(wrapped_parent)
    else:
        # Root organization - set parent to empty string
        parents.append("")
    
    # Count total descendants (BFS)
    descendants = 0
    queue = deque([org_id])
    visited = {org_id}
    while queue:
        current = queue.popleft()
        for child in parent_to_children.get(current, []):
            if child not in visited:
                visited.add(child)
                queue.append(child)
                descendants += 1
    
    # Calculate value based on format weight + bonus for number of descendants
    format_weight = FORMAT_WEIGHTS.get(org_format, 50)
    descendant_bonus = descendants * 5 
    values.append(format_weight + descendant_bonus)

# Create DataFrame for plotting
plot_df = pd.DataFrame({
    'labels': labels,
    'parents': parents,
    'ids': ids,
    'formats': formats,
    'values': values,
    'web_urls': web_urls
})

import plotly.express as px

# Professional 
plot_colors = ['#1D70B8', '#003078', '#5694CA', '#2B8CC4', '#004C8C',
              '#505a5e', '#626a6e', '#b1b4b6', '#dee0e2', '#6f777b']

# Create Treemap
fig_treemap = px.treemap(
    plot_df,
    names='labels',
    parents='parents',
    values='values',
    color='formats',
    color_discrete_sequence=plot_colors,
    hover_data={'formats': True, 'web_urls': True},
    title='UK Government Organizational Hierarchy',
    height=1200,
    width=1600,
)

fig_treemap.update_traces(
    textposition='top left',  # Position labels at top-left to create header effect
    textfont=dict(
        size=11,
        family='Arial, sans-serif',
        color='white'
    ),
    marker=dict(
        line=dict(color='grey', width=1),
        pad=dict(t=20, l=3, r=3, b=3)  # Add top padding for header space
    ),
    hovertemplate='<b>%{label}</b><br>Type: %{customdata[0]}<br>URL: %{customdata[1]}<extra></extra>'
)

fig_treemap.update_layout(
    font=dict(size=12, family='Arial, sans-serif'),
    margin=dict(t=100, l=10, r=10, b=10),
    paper_bgcolor='#f8f9fa',
    plot_bgcolor='#f8f9fa',
    title=dict(
        text='UK Government Organizational Hierarchy<br><sub style="font-size:13px; color:#505a5e;">Size reflects org format and descendant count</sub>',
        font=dict(size=26, color='#0b0c0c', family='Arial'),
        x=0.5,
        xanchor='center'
    ),
)

treemap_path = '../org_hierarchy_proper_treemap.html'
fig_treemap.write_html(treemap_path)
print(f"\nTreemap saved: {treemap_path}")

Total organizations: 895
Root organizations (no parents): 794
Organizations with children: 43

Unique organization formats:
  Advisory non-departmental public body: 46
  Court: 1
  Devolved government: 3
  Executive agency: 115
  Executive non-departmental public body: 145
  Executive office: 1
  Independent monitoring body: 4
  Ministerial department: 30
  Non-ministerial department: 14
  Other: 427
  Public corporation: 22
  Special health authority: 4
  Sub organisation: 73
  Tribunal: 10

Treemap saved: ../org_hierarchy_proper_treemap.html
