# Code for analyzing city formation in embedding space

## Import dependencies

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.cm as cm
from matplotlib.colors import LinearSegmentedColormap
from collections import Counter
import matplotlib.patches as mpatches

## Utils functions

In [None]:
def set_plotting_style():
    """
    Configures matplotlib settings for publication-quality figures.

    This function sets global `rcParams` for consistent font usage (Arial),
    font sizes, line widths, colors, and proper font embedding in exported
    vector graphics files (PDF, PS, SVG), which is crucial for academic
    publications.
    """
    # Font Setting
    plt.rcParams['font.family'] = 'sans-serif'
    plt.rcParams['font.sans-serif'] = ['Arial']

    # Font Size Settings
    plt.rcParams['axes.labelsize'] = 10
    plt.rcParams['xtick.labelsize'] = 9
    plt.rcParams['ytick.labelsize'] = 9
    plt.rcParams['legend.fontsize'] = 9
    plt.rcParams['figure.titlesize'] = 12

    # Line Width Settings
    plt.rcParams['axes.linewidth'] = 0.5
    plt.rcParams['grid.linewidth'] = 0.5
    plt.rcParams['lines.linewidth'] = 1.0

    # Color Settings for Plot Elements
    plt.rcParams['axes.edgecolor'] = 'black'
    plt.rcParams['axes.labelcolor'] = 'black'
    plt.rcParams['xtick.color'] = 'black'
    plt.rcParams['ytick.color'] = 'black'

    # Background Color Settings
    plt.rcParams['figure.facecolor'] = 'white'
    plt.rcParams['axes.facecolor'] = 'white'

    # Figure Saving Parameters
    plt.rcParams['savefig.dpi'] = 300
    plt.rcParams['figure.dpi'] = 150

    # Font Embedding for Vector Formats (PDF, PS, SVG)
    plt.rcParams['pdf.fonttype'] = 42
    plt.rcParams['ps.fonttype'] = 42
    
    # Instructs the PDF backend to embed the full Arial font, not just the 14 standard PDF fonts.
    plt.rcParams['pdf.use14corefonts'] = False
    
    # Ensures fonts in SVG files are exported as text objects, not paths.
    plt.rcParams['svg.fonttype'] = 'none'

## Load embeddings and city data

In [None]:
# Define placeholder paths for the datasets.
# Please replace these with the actual paths to your data files.
path_to_embedding_info = '/path/to/cn_eu/embedding_info.parquet'
path_to_eu_cities = '/path/to/european_cities_attributes.parquet'
path_to_cn_cities = '/path/to/chinese_cities_attributes.parquet'

# Load the umap projected embeddings and kmeans clusters for the combined EU and China dataset.
embedding_results = pd.read_parquet(path_to_embedding_info)

# Load the European city dataset and select relevant columns.
eu_cities_df = pd.read_parquet(path_to_eu_cities)
# Invalidate the 'birth' year for grid points that are not city-centered grids.
eu_cities_df = eu_cities_df[["city", "city_id", "birth", "id"]]
eu_cities_df["country"] = "Europe"


# Load the Chinese prefecture city dataset and select relevant columns.
cn_cities_df = pd.read_parquet(path_to_cn_cities)
cn_cities_df = cn_cities_df[["id", "city", "city_id", "birth"]]
# Add a 'country' identifier to the Chinese prefecture city data.
cn_cities_df["country"] = "China"

## Visualize cities in embedding space (Figure 2a,b)
Visualize city locations within the embedding space, colored by historical founding periods.

Select the target country for visualization (either "Europe" or "China") using country variable.

In [None]:
# 1. Data Preprocessing
# Select the target country for visualization and merge dataframes 
country = "China"  # Options: "Europe" or "China"
if country == "Europe":
    eu_cities_df["country"] = "Europe"
    cities_df = eu_cities_df
else:
    cn_cities_df["country"] = "China"
    cities_df = cn_cities_df

data_to_plot = embedding_results.merge(cities_df, on=["id", "country"], how='left')

# Check for and rename embedding columns for consistency
# This ensures the script works with different naming conventions (e.g., umap_0_cosine vs. umap_0).
if "umap_0_cosine" in data_to_plot.columns:
    data_to_plot = data_to_plot.rename(columns={"umap_0_cosine": "umap_0", "umap_1_cosine": "umap_1"})
if "pac_0_cosine" in data_to_plot.columns:
    data_to_plot = data_to_plot.rename(columns={"pac_0_cosine": "pac_0", "pac_1_cosine": "pac_1"})
if "tsne_0_cosine" in data_to_plot.columns:
    data_to_plot = data_to_plot.rename(columns={"tsne_0_cosine": "tsne_0", "tsne_1_cosine": "tsne_1"})

# 2. Create Figure and Axes
fig, ax = plt.subplots(figsize=(11, 10), facecolor="none")
plt.rcParams['font.family'] = 'Arial'
cmap = cm.plasma

# 3. Process City Founding Year ('birth') Data
emb_to_show = "umap"

# Mask non-city areas based on the 'birth' column
if country == 'Europe':
    data_to_plot['birth_float'] = data_to_plot['birth'].astype(float)
    data_to_plot.loc[data_to_plot['birth_float'] == -1, 'birth_float'] = np.nan
    mask_non_city = data_to_plot['birth_float'].isna()
else:
    data_to_plot['birth_float'] = data_to_plot['birth']
    mask_non_city = data_to_plot['birth'].isna()

# Unify the birth field into century
data_to_plot['century'] = np.nan
valid_birth_mask = ~data_to_plot['birth_float'].isna()
data_to_plot.loc[valid_birth_mask, 'century'] = np.ceil(data_to_plot.loc[valid_birth_mask, 'birth_float'] / 100)

# 4. Plot Background Points (Non-City Areas)
#  Differentiate non-city areas by target country vs. other country 
target_country_mask = (data_to_plot['country'] == country)
mask_target_non_city = mask_non_city & target_country_mask
mask_other_non_city = mask_non_city & ~target_country_mask
other_country = 'Europe' if country == 'China' else 'China'

background_colors = ['#acc8d6', '#CFD2D4'] 

#  Plot non-city points for the other country (lighter background) 
ax.scatter(
    data_to_plot[f"{emb_to_show}_0"][mask_other_non_city],
    data_to_plot[f"{emb_to_show}_1"][mask_other_non_city],
    s=10, c=[background_colors[1]], alpha=1, edgecolors='none'
)
#  Plot non-city points for the target country (deeper background) 
ax.scatter(
    data_to_plot[f"{emb_to_show}_0"][mask_target_non_city],
    data_to_plot[f"{emb_to_show}_1"][mask_target_non_city],
    s=10, c=[background_colors[0]], alpha=1 edgecolors='none'
)

# 5. Define Historical Periods for Each Civilization
if country == 'Europe':
    period_bounds = [5, 10, 13, 16, 20]  # Century boundaries
    period_labels = [
        'Early Medieval (7th-10th cent. CE)',
        'High Medieval (11th-13th cent. CE)',
        'Late Medieval (14th-16th cent. CE)',
        'Modern (17th-20th cent. CE)',
    ]
else: # China
    period_bounds = [-7, -2, 3, 9, 20]  # Century boundaries
    period_labels = [
        'Pre-Qin (Before 3rd cent. BCE)', # Because of ceiling option
        'Qin-Han (2nd cent. BCE - 3rd cent. CE)',
        'Wei-Tang (4th-10th cent. CE)',
        'Song-Qing (11th-20th cent. CE)'
    ]

# 6. Plot City Points for Each Period
# The loop is reversed to plot later periods first, so older cities appear on top.
for i in reversed(range(len(period_bounds) - 1)):
    period_start = period_bounds[i]
    period_end = period_bounds[i + 1]
    
    # Normalize color index to map to the colormap (older periods are brighter).
    color_index = 1 - (i / (len(period_bounds) - 2))
    color = cmap(color_index)
    
    # Create a mask for cities founded in the current period.
    mask_current = (data_to_plot['century'] > period_start) & (data_to_plot['century'] <= period_end)
    
    if mask_current.any():
        # Adjust point size so older cities appear larger.
        size = 20 - (i * 3)
        ax.scatter(
            data_to_plot[f"{emb_to_show}_0"][mask_current],
            data_to_plot[f"{emb_to_show}_1"][mask_current],
            s=size, c=[color], alpha=1
        )

ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)

#  Create custom legend handles 
for i in reversed(range(len(period_bounds) - 1)):
    color_index = 1 - (i / (len(period_bounds) - 2))
    color = cmap(color_index)
    size = 20 - (i * 3)
    ax.scatter([], [], s=size * 2, c=[color], alpha=1, label=period_labels[i]) # Use a larger size for legend visibility

plt.scatter([], [], s=16, c=[background_colors[0]], alpha=1, label=f'Non-City Areas in {country}')
plt.scatter([], [], s=16, c=[background_colors[1]], alpha=1, label=f'Non-City Areas in {other_country}')

# Optimize and Order the Legend
handles, labels = ax.get_legend_handles_labels()

# Separate city period items from non-city area items.
city_items = [(h, l) for h, l in zip(handles, labels) if 'Non-City' not in l]
non_city_items = [(h, l) for h, l in zip(handles, labels) if 'Non-City' in l]

# Reverse the city items to display them in chronological order (early to late).
city_items.reverse()
legend_items = city_items + non_city_items
handles, labels = zip(*legend_items)

legend = ax.legend(
    handles, labels,
    title="  Historical Periods",
    loc="lower right",
    bbox_to_anchor=(0.99, 0.01),
    frameon=False, # Set to False for no frame
    fontsize=15,
    ncol=1,
    alignment='left'
)
plt.setp(legend.get_title(), fontsize=18, weight='bold')

#  Final styling: remove all axes spines and ticks 
for spine in plt.gca().spines.values():
    spine.set_visible(False)
plt.xticks([])
plt.yticks([])

plt.tight_layout()

output_path = f'/path/to/your/output/figure_cities_in_embedding_space_{country}.png'
plt.savefig(output_path, dpi=500, transparent=True)

plt.show()

# Counts of citis in each clusters (Figure 2c)

In [None]:
# Load embedding results and city data for Europe and China.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Define placeholder paths for the datasets.
path_to_embeddings = "/path/to/your/embeddings_UMAP_eu&cn_clusters.parquet"
path_to_eu_cities = "/path/to/your/european_cities_attributes.parquet"
path_to_cn_cities = "/path/to/your/chinese_cities_attributes.parquet"

# Load the main embedding results data.
embedding_results = pd.read_parquet(path_to_embeddings)

# Load the European city dataset and select relevant columns.
eu_cities_df = pd.read_parquet(path_to_eu_cities)
# Invalidate the 'birth' year for grid points that are not city-centered grids.
eu_cities_df.loc[eu_cities_df['city_centered'] != 1, 'birth'] = -1
eu_cities_df = eu_cities_df[["city", "city_id", "birth", "is_city", "id"]]
eu_cities_df["country"] = "Europe"


# Load the Chinese city dataset and select relevant columns.
cn_cities_df = pd.read_parquet(path_to_cn_cities)
cn_cities_df.loc[cn_cities_df['city_centered'] != 1, 'birth'] = -1
cn_cities_df = cn_cities_df[["id", "city", "city_id", "birth", "d_l_river", "d_sea", "mean", "std"]]
# Add a 'country' identifier to the Chinese city data.
cn_cities_df["country"] = "China"


city_data = pd.concat([eu_cities_df, cn_cities_df], axis=0)
city_data = embedding_results.merge(city_data, on = ["id","country"], how='left')

# Count the number of cities within each cluster for both China and Europe.
china_cluster_counts = Counter(city_data[city_data['country'] == 'China']['cluster'])
europe_cluster_counts = Counter(city_data[city_data['country'] == 'Europe']['cluster'])

In [None]:
# Generate a stacked bar chart showing the count of Chinese and European cities per geographic cluster.
# Replicate Figure 2c, but with more information (e.g., the counts cities in each cluster).

import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import numpy as np
from matplotlib import rcParams

# Configure global matplotlib settings for a professional, 'Nature'-style appearance.
rcParams['font.family'] = 'Arial'
rcParams['font.size'] = 8  # Journals often prefer smaller, consistent font sizes.
rcParams['axes.linewidth'] = 0.5
rcParams['xtick.major.width'] = 0.5
rcParams['ytick.major.width'] = 0.5
rcParams['lines.solid_capstyle'] = 'butt'  # Use square line endings.
rcParams['lines.dash_capstyle'] = 'butt'
# Ensure fonts are embedded as editable text in vector formats.
rcParams['pdf.fonttype'] = 42
rcParams['ps.fonttype'] = 42

# Define a professional, colorblind-friendly color palette.
colors = [
   '#4DA037', '#4F96A5', '#4E8ABD', '#F6B050', '#77A754', '#E1E18E', '#EEC37A', '#D3403C',  
   '#AB5E3D', '#ADADAD', '#7BC277', '#A286BC', '#80B7E0', '#EF7E45', '#6D55A1', '#F3CBDF',
   '#A99698', '#E4D06B', 
]

def create_stacked_bar_chart(china_counts, europe_counts, save_path=None):
    """Creates a stacked bar chart of city counts.

    This function uses transparency to distinguish between regions and is styled
    for academic publications.

    Args:
        china_counts (dict): A dictionary with cluster IDs as keys and city
            counts as values for China.
        europe_counts (dict): A dictionary with cluster IDs as keys and city
            counts as values for Europe.
        save_path (str, optional): The base path to save the figure. The script
            will save PNG, PDF, and SVG versions. Defaults to None.
    """
    # Ensure all clusters from 1 to 18 are represented on the x-axis.
    all_clusters = list(range(1, 19))
    
    # Prepare data arrays based on the complete list of clusters.
    china_data = np.array([china_counts.get(cluster, 0) for cluster in all_clusters])
    europe_data = np.array([europe_counts.get(cluster, 0) for cluster in all_clusters])
    
    # Create figure with dimensions appropriate for a journal.
    fig, ax = plt.subplots(figsize=(4, 3), dpi=300)
    
    # Set up bar positions.
    x = np.arange(len(all_clusters))
    bar_width = 0.9
    
    # NOTE: Colors are assigned based on the bar's position (index), not its cluster ID.
    bar_colors = [colors[i % len(colors)] for i in range(len(all_clusters))]
    
    # Plot China bars (bottom layer, fully opaque).
    ax.bar(x, china_data, bar_width,
           color=bar_colors, alpha=1.0, edgecolor='none', linewidth=0, label='China')
    
    # Plot Europe bars (top layer, semi-transparent).
    ax.bar(x, europe_data, bar_width,
           bottom=china_data,
           color=bar_colors, alpha=0.7, edgecolor='none', linewidth=0, label='Europe')
    
    # Configure axes labels and ticks.
    ax.set_xlabel('Terrain-water cluster', fontweight='normal', fontsize=7)
    ax.set_ylabel('Number of cities', fontweight='normal', fontsize=7)
    
    # Set x-axis tick labels (e.g., "01", "02", ...).
    cluster_labels = [f'{str(c).zfill(2)}' for c in all_clusters]
    ax.set_xticks(x)
    ax.set_xticklabels(cluster_labels, ha='center', fontsize=7) # Center-align labels with ticks.
    ax.set_yticks(range(0, 801, 200))
    
    # Configure plot spines and grid.
    ax.axhline(y=0, color='black', linewidth=0.5) # Use consistent linewidth.
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax.grid(axis='y', linestyle='-', alpha=0, linewidth=0.5) # Grid is present but invisible.
    
    # Set axis limits.
    max_value = max(1, (china_data + europe_data).max()) # Avoid division by zero.
    ax.set_ylim(0, max_value * 1.05)
    ax.set_xlim(-0.55, len(all_clusters) - 0.45)
    
    # Fine-tune tick parameters.
    ax.tick_params(axis='both', which='major', labelsize=6, length=3, direction='out')

    plt.tight_layout(pad=0.5)
    
    # Save the figure in multiple high-quality formats if a path is provided.
    if save_path:
        # Save as high-resolution PNG.
        plt.savefig(save_path, dpi=600, bbox_inches='tight', transparent=True)
        # Save as editable vector PDF.
        plt.savefig(save_path.replace('.png', '.pdf'), bbox_inches='tight', transparent=True)
        # Save as editable vector SVG.
        plt.savefig(save_path.replace('.png', '.svg'), bbox_inches='tight', transparent=True)
    
    return fig, ax

# --- Example Usage ---
china_cluster_counts = Counter(city_data[city_data['country'] == 'China']['cluster'])
europe_cluster_counts = Counter(city_data[city_data['country'] == 'Europe']['cluster'])

# Define a path for the output figure.
# NOTE: Please replace with your desired output directory and filename.
output_path = "/path/to/your/output/stacked_cluster_counts.png"

# Create the chart.
fig, ax = create_stacked_bar_chart(
    china_cluster_counts, 
    europe_cluster_counts, 
    save_path=output_path
)

plt.show()

In [None]:
# Generate a stacked bar chart showing the count of Chinese and European cities per geographic cluster.
# This version is not identical to the Figure 2c, but with more information (e.g., the counts cities in each cluster).

# 1. Set Plotting Style
sns.set(style="ticks")
plt.rcParams['font.family'] = 'Arial'

# 2. Define Color Palette
# A predefined list of colors to assign to each cluster.
colors = [
   '#4DA037', '#4F96A5', '#4E8ABD', '#F6B050', '#77A754', '#E1E18E', '#EEC37A', '#D3403C',  
   '#AB5E3D', '#ADADAD', '#7BC277', '#A286BC', '#80B7E0', '#EF7E45', '#6D55A1', '#F3CBDF',
   '#A99698', '#E4D06B', 
]

# 3. Prepare Data for Plotting
# Get a sorted list of all unique clusters, ensuring specific clusters (e.g., 10, 18) are included even if empty.
all_clusters = sorted(list(set(china_cluster_counts.keys()).union(set(europe_cluster_counts.keys())).union({10, 18})))

# Create lists of city counts for each country, ordered by 'all_clusters'.
china_data = [china_cluster_counts.get(cluster, 0) for cluster in all_clusters]
europe_data = [europe_cluster_counts.get(cluster, 0) for cluster in all_clusters]

# 4. Create the Stacked Bar Chart
fig, ax = plt.subplots(figsize=(10, 6))
bar_width = 0.7
x = np.arange(len(all_clusters))

# Define distinct hatch patterns for accessibility.
china_pattern = '/'
europe_pattern = '\\'

# Plot the bars for China (bottom layer).
china_bars = ax.bar(x, china_data, bar_width, 
                    color=[colors[min(c - 1, len(colors) - 1)] for c in all_clusters], 
                    label='China',
                    edgecolor='white', 
                    hatch=china_pattern)

# Plot the bars for Europe (stacked on top of China).
europe_bars = ax.bar(x, europe_data, bar_width,
                     bottom=china_data,
                     color=[colors[min(c - 1, len(colors) - 1)] for c in all_clusters],
                     label='Europe',
                     edgecolor='white',
                     hatch=europe_pattern)

# 5. Add Data Labels
# Helper function to add count labels inside the bar segments.
def add_labels(bars, heights=None):
    for i, bar in enumerate(bars):
        height = bar.get_height()
        if height > 0:  # Only label segments with a count greater than zero.
            y_pos = heights[i] - height / 2 if heights is not None else height / 2
            ax.text(bar.get_x() + bar.get_width() / 2., y_pos,
                    f'{int(height)}',
                    ha='center', va='center', fontsize=9, color='white', fontweight='bold')

# Add labels for each country's data.
add_labels(china_bars)
add_labels(europe_bars, heights=[c + e for c, e in zip(china_data, europe_data)])

# Add total count labels above each bar.
for i, (c, e) in enumerate(zip(china_data, europe_data)):
    total = c + e
    y_pos = total + max(max(china_data), max(europe_data)) * 0.02  # Small offset
    if total > 0:
        ax.text(i, y_pos, f'{total}', ha='center', va='bottom', fontsize=9)
    else:
        ax.text(i, 1, '0', ha='center', va='bottom', fontsize=9, color='gray') # Label for zero-height bars

# 6. Configure Axes, Title, and Legend
ax.set_xlabel('Geographic Cluster', fontweight='bold', fontsize=12)
ax.set_ylabel('Number of Cities', fontweight='bold', fontsize=12)
ax.set_xticks(x)
ax.set_xticklabels(all_clusters)

# Create a custom legend for the hatch patterns.
china_patch = mpatches.Patch(facecolor='gray', edgecolor='white', hatch=china_pattern, label='China')
europe_patch = mpatches.Patch(facecolor='gray', edgecolor='white', hatch=europe_pattern, label='Europe')
ax.legend(handles=[china_patch, europe_patch], loc='upper right', frameon=False)

# 7. Finalize Styling
# Set the Y-axis limit to provide space for labels above the bars.
max_height = max([c + e for c, e in zip(china_data, europe_data)])
ax.set_ylim(0, max_height * 1.15 if max_height > 0 else 5)
ax.grid(axis='y', linestyle='--', alpha=0.7)
sns.despine() # Remove top and right spines

# 8. Save and Display the Figure
plt.tight_layout()

# Define a generic path for the output figure.
# NOTE: Please replace with your desired output directory.
output_path = "/path/to/your/output/stacked_cluster_counts.png"
plt.savefig(output_path, dpi=300, bbox_inches='tight')

plt.show()