# Topological data analysis of U.S. city demographics

This notebook uses persistence diagrams of city dual graphs with Black/Hispanic population data to compare cities with one another and across time. Clustering experiments are in the other notebook.


In [None]:
from gerrychain import Graph #see https://gerrychain.readthedocs.io/en/latest/
import networkx as nx
import numpy as np
import gudhi
import random
import gudhi.hera
from sklearn.manifold import MDS
import os
from numpy import inf
import matplotlib.pyplot as plt
import scipy
from scipy.stats import pearsonr
from sklearn.neighbors import LocalOutlierFactor
import pandas as pd
import geopandas as gpd
from matplotlib.pyplot import Circle
from tqdm import tqdm
plt.rcParams['text.usetex'] = True
INFINITY = 1e6
n = 100

In [None]:
os.makedirs('figs/clustering', exist_ok=True)
os.makedirs('figs/MDS', exist_ok=True)
os.makedirs('figs/pds and maps', exist_ok=True)
os.makedirs('figs/time', exist_ok=True)
os.makedirs('figs/TP', exist_ok=True)

## Choose demographic group

Set the demographic group (`"BLACK"` or `"HISP"`) to analyze

In [None]:
col = 'HISP'

## Useful functions

In [None]:
def infinity_is_one(PD):
    for i, point in enumerate(PD):
        if point[1] == inf or point[1] == INFINITY:
            PD[i] = (PD[i][0], 1)
    return PD
    
def pd_from_graph(graph1, column, popthreshold=0):
    scomplex1 = gudhi.SimplexTree()
    for i in graph1.nodes: 
        scomplex1.insert([i]) #add a 0-simplex, given as a list with just one vertex for (u,v) in grid.edges: 
    for (u,v) in graph1.edges:
        scomplex1.insert([u,v]) #insert edge for v in scomplex.get_skeleton(0):
    for v in scomplex1.get_skeleton(0):
        node = v[0][0]
        if graph1.nodes[node]['TOTPOP'] > popthreshold:
            scomplex1.assign_filtration(
                v[0],
                filtration = 1-graph1.nodes[node][column]/graph1.nodes[node]['TOTPOP']
            )
        else:
            neighbor_values = [
                graph1.nodes[m][column]/graph1.nodes[m]['TOTPOP']
                for m in graph1.neighbors(node) if graph1.nodes[m]['TOTPOP'] > popthreshold
            ]
            if len(neighbor_values) == 0:
                scomplex1.assign_filtration(
                    v[0],
                    1
                )
            else:
                scomplex1.assign_filtration(
                    v[0],
                    1-max(neighbor_values)
                )
    scomplex1.make_filtration_non_decreasing()
    
    persistence1 = scomplex1.persistence()
    persistence01 = [x[1] for x in persistence1 if x[0] == 0]
    for i, point in enumerate(persistence01):
        if point[1] == inf:
            persistence01[i] = (persistence01[i][0], INFINITY)
    return persistence01

def wasserstein_between_pds(pd1, pd2, p=1):
    return gudhi.hera.wasserstein_distance(pd1, pd2, order = p, internal_p = p)

def bottleneck_between_pds(pd1, pd2):
    return gudhi.bottleneck_distance(pd1, pd2)

  
def total_persistence(pd1, p=1): 
    tp = np.linalg.norm(np.array([x[1]-x[0] for x in pd1]), ord=p)
    return tp

def minshare(graph1, column1):
    total_group = sum([graph1.nodes[n][column1] for n in graph1.nodes])
    total_population = sum([graph1.nodes[n]['TOTPOP'] for n in graph1.nodes])
    return total_group/total_population

def moransI(graph, col, pop_col='TOTPOP'):
    A = nx.adjacency_matrix(graph).toarray()
    P = A/A.sum(axis=0)
    v = np.array([graph.nodes[n][col]/(graph.nodes[n][pop_col]+1e-9) for n in graph.nodes])
    v = v-np.mean(v)
    return np.dot(np.dot(v, P), v)/np.dot(v,v)

def DI(graph, col):
    group_pops = np.array([graph.nodes[n][col] for n in graph.nodes])
    totpops = np.array([graph.nodes[n]['TOTPOP'] for n in graph.nodes])
    DI = 0.5*sum(
        np.abs(group_pops/group_pops.sum() - (totpops - group_pops)/(totpops.sum() - group_pops.sum()))
    )
    return DI

def outliers(distance_matrix):
    loof = np.mean(
        [
            LocalOutlierFactor(
                metric='precomputed',
                n_neighbors=k).fit(distance_matrix).negative_outlier_factor_
            for k in range(10,20)
        ], axis=0
    )
    outliers = [i for i in range(len(loof)) if loof[i] <= -2]
    return outliers

## Load data

In [None]:
list_of_cities_pd =pd.read_csv('./City_Names_And_Populations.csv')
list_of_cities = [x+y for x,y in zip(list_of_cities_pd.NAME, list_of_cities_pd.ST)][:n]
city_names = [x+' ' + y for x,y in zip(list_of_cities_pd.NAME, list_of_cities_pd.ST)][:n]
cmap = plt.get_cmap('tab20')
colors = [cmap(x/(7-1)) for x in range(7)]

In [None]:
list_of_graphs = {'2010':[], '2020':[]}
for year in list_of_graphs:
    for i in range(n):
        graph1=Graph.from_json('./cities{}data/{}.json'.format(year, list_of_cities[i]))
#         cc = sorted(nx.connected_components(graph1), key=len, reverse=True)
#         print(list_of_cities[i], (len(graph1) - len(cc[0]))/len(graph1))
#         graph1 = graph1.subgraph(cc[0])
        list_of_graphs[year].append(graph1) 
PDs = {y: [infinity_is_one(pd_from_graph(graph, col, popthreshold=10)) for graph in list_of_graphs[y]] for y in list_of_graphs}


In [None]:
coords = pd.DataFrame()
x = []
y = []
coords['name'] = list_of_cities[:n]
for i, graph in enumerate(list_of_graphs['2010']):
    node = list(graph.nodes)[0]
    x.append(graph.nodes[node]['C_X'])
    y.append(graph.nodes[node]['C_Y'])
    
coords['x'] = x
coords['y'] = y
coords = coords.set_index('name')
coords.to_csv('city_coordinates.csv', index='name')

# MDS embeddings

Display an approximate planar plot for the 100 cities. Some hardcoding is required to make the labels appear in the correct places.

In [None]:
distance = 'Wasserstein'
year = '2020'

In [None]:
distances_pairwise = np.array([
    np.array([
        wasserstein_between_pds(pd1, pd2) for pd1 in PDs[year]
    ]) for pd2 in PDs[year]
])
distances_pairwise = (distances_pairwise + distances_pairwise.T)/2

#hacks for displaying nicely for the plots in the paper
xadj, yadj = np.zeros(n), np.zeros(n)
xlims, ylims = None, None
if distance == 'Wasserstein' and year == '2020' and col == 'BLACK':
    labelxs = (-9.5, 6)
    to_tag =  [0,1,2, 3,5,8]
    lr_tag =  [0,1,0, 0,1,0]
    yoffset = [0,0,-0.5,0,0,-0.5,0]
    xlims = (-10,12)
    ylims = None
if distance == 'Wasserstein' and year == '2020' and col == 'HISP':
    labelxs = (-20, -5)
    to_tag =  [0,1,2,3,7,8,22,5]
    lr_tag =  [0,0,0,0,0,0,0,0]
    yoffset = [0,0,0.3,-1.5,0,-1,1.2,0]
    xlims = (-21, 4)
    ylims = None
    

plt.subplots(figsize=(5,10))
np.random.seed(2023)
mds = MDS(dissimilarity='precomputed')
pos = mds.fit_transform(distances_pairwise)
plt.scatter(
    pos[:,0], pos[:,1]
)

outlier_points = outliers(distances_pairwise)

for m, i in enumerate(outlier_points):
    print(i+1, city_names[i])
    plt.scatter(
        pos[i:i+1,0], pos[i:i+1,1], c=[colors[2]]
    )

for m, i in enumerate(to_tag):
    plt.annotate(
        city_names[i],
        xy=(pos[i,0], pos[i,1]),
        xytext=(labelxs[lr_tag[m]], pos[i,1]+yoffset[m]),
        c='black', fontsize=12, arrowprops={'arrowstyle':'->'}
    )
plt.gca().set_aspect(1)
if xlims is not None:
    plt.xlim(xlims)
if ylims is not None:
    plt.ylim(ylims)
plt.savefig('figs/MDS/MDS_{}cities_{}_{}.png'.format(n, year, col), dpi=150, bbox_inches='tight')

## Total persistence plotting

Plot the total persistence against other stats.

### Against total population

In [None]:
year = '2020'

In [None]:
def total_pop(graph):
    return sum(graph.nodes[n]['TOTPOP'] for n in graph.nodes)
def demo_pop(graph, col):
    return sum(graph.nodes[n][col] for n in graph.nodes)

In [None]:
tps = [total_persistence(PD) for PD in PDs[year]]
totpops = [total_pop(g) for g in list_of_graphs[year]]
demopops = [demo_pop(g, col) for g in list_of_graphs[year]]
 
# find outliers
outlier_points = outliers(distances_pairwise)


#vs total population
plt.subplots(figsize=(3.5,3.5))
plt.scatter(
    totpops, tps,
    s=10
)
plt.scatter(
    [totpops[i] for i in outlier_points],
    [tps[i] for i in outlier_points],
    c='red',s =10
)
plt.xlabel('Total population')
plt.ylabel('Total persistence')
r = pearsonr(totpops, tps)[0]
#plt.title('r = {:.2f}'.format(r))
plt.savefig('figs/TP/tps_vs_totpops_{}_{}_{}.png'.format(n, year, col), dpi=150, bbox_inches='tight')

#vs minority population
realname = {'HISP':'Hispanic', 'BLACK': 'Black'}
plt.subplots(figsize=(3.5,3.5))
plt.scatter(
    demopops, tps,
    s=10
)
plt.scatter(
    [demopops[i] for i in outlier_points],
    [tps[i] for i in outlier_points],
    c='red',s =10
)
if col=='HISP':
    i=6
    plt.annotate(
        city_names[i],
        xy=(demopops[i], tps[i]),
        xytext=(1.5e6, tps[i]),
        c='black', fontsize=12, arrowprops={'arrowstyle':'->'}
    )
    
if col=='BLACK':
    yshift = {2:0, 3:0, 4:-0.6, 5:0, 18:0}
    for i in [2,3,18]:
        plt.annotate(
            city_names[i],
            xy=(demopops[i], tps[i]),
            xytext=(1.2e6, tps[i] + yshift[i]),
            c='black', fontsize=12, arrowprops={'arrowstyle':'->'}
        )
plt.xlabel('Total {} population'.format(realname[col]))
plt.ylabel('Total persistence')
r = pearsonr(demopops, tps)[0]
#plt.title('r = {:.2f}'.format(r))
plt.savefig('figs/TP/tps_vs_demopops_{}_{}_{}.png'.format(n, year, col), dpi=150, bbox_inches='tight')

### Plotting against dissimilarity


In [None]:
DIs = [DI(g, col) for g in list_of_graphs[year]]
 
# find outliers
distances_pairwise = np.array([
    np.array([
        wasserstein_between_pds(pd1, pd2) for pd1 in PDs[year]
    ]) for pd2 in PDs[year]
])
distances_pairwise = (distances_pairwise + distances_pairwise.T)/2

outlier_points = outliers(distances_pairwise)


#vs total population
plt.subplots(figsize=(3.5,7))
plt.scatter(
    DIs, tps,
    s=10
)
plt.scatter(
    [DIs[i] for i in outlier_points],
    [tps[i] for i in outlier_points],
    c='red',s =10
)
plt.xlabel('Dissimilarity index'.format(realname[col]))
plt.ylabel('Total persistence')
if col=='BLACK':
    for i in [0,1,3,8,18]:
        plt.annotate(
            city_names[i],
            xy=(DIs[i], tps[i]),
            xytext=(0.85, tps[i]),
            c='black', fontsize=12, arrowprops={'arrowstyle':'->'}
        )
if col=='HISP':
    for i in [0,1,2,3,22,6]:
        plt.annotate(
            city_names[i],
            xy=(DIs[i], tps[i]),
            xytext=(0.85, tps[i]),
            c='black', fontsize=12, arrowprops={'arrowstyle':'->'}
        )
r = pearsonr(DIs, tps)[0]
print(pearsonr(DIs, tps))
plt.xlim(0,1.45)
#plt.title('r = {:.2f}'.format(r))
plt.savefig('figs/TP/tps_vs_DI_{}_{}_{}.png'.format(n, year, col), dpi=150, bbox_inches='tight')

## Change over time

In [None]:
PDs_BLACK = {
    y: [infinity_is_one(pd_from_graph(graph, 'BLACK', popthreshold=10)) for graph in list_of_graphs[y]] for y in list_of_graphs
}

PDs_HISP = {
    y: [infinity_is_one(pd_from_graph(graph, 'HISP', popthreshold=10)) for graph in list_of_graphs[y]] for y in list_of_graphs
}


In [None]:
diffs_BLACK = [
    wasserstein_between_pds(pd1, pd2) for (pd1, pd2) in zip(PDs_BLACK['2010'], PDs_BLACK['2020'])
]

diffs_HISP = [
    wasserstein_between_pds(pd1, pd2) for (pd1, pd2) in zip(PDs_HISP['2010'], PDs_HISP['2020'])
]

In [None]:
fig, ax = plt.subplots(figsize=(4,20))
ax.barh(
    [x+0.15 for x in range(50)],
    diffs_BLACK[:50],
    height=0.3,
)
ax.barh(
    [x-0.15 for x in range(50)],
    diffs_HISP[:50],
    height=0.3,
)
ax.set_yticks(range(50))
ax.set_yticklabels(labels=city_names[:50])
ax.set_xlim(0,8)
plt.savefig('figs/time/Wasserstein_2010_2020_1.png', dpi=150, bbox_inches='tight')
plt.show()


fig, ax = plt.subplots(figsize=(4,20))
ax.barh(
    [x+0.15 for x in range(50)],
    diffs_BLACK[50:],
    height=0.3,
    label='Black'
)
ax.barh(
    [x-0.15 for x in range(50)],
    diffs_HISP[50:],
    height=0.3,
    label='Hispanic'
)
ax.set_yticks(range(50))
ax.set_yticklabels(labels=city_names[50:])
ax.set_xlim(0,8)
plt.legend()
plt.savefig('figs/time/Wasserstein_2010_2020_2.png', dpi=150, bbox_inches='tight')
plt.show()