# The London Underground, Graphed

In [1]:
%pip install folium
import pandas as pd
import networkx as nx
import requests
import folium
from collections import defaultdict



## Constructing Datasets

Using the TfL API we construct tables of the London Underground lines, stations, and connections between those stations.

These datasets are used to build a graph of the transport network for crime analysis.

In [2]:
tfl_api = "https://api.tfl.gov.uk"

### Underground Lines

In [3]:
try:
    underground_lines = pd.read_csv("lines.csv")
except:
    line_data = requests.get(tfl_api + "/Line/Mode/tube").json()
    underground_lines = pd.DataFrame(line_data, columns=["id", "name"])
    underground_lines.to_csv("lines.csv", index=False)

underground_lines

Unnamed: 0,id,name
0,bakerloo,Bakerloo
1,central,Central
2,circle,Circle
3,district,District
4,hammersmith-city,Hammersmith & City
5,jubilee,Jubilee
6,metropolitan,Metropolitan
7,northern,Northern
8,piccadilly,Piccadilly
9,victoria,Victoria


### Stations

In [4]:
try:
    stations = pd.read_csv("stations.csv", index_col="name")
except:
    station_data = set()

    for line in underground_lines.id:
        stop_points = requests.get(f"{tfl_api}/line/{line}/stoppoints").json()

        for point in stop_points:
            id = point["id"]
            name = point["commonName"].removesuffix(" Underground Station").removesuffix("-Underground")
            lat = point["lat"]
            lon = point["lon"]

            line_mode_tube = next(group for group in point["lineModeGroups"] if group["modeName"] == "tube")
            tube_lines = line_mode_tube["lineIdentifier"]

            station_data.add((id, name, lat, lon, ",".join(tube_lines)))

    stations = pd.DataFrame(station_data, columns=["id", "name", "latitude", "longitude", "lines"]).set_index("name").sort_values("name")
    stations.to_csv("stations.csv")

stations

Unnamed: 0_level_0,id,latitude,longitude,lines
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Acton Town,940GZZLUACT,51.503057,-0.280462,"district,piccadilly"
Aldgate,940GZZLUALD,51.514246,-0.075689,"circle,metropolitan"
Aldgate East,940GZZLUADE,51.515037,-0.072384,"district,hammersmith-city"
Alperton,940GZZLUALP,51.540627,-0.299610,piccadilly
Amersham,940GZZLUAMS,51.674126,-0.607714,metropolitan
...,...,...,...,...
Wimbledon Park,940GZZLUWIP,51.434573,-0.199719,district
Wood Green,940GZZLUWOG,51.597479,-0.109886,piccadilly
Wood Lane,940GZZLUWLA,51.509669,-0.224530,"circle,hammersmith-city"
Woodford,940GZZLUWOF,51.606899,0.033970,central


In [5]:
try:
    connections = pd.read_csv("connections.csv")
except:
    station_by_id = { row["id"]: station for station, row in stations.iterrows() }

    connection_data = list()

    for line in underground_lines.id:
        route = requests.get(f"{tfl_api}/line/{line}/Route/Sequence/all").json()

        for line_route in route["orderedLineRoutes"]:
            station_ids = line_route["naptanIds"]

            for source, dest in zip(station_ids, station_ids[1:]):
                station1, station2 = sorted((station_by_id[source], station_by_id[dest]))
                connection_data.append((station1, station2, line))

    connections = pd.DataFrame(connection_data, columns=["station1", "station2", "line"]).drop_duplicates(ignore_index=True)
    connections.to_csv("connections.csv", index=False)

connections

Unnamed: 0,station1,station2,line
0,Elephant & Castle,Lambeth North,bakerloo
1,Lambeth North,Waterloo,bakerloo
2,Embankment,Waterloo,bakerloo
3,Charing Cross,Embankment,bakerloo
4,Charing Cross,Piccadilly Circus,bakerloo
...,...,...,...
372,Finsbury Park,Seven Sisters,victoria
373,Seven Sisters,Tottenham Hale,victoria
374,Blackhorse Road,Tottenham Hale,victoria
375,Blackhorse Road,Walthamstow Central,victoria


### Dealing with Paddington

Since the crime data does not distinguish between Paddington and Paddington (H&C Line), we merge the stations in our graph

In [6]:
stations.loc["Paddington", "lines"] += ",hammersmith-city"
stations.drop("Paddington (H&C Line)", inplace=True)

connections.replace({"Paddington (H&C Line)": "Paddington"}, inplace=True)
connections.drop_duplicates(ignore_index=True, inplace=True)

In [7]:
stations_by_line = defaultdict(lambda: set())

for station, row in stations.iterrows():
    for line in row["lines"].split(","):
        stations_by_line[line].add(station)

## Graphing the Underground Network

In [8]:
g = nx.MultiGraph()

for _, connection in connections.iterrows():
    g.add_edge(*connection)

for station, (id, latitude, longitude, lines) in stations.iterrows():
    g.nodes[station]["lines"] = lines.split(",")
    g.nodes[station]["latitude"] = latitude
    g.nodes[station]["longitude"] = longitude

Example: calculating the shortest path between any two stations

In [9]:
nx.shortest_path(g, "Victoria", "Camden Town")

['Victoria',
 'Green Park',
 'Oxford Circus',
 'Warren Street',
 'Euston',
 'Camden Town']

## Adding Crime Data

In [10]:
with open("crimes.csv", "w") as f:
    response = requests.get("https://raw.githubusercontent.com/undevised/crime-on-the-underground/main/content/data/crimes.csv")
    f.write(response.content.decode("utf-8"))

crimes = pd.read_csv("crimes.csv", index_col="Unique ID", keep_default_na=False)

crimes["Lines"] = crimes.apply(lambda x: x["Train Operating Company"].replace(" & ", "-").replace(" and ", ",").lower(), axis=1)

crimes

Unnamed: 0_level_0,Category,Start Location,End Location,Reporting Location,Train Operating Company,Lines
Unique ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
126452,Station,Acton Town,Acton Town,Acton Town,Piccadilly,piccadilly
126453,Station,Acton Town,Acton Town,Acton Town,Piccadilly,piccadilly
137061,On Train,Alperton,Acton Town,Acton Town,Piccadilly,piccadilly
99258,On Train,Barons Court,Acton Town,Acton Town,Piccadilly,piccadilly
114712,On Train,Barons Court,Acton Town,Acton Town,Piccadilly,piccadilly
...,...,...,...,...,...,...
182258,Station,,,,Central,central
182865,On Train,,,,Central,central
184090,Station,,,,District,district
184078,Station,,,,Northern,northern


In [11]:
# initialse crime records
nx.set_node_attributes(g, 0, "crimes")
nx.set_edge_attributes(g, 0, "crimes")

def record_station_crime_at(graph, station):
    graph.nodes[station]["crimes"] += 1

def record_station_crime_on_line(graph, line, weight):
    stations_on_line = stations_by_line[line]

    for station in stations_on_line:
        graph.nodes[station]["crimes"] += weight / len(stations_on_line)

def record_train_crime_between(graph, start_station, end_station):
    path = nx.shortest_path(graph, start_station, end_station)

    path_edges = graph.subgraph(path).edges(keys=True)

    for edge in path_edges:
        graph.edges[edge]["crimes"] += 1 / len(path_edges)

def record_train_crime_on_line(graph, line, weight):
    edges = nx.subgraph_view(
        g,
        filter_node=lambda node: node in stations_by_line[line],
        filter_edge=lambda a, b, key: key == line
    ).edges(keys=True)

    for edge in edges:
        graph.edges[edge]["crimes"] += weight / len(edges)

# process each row of crime data, adding to graph
for id, crime in crimes.iterrows():
    category = crime["Category"].strip()
    start = crime["Start Location"]
    reporting = crime["Reporting Location"]
    lines = crime["Lines"].split(",")

    if category == "Station":

        # if reporting location is known, record crime at specified station
        if reporting:
            record_station_crime_at(g, reporting)

        # otherwise, distribute crime across all stations on the specified line(s)
        else:
            for line in lines:
                record_station_crime_on_line(g, line, 1 / len(lines))

    elif category == "On Train":

        # if start and reporting location are known, record crime on path between stations
        if start and reporting:
            record_train_crime_between(g, start, reporting)

        # otherwise, distribute crime across all segments of the specified line(s)
        else:
            for line in lines:
                record_train_crime_on_line(g, line, 1 / len(lines))

    else:
        print(f"WARNING: unexpected category '{category}'")

## Adding Passenger Data

In [12]:
with open("passengers.csv", "w") as f:
    response = requests.get("https://raw.githubusercontent.com/undevised/crime-on-the-underground/main/content/data/passengers.csv")
    f.write(response.content.decode("utf-8"))

passengers = pd.read_csv("passengers.csv", index_col="Station")

passengers

Unnamed: 0_level_0,Entry/Exit
Station,Unnamed: 1_level_1
Acton Town,13701833
Aldgate,17129121
Aldgate East,24881038
Alperton,6564904
Amersham,4446831
...,...
Wimbledon Park,4838778
Wood Green,25259999
Wood Lane,11289998
Woodford,11980768


In [13]:
# initialse passengers numbers
nx.set_node_attributes(g, 0, "passengers")
nx.set_edge_attributes(g, 0, "passengers")

def add_passengers(graph, station, volume):
    # record station passenger volume
    graph.nodes[station]["passengers"] = volume

    # distribute passenger volume across all lines at this station
    edges = g.edges(station, keys=True)
    for edge in edges:
        graph.edges[edge]["passengers"] += volume / len(edges)

# add passenger data to stations & line segments
for station, row in passengers.iterrows():
    add_passengers(g, station, row["Entry/Exit"] / 2)

### Normalising Results

In [14]:
# noramlise station crimes by passenger figures
for station, data in g.nodes(data=True):
    g.nodes[station]["crimes_per_million_passengers"] = 1_000_000 * data["crimes"] / data["passengers"]

# normalise segment crimes by passenger figures
for station1, station2, line, data in g.edges(keys=True, data=True):
    g.edges[(station1, station2, line)]["crimes_per_million_passengers"] = 1_000_000 * data["crimes"] / data["passengers"]

## Analysis

### Station Crime

In [15]:
station_crime = pd.DataFrame(index=g.nodes)
station_crime["crimes"] = nx.get_node_attributes(g, "crimes")
station_crime["passengers"] = nx.get_node_attributes(g, "passengers")
station_crime["crimes_per_million_passengers"] = 1_000_000 * station_crime["crimes"] / station_crime["passengers"]

In [16]:
station_crime.sort_values("crimes", ascending=False)

Unnamed: 0,crimes,passengers,crimes_per_million_passengers
Oxford Circus,605.141939,70276328.0,8.610893
King's Cross St. Pancras,529.606796,88994137.5,5.951030
Leicester Square,493.229681,38267555.0,12.888978
Tottenham Court Road,470.520016,57321470.0,8.208443
Finsbury Park,427.931604,23862703.0,17.933073
...,...,...,...
Northwood Hills,0.470588,1850867.0,0.254253
South Woodford,0.346939,5361054.5,0.064715
Grange Hill,0.346939,510934.5,0.679028
Battersea Power Station,0.173077,7368741.0,0.023488


In [17]:
station_crime.sort_values("crimes_per_million_passengers", ascending=False)

Unnamed: 0,crimes,passengers,crimes_per_million_passengers
West Ham,182.426501,6672711.5,27.339186
Finsbury Park,427.931604,23862703.0,17.933073
Upton Park,174.167241,11241611.5,15.493085
East Ham,188.167241,14026321.0,13.415296
Leicester Square,493.229681,38267555.0,12.888978
...,...,...,...
Richmond,4.150000,13835305.5,0.299957
Northwood Hills,0.470588,1850867.0,0.254253
Heathrow Terminal 4,0.056604,820937.0,0.068950
South Woodford,0.346939,5361054.5,0.064715


### Train Crime

In [18]:
train_crime = pd.DataFrame(index=g.edges(keys=True))
train_crime["crimes"] = nx.get_edge_attributes(g, "crimes")
train_crime["passengers"] = nx.get_edge_attributes(g, "passengers")
train_crime["crimes_per_million_passengers"] = 1_000_000 * train_crime["crimes"] / train_crime["passengers"]

In [19]:
train_crime.sort_values("crimes", ascending=False)

Unnamed: 0,Unnamed: 1,Unnamed: 2,crimes,passengers,crimes_per_million_passengers
Oxford Circus,Green Park,victoria,220.015298,1.803596e+07,12.198702
Victoria,Green Park,victoria,212.007498,1.914742e+07,11.072380
King's Cross St. Pancras,Highbury & Islington,victoria,186.764068,1.716062e+07,10.883292
Oxford Circus,Warren Street,victoria,186.165003,1.584515e+07,11.749020
Waterloo,Westminster,jubilee,186.153148,1.406152e+07,13.238477
...,...,...,...,...,...
Cannon Street,Monument,circle,5.155800,3.970608e+06,1.298491
Blackfriars,Mansion House,circle,5.143708,4.179397e+06,1.230730
Goldhawk Road,Hammersmith (H&C Line),circle,4.923637,5.695222e+06,0.864521
Shepherd's Bush Market,Goldhawk Road,circle,4.756030,1.514957e+06,3.139384


In [20]:
train_crime.sort_values("crimes_per_million_passengers", ascending=False)

Unnamed: 0,Unnamed: 1,Unnamed: 2,crimes,passengers,crimes_per_million_passengers
Chigwell,Roding Valley,central,15.748210,3.151400e+05,49.972107
Grange Hill,Chigwell,central,15.748210,4.601320e+05,34.225418
Ickenham,Hillingdon,piccadilly,13.963237,7.315990e+05,19.085917
Warren Street,Euston,victoria,181.842772,9.979945e+06,18.220818
Ruislip,Ickenham,piccadilly,13.173382,7.680816e+05,17.151019
...,...,...,...,...,...
Liverpool Street,Aldgate,circle,9.912169,1.122723e+07,0.882868
Goldhawk Road,Hammersmith (H&C Line),circle,4.923637,5.695222e+06,0.864521
Monument,Tower Hill,circle,6.193799,7.777753e+06,0.796348
Kew Gardens,Richmond,district,11.321123,1.625000e+07,0.696684


### Tube Lines

In [21]:
line_crime = pd.DataFrame(columns=["train_crimes", "station_crimes", "total_crimes", "passengers", "crimes_per_million_passengers"])

for line in underground_lines.id:
    subgraph = nx.subgraph_view(
        g,
        filter_node=lambda node: node in stations_by_line[line],
        filter_edge=lambda a, b, key: key == line
    )

    train_crimes = sum(nx.get_edge_attributes(subgraph, "crimes").values())
    station_crimes = sum(data["crimes"] / len(data["lines"]) for station, data in subgraph.nodes(data=True))

    train_passengers = sum(nx.get_edge_attributes(subgraph, "passengers").values())
    station_passengers = sum(data["passengers"] / len(data["lines"]) for station, data in subgraph.nodes(data=True))

    line_crime.loc[line] = (
        train_crimes,
        station_crimes,
        train_crimes + station_crimes,
        train_passengers + station_passengers,
        1_000_000 * (train_crimes + station_crimes) / (train_passengers + station_passengers)
    )

line_crime.sum()

Unnamed: 0,0
train_crimes,13328.0
station_crimes,13649.0
total_crimes,26977.0
passengers,5604631000.0
crimes_per_million_passengers,51.74967


In [22]:
line_crime.sort_values("total_crimes", ascending=False)

Unnamed: 0,train_crimes,station_crimes,total_crimes,passengers,crimes_per_million_passengers
northern,2208.056244,2174.435236,4382.49148,954286800.0,4.592426
central,2311.38253,1820.705109,4132.087638,752983200.0,5.487623
victoria,2119.83329,1962.763079,4082.596369,539197200.0,7.57162
jubilee,1948.733063,1625.497616,3574.230679,672049100.0,5.318407
piccadilly,1531.447995,1766.692366,3298.140361,621788300.0,5.304282
district,1016.718102,1258.728032,2275.446134,714107800.0,3.186418
bakerloo,488.858107,850.29944,1339.157547,326899100.0,4.096547
hammersmith-city,444.840439,814.366102,1259.206541,295132000.0,4.266588
circle,543.523433,698.23344,1241.756872,418853900.0,2.964654
metropolitan,617.52876,576.351491,1193.880251,264875500.0,4.507326


In [23]:
line_crime.sort_values("crimes_per_million_passengers", ascending=False)

Unnamed: 0,train_crimes,station_crimes,total_crimes,passengers,crimes_per_million_passengers
victoria,2119.83329,1962.763079,4082.596369,539197200.0,7.57162
central,2311.38253,1820.705109,4132.087638,752983200.0,5.487623
jubilee,1948.733063,1625.497616,3574.230679,672049100.0,5.318407
piccadilly,1531.447995,1766.692366,3298.140361,621788300.0,5.304282
northern,2208.056244,2174.435236,4382.49148,954286800.0,4.592426
metropolitan,617.52876,576.351491,1193.880251,264875500.0,4.507326
waterloo-city,97.078038,100.928089,198.006127,44458000.0,4.45378
hammersmith-city,444.840439,814.366102,1259.206541,295132000.0,4.266588
bakerloo,488.858107,850.29944,1339.157547,326899100.0,4.096547
district,1016.718102,1258.728032,2275.446134,714107800.0,3.186418


In [24]:
from pathlib import Path

Path("results").mkdir(exist_ok=True)

station_crime.sort_values("crimes", ascending=False).to_csv("results/station_crime.csv", )
train_crime.sort_values("crimes", ascending=False).to_csv("results/train_crime.csv")
line_crime.sort_values("total_crimes", ascending=False).to_csv("results/line_crime.csv")

### Visualising Results

In [25]:
# source: https://content.tfl.gov.uk/tfl-colour-standard-issue-08.pdf
line_colours = {
    "central": (225, 37, 27),
    "circle": (255, 205, 0),
    "bakerloo": (166, 90, 42),
    "district": (0, 121, 52),
    "jubilee": (123, 134, 140),
    "hammersmith-city": (236, 155, 173),
    "northern": (0, 0, 0),
    "metropolitan": (135, 15, 84),
    "piccadilly": (0, 15, 159),
    "victoria": (0, 160, 223),
    "waterloo-city": (107, 205, 178)
}

The map below shows total and normalised crimes across each London Underground line.

In [26]:
map = folium.Map(
    location=(51.5072, -0.1276),
    tiles="cartodb positron",
    zoom_start=11,
)

feature_total = folium.FeatureGroup(name="Total Crimes",).add_to(map)
feature_normalised = folium.FeatureGroup(name="Crimes per million", show=False).add_to(map)

for src, dest, line, data in g.edges(keys=True, data=True):
    pos1 = [g.nodes[src]["latitude"], g.nodes[src]["longitude"]]
    pos2 = [g.nodes[dest]["latitude"], g.nodes[dest]["longitude"]]

    folium.PolyLine(
        locations=[pos1, pos2],
        tooltip=f"{line}: {line_crime['total_crimes'][line]:.1f} total crimes",
        color="#%02x%02x%02x" % line_colours[line],
        dashArray=f"4 {len(underground_lines)}",
        dashOffset=str(list(line_colours.keys()).index(line)),
        weight=line_crime["total_crimes"][line] / line_crime["total_crimes"].quantile(0.01)
    ).add_to(feature_total)

    folium.PolyLine(
        locations=[pos1, pos2],
        tooltip=f"{line}: {line_crime['crimes_per_million_passengers'][line]:.1f} crimes per million passengers",
        color="#%02x%02x%02x" % line_colours[line],
        dashArray=f"4 {len(underground_lines)}",
        dashOffset=str(list(line_colours.keys()).index(line)),
        weight=line_crime["crimes_per_million_passengers"][line]
    ).add_to(feature_normalised)

folium.LayerControl(collapsed=False).add_to(map)

map

The map below shows total crimes across all stations and segments of the London Underground network.

In [27]:
map = folium.Map(
    location=(51.5072, -0.1276),
    tiles="cartodb positron",
    zoom_start=11,
)

feature_at_station = folium.FeatureGroup(name="At-station crime",).add_to(map)
feature_on_train = folium.FeatureGroup(name="On-train crime").add_to(map)

for node, data in g.nodes(data=True):
    value = data["crimes"]

    folium.CircleMarker(
        location=[data["latitude"], data["longitude"]],
        radius=value / 20,
        tooltip=node,
        popup=f"{value:.1f} total crimes",
        fill=True,
        weight=1,
        color="darkred"
    ).add_to(feature_at_station)

for src, dest, line, data in g.edges(keys=True, data=True):
    value = data["crimes"]

    pos1 = [g.nodes[src]["latitude"], g.nodes[src]["longitude"]]
    pos2 = [g.nodes[dest]["latitude"], g.nodes[dest]["longitude"]]

    folium.PolyLine(
        locations=[pos1, pos2],
        tooltip=f"{src} - {dest} ({line})",
        popup=f"{value:.1f} total crimes",
        color="#%02x%02x%02x" % line_colours[line],
        dashArray=f"4 {len(underground_lines)}",
        dashOffset=str(list(line_colours.keys()).index(line)),
        lineCap="square",
        weight=value / 20,
    ).add_to(feature_on_train)

folium.LayerControl(collapsed=False).add_to(map)

map

The map below shows crimes per million passengers across all stations and segments of the London Underground network.

In [28]:
map = folium.Map(
    location=(51.5072, -0.1276),
    tiles="cartodb positron",
    zoom_start=11,
)

feature_at_station = folium.FeatureGroup(name="At-station crime",).add_to(map)
feature_on_train = folium.FeatureGroup(name="On-train crime").add_to(map)

for node, data in g.nodes(data=True):
    value = data["crimes_per_million_passengers"]

    folium.CircleMarker(
        location=[data["latitude"], data["longitude"]],
        radius=value,
        tooltip=node,
        popup=f"{value:.1f} crimes per million passengers",
        fill=True,
        weight=1,
        color="darkred"
    ).add_to(feature_at_station)

for src, dest, line, data in g.edges(keys=True, data=True):
    value = data["crimes_per_million_passengers"]

    pos1 = [g.nodes[src]["latitude"], g.nodes[src]["longitude"]]
    pos2 = [g.nodes[dest]["latitude"], g.nodes[dest]["longitude"]]

    folium.PolyLine(
        locations=[pos1, pos2],
        tooltip=f"{src} - {dest} ({line})",
        popup=f"{value:.1f} crimes per million passengers",
        color="#%02x%02x%02x" % line_colours[line],
        dashArray=f"4 {len(underground_lines)}",
        dashOffset=str(list(line_colours.keys()).index(line)),
        lineCap="square",
        weight=value / 2,
    ).add_to(feature_on_train)

folium.LayerControl(collapsed=False).add_to(map)

map