# The London Underground, Graphed

In [1]:
%pip install folium
import pandas as pd
import networkx as nx
import requests
import folium
from collections import defaultdict



## Constructing Datasets

Using the TfL API we construct tables of the London Underground lines, stations, and connections between those stations.

These datasets are used to build a graph of the transport network for crime analysis.

In [2]:
tfl_api = "https://api.tfl.gov.uk"

### Underground Lines

In [3]:
try:
    underground_lines = pd.read_csv("lines.csv")
except:
    line_data = requests.get(tfl_api + "/Line/Mode/tube").json()
    underground_lines = pd.DataFrame(line_data, columns=["id", "name"])
    underground_lines.to_csv("lines.csv", index=False)

underground_lines

Unnamed: 0,id,name
0,bakerloo,Bakerloo
1,central,Central
2,circle,Circle
3,district,District
4,hammersmith-city,Hammersmith & City
5,jubilee,Jubilee
6,metropolitan,Metropolitan
7,northern,Northern
8,piccadilly,Piccadilly
9,victoria,Victoria


### Stations

In [4]:
try:
    stations = pd.read_csv("stations.csv", index_col="name")
except:
    station_data = set()

    for line in underground_lines.id:
        stop_points = requests.get(f"{tfl_api}/line/{line}/stoppoints").json()

        for point in stop_points:
            id = point["id"]
            name = point["commonName"].removesuffix(" Underground Station").removesuffix("-Underground")
            lat = point["lat"]
            lon = point["lon"]

            line_mode_tube = next(group for group in point["lineModeGroups"] if group["modeName"] == "tube")
            tube_lines = line_mode_tube["lineIdentifier"]

            station_data.add((id, name, lat, lon, ",".join(tube_lines)))

    stations = pd.DataFrame(station_data, columns=["id", "name", "latitude", "longitude", "lines"]).set_index("name").sort_values("name")
    stations.to_csv("stations.csv")

stations

Unnamed: 0_level_0,id,latitude,longitude,lines
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Acton Town,940GZZLUACT,51.503057,-0.280462,"district,piccadilly"
Aldgate,940GZZLUALD,51.514246,-0.075689,"circle,metropolitan"
Aldgate East,940GZZLUADE,51.515037,-0.072384,"district,hammersmith-city"
Alperton,940GZZLUALP,51.540627,-0.299610,piccadilly
Amersham,940GZZLUAMS,51.674126,-0.607714,metropolitan
...,...,...,...,...
Wimbledon Park,940GZZLUWIP,51.434573,-0.199719,district
Wood Green,940GZZLUWOG,51.597479,-0.109886,piccadilly
Wood Lane,940GZZLUWLA,51.509669,-0.224530,"circle,hammersmith-city"
Woodford,940GZZLUWOF,51.606899,0.033970,central


In [5]:
try:
    connections = pd.read_csv("connections.csv")
except:
    station_by_id = { row["id"]: station for station, row in stations.iterrows() }

    connection_data = list()

    for line in underground_lines.id:
        route = requests.get(f"{tfl_api}/line/{line}/Route/Sequence/all").json()

        for line_route in route["orderedLineRoutes"]:
            station_ids = line_route["naptanIds"]

            for source, dest in zip(station_ids, station_ids[1:]):
                station1, station2 = sorted((station_by_id[source], station_by_id[dest]))
                connection_data.append((station1, station2, line))

    connections = pd.DataFrame(connection_data, columns=["station1", "station2", "line"]).drop_duplicates(ignore_index=True)
    connections.to_csv("connections.csv", index=False)

connections

Unnamed: 0,station1,station2,line
0,Elephant & Castle,Lambeth North,bakerloo
1,Lambeth North,Waterloo,bakerloo
2,Embankment,Waterloo,bakerloo
3,Charing Cross,Embankment,bakerloo
4,Charing Cross,Piccadilly Circus,bakerloo
...,...,...,...
372,Finsbury Park,Seven Sisters,victoria
373,Seven Sisters,Tottenham Hale,victoria
374,Blackhorse Road,Tottenham Hale,victoria
375,Blackhorse Road,Walthamstow Central,victoria


### Dealing with Paddington

Since the crime data does not distinguish between Paddington and Paddington (H&C Line), we merge the stations in our graph

In [6]:
stations.loc["Paddington", "lines"] += ",hammersmith-city"
stations.drop("Paddington (H&C Line)", inplace=True)

connections.replace({"Paddington (H&C Line)": "Paddington"}, inplace=True)
connections.drop_duplicates(ignore_index=True, inplace=True)

In [7]:
stations_by_line = defaultdict(lambda: set())

for station, row in stations.iterrows():
    for line in row["lines"].split(","):
        stations_by_line[line].add(station)

## Graphing the Underground Network

In [8]:
pd.options.display.float_format = '{:,.1f}'.format

In [9]:
g = nx.MultiGraph()

for _, connection in connections.iterrows():
    g.add_edge(*connection)

for station, (id, latitude, longitude, lines) in stations.iterrows():
    g.nodes[station]["lines"] = lines.split(",")
    g.nodes[station]["latitude"] = latitude
    g.nodes[station]["longitude"] = longitude

Example: calculating the shortest path between any two stations

In [10]:
nx.shortest_path(g, "Victoria", "Camden Town")

['Victoria',
 'Green Park',
 'Oxford Circus',
 'Warren Street',
 'Euston',
 'Camden Town']

## Adding Crime Data

In [11]:
with open("crimes.csv", "w") as f:
    response = requests.get("https://raw.githubusercontent.com/undevised/crime-on-the-underground/main/content/data/crimes.csv")
    f.write(response.content.decode("utf-8"))

crimes = pd.read_csv("crimes.csv", index_col="Unique ID", keep_default_na=False)

crimes["Lines"] = crimes.apply(lambda x: x["Train Operating Company"].replace(" & ", "-").replace(" and ", ",").lower(), axis=1)

crimes

Unnamed: 0_level_0,Category,Start Location,End Location,Reporting Location,Train Operating Company,Lines
Unique ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
126452,Station,Acton Town,Acton Town,Acton Town,Piccadilly,piccadilly
126453,Station,Acton Town,Acton Town,Acton Town,Piccadilly,piccadilly
137061,On Train,Alperton,Acton Town,Acton Town,Piccadilly,piccadilly
99258,On Train,Barons Court,Acton Town,Acton Town,Piccadilly,piccadilly
114712,On Train,Barons Court,Acton Town,Acton Town,Piccadilly,piccadilly
...,...,...,...,...,...,...
182258,Station,,,,Central,central
182865,On Train,,,,Central,central
184090,Station,,,,District,district
184078,Station,,,,Northern,northern


In [12]:
# initialse crime records
nx.set_node_attributes(g, 0, "crimes")
nx.set_node_attributes(g, 0, "crimes_distributed")
nx.set_edge_attributes(g, 0, "crimes")
nx.set_edge_attributes(g, 0, "crimes_distributed")

def record_station_crime_at(graph, station):
    graph.nodes[station]["crimes"] += 1

def record_station_crime_on_line(graph, line, weight):
    stations_on_line = stations_by_line[line]

    for station in stations_on_line:
        graph.nodes[station]["crimes_distributed"] += weight / len(stations_on_line)

def record_train_crime_between(graph, start_station, end_station):
    path = nx.shortest_path(graph, start_station, end_station)

    path_edges = graph.subgraph(path).edges(keys=True)

    for edge in path_edges:
        graph.edges[edge]["crimes"] += 1 / len(path_edges)

def record_train_crime_on_line(graph, line, weight):
    edges = nx.subgraph_view(
        g,
        filter_node=lambda node: node in stations_by_line[line],
        filter_edge=lambda a, b, key: key == line
    ).edges(keys=True)

    for edge in edges:
        graph.edges[edge]["crimes_distributed"] += weight / len(edges)

# process each row of crime data, adding to graph
for id, crime in crimes.iterrows():
    category = crime["Category"].strip()
    start = crime["Start Location"]
    reporting = crime["Reporting Location"]
    lines = crime["Lines"].split(",")

    if category == "Station":

        # if reporting location is known, record crime at specified station
        if reporting:
            record_station_crime_at(g, reporting)

        # otherwise, distribute crime across all stations on the specified line(s)
        else:
            for line in lines:
                record_station_crime_on_line(g, line, 1 / len(lines))

    elif category == "On Train":

        # if start and reporting location are known, record crime on path between stations
        if start and reporting:
            record_train_crime_between(g, start, reporting)

        # otherwise, distribute crime across all segments of the specified line(s)
        else:
            for line in lines:
                record_train_crime_on_line(g, line, 1 / len(lines))

    else:
        print(f"WARNING: unexpected category '{category}'")

## Adding Passenger Data

In [13]:
with open("passengers.csv", "w") as f:
    response = requests.get("https://raw.githubusercontent.com/undevised/crime-on-the-underground/main/content/data/passengers.csv")
    f.write(response.content.decode("utf-8"))

passengers = pd.read_csv("passengers.csv", index_col="Station")

passengers

Unnamed: 0_level_0,Entry/Exit
Station,Unnamed: 1_level_1
Acton Town,13701833
Aldgate,17129121
Aldgate East,24881038
Alperton,6564904
Amersham,4446831
...,...
Wimbledon Park,4838778
Wood Green,25259999
Wood Lane,11289998
Woodford,11980768


In [14]:
# initialse passengers numbers
nx.set_node_attributes(g, 0, "passengers")
nx.set_edge_attributes(g, 0, "passengers")

def add_passengers(graph, station, volume):
    # record station passenger volume
    graph.nodes[station]["passengers"] = volume

    # distribute passenger volume across all lines at this station
    edges = g.edges(station, keys=True)
    for edge in edges:
        graph.edges[edge]["passengers"] += volume / len(edges)

# add passenger data to stations & line segments
for station, row in passengers.iterrows():
    add_passengers(g, station, row["Entry/Exit"] / 2)

### Normalising Results

In [15]:
# noramlise station crimes by passenger figures
for station, data in g.nodes(data=True):
    g.nodes[station]["crimes_per_million_passengers"] = 1_000_000 * data["crimes"] / data["passengers"]

# normalise segment crimes by passenger figures
for station1, station2, line, data in g.edges(keys=True, data=True):
    g.edges[(station1, station2, line)]["crimes_per_million_passengers"] = 1_000_000 * data["crimes"] / data["passengers"]

## Analysis

### Station Crime

In [16]:
station_crime = pd.DataFrame(index=g.nodes)
station_crime["crimes"] = nx.get_node_attributes(g, "crimes")
station_crime["passengers"] = nx.get_node_attributes(g, "passengers")
station_crime["crimes_per_million_passengers"] = 1_000_000 * station_crime["crimes"] / station_crime["passengers"]

In [17]:
station_crime.sort_values("crimes", ascending=False).round(2)

Unnamed: 0,crimes,passengers,crimes_per_million_passengers
Oxford Circus,602,70276328.0,8.6
King's Cross St. Pancras,528,88994137.5,5.9
Leicester Square,493,38267555.0,12.9
Tottenham Court Road,470,57321470.0,8.2
Finsbury Park,427,23862703.0,17.9
...,...,...,...
Northwood Hills,0,1850867.0,0.0
Chorleywood,0,978689.5,0.0
Battersea Power Station,0,7368741.0,0.0
South Woodford,0,5361054.5,0.0


In [18]:
station_crime.sort_values("crimes_per_million_passengers", ascending=False).round(2)

Unnamed: 0,crimes,passengers,crimes_per_million_passengers
West Ham,182,6672711.5,27.3
Finsbury Park,427,23862703.0,17.9
Upton Park,174,11241611.5,15.5
East Ham,188,14026321.0,13.4
Leicester Square,493,38267555.0,12.9
...,...,...,...
Heathrow Terminal 4,0,820937.0,0.0
South Woodford,0,5361054.5,0.0
Northwood Hills,0,1850867.0,0.0
Chorleywood,0,978689.5,0.0


### Train Crime

In [19]:
train_crime = pd.DataFrame(index=g.edges(keys=True))
train_crime["crimes"] = nx.get_edge_attributes(g, "crimes")
train_crime["passengers"] = nx.get_edge_attributes(g, "passengers")
train_crime["crimes_per_million_passengers"] = 1_000_000 * train_crime["crimes"] / train_crime["passengers"]

In [20]:
train_crime.sort_values("crimes", ascending=False)

Unnamed: 0,Unnamed: 1,Unnamed: 2,crimes,passengers,crimes_per_million_passengers
Oxford Circus,Green Park,victoria,176.7,18035959.3,9.8
Victoria,Green Park,victoria,168.7,19147418.4,8.8
Waterloo,Westminster,jubilee,162.4,14061523.3,11.6
Bank,Liverpool Street,central,144.8,14608694.7,9.9
King's Cross St. Pancras,Highbury & Islington,victoria,143.4,17160622.4,8.4
...,...,...,...,...,...
Hatton Cross,Heathrow Terminal 4,piccadilly,0.0,1633060.0,0.0
Heathrow Terminal 4,Heathrow Terminals 2 & 3,piccadilly,0.0,1697133.2,0.0
Chalfont & Latimer,Amersham,metropolitan,0.0,2675413.2,0.0
Battersea Power Station,Nine Elms,northern,0.0,9125160.2,0.0


In [21]:
train_crime.sort_values("crimes_per_million_passengers", ascending=False)

Unnamed: 0,Unnamed: 1,Unnamed: 2,crimes,passengers,crimes_per_million_passengers
Warren Street,Euston,northern,138.5,9979945.3,13.9
Warren Street,Euston,victoria,138.5,9979945.3,13.9
Baker Street,Finchley Road,metropolitan,69.0,4985569.0,13.8
Westminster,Green Park,jubilee,134.4,10265552.7,13.1
Stockwell,Clapham North,northern,79.4,6185253.9,12.8
...,...,...,...,...,...
Battersea Power Station,Nine Elms,northern,0.0,9125160.2,0.0
Hatton Cross,Heathrow Terminal 4,piccadilly,0.0,1633060.0,0.0
Heathrow Terminal 4,Heathrow Terminals 2 & 3,piccadilly,0.0,1697133.2,0.0
Acton Town,Chiswick Park,district,0.0,2610641.0,0.0


### Tube Lines

In [22]:
line_crime = pd.DataFrame(columns=["station_crimes", "station_crimes_distributed", "train_crimes", "train_crimes_distributed", "total_crimes", "passengers", "crimes_per_million_passengers"])

for line in underground_lines.id:
    subgraph = nx.subgraph_view(
        g,
        filter_node=lambda node: node in stations_by_line[line],
        filter_edge=lambda a, b, key: key == line
    )

    station_crimes = sum(data["crimes"] / len(data["lines"]) for station, data in subgraph.nodes(data=True))
    station_crimes_distributed = sum(data["crimes_distributed"] / len(data["lines"]) for station, data in subgraph.nodes(data=True))

    train_crimes = sum(nx.get_edge_attributes(subgraph, "crimes").values())
    train_crimes_distributed = sum(nx.get_edge_attributes(subgraph, "crimes_distributed").values())

    total_crimes = station_crimes + station_crimes_distributed + train_crimes + train_crimes_distributed

    station_passengers = sum(data["passengers"] / len(data["lines"]) for station, data in subgraph.nodes(data=True))
    train_passengers = sum(nx.get_edge_attributes(subgraph, "passengers").values())

    total_passengers = train_passengers + station_passengers

    line_crime.loc[line] = (
        station_crimes,
        station_crimes_distributed,
        train_crimes,
        train_crimes_distributed,
        total_crimes,
        total_passengers,
        1_000_000 * total_crimes / total_passengers
    )

line_crime.sum()

Unnamed: 0,0
station_crimes,13525.0
station_crimes_distributed,124.0
train_crimes,9202.0
train_crimes_distributed,4126.0
total_crimes,26977.0
passengers,5604630901.0
crimes_per_million_passengers,51.7


In [23]:
line_crime.sort_values("total_crimes", ascending=False)

Unnamed: 0,station_crimes,station_crimes_distributed,train_crimes,train_crimes_distributed,total_crimes,passengers,crimes_per_million_passengers
northern,2161.7,12.8,1578.1,630.0,4382.5,954286778.9,4.6
central,1804.1,16.6,1609.4,702.0,4132.1,752983157.8,5.5
victoria,1951.7,11.1,1469.8,650.0,4082.6,539197219.9,7.6
jubilee,1617.7,7.8,1331.7,617.0,3574.2,672049083.7,5.3
piccadilly,1759.8,6.9,1045.4,486.0,3298.1,621788330.2,5.3
district,1250.4,8.3,583.7,433.0,2275.4,714107753.5,3.2
bakerloo,811.3,39.0,269.9,219.0,1339.2,326899125.6,4.1
hammersmith-city,810.7,3.6,355.8,89.0,1259.2,295132005.5,4.3
circle,693.2,5.0,468.5,75.0,1241.8,418853922.6,3.0
metropolitan,564.1,12.2,392.5,225.0,1193.9,264875524.6,4.5


In [24]:
line_crime.sort_values("crimes_per_million_passengers", ascending=False)

Unnamed: 0,station_crimes,station_crimes_distributed,train_crimes,train_crimes_distributed,total_crimes,passengers,crimes_per_million_passengers
victoria,1951.7,11.1,1469.8,650.0,4082.6,539197219.9,7.6
central,1804.1,16.6,1609.4,702.0,4132.1,752983157.8,5.5
jubilee,1617.7,7.8,1331.7,617.0,3574.2,672049083.7,5.3
piccadilly,1759.8,6.9,1045.4,486.0,3298.1,621788330.2,5.3
northern,2161.7,12.8,1578.1,630.0,4382.5,954286778.9,4.6
metropolitan,564.1,12.2,392.5,225.0,1193.9,264875524.6,4.5
waterloo-city,100.2,0.8,97.1,0.0,198.0,44457998.7,4.5
hammersmith-city,810.7,3.6,355.8,89.0,1259.2,295132005.5,4.3
bakerloo,811.3,39.0,269.9,219.0,1339.2,326899125.6,4.1
district,1250.4,8.3,583.7,433.0,2275.4,714107753.5,3.2


In [25]:
from pathlib import Path

Path("results").mkdir(exist_ok=True)

station_crime.sort_values("crimes", ascending=False).to_csv("results/station_crime.csv", )
train_crime.sort_values("crimes", ascending=False).to_csv("results/train_crime.csv")
line_crime.sort_values("total_crimes", ascending=False).to_csv("results/line_crime.csv")

### Visualising Results

In [26]:
# source: https://content.tfl.gov.uk/tfl-colour-standard-issue-08.pdf
line_colours = {
    "central": (225, 37, 27),
    "circle": (255, 205, 0),
    "bakerloo": (166, 90, 42),
    "district": (0, 121, 52),
    "jubilee": (123, 134, 140),
    "hammersmith-city": (236, 155, 173),
    "northern": (0, 0, 0),
    "metropolitan": (135, 15, 84),
    "piccadilly": (0, 15, 159),
    "victoria": (0, 160, 223),
    "waterloo-city": (107, 205, 178)
}

In [27]:
compass = folium.Element('<img src="https://openclipart.org/image/800px/100207" style="position:absolute; z-index:1000000; width:32px; bottom:48px; left:16px">')

The map below shows total and normalised crimes across each London Underground line.

In [28]:
map = folium.Map(
    location=(51.5072, -0.1276),
    tiles="cartodb positron",
    control_scale=True,
    zoom_control=False,
    zoom_start=11,
)

feature_total = folium.FeatureGroup(name="Total Crimes",).add_to(map)
feature_normalised = folium.FeatureGroup(name="Crimes per million", show=False).add_to(map)

for src, dest, line, data in g.edges(keys=True, data=True):
    pos1 = [g.nodes[src]["latitude"], g.nodes[src]["longitude"]]
    pos2 = [g.nodes[dest]["latitude"], g.nodes[dest]["longitude"]]

    folium.PolyLine(
        locations=[pos1, pos2],
        tooltip=f"{line}: {line_crime['total_crimes'][line]:.1f} total crimes",
        color="#%02x%02x%02x" % line_colours[line],
        dashArray=f"4 {len(underground_lines)}",
        dashOffset=str(list(line_colours.keys()).index(line)),
        weight=line_crime["total_crimes"][line] / line_crime["total_crimes"].quantile(0.01)
    ).add_to(feature_total)

    folium.PolyLine(
        locations=[pos1, pos2],
        tooltip=f"{line}: {line_crime['crimes_per_million_passengers'][line]:.1f} crimes per million passengers",
        color="#%02x%02x%02x" % line_colours[line],
        dashArray=f"4 {len(underground_lines)}",
        dashOffset=str(list(line_colours.keys()).index(line)),
        weight=line_crime["crimes_per_million_passengers"][line]
    ).add_to(feature_normalised)

folium.LayerControl(collapsed=False).add_to(map)
map.get_root().html.add_child(compass)

map

The map below shows total crimes across all stations and segments of the London Underground network.

In [29]:
map = folium.Map(
    location=(51.5072, -0.1276),
    tiles="cartodb positron",
    control_scale=True,
    zoom_control=False,
    zoom_start=11,
)

feature_at_station = folium.FeatureGroup(name="At-station crime").add_to(map)
feature_on_train = folium.FeatureGroup(name="On-train crime").add_to(map)

for node, data in g.nodes(data=True):
    value = data["crimes"]

    folium.CircleMarker(
        location=[data["latitude"], data["longitude"]],
        radius=value / 20,
        tooltip=node,
        popup=f"{value:.1f} total crimes",
        fill=True,
        weight=1,
        color="darkred"
    ).add_to(feature_at_station)

for src, dest, line, data in g.edges(keys=True, data=True):
    value = data["crimes"]

    pos1 = [g.nodes[src]["latitude"], g.nodes[src]["longitude"]]
    pos2 = [g.nodes[dest]["latitude"], g.nodes[dest]["longitude"]]

    folium.PolyLine(
        locations=[pos1, pos2],
        tooltip=f"{src} - {dest} ({line})",
        popup=f"{value:.1f} total crimes",
        color="#%02x%02x%02x" % line_colours[line],
        dashArray=f"4 {len(underground_lines)}",
        dashOffset=str(list(line_colours.keys()).index(line)),
        lineCap="square",
        weight=value / 10,
    ).add_to(feature_on_train)

folium.LayerControl(collapsed=False).add_to(map)
map.get_root().html.add_child(compass)

map

The map below shows crimes per million passengers across all stations and segments of the London Underground network.

In [30]:
map = folium.Map(
    location=(51.5072, -0.1276),
    tiles="cartodb positron",
    control_scale=True,
    zoom_control=False,
    zoom_start=11,
)

feature_at_station = folium.FeatureGroup(name="At-station crime",).add_to(map)
feature_on_train = folium.FeatureGroup(name="On-train crime").add_to(map)

for node, data in g.nodes(data=True):
    value = data["crimes_per_million_passengers"]

    folium.CircleMarker(
        location=[data["latitude"], data["longitude"]],
        radius=value,
        tooltip=node,
        popup=f"{value:.1f} crimes per million passengers",
        fill=True,
        weight=1,
        color="darkred"
    ).add_to(feature_at_station)

for src, dest, line, data in g.edges(keys=True, data=True):
    value = data["crimes_per_million_passengers"]

    pos1 = [g.nodes[src]["latitude"], g.nodes[src]["longitude"]]
    pos2 = [g.nodes[dest]["latitude"], g.nodes[dest]["longitude"]]

    folium.PolyLine(
        locations=[pos1, pos2],
        tooltip=f"{src} - {dest} ({line})",
        popup=f"{value:.1f} crimes per million passengers",
        color="#%02x%02x%02x" % line_colours[line],
        dashArray=f"4 {len(underground_lines)}",
        dashOffset=str(list(line_colours.keys()).index(line)),
        lineCap="square",
        weight=value,
    ).add_to(feature_on_train)

folium.LayerControl(collapsed=False).add_to(map)
map.get_root().html.add_child(compass)

map