<div class="alert alert-block alert-success">
    <h1>
        Example notebook - London Tube
    </h1>
    <p>
        Link to dataset : <a href="https://drive.google.com/drive/folders/1Tfl7D5Khh1uiQDMy20XngdFggaGyj6Vy?usp=drive_link">Google Drive</a>
    </p>
</div>

# Import modules and functions

In [1]:
import os
import pandas as pd
import re
from IPython.display import display, Markdown
import numpy as np
import time
from tqdm.auto import tqdm

from turingdb_examples.graph import (
    create_graph_from_df,
    build_create_command_from_networkx,
    split_cypher_commands
)
from turingdb_examples.llm import query_llm, natural_language_to_cypher

In [2]:
%load_ext autoreload
%autoreload 2

# Check data files are available

In [3]:
example_name = "london_transport_TfL"
path_data = f"{os.getcwd()}/data/{example_name}"
if not os.path.exists(path_data):
    raise ValueError(f"{path_data} does not exists")

list_csv_files = sorted(os.listdir(path_data))
if not all([file in list_csv_files for file in [
    "TfL_london_transport_tube.csv",
    "TfL_london_transport_tube_sightseeing.csv",
    "TfL_london_transport_tube_stations.csv",
]]):  
    raise ValueError(
        f"{filename} csv file is not available in {path_data}"
    )

# Import and format data

## Import dataframe containing linkage between each station

In [4]:
df_link_stations = pd.read_csv(
    f"{path_data}/TfL_london_transport_tube.csv", decimal=","
)
display(df_link_stations)
df_link_stations = df_link_stations.apply(
    lambda x: x.str.strip() if x.dtype == "object" else x
)

# Remove two unrelevant lines 'Docklands Light Railway' and 'East London'
df_link_stations = df_link_stations[~df_link_stations["Line"].isin(['Docklands Light Railway', 'East London'])]

# Replace all "and" by "&" in all station names 
df_link_stations["From_Station"] = df_link_stations["From_Station"].str.replace(" and ", " & ")
df_link_stations["To_Station"] = df_link_stations["To_Station"].str.replace(" and ", " & ")

# Replace "Bank" by "Bank & Monument" for convenience
df_link_stations["From_Station"] = df_link_stations["From_Station"].str.replace("^Bank$", "Bank & Monument", regex=True)
df_link_stations["To_Station"] = df_link_stations["To_Station"].str.replace("^Bank$", "Bank & Monument", regex=True)

# Remove (B) and (C) from Edgware Road for convenience
df_link_stations["From_Station"] = df_link_stations["From_Station"].str.replace(r" \([B|C]\)", "", regex=True)
df_link_stations["To_Station"] = df_link_stations["To_Station"].str.replace(r" \([B|C]\)", "", regex=True)

# Add spaces in "Heathrow Terminals 123"
df_link_stations["From_Station"] = df_link_stations["From_Station"].str.replace("Heathrow Terminals 123", "Heathrow Terminals 1 2 3", regex=False)
df_link_stations["To_Station"] = df_link_stations["To_Station"].str.replace("Heathrow Terminals 123", "Heathrow Terminals 1 2 3", regex=False)

# Unique stations
unique_stations = list(np.unique(df_link_stations[["From_Station", "To_Station"]].values.flatten()))
print(f"Number of unique stations: {len(unique_stations)}")

df_link_stations

Unnamed: 0,Line,Order,From_Station,To_Station,Total,PM Peak,Evening,Late
0,Bakerloo,1,Elephant & Castle,Lambeth North,13.432,4.118,1.288,398.000
1,Bakerloo,2,Lambeth North,Waterloo,18.597,5.718,1.916,580.000
2,Bakerloo,3,Waterloo,Embankment,46.020,13.746,4.609,1.466
3,Bakerloo,4,Embankment,Charing Cross,55.475,17.155,5.959,1.940
4,Bakerloo,5,Charing Cross,Piccadilly Circus,63.929,20.112,7.712,3.035
...,...,...,...,...,...,...,...,...
859,Victoria,13,Pimlico,Vauxhall,94.853,37.711,24.553,10.574
860,Victoria,14,Vauxhall,Stockwell,66.561,27.117,18.342,7.992
861,Victoria,15,Stockwell,Brixton,41.178,15.972,11.553,5.361
862,Waterloo & City,1,Waterloo,Bank and Monument,5.554,5.197,1.465,386.000


Number of unique stations: 266


Unnamed: 0,Line,Order,From_Station,To_Station,Total,PM Peak,Evening,Late
0,Bakerloo,1,Elephant & Castle,Lambeth North,13.432,4.118,1.288,398.000
1,Bakerloo,2,Lambeth North,Waterloo,18.597,5.718,1.916,580.000
2,Bakerloo,3,Waterloo,Embankment,46.020,13.746,4.609,1.466
3,Bakerloo,4,Embankment,Charing Cross,55.475,17.155,5.959,1.940
4,Bakerloo,5,Charing Cross,Piccadilly Circus,63.929,20.112,7.712,3.035
...,...,...,...,...,...,...,...,...
859,Victoria,13,Pimlico,Vauxhall,94.853,37.711,24.553,10.574
860,Victoria,14,Vauxhall,Stockwell,66.561,27.117,18.342,7.992
861,Victoria,15,Stockwell,Brixton,41.178,15.972,11.553,5.361
862,Waterloo & City,1,Waterloo,Bank & Monument,5.554,5.197,1.465,386.000


## Import dataframe containing station information

In [5]:
df_stations = pd.read_csv(f"{path_data}/TfL_london_transport_tube_stations.csv")
display(df_stations)
df_stations = df_stations.drop(["Zone_original", "OS X", "OS Y"], axis=1)
df_stations = df_stations.apply(lambda x: x.str.strip() if x.dtype == "object" else x)
df_stations["Station"] = df_stations["Station"].str.replace(" and ", " & ")

# Keep only first occurence of station info (e.g. "Edgware Road" appears twice)
count_stations = df_stations["Station"].value_counts()
duplicated_stations = count_stations[count_stations > 1]
print(f"Duplicated stations:\n{duplicated_stations}\n")
df_stations = df_stations.drop_duplicates("Station", keep="first")

# Replace "Bank" by "Bank & Monument" for convenience
df_stations["Station"] = df_stations["Station"].str.replace("^Bank$", "Bank & Monument", regex=True)
df_stations = df_stations.drop(df_stations[df_stations["Station"] == "Monument"].index, axis=0)

# Add apostrophe in "Earls Court", "Kings Cross St. Pancras", "Queen's Park",
# "Regent's Park", "Shepherd's Bush", "Shepherd's Bush Market", "St. John's Wood", "St. Paul's"
df_stations["Station"] = df_stations["Station"].str.replace("^Earls", "Earl's", regex=True)
df_stations["Station"] = df_stations["Station"].str.replace("^Kings Cross St. Pancras", "King's Cross St. Pancras", regex=True)
df_stations["Station"] = df_stations["Station"].str.replace("^Queens Park", "Queen's Park", regex=True)
df_stations["Station"] = df_stations["Station"].str.replace("^Regents Park", "Regent's Park", regex=True)
df_stations["Station"] = df_stations["Station"].str.replace("^Shepherds Bush", "Shepherd's Bush", regex=True)
df_stations["Station"] = df_stations["Station"].str.replace("^Shepherds Bush Market", "Shepherd's Bush Market", regex=True)
df_stations["Station"] = df_stations["Station"].str.replace("^St. Johns Wood", "St. John's Wood", regex=True)
df_stations["Station"] = df_stations["Station"].str.replace("^St. Pauls", "St. Paul's", regex=True)

# Add info about Kensington (Olympia) station
df_stations.loc[-1] = {
    "Station": "Kensington (Olympia)",
    "Latitude": 51.4979934,
    "Longitude": -0.2099576,
    "Zone": 2,
    "Postcode": "W14 0NE"
}

# Keep only stations present in linkage dataframe above
all_stations = df_stations["Station"].values.tolist()
df_stations = df_stations[df_stations["Station"].isin(unique_stations)]

# Set clean index and sort rows alphabetically
df_stations = df_stations.sort_values("Station")
df_stations.index = range(len(df_stations))

df_stations

Unnamed: 0,Station,OS X,OS Y,Latitude,Longitude,Zone,Postcode,Zone_original
0,Abbey Road,539081,183352,51.531952,0.003723,3,E15 3NB,3
1,Abbey Wood,547297,179002,51.490784,0.120272,4,SE2 9RH,4
2,Acton Central,520613,180299,51.508757,-0.263430,2,W3 6BH,2
3,Acton Main Line,520296,181196,51.516886,-0.267690,3,W3 9EH,3
4,Acton Town,519457,179639,51.503071,-0.280303,3,W3 8HN,3
...,...,...,...,...,...,...,...,...
648,Woodside Park,525725,192564,51.617868,-0.185426,4,N12 8SE,4
649,Woolwich,543931,178994,51.491578,0.071819,4,SE18 6EU,4
650,Woolwich Arsenal,543754,178803,51.489907,0.069194,4,SE18 6HX,4
651,Woolwich Dockyard,542738,178908,51.491108,0.054612,3,SE18 5JY,3


Duplicated stations:
Station
Hammersmith     2
Edgware Road    2
Name: count, dtype: int64



Unnamed: 0,Station,Latitude,Longitude,Zone,Postcode
0,Acton Town,51.503071,-0.280303,3,W3 8HN
1,Aldgate,51.514342,-0.075627,1,EC3N 1AH
2,Aldgate East,51.515082,-0.073001,1,E1 7PT
3,Alperton,51.541209,-0.299516,4,HA0 4LL
4,Amersham,51.674128,-0.606514,9,HP6 5AZ
...,...,...,...,...,...
261,Wimbledon Park,51.434581,-0.199186,3,SW19 7DZ
262,Wood Green,51.597454,-0.109527,3,N22 8HH
263,Wood Lane,51.509658,-0.224473,2,W12 7DS
264,Woodford,51.607202,0.034056,4,IG8 7QE


In [6]:
#assert not (set(unique_stations) - set(all_stations)), "Some station information have not been found"

## Import dataframe containing tube station information

In [7]:
df_sightseeing = pd.read_csv(
    f"{path_data}/TfL_london_transport_tube_sightseeing.csv", sep=";", header=1
)
df_sightseeing = df_sightseeing.T.drop_duplicates().T.rename(columns={"Blurb": "Note"})
df_sightseeing = df_sightseeing.apply(
    lambda x: x.str.strip() if x.dtype == "object" else x
)
df_sightseeing["Station"] = df_sightseeing["Station"].str.replace(" and ", " & ")

# Replace "Bank" by "Bank & Monument" for convenience
df_sightseeing["Station"] = df_sightseeing["Station"].str.replace("^Bank$", "Bank & Monument", regex=True)
df_sightseeing = df_sightseeing.drop(df_sightseeing[df_sightseeing["Station"] == "Monument"].index, axis=0)

# Remove lines from Docklands Light Railway (DLR) line
df_sightseeing = df_sightseeing[~df_sightseeing["Station"].isin([
    "Crossharbour", "Custom House for ExCeL"
])]

# Remove (B) and (C) from Edgware Road
df_sightseeing["Station"] = df_sightseeing["Station"].str.replace(r" \((Bakerloo|H & C)\)", "", regex=True)

# Remove (District & Picc) from Hammersmith
df_sightseeing["Station"] = df_sightseeing["Station"].str.replace(r" \(District & Picc\)", "", regex=True)

# Add dashes to Harrow on the Hill station
df_sightseeing["Station"] = df_sightseeing["Station"].str.replace("Harrow on the Hill", "Harrow-on-the-Hill", regex=True)

# Add spaces in "Heathrow Terminals 123"
df_sightseeing["Station"] = df_sightseeing["Station"].str.replace("Heathrow Terminals 123", "Heathrow Terminals 1 2 3", regex=False)

# Add dot in King's Cross St Pancras, St John's Wood and St Paul's
df_sightseeing["Station"] = df_sightseeing["Station"].str.replace("King's Cross St Pancras", "King's Cross St. Pancras", regex=False)
df_sightseeing["Station"] = df_sightseeing["Station"].str.replace("St John's Wood", "St. John's Wood", regex=False)
df_sightseeing["Station"] = df_sightseeing["Station"].str.replace("St Paul's", "St. Paul's", regex=False)

df_sightseeing

Unnamed: 0,Station,Note
0,Abbey Road,Explore the historic House Mill and the Three ...
1,Acton Town,Gunnersbury Park and Museum offer landscaped g...
2,Aldgate,See the historic Aldgate Pump and visit the ne...
3,Aldgate East,"Head to Brick Lane for famous curry houses, vi..."
4,All Saints,Chrisp Street Market and the Lansbury Estate s...
...,...,...
294,Wimbledon Park,Row on the lake at Wimbledon Park and watch te...
295,Wood Green,Take in the views from Alexandra Palace and sh...
296,Woodford,Explore Epping Forest and play golf at Woodfor...
297,Woodside Park,Walk the Dollis Valley Greenwalk and shop alon...


In [8]:
#assert not (set(df_sightseeing["Station"]) - set(df_stations["Station"])), "Some station information have not been found"

## Merge station information

In [9]:
if "Note" not in df_stations.columns:
    df_stations = pd.merge(df_stations, df_sightseeing, on="Station", how="left")
df_stations

Unnamed: 0,Station,Latitude,Longitude,Zone,Postcode,Note
0,Acton Town,51.503071,-0.280303,3,W3 8HN,Gunnersbury Park and Museum offer landscaped g...
1,Aldgate,51.514342,-0.075627,1,EC3N 1AH,See the historic Aldgate Pump and visit the ne...
2,Aldgate East,51.515082,-0.073001,1,E1 7PT,"Head to Brick Lane for famous curry houses, vi..."
3,Alperton,51.541209,-0.299516,4,HA0 4LL,Walk along the Grand Union Canal or explore th...
4,Amersham,51.674128,-0.606514,9,HP6 5AZ,Wander through the picturesque old town of Ame...
...,...,...,...,...,...,...
263,Wimbledon Park,51.434581,-0.199186,3,SW19 7DZ,Row on the lake at Wimbledon Park and watch te...
264,Wood Green,51.597454,-0.109527,3,N22 8HH,Take in the views from Alexandra Palace and sh...
265,Wood Lane,51.509658,-0.224473,2,W12 7DS,
266,Woodford,51.607202,0.034056,4,IG8 7QE,Explore Epping Forest and play golf at Woodfor...


## Compute distances between stations (using euclidian distance between station coordinates)

In [10]:
import numpy as np

def euclidean_distance_geo_coord(df_coordinates, latitude_deg=51.5):  # lat1, lon1, lat2, lon2):
    """
    Euclidean distance in meters.
    Accurate enough for distances < 5km.

    By default, latitude is set to London (51.5°)
    """
    # Meters per degree at London's latitude (51.5°)
    meters_per_lat = 111320  # Constant everywhere
    meters_per_lon = 111320 * np.cos(latitude_deg)
    
    dy = (df_coordinates.iloc[0, 0] - df_coordinates.iloc[1, 0]) * meters_per_lat
    dx = (df_coordinates.iloc[0, 1] - df_coordinates.iloc[1, 1]) * meters_per_lon
    
    return np.sqrt(dx**2 + dy**2)

In [11]:
def add_stations_distances(df_link_stations):
    list_dist_stations = []

    for row in tqdm(df_link_stations.itertuples(), total=len(df_link_stations)):
        # Get origin and destination stations
        from_station = row.From_Station
        to_station = row.To_Station
        #print(f"From {from_station} to {to_station}")
    
        # Get stations coordinates
        df_coordinates = df_stations[df_stations["Station"].isin([from_station, to_station])].loc[:, ["Latitude", "Longitude"]]
        
        # Compute euclidean distance between stations coordinates and get data in meters
        dist = int(euclidean_distance_geo_coord(df_coordinates))
        #print(f"Euclidean distance: {dist}")
    
        # Save for df_link_stations
        list_dist_stations.append(dist)
    
    assert len(list_dist_stations) == len(df_link_stations), "List of distances should be the same size as df_links_stations"

    # Add Distance column to df_link_stations
    df_link_stations["DistanceMeters"] = list_dist_stations

In [12]:
# Add Distance column to df_link_stations
if "DistanceMeters" not in df_link_stations.columns:
    add_stations_distances(df_link_stations)
    print("Euclidean distance successfully added")

  0%|          | 0/714 [00:00<?, ?it/s]

Euclidean distance successfully added


In [13]:
df_link_stations

Unnamed: 0,Line,Order,From_Station,To_Station,Total,PM Peak,Evening,Late,DistanceMeters
0,Bakerloo,1,Elephant & Castle,Lambeth North,13.432,4.118,1.288,398.000,545
1,Bakerloo,2,Lambeth North,Waterloo,18.597,5.718,1.916,580.000,450
2,Bakerloo,3,Waterloo,Embankment,46.020,13.746,4.609,1.466,571
3,Bakerloo,4,Embankment,Charing Cross,55.475,17.155,5.959,1.940,146
4,Bakerloo,5,Charing Cross,Piccadilly Circus,63.929,20.112,7.712,3.035,359
...,...,...,...,...,...,...,...,...,...
859,Victoria,13,Pimlico,Vauxhall,94.853,37.711,24.553,10.574,544
860,Victoria,14,Vauxhall,Stockwell,66.561,27.117,18.342,7.992,1511
861,Victoria,15,Stockwell,Brixton,41.178,15.972,11.553,5.361,1092
862,Waterloo & City,1,Waterloo,Bank & Monument,5.554,5.197,1.465,386.000,1443


# Create graph from dataframe

In [14]:
G = create_graph_from_df(
    df_link_stations,
    source_node_col={"id": "From_Station", "type": "Station"},
    target_node_col={"id": "To_Station", "type": "Station"},
    node_attributes_df=df_stations,
    node_attributes_key_col="Station",
    #edge_col="Line",
    attributes_edges=["Line", "DistanceMeters"]
)

In [15]:
# Show 10 first nodes
for node in list(G.nodes(data=True))[:10]:
    print(node)

('Elephant & Castle', {'displayName': 'Elephant & Castle', 'type': 'Station', 'Latitude': 51.4958492200292, 'Longitude': -0.100738714256282, 'Zone': 1, 'Postcode': 'SE1 6LW', 'Note': 'Visit the Imperial War Museum or catch a show at Southwark Playhouse.'})
('Lambeth North', {'displayName': 'Lambeth North', 'type': 'Station', 'Latitude': 51.4991297653886, 'Longitude': -0.111768260460206, 'Zone': 1, 'Postcode': 'SE1 7XG', 'Note': 'Visit the Imperial War Museum and browse the Saturday market on Lower Marsh.'})
('Waterloo', {'displayName': 'Waterloo', 'type': 'Station', 'Latitude': 51.5031464567835, 'Longitude': -0.113259251258196, 'Zone': 1, 'Postcode': 'SE1 7NY', 'Note': 'Ride the London Eye, stroll along the South Bank and see a play at the National Theatre.'})
('Embankment', {'displayName': 'Embankment', 'type': 'Station', 'Latitude': 51.5073122969468, 'Longitude': -0.12236733566212, 'Zone': 1, 'Postcode': 'WC2N 6NS', 'Note': 'Relax in Victoria Embankment Gardens and see Cleopatra’s Ne

In [16]:
# Show 10 first edges
for edge in list(G.edges(data=True))[:10]:
    print(edge)

('Elephant & Castle', 'Lambeth North', {'Line': 'Bakerloo', 'DistanceMeters': 545})
('Elephant & Castle', 'Borough', {'Line': 'Northern', 'DistanceMeters': 626})
('Elephant & Castle', 'Kennington', {'Line': 'Northern', 'DistanceMeters': 815})
('Lambeth North', 'Waterloo', {'Line': 'Bakerloo', 'DistanceMeters': 450})
('Lambeth North', 'Elephant & Castle', {'Line': 'Bakerloo', 'DistanceMeters': 545})
('Waterloo', 'Embankment', {'Line': 'Northern', 'DistanceMeters': 571})
('Waterloo', 'Lambeth North', {'Line': 'Bakerloo', 'DistanceMeters': 450})
('Waterloo', 'Westminster', {'Line': 'Jubilee', 'DistanceMeters': 473})
('Waterloo', 'Southwark', {'Line': 'Jubilee', 'DistanceMeters': 285})
('Waterloo', 'Kennington', {'Line': 'Northern', 'DistanceMeters': 1640})


In [17]:
print(G)

DiGraph with 266 nodes and 614 edges


# Create Cypher CREATE command

## Build CREATE command

In [18]:
%%time

# Build CREATE command from networkx object
graph_CREATE_command = build_create_command_from_networkx(G, node_type_key='type', edge_type_key='CONNECTION')

print(f"""
Cypher CREATE command :
* size: {len(graph_CREATE_command.encode('utf-8'))/1024/1000:.4f} MB\n
{100 * '*'}
{graph_CREATE_command if len(graph_CREATE_command.split("\n")) < 1000 else "\n".join(graph_CREATE_command.split('\n')[:10]) + "\n...\n" + "\n".join(graph_CREATE_command.split('\n')[-10:])}
{100 * '*'}
""")

Cypher query will create graph with 266 nodes and 614 edges

Cypher CREATE command :
* size: 0.1528 MB

****************************************************************************************************
CREATE (:Station {id: "Elephant & Castle", displayName: "Elephant & Castle", type: "Station", Latitude: 51.4958492200292, Longitude: -0.100738714256282, Zone: 1, Postcode: "SE1 6LW", Note: "Visit the Imperial War Museum or catch a show at Southwark Playhouse."}),
(:Station {id: "Lambeth North", displayName: "Lambeth North", type: "Station", Latitude: 51.4991297653886, Longitude: -0.111768260460206, Zone: 1, Postcode: "SE1 7XG", Note: "Visit the Imperial War Museum and browse the Saturday market on Lower Marsh."}),
(:Station {id: "Waterloo", displayName: "Waterloo", type: "Station", Latitude: 51.5031464567835, Longitude: -0.113259251258196, Zone: 1, Postcode: "SE1 7NY", Note: "Ride the London Eye, stroll along the South Bank and see a play at the National Theatre."}),
(:Station {id: "E

## Split command into chunks

In [19]:
%%time

chunks = split_cypher_commands(graph_CREATE_command, max_size_mb=1)

print(f"✓ Split into {len(chunks['node_chunks'])} node chunk(s) and {len(chunks['edge_chunks'])} edge chunk(s)")

print("\nNode chunks:")
for i, chunk in enumerate(chunks['node_chunks']):
    print(f"  Node chunk {i+1}: {len(chunk.encode('utf-8'))/1024:.1f} KB")
    if i == 10:
        print("  ...")
        break
        
print("\nEdge chunks:")
for i, chunk in enumerate(chunks['edge_chunks']):
    print(f"  Edge chunk {i+1}: {len(chunk.encode('utf-8'))/1024:.1f} KB")
    if i == 10:
        print("  ...")
        break

✓ Split into 1 node chunk(s) and 614 edge chunk(s)

Node chunks:
  Node chunk 1: 65.3 KB

Edge chunks:
  Edge chunk 1: 0.1 KB
  Edge chunk 2: 0.1 KB
  Edge chunk 3: 0.1 KB
  Edge chunk 4: 0.1 KB
  Edge chunk 5: 0.1 KB
  Edge chunk 6: 0.1 KB
  Edge chunk 7: 0.1 KB
  Edge chunk 8: 0.1 KB
  Edge chunk 9: 0.1 KB
  Edge chunk 10: 0.1 KB
  Edge chunk 11: 0.1 KB
  ...
CPU times: user 9.35 ms, sys: 907 μs, total: 10.3 ms
Wall time: 9.92 ms


# Create graph using `turingdb` python package

<div class="alert alert-block alert-info">
    <h2>
        See <a href="https://docs.turingdb.ai/quickstart">TuringDB Get started documentation</a> for the important steps to follow :
    </h2>
    <h4>
        <ul>
            <li>Create your TuringDB account</li>
            <li>Create your instance in the <a href="https://console.turingdb.ai/auth">TuringDB Cloud UI</a></li>
            <li>Copy your Instance ID from the Database Instances management page</li>
            <li>Get API Key from the Settings in UI</li>
        </ul>
        Remember to have your instance active while working in this notebook !
    </h4>
</div>

In [20]:
from turingdb import TuringDB

# Create TuringDB client
# set host parameter to the URL (as string) on which TuringDB is running,
# default "http://localhost:6666"
client = TuringDB(host="http://localhost:6666")
try:
    client.warmup()
except Exception as e:
    print(f"TuringDB not started, please run `uv run turingdb` in your terminal")

In [21]:
# Get list of available graphs
list_graphs = client.list_available_graphs()

In [22]:
# Get list of loaded graphs
#client.list_loaded_graphs()

In [23]:
# Set graph name
graph_name_prefix = example_name
graph_name_nb_suffix = str(
    max(
        [
            int(re.sub(graph_name_prefix, "", g))
            for g in list_graphs
            if g.startswith(graph_name_prefix)
            and re.sub(graph_name_prefix, "", g).isdigit()
        ]
        + [0]
    )
    + 1
)
graph_name = graph_name_prefix + graph_name_nb_suffix
graph_name = re.sub("-", "_", graph_name)
graph_name

'london_transport_TfL2'

In [24]:
from turingdb.exceptions import TuringDBException

In [25]:
%%time

# Set graph
try:
    client.create_graph(graph_name)
except TuringDBException as e:
    print(e)

# Set working graph
client.set_graph(graph_name)

CPU times: user 1.07 ms, sys: 2.52 ms, total: 3.59 ms
Wall time: 9.99 ms


In [26]:
%%time

# Create a new change on the graph
change = client.new_change()
print(f"Current change {change}")

# Checkout into the change
client.checkout(change=change)

Current change 0
CPU times: user 3.51 ms, sys: 0 ns, total: 3.51 ms
Wall time: 3.34 ms


In [27]:
%%time

# Run CREATE command
print("\nExecuting query on TuringDB...")
start_time = time.time()

print(f"✓ Split into {len(chunks['node_chunks'])} node chunk(s) and {len(chunks['edge_chunks'])} edge chunk(s)")

# CREATE nodes
print("\nNode chunks:")
for i, chunk in enumerate(tqdm(chunks['node_chunks'])):
    result = client.query(chunk)
# Commit the change
client.query("COMMIT")
print(f"✓ {len(chunks['node_chunks'])} node chunks done")

# CREATE edges
print("\nEdge chunks:")
for i, chunk in enumerate(tqdm(chunks['edge_chunks'])):
    result = client.query(chunk)
# Commit the change
client.query("COMMIT")
print(f"✓ {len(chunks['edge_chunks'])} edge chunks done")

execution_time = time.time() - start_time
print(f"\n✓ Graph created successfully in {execution_time:.2f} seconds")

# Submit changes
start_time = time.time()
client.query("CHANGE SUBMIT")
execution_time = time.time() - start_time
print(f"\n✓ Changes successfully submitted in {execution_time:.2f} seconds")

# Checkout into main
client.checkout()


Executing query on TuringDB...
✓ Split into 1 node chunk(s) and 614 edge chunk(s)

Node chunks:


  0%|          | 0/1 [00:00<?, ?it/s]

✓ 1 node chunks done

Edge chunks:


  0%|          | 0/614 [00:00<?, ?it/s]

✓ 614 edge chunks done

✓ Graph created successfully in 0.61 seconds

✓ Changes successfully submitted in 0.08 seconds
CPU times: user 512 ms, sys: 43 ms, total: 555 ms
Wall time: 689 ms


<div class="alert alert-block alert-info">
    <h2>
        Visualize your graph in TuringDB Graph Visualizer ! Now that your instance is running:
    </h2>
    <h3>
        <ul>
            <li>Go to <a href="https://console.turingdb.ai/databases">TuringDB Console - Database Instances</a></li>
            <li>In your current instance panel, click on "Open Visualizer" button</li>
            <li>Visualizer opens, now you can choose your graph in the dropdown menu at the top-right corner</li>
        </ul>
        You can then play with your graph and visualize the nodes you want !
    </h3>
</div>

# Query TuringDB

## Use metaqueries to have insight on graph overall structure

<h3>
    To learn more about 📮 Metaqueries, please check TuringDB documentation on this <a href="https://turingdb.mintlify.app/query/cypher_subset#%F0%9F%93%AE-metaqueries">link</a>
</h3>

In [28]:
%%time

# CALL propertyTypes() - returns a column of all the different node and edge properties and their types in the database
command = """
CALL db.propertyTypes()
"""
df_propertyTypes = client.query(command)
if df_propertyTypes.empty:
    print("No result found")
else:
    display(df_propertyTypes)

Unnamed: 0,id,propertyType,valueType
0,0,Note,String
1,1,Zone,Int64
2,2,Longitude,Double
3,3,Latitude,Double
4,4,type,String
5,5,displayName,String
6,6,Postcode,String
7,7,id,String
8,8,DistanceMeters,Int64
9,9,Line,String


CPU times: user 7.43 ms, sys: 1.83 ms, total: 9.27 ms
Wall time: 8.25 ms


In [29]:
# Get node properties
nodes_properties = df_propertyTypes["propertyType"].values.tolist()
print(f"Node properties: {nodes_properties}")

Node properties: ['Note', 'Zone', 'Longitude', 'Latitude', 'type', 'displayName', 'Postcode', 'id', 'DistanceMeters', 'Line']


In [30]:
%%time

# CALL labels () - returns a column of all the different node labels
command = """
CALL db.labels()
"""
df_labels = client.query(command)
if df_labels.empty:
    print("No result found")
else:
    display(df_labels)

Unnamed: 0,id,label
0,0,Station


CPU times: user 7.09 ms, sys: 1.01 ms, total: 8.1 ms
Wall time: 7.36 ms


In [31]:
%%time

# CALL edgeTypes() - returns a column of all the different edge types (edge equivalent of node labels)
command = """
CALL db.edgeTypes()
"""
df_edgeTypes = client.query(command)
if df_edgeTypes.empty:
    print("No result found")
else:
    display(df_edgeTypes)

Unnamed: 0,id,edgeType
0,0,CONNECTION


CPU times: user 7.68 ms, sys: 91 μs, total: 7.77 ms
Wall time: 7.14 ms


## Counts

In [32]:
%%time

# Find number of nodes and number of edges in the graph
n_nodes = len(client.query("MATCH (n) RETURN n"))
n_edges = len(client.query("MATCH (n)-->(m) RETURN n, m"))
print(f"Graph: {n_nodes:,} nodes and {n_edges:,} edges")

Graph: 266 nodes and 614 edges
CPU times: user 5.75 ms, sys: 1.11 ms, total: 6.86 ms
Wall time: 5.84 ms


In [33]:
%%time

# Count all nodes
command = """
MATCH (n)
RETURN COUNT(n)
"""
df_count_nodes = client.query(command)
display(df_count_nodes)

# Count all edges
command = """
MATCH (n)-->()
RETURN COUNT(n)
"""
df_count_edges = client.query(command)
display(df_count_edges)

# Find number of nodes and number of edges in the graph
n_nodes = int(df_count_nodes.loc[0, "COUNT(n)"])
n_edges = int(df_count_edges.loc[0, "COUNT(n)"])
print(f"Graph: {n_nodes:,} nodes and {n_edges:,} edges")

Unnamed: 0,COUNT(n)
0,266


Unnamed: 0,COUNT(n)
0,614


Graph: 266 nodes and 614 edges
CPU times: user 13.4 ms, sys: 2.05 ms, total: 15.4 ms
Wall time: 13.9 ms


In [34]:
# Count number of nodes for each label
for label in df_labels["label"]:
    print(100 * '-')
    print(f"label: {label}")
    df_curr_label = client.query(f"""
    MATCH (n:{label})
    RETURN n.displayName
    """)
    df_curr_label_count = client.query(f"""
    MATCH (n:{label})
    RETURN count(n)
    """)
    display(df_curr_label)
    display(df_curr_label_count)
    
    print()
print(100 * '-')

----------------------------------------------------------------------------------------------------
label: Station


Unnamed: 0,n.displayName
0,Northwood Hills
1,Northwood
2,Moor Park
3,Croxley
4,Watford
...,...
261,Wimbledon Park
262,Wimbledon
263,Earl s Court
264,West Kensington


Unnamed: 0,count(n)
0,266



----------------------------------------------------------------------------------------------------


## Queries

In [35]:
%%time

# Match all edges and return them - all links between two stations
command = """
MATCH (n)-[e]->(m)
RETURN n.displayName, e.Line, m.displayName
"""
df = client.query(command)
if df.empty:
    print("No result found")
else:
    display(df)

Unnamed: 0,n.displayName,e.Line,m.displayName
0,Northwood Hills,Metropolitan,Northwood
1,Northwood Hills,Metropolitan,Pinner
2,Northwood,Metropolitan,Moor Park
3,Northwood,Metropolitan,Northwood Hills
4,Moor Park,Metropolitan,Croxley
...,...,...,...
609,West Kensington,District,Earl s Court
610,West Kensington,District,Barons Court
611,Barons Court,District,West Kensington
612,Barons Court,Piccadilly,Hammersmith


CPU times: user 9.43 ms, sys: 1.92 ms, total: 11.3 ms
Wall time: 10.4 ms


In [36]:
%%time

# Check if there exists a path between Hoxton Station and South Kensington in 1 hop
command = """
MATCH (departure)-[link]->(arrival)
WHERE departure.displayName = 'Hoxton Station'
AND arrival.displayName = 'South Kensington'
RETURN departure.displayName, link, arrival.displayName
"""
df = client.query(command)
if df.empty:
    print("No result found")
else:
    display(df)

No result found
CPU times: user 3.66 ms, sys: 0 ns, total: 3.66 ms
Wall time: 3.31 ms


In [37]:
%%time

# Check if there exists a path between Blackfriars and Mansion House in 1 hop
command = """
MATCH (departure)-[link]->(arrival)
WHERE departure.displayName = 'Blackfriars'
AND arrival.displayName = 'Mansion House'
RETURN departure.displayName, link.Line, link.DistanceMeters, arrival.displayName
"""
df = client.query(command)
if df.empty:
    print("No result found")
else:
    display(df)

Unnamed: 0,departure.displayName,link.Line,link.DistanceMeters,arrival.displayName
0,Blackfriars,Circle,328,Mansion House


CPU times: user 8.78 ms, sys: 964 μs, total: 9.75 ms
Wall time: 8.99 ms


In [38]:
%%time

# Check if there exists a path between Victoria and South Kensington in 2 hops
command = """
MATCH (departure)-->(intermediate)-->(arrival)
WHERE departure.displayName = 'Victoria'
AND arrival.displayName = 'South Kensington'
RETURN departure.displayName, intermediate.displayName, arrival.displayName
"""
df = client.query(command)
if df.empty:
    print("No result found")
else:
    display(df)

Unnamed: 0,departure.displayName,intermediate.displayName,arrival.displayName
0,Victoria,Sloane Square,South Kensington


CPU times: user 7.67 ms, sys: 1.11 ms, total: 8.77 ms
Wall time: 8.05 ms


In [39]:
%%time

# Check if there exists a path between Victoria and South Kensington in 2 hops returning connections
command = """
MATCH (departure)-[l1:CONNECTION]->(intermediate)-[l2:CONNECTION]->(arrival)
WHERE departure.displayName = 'Victoria'
AND arrival.displayName = 'South Kensington'
RETURN departure.displayName, l1.Line, intermediate.displayName, l2.Line, arrival.displayName
"""
df = client.query(command)
if df.empty:
    print("No result found")
else:
    display(df)

Unnamed: 0,departure.displayName,l1.Line,intermediate.displayName,l2.Line,arrival.displayName
0,Victoria,Circle,Sloane Square,Circle,South Kensington


CPU times: user 8.71 ms, sys: 950 μs, total: 9.66 ms
Wall time: 9.07 ms


In [40]:
%%time

# Check if there exists a path between Victoria and South Kensington in 2 hops using District Line only
command = """
MATCH (departure)-[l1:CONNECTION]->(intermediate)-[l2:CONNECTION]->(arrival)
WHERE departure.displayName = 'Victoria'
AND arrival.displayName = 'South Kensington'
AND l1.Line = 'District Line'
AND l2.Line = 'District Line'
RETURN departure.displayName, l1.Line, intermediate.displayName, l2.Line, arrival.displayName
"""
df = client.query(command)
if df.empty:
    print("No result found")
else:
    display(df)

No result found
CPU times: user 4.28 ms, sys: 44 μs, total: 4.32 ms
Wall time: 3.52 ms


In [41]:
%%time

# Check if there exists a path between Victoria and South Kensington in 2 hops using Circle Line only
command = """
MATCH (departure)-[l1:CONNECTION]->(intermediate)-[l2:CONNECTION]->(arrival)
WHERE departure.displayName = 'Victoria'
AND arrival.displayName = 'South Kensington'
AND l1.Line = 'Circle'
AND l2.Line = 'Circle'
RETURN departure.displayName, l1.Line, intermediate.displayName, l2.Line, arrival.displayName
"""
df = client.query(command)
if df.empty:
    print("No result found")
else:
    display(df)

Unnamed: 0,departure.displayName,l1.Line,intermediate.displayName,l2.Line,arrival.displayName
0,Victoria,Circle,Sloane Square,Circle,South Kensington


CPU times: user 10.5 ms, sys: 85 μs, total: 10.6 ms
Wall time: 9.7 ms


In [42]:
%%time

# Get route between Stratford and Liverpool Street getting distances between stations
command = """
MATCH (departure)-[l1:CONNECTION]->(i1)-[l2:CONNECTION]->(i2)-[l3:CONNECTION]->(arrival)
WHERE departure.displayName = 'Stratford'
AND arrival.displayName = 'Liverpool Street'
RETURN departure.displayName, l1.Line, l1.DistanceMeters,
i1.displayName, l2.Line, l2.DistanceMeters,
i2.displayName, l3.Line, l3.DistanceMeters,
arrival.displayName
"""
df = client.query(command)
if df.empty:
    print("No result found")
else:
    display(df)

Unnamed: 0,departure.displayName,l1.Line,l1.DistanceMeters,i1.displayName,l2.Line,l2.DistanceMeters,i2.displayName,l3.Line,l3.DistanceMeters,arrival.displayName
0,Stratford,Central,2110,Mile End,Central,860,Bethnal Green,Central,1453,Liverpool Street


CPU times: user 14.1 ms, sys: 1.18 ms, total: 15.3 ms
Wall time: 14.2 ms


# Python functions to construct Cypher queries

In [43]:
# Define build_query_hops_with_edge function
def build_query_hops_with_edge(
    departure_station: str, arrival_station: str, hopCount: int
) -> tuple[str, list[str]]:
    """
    Build query string and corresponding column names.

    Returns:
        tuple: (query_string, column_names)
    """
    #query = "MATCH "
    #query += '(departure:Station{displayName:"'
    #query += departure_station
    #query += '"})'
    query = "MATCH (departure:Station)"

    for k in range(1, hopCount):
        query += f"-[c{k}:CONNECTION]->(st{k}:Station)"

    query += f"""-[c{hopCount}:CONNECTION]->(arrival:Station)\n
    WHERE departure.displayName = '{departure_station}'
    AND arrival.displayName = '{arrival_station}'
    """

    #query += (
    #    "-[c"
    #    + str(hopCount)
    #    + ':CONNECTED]-(arrival:Station{displayName="'
    #    + arrival_station
    #    + '"})'
    #)

    query += " RETURN departure, departure.displayName, departure.Note, "
    for k in range(1, hopCount):
        query += f"c{k}.Line, st{k}, st{k}.displayName, st{k}.Note, "
    query += f"c{hopCount}.Line, arrival, arrival.displayName, arrival.Note"

    return re.sub(' +', ' ', query.replace('\n', ''))

In [44]:
departure_station = "Paddington"
arrival_station = "Blackfriars"

distmax = 14

In [45]:
#%%time

# Loop over distance (number of hops) until we find a path between requested start and end stations
for k in range(1, distmax + 1):
    print(100 * "*")
    print(f"*** k = {k}\n")

    # We get a query and column names
    query_str = build_query_hops_with_edge(departure_station, arrival_station, k)
    print(f"query_str :\n{query_str}")
    df_path = client.query(query_str)

    if not df_path.empty:
        display(df_path)
        break

print(100 * "*")

****************************************************************************************************
*** k = 1

query_str :
MATCH (departure:Station)-[c1:CONNECTION]->(arrival:Station) WHERE departure.displayName = 'Paddington' AND arrival.displayName = 'Blackfriars' RETURN departure, departure.displayName, departure.Note, c1.Line, arrival, arrival.displayName, arrival.Note
****************************************************************************************************
*** k = 2

query_str :
MATCH (departure:Station)-[c1:CONNECTION]->(st1:Station)-[c2:CONNECTION]->(arrival:Station) WHERE departure.displayName = 'Paddington' AND arrival.displayName = 'Blackfriars' RETURN departure, departure.displayName, departure.Note, c1.Line, st1, st1.displayName, st1.Note, c2.Line, arrival, arrival.displayName, arrival.Note
****************************************************************************************************
*** k = 3

query_str :
MATCH (departure:Station)-[c1:CONNECTION]->(st1:St

Unnamed: 0,departure,departure.displayName,departure.Note,c1.Line,st1,st1.displayName,st1.Note,c2.Line,st2,st2.displayName,...,st6.displayName,st6.Note,c7.Line,st7,st7.displayName,st7.Note,c8.Line,arrival,arrival.displayName,arrival.Note
0,177,Paddington,Stroll along the canals at Paddington Basin an...,Circle,176,Edgware Road,Explore Little Venice s canals and visit the b...,Circle,174,Baker Street,...,Embankment,Relax in Victoria Embankment Gardens and see C...,Circle,202,Temple,Discover the historic Temple Church and stroll...,Circle,203,Blackfriars,Cross the Blackfriars Bridge to the Tate Moder...


****************************************************************************************************


# Process returned dataframe

In [46]:
# Define create_journey_string function
def create_journey_string(row):
    """Create journey string from DataFrame row - safer version"""
    # Start with departure station
    result = f"({row['departure.displayName']})"

    # Find all available segment numbers
    line_cols = [
        col for col in row.index if col.startswith("c") and col.endswith(".Line")
    ]
    segment_numbers = sorted([int(col.split(".")[0][1:]) for col in line_cols])

    # Process each available segment
    for i in segment_numbers:
        line_col = f"c{i}.Line"
        station_col = f"st{i}.displayName"

        # Only process if both columns exist and have values
        if (
            line_col in row.index
            and station_col in row.index
            and not pd.isna(row[line_col])
            and not pd.isna(row[station_col])
        ):
            line = row[line_col]
            station = row[station_col]
            result += f"--[{line}]-->({station})"

    # Add arrival station if it exists
    if "arrival.displayName" in row.index and not pd.isna(row["arrival.displayName"]):
        # Check if there's a final line segment to the arrival
        max_segment = max(segment_numbers) if segment_numbers else 0
        final_line_col = f"l{max_segment}.Line"
        if final_line_col in row.index and not pd.isna(row[final_line_col]):
            result += f"--[{row[final_line_col]}]-->({row['arrival.displayName']})"

    return result

In [47]:
# Show journey strings (multiple paths of the same length can exist between two stations)
for i in range(min(len(df_path), 10)):
    journey_path = create_journey_string(
        df_path.filter(regex="displayName$|Line$", axis=1).iloc[i]
    )
    print(f"Path n°{i}: {journey_path}\n")

Path n°0: (Paddington)--[Circle]-->(Edgware Road)--[Circle]-->(Baker Street)--[Jubilee]-->(Bond Street)--[Jubilee]-->(Green Park)--[Jubilee]-->(Westminster)--[Circle]-->(Embankment)--[Circle]-->(Temple)



# Create subgraph to visualise

In [48]:
def get_subgraph(G, df):
    subset_nodes = df.filter(regex="displayName$", axis=1).iloc[0].values.tolist()
    subG = G.subgraph(subset_nodes).copy()

    return subG

In [49]:
# Get subgraph with stations from matched path
subG = get_subgraph(G, df_path)
print(subG)

# Build CREATE command from subgraph
create_command_subG = build_create_command_from_networkx(subG)
print(f"""
Cypher CREATE command :
* size: {len(create_command_subG.encode('utf-8'))/1024/1000:.4f} MB\n
{100 * '*'}
{create_command_subG \
if len(create_command_subG.split("\n")) < 10000 \
else "\n".join(create_command_subG.split('\n')[:5]) + "\n...\n" + "\n".join(create_command_subG.split('\n')[-5:])}
{100 * '*'}
""")

DiGraph with 9 nodes and 16 edges
Cypher query will create graph with 9 nodes and 16 edges

Cypher CREATE command :
* size: 0.0044 MB

****************************************************************************************************
CREATE (:Node {id: "Embankment", displayName: "Embankment", type: "Station", Latitude: 51.5073122969468, Longitude: -0.12236733566212, Zone: 1, Postcode: "WC2N 6NS", Note: "Relax in Victoria Embankment Gardens and see Cleopatra s Needle along the Thames."}),
(:Node {id: "Blackfriars", displayName: "Blackfriars", type: "Station", Latitude: 51.511586761695, Longitude: -0.102995384624505, Zone: 1, Postcode: "EC4V 4DD", Note: "Cross the Blackfriars Bridge to the Tate Modern or explore the riverside walkways."}),
(:Node {id: "Edgware Road", displayName: "Edgware Road", type: "Station", Latitude: 51.5206450845361, Longitude: -0.17053954989974, Zone: 1, Postcode: "W2 1DY", Note: "Explore Little Venice s canals and visit the beautiful St Mary Magdalene church."}

In [50]:
subgraph_name = f"{graph_name}_subgraph"
subgraph_name

'london_transport_TfL2_subgraph'

In [51]:
%%time

# Set graph
try:
    client.create_graph(subgraph_name)
except TuringDBException as e:
    print(e)

# Set working graph
client.set_graph(subgraph_name)

# Create a new change on the graph
client.checkout()
change = client.new_change()
print(f"Current change {change}")

# Checkout into the change
client.checkout(change=change)

Current change 0
CPU times: user 3.91 ms, sys: 1.18 ms, total: 5.09 ms
Wall time: 15.4 ms


In [52]:
%%time

chunks = split_cypher_commands(create_command_subG, max_size_mb=1)

print(f"✓ Split into {len(chunks['node_chunks'])} node chunk(s) and {len(chunks['edge_chunks'])} edge chunk(s)")

print("\nNode chunks:")
for i, chunk in enumerate(chunks['node_chunks']):
    print(f"  Node chunk {i+1}: {len(chunk.encode('utf-8'))/1024:.1f} KB")
    if i == 10:
        print("  ...")
        break

print("\nEdge chunks:")
for i, chunk in enumerate(chunks['edge_chunks']):
    print(f"  Edge chunk {i+1}: {len(chunk.encode('utf-8'))/1024:.1f} KB")
    if i == 10:
        print("  ...")
        break

✓ Split into 1 node chunk(s) and 16 edge chunk(s)

Node chunks:
  Node chunk 1: 2.2 KB

Edge chunks:
  Edge chunk 1: 0.1 KB
  Edge chunk 2: 0.1 KB
  Edge chunk 3: 0.1 KB
  Edge chunk 4: 0.1 KB
  Edge chunk 5: 0.1 KB
  Edge chunk 6: 0.1 KB
  Edge chunk 7: 0.1 KB
  Edge chunk 8: 0.1 KB
  Edge chunk 9: 0.1 KB
  Edge chunk 10: 0.1 KB
  Edge chunk 11: 0.1 KB
  ...
CPU times: user 770 μs, sys: 0 ns, total: 770 μs
Wall time: 714 μs


In [53]:
%%time

# Run CREATE command
print("\nExecuting query on TuringDB...")
start_time = time.time()

print(f"✓ Split into {len(chunks['node_chunks'])} node chunk(s) and {len(chunks['edge_chunks'])} edge chunk(s)")

# CREATE nodes
print("\nNode chunks:")
for i, chunk in enumerate(tqdm(chunks['node_chunks'])):
    result = client.query(chunk)
# Commit the change
client.query("COMMIT")
print(f"✓ {len(chunks['node_chunks'])} node chunks done")

# CREATE edges
print("\nEdge chunks:")
for i, chunk in enumerate(tqdm(chunks['edge_chunks'])):
    result = client.query(chunk)
# Commit the change
client.query("COMMIT")
print(f"✓ {len(chunks['edge_chunks'])} edge chunks done")

execution_time = time.time() - start_time
print(f"\n✓ Graph created successfully in {execution_time:.2f} seconds")

# Submit changes
start_time = time.time()
client.query("CHANGE SUBMIT")
execution_time = time.time() - start_time
print(f"\n✓ Changes successfully submitted in {execution_time:.2f} seconds")

# Checkout into main
client.checkout()


Executing query on TuringDB...
✓ Split into 1 node chunk(s) and 16 edge chunk(s)

Node chunks:


  0%|          | 0/1 [00:00<?, ?it/s]

✓ 1 node chunks done

Edge chunks:


  0%|          | 0/16 [00:00<?, ?it/s]

✓ 16 edge chunks done

✓ Graph created successfully in 0.04 seconds

✓ Changes successfully submitted in 0.07 seconds
CPU times: user 39.8 ms, sys: 7.42 ms, total: 47.2 ms
Wall time: 107 ms


<div class="alert alert-block alert-info">
    <h2>
        You can visualise the subgraph directly in the notebook below. For more details on nodes and edges, you can go to TuringDB visualizer (running on your instance)
    </h2>
</div>

<div class="alert alert-block alert-info">
    <h2>
        Visualize your graph in TuringDB Graph Visualizer ! Now that your instance is running:
    </h2>
    <h3>
        <ul>
            <li>Go to <a href="https://console.turingdb.ai/databases">TuringDB Console - Database Instances</a></li>
            <li>In your current instance panel, click on "Open Visualizer" button</li>
            <li>Visualizer opens, now you can choose your graph in the dropdown menu at the top-right corner</li>
        </ul>
        You can then play with your graph and visualize the nodes you want !
    </h3>
</div>

In [54]:
from pyvis.network import Network

net = Network(
    height="750px",
    width="100%",
    notebook=True,
    bgcolor="#ffffff",
    font_color="#000000",
    directed=True,
)

# Line colors (official TfL colors)
line_colors = {
    "Circle": "#FFD329",
    "Jubilee": "#A1A5A7",
    "Central": "#DC241F",
    "District": "#00782A",
    "Metropolitan": "#9B0058",
}

for node, data in subG.nodes(data=True):
    net.add_node(
        node,
        label=data.get("displayName", str(node)),
        title=f"{data.get('displayName', '')}<br>{data.get('Note', '')}",
        color="#3498db",
        size=25,
    )

for source, target, data in subG.edges(data=True):
    line = data.get("Line", "")
    color = line_colors.get(line, "#95a5a6")
    net.add_edge(source, target, title=line, label=line, color=color, width=3)

net.toggle_physics(True)
net.show(f"{example_name}_subgraph.html")

london_transport_TfL_subgraph.html


# Use LLM to generate Cypher query

Before running this section, create a `.env` file in the project root with your API keys:

```env
ANTHROPIC_API_KEY=your_key_here
OPENAI_API_KEY=your_key_here
MISTRAL_API_KEY=your_key_here

In [55]:
import os
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

True

In [56]:
api_keys = {
    "Anthropic": os.getenv("ANTHROPIC_API_KEY"),
    "Mistral": os.getenv("MISTRAL_API_KEY"),
    "OpenAI": os.getenv("OPENAI_API_KEY"),
}

In [60]:
%%time

system_prompt = """
Only base your response on the data provided.
Do not add knowledge but describe it in a natural way as if you were a touristic guide, be friendly !
You can add forms but not content.
Format the output with nice Markdown format and emojis.
"""

prompt = f"""
Here is the path I got from stations {departure_station} to {arrival_station} : {df_path.to_dict()}.
Describe the path between these two stations.
Give me an itinerary of things I could visit on the way.
"""

provider = "Anthropic"

result = query_llm(
    prompt=prompt,
    system_prompt=system_prompt,
    provider=provider,
    model="claude-haiku-4-5-20251001",
    api_key=api_keys[provider],
    temperature=0.2,
)

In [62]:
display(Markdown(result))

# 🚇 Your Journey: Paddington to Blackfriars

Welcome! Let me guide you through this wonderful route across London's iconic Underground network!

---

## 📍 Your Complete Route

**Starting Point:** 🚂 **Paddington** → **Destination:** 🏛️ **Blackfriars**

You'll be traveling through **3 different Underground lines**: Circle, Jubilee, and back to Circle!

---

## 🎫 Stop-by-Stop Itinerary

### 1️⃣ **Paddington** (Circle Line)
🐻 *Your starting point!*
- Stroll along the charming canals at **Paddington Basin**
- Say hello to the famous **Paddington Bear statue**

### 2️⃣ **Edgware Road** (Circle Line)
🏘️ *A hidden gem!*
- Explore the picturesque canals of **Little Venice**
- Visit the beautiful **St Mary Magdalene Church**

### 3️⃣ **Baker Street** (Circle Line)
🔍 *Step into literary history!*
- Discover **221B Baker Street** - Sherlock Holmes' world
- Meet wax celebrities at **Madame Tussauds**

### 4️⃣ **Bond Street** (Switch to Jubilee Line)
💎 *Shopper's paradise!*
- Browse luxury boutiques on **Bond Street**
- Explore high street stores along **Oxford Street**

### 5️⃣ **Green Park** (Jubilee Line)
🌳 *Royal London awaits!*
- Stroll through the elegant **Green Park**
- Head towards **Buckingham Palace**
- Visit nearby **Piccadilly**

### 6️⃣ **Westminster** (Jubilee Line)
🏰 *The heart of British politics!*
- Marvel at the **Houses of Parliament** and **Big Ben**
- Visit the magnificent **Westminster Abbey**

### 7️⃣ **Embankment** (Switch back to Circle Line)
🌊 *Riverside relaxation!*
- Relax in **Victoria Embankment Gardens**
- See the iconic **Cleopatra's Needle** along the Thames

### 8️⃣ **Temple** (Circle Line)
⛪ *Historic treasures!*
- Discover the historic **Temple Church**
- Stroll through **Victoria Embankment Gardens** again

### 🎯 **Blackfriars** (Final Destination!)
🎨 *Your journey ends here!*
- Cross the **Blackfriars Bridge** to the **Tate Modern**
- Explore the beautiful **riverside walkways**

---

## ✨ Pro Tips for Your Journey

- 📸 Don't miss photo opportunities at Westminster with Big Ben!
- 👜 Allow extra time at Bond Street if you love shopping
- 🚶 Consider exiting at Embankment or Temple to walk along the Thames
- 🎭 The Tate Modern at Blackfriars is perfect for art lovers!

**Enjoy your London adventure!** 🇬🇧✨

# Use LLM to generate Cypher query

In [63]:
"""Build system prompt with TuringDB schema and examples"""

turingdb_cypher_system_prompt = """
You are an expert at converting natural language questions into TuringDB queries.

Your task is to generate syntactically correct TuringDB queries based on natural language input.

VERY IMPORTANT - YOU MUST FOLLOW THESE REQUESTS - TuringDB Syntax Guidelines:
1. Return ONLY the TuringDB query, no explanations or markdown formatting
2. Use MATCH, CREATE and WHERE operations only
3. Nodes: (n:Label {property = "value"}) or (n:Label {property: value})
4. Edges: Use DIRECTED syntax with ->
5. Pattern matching: MATCH (n)-[e]->(m)
6. Property matching: Use = operator for exact matching
7. Multiple constraints: (n:Person:Engineer {name = "John", age = 30})
8. Return all matched entities: RETURN n, e, m or use RETURN * for all
9. Filter using WHERE clause: MATCH (n:Person) WHERE n.name = 'John' RETURN n.firstname, n.lastname

VERY IMPORTANT - YOU ARE NOT ALLOWED TO USE THE FOLLOWING - FORBIDDEN in TuringDB:
- Do NOT use AS aliases
- Do NOT use LIMIT, SKIP clauses
- Do NOT use WITH clauses
- Do NOT use CALL (except for metaqueries)
- Do NOT use toLower() or other functions
- Do NOT use wildcard character (*)
- Do NOT use multi-hops pattern for edges: e.g. `-[e:CONNECTED*1..10]->`
- Do NOT use "end" or "s3" variable name

Supported TuringDB Operations:
- MATCH queries: MATCH (n:Label)-[e:Type]->(m) RETURN n, m
- CREATE queries: CREATE (n:Label{property="value"})-[e:Type]->(m:Label)
- Metaqueries: CALL db.propertyTypes(), CALL db.labels(), CALL db.edgeTypes()
- Property types: String ("text" or `text`), Boolean (true/false), Integer (20), Double (20.5)

Examples for few-shot learning:
- Find all persons: MATCH (n:Person) RETURN n
- Find connections: MATCH (n:Person)-[e]->(m:Person) RETURN n, e, m
- Create person: CREATE (n:Person{name="John", age=30})
- Match person with specific name: MATCH (p:Person) WHERE p.name = "John" RETURN p
- Path with 1 hop between Station Paddington and Blackfriars:  MATCH (first:Station{displayName:"Paddington"})-[e1:CONNECTED]->(last:Station{displayName="Blackfriars"}) RETURN start, start.displayName, start.Note, e1.Line, last, last.displayName, last.Note
- Path with 2 hops between Station Paddington and Blackfriars: MATCH (first:Station{displayName:"Paddington"})-[e1:CONNECTED]->(s1:Station)-[e2:CONNECTED]->(last:Station{displayName="Blackfriars"}) RETURN start, start.displayName, start.Note, e1.Line, s1, s1.displayName, s1.Note, e2.Line, last, last.displayName, last.Note
- Path with 8 hops between Station Paddington and Blackfriars: MATCH (first:Station{displayName:"Paddington"})-[e1:CONNECTED]->(s1:Station)-[e2:CONNECTED]->(s2:Station)-[e3:CONNECTED]->(s3:Station)-[e4:CONNECTED]->(s4:Station)-[e5:CONNECTED]->(s5:Station)-[e6:CONNECTED]->(s6:Station)-[e7:CONNECTED]->(s7:Station)-[e8:CONNECTED]->(last:Station{displayName="Blackfriars"}) RETURN start, start.displayName, start.Note, e1.Line, s1, s1.displayName, s1.Note, e2.Line, s2, s2.displayName, s2.Note, e3.Line, s3, s3.displayName, s3.Note, e4.Line, s4, s4.displayName, s4.Note, e5.Line, s5, s5.displayName, s5.Note, e6.Line, s6, s6.displayName, s6.Note, e7.Line, s7, s7.displayName, s7.Note, e8.Line, last, last.displayName, last.Note
- Find all Chinese providers and what they supply: MATCH (n{provider_country:"CHN"}) RETURN n, n.provider_name, n.displayName, n.share_provided, n.type
- Find all deposition tools and their types: MATCH (specific)-[e:IS_TYPE_OF]->(general:Tool_Resource{displayName:"Deposition tools"}) RETURN specific, specific.displayName, specific.provider_name, e, general, general.displayName
"""

In [66]:
%%time

# Define maximum number of hops between two
distmax = 20

# Get subset of CREATE command to avoid exceeding context window
graph_CREATE_command_subset = "\n".join(
    graph_CREATE_command.split("\n")[:5] + graph_CREATE_command.split("\n")[-5:]
)

for k in range(1, distmax + 1):
    print(100 * "*")
    print(f"*** k = {k}\n")

    #########################################################################################################
    question = f"""
    Give me the Cypher query to find the path using London Tube between stations {departure_station} and {arrival_station}.
    The path between the two stations has to contain {k} hops.
    Very important :
    - Make sure you respect the number of hops and stations required between the two stations.
    - Pay attention to use the correct node and edge properties name in the MATCH section.
    - Pay attention to use the correct node and edge properties name in the RETURN section.
    - If no contrary information is asked by the user :
        - return all the matched nodes and edges and their properties in the RETURN section.
    """

    system_prompt = f"""
    TuringDB Cypher prompt :
    {turingdb_cypher_system_prompt}
    
    Here is a subset of the CREATE command used to create the graph, this way you know graph structure.
    Only a subset is passed because the whole command is too long. Please use this as the most relevant source of information :
    {graph_CREATE_command_subset}
    
    Here is also the output of "CALL LABELS ()" command, showing the different node types of the graph :
    {client.query("CALL db.labels()")}
    
    Here is also the output of "CALL EDGETYPES ()" command, showing the different edge types of the graph :
    {client.query("CALL db.edgeTypes()")}
    
    Very important :
    - You MUST follow current TuringDB Syntax Guidelines
    - You MUST NOT USE what is FORBIDDEN in TuringDB
    - By default, RETURN ALL THE MATCHED NODES AND EDGES AND THEIR PROPERTIES in the RETURN section (except contrary demand from user)
    - Use the correct node and edge properties name in the MATCH section.
    - Use the correct node and edge properties name in the RETURN section.
    - Pay attention to which properties come from nodes or edges, to create a functioning query
    - Pay attention to lower and uppercases in properties
    - If some properties contain spaces, be careful to wrap them
    
    Give me the query FOLLOWING TURINGDB GUIDELINES AND NOT USING WHAT IS FORBIDDEN for this specific question :
    """

    cypher_query = natural_language_to_cypher(
        question=question,
        system_prompt=system_prompt,
        provider=provider,
        model="claude-haiku-4-5-20251001",
        api_key=api_keys[provider],
    )
    print(f"cypher_query : {cypher_query}")

    #########################################################################################################

    ## Build column names to exactly match the RETURN clause
    # column_names = ['start', 'start.displayName', 'start.Note']
    #
    # for k_ in range(1, k):
    #    column_names.extend([
    #        f'e{k_}.Line',
    #        f's{k_}',
    #        f's{k_}.displayName',
    #        f's{k_}.Note'
    #    ])
    #
    # column_names.extend([
    #    f'e{k}.Line',
    #    'end',
    #    'end.displayName',
    #    'end.Note'
    # ])
    cypher_query

    #########################################################################################################

    try:
        df_path = client.query(cypher_query)
        if df_path.empty:
            print("--> No result found\n")
        else:
            display(df_path)
    except TuringDBException:
        print(f"Query generated by LLM failed.")

    break
    
print(100 * "*")

****************************************************************************************************
*** k = 1

cypher_query : MATCH (first:Station{displayName:"Paddington"})-[e1:CONNECTED]->(last:Station{displayName:"Blackfriars"}) RETURN first, first.displayName, first.Note, e1, last, last.displayName, last.Note
Query generated by LLM failed.
****************************************************************************************************
CPU times: user 41.6 ms, sys: 1.86 ms, total: 43.5 ms
Wall time: 1.21 s


In [67]:
# Show journey strings (multiple paths of the same length can exist between two stations)
for i in range(len(df_path)):
    journey_path = create_journey_string(
        df_path.filter(regex="displayName$|Line$", axis=1).iloc[0]
    )
    print(f"Path n°{i}: {journey_path}")

Path n°0: (Paddington)--[Circle]-->(Edgware Road)--[Circle]-->(Baker Street)--[Jubilee]-->(Bond Street)--[Jubilee]-->(Green Park)--[Jubilee]-->(Westminster)--[Circle]-->(Embankment)--[Circle]-->(Temple)


In [68]:
print("Notebook finished !")

Notebook finished !
