<div class="alert alert-block alert-success">
    <h1>
        Example notebook - London Tube
    </h1>
    <p>
        Link to dataset : <a href="https://drive.google.com/drive/folders/1Tfl7D5Khh1uiQDMy20XngdFggaGyj6Vy?usp=drive_link">Google Drive</a>
    </p>
</div>

# Import modules and functions

In [1]:
import os
import pandas as pd
import re
from IPython.display import display, Markdown

from turingdb_examples.graph import (
    create_graph_from_df,
    build_create_command_from_networkx,
)
from turingdb_examples.llm import query_llm, natural_language_to_cypher
from turingdb_examples.utils import get_return_statements

# Check data files are available

In [2]:
example_name = "london_transport_TfL"
path_data = f"{os.getcwd()}/data/{example_name}"
if not os.path.exists(path_data):
    raise ValueError(f"{path_data} does not exists")

list_csv_files = sorted(os.listdir(path_data))
if not list_csv_files == [
    "TfL_london_transport_tube.csv",
    "TfL_london_transport_tube_sightseeing.csv",
    "TfL_london_transport_tube_stations.csv",
]:
    raise ValueError(
        f"At least one of the {len(list_csv_files)} csv files is not available in {path_data}"
    )

# Import and format data

In [3]:
df_link_stations = pd.read_csv(
    f"{path_data}/TfL_london_transport_tube.csv", decimal=","
)
df_link_stations = df_link_stations.apply(
    lambda x: x.str.strip() if x.dtype == "object" else x
)
df_link_stations

Unnamed: 0,Line,Order,From_Station,To_Station,Total,PM Peak,Evening,Late
0,Bakerloo,1,Elephant & Castle,Lambeth North,13.432,4.118,1.288,398.000
1,Bakerloo,2,Lambeth North,Waterloo,18.597,5.718,1.916,580.000
2,Bakerloo,3,Waterloo,Embankment,46.020,13.746,4.609,1.466
3,Bakerloo,4,Embankment,Charing Cross,55.475,17.155,5.959,1.940
4,Bakerloo,5,Charing Cross,Piccadilly Circus,63.929,20.112,7.712,3.035
...,...,...,...,...,...,...,...,...
859,Victoria,13,Pimlico,Vauxhall,94.853,37.711,24.553,10.574
860,Victoria,14,Vauxhall,Stockwell,66.561,27.117,18.342,7.992
861,Victoria,15,Stockwell,Brixton,41.178,15.972,11.553,5.361
862,Waterloo & City,1,Waterloo,Bank and Monument,5.554,5.197,1.465,386.000


In [4]:
df_stations = pd.read_csv(f"{path_data}/TfL_london_transport_tube_stations.csv")
df_stations = df_stations.drop(["Zone_original", "OS X", "OS Y"], axis=1)
df_stations = df_stations.apply(lambda x: x.str.strip() if x.dtype == "object" else x)
df_stations["Station"] = df_stations["Station"].str.replace(" and ", " & ")
df_stations

Unnamed: 0,Station,Latitude,Longitude,Zone,Postcode
0,Abbey Road,51.531952,0.003723,3,E15 3NB
1,Abbey Wood,51.490784,0.120272,4,SE2 9RH
2,Acton Central,51.508757,-0.263430,2,W3 6BH
3,Acton Main Line,51.516886,-0.267690,3,W3 9EH
4,Acton Town,51.503071,-0.280303,3,W3 8HN
...,...,...,...,...,...
648,Woodside Park,51.617868,-0.185426,4,N12 8SE
649,Woolwich,51.491578,0.071819,4,SE18 6EU
650,Woolwich Arsenal,51.489907,0.069194,4,SE18 6HX
651,Woolwich Dockyard,51.491108,0.054612,3,SE18 5JY


In [5]:
df_sightseeing = pd.read_csv(
    f"{path_data}/TfL_london_transport_tube_sightseeing.csv", sep=";", header=1
)
df_sightseeing = df_sightseeing.T.drop_duplicates().T.rename(columns={"Blurb": "Note"})
df_sightseeing = df_sightseeing.apply(
    lambda x: x.str.strip() if x.dtype == "object" else x
)
df_sightseeing

Unnamed: 0,Station,Note
0,Abbey Road,Explore the historic House Mill and the Three ...
1,Acton Town,Gunnersbury Park and Museum offer landscaped g...
2,Aldgate,See the historic Aldgate Pump and visit the ne...
3,Aldgate East,"Head to Brick Lane for famous curry houses, vi..."
4,All Saints,Chrisp Street Market and the Lansbury Estate s...
...,...,...
294,Wimbledon Park,Row on the lake at Wimbledon Park and watch te...
295,Wood Green,Take in the views from Alexandra Palace and sh...
296,Woodford,Explore Epping Forest and play golf at Woodfor...
297,Woodside Park,Walk the Dollis Valley Greenwalk and shop alon...


In [6]:
df_stations = pd.merge(df_stations, df_sightseeing, on="Station", how="left")
df_stations

Unnamed: 0,Station,Latitude,Longitude,Zone,Postcode,Note
0,Abbey Road,51.531952,0.003723,3,E15 3NB,Explore the historic House Mill and the Three ...
1,Abbey Wood,51.490784,0.120272,4,SE2 9RH,
2,Acton Central,51.508757,-0.263430,2,W3 6BH,
3,Acton Main Line,51.516886,-0.267690,3,W3 9EH,
4,Acton Town,51.503071,-0.280303,3,W3 8HN,Gunnersbury Park and Museum offer landscaped g...
...,...,...,...,...,...,...
648,Woodside Park,51.617868,-0.185426,4,N12 8SE,Walk the Dollis Valley Greenwalk and shop alon...
649,Woolwich,51.491578,0.071819,4,SE18 6EU,
650,Woolwich Arsenal,51.489907,0.069194,4,SE18 6HX,Explore the Royal Arsenal Riverside and take t...
651,Woolwich Dockyard,51.491108,0.054612,3,SE18 5JY,


In [7]:
df_link_stations

Unnamed: 0,Line,Order,From_Station,To_Station,Total,PM Peak,Evening,Late
0,Bakerloo,1,Elephant & Castle,Lambeth North,13.432,4.118,1.288,398.000
1,Bakerloo,2,Lambeth North,Waterloo,18.597,5.718,1.916,580.000
2,Bakerloo,3,Waterloo,Embankment,46.020,13.746,4.609,1.466
3,Bakerloo,4,Embankment,Charing Cross,55.475,17.155,5.959,1.940
4,Bakerloo,5,Charing Cross,Piccadilly Circus,63.929,20.112,7.712,3.035
...,...,...,...,...,...,...,...,...
859,Victoria,13,Pimlico,Vauxhall,94.853,37.711,24.553,10.574
860,Victoria,14,Vauxhall,Stockwell,66.561,27.117,18.342,7.992
861,Victoria,15,Stockwell,Brixton,41.178,15.972,11.553,5.361
862,Waterloo & City,1,Waterloo,Bank and Monument,5.554,5.197,1.465,386.000


# Create graph from dataframe

In [8]:
G = create_graph_from_df(
    df_link_stations,
    source_node_col={"id": "From_Station", "type": "Station"},
    target_node_col={"id": "To_Station", "type": "Station"},
    node_attributes_df=df_stations,
    node_attributes_key_col="Station",
    edge_col="Line",
)

In [9]:
# Show 10 first nodes
for node in list(G.nodes(data=True))[:10]:
    print(node)

('Elephant & Castle', {'displayName': 'Elephant & Castle', 'type': 'Station', 'Latitude': 51.4958492200292, 'Longitude': -0.100738714256282, 'Zone': 1, 'Postcode': 'SE1 6LW', 'Note': 'nan'})
('Lambeth North', {'displayName': 'Lambeth North', 'type': 'Station', 'Latitude': 51.4991297653886, 'Longitude': -0.111768260460206, 'Zone': 1, 'Postcode': 'SE1 7XG', 'Note': 'Visit the Imperial War Museum and browse the Saturday market on Lower Marsh.'})
('Waterloo', {'displayName': 'Waterloo', 'type': 'Station', 'Latitude': 51.5031464567835, 'Longitude': -0.113259251258196, 'Zone': 1, 'Postcode': 'SE1 7NY', 'Note': 'Ride the London Eye, stroll along the South Bank and see a play at the National Theatre.'})
('Embankment', {'displayName': 'Embankment', 'type': 'Station', 'Latitude': 51.5073122969468, 'Longitude': -0.12236733566212, 'Zone': 1, 'Postcode': 'WC2N 6NS', 'Note': 'Relax in Victoria Embankment Gardens and see Cleopatra’s Needle along the Thames.'})
('Charing Cross', {'displayName': 'Chari

In [10]:
# Show 10 first edges
for edge in list(G.edges(data=True))[:10]:
    print(edge)

('Elephant & Castle', 'Lambeth North', {'Line': 'Bakerloo'})
('Elephant & Castle', 'Borough', {'Line': 'Northern'})
('Elephant & Castle', 'Kennington', {'Line': 'Northern'})
('Lambeth North', 'Waterloo', {'Line': 'Bakerloo'})
('Lambeth North', 'Elephant & Castle', {'Line': 'Bakerloo'})
('Waterloo', 'Embankment', {'Line': 'Northern'})
('Waterloo', 'Lambeth North', {'Line': 'Bakerloo'})
('Waterloo', 'Westminster', {'Line': 'Jubilee'})
('Waterloo', 'Southwark', {'Line': 'Jubilee'})
('Waterloo', 'Kennington', {'Line': 'Northern'})


# Create graph using `turingdb` python package

<div class="alert alert-block alert-info">
    <h2>
        See <a href="https://docs.turingdb.ai/quickstart">TuringDB Get started documentation</a> for the important steps to follow :
    </h2>
    <h4>
        <ul>
            <li>Create your TuringDB account</li>
            <li>Create your instance in the <a href="https://console.turingdb.ai/auth">TuringDB Cloud UI</a></li>
            <li>Copy your Instance ID from the Database Instances management page</li>
            <li>Get API Key from the Settings in UI</li>
        </ul>
        Remember to have your instance active while working in this notebook !
    </h4>
</div>

In [11]:
from turingdb import TuringDB

# Create TuringDB client
client = TuringDB(
    host="http://localhost:6666"  # Remove this parameter and set the two parameters below
    # instance_id="...",  # Replace by your instance id
    # auth_token="...",  # Replace by your API token
)

In [12]:
# Get list of available graphs
list_graphs = client.query("LIST GRAPH").loc[:, 0].tolist()

In [13]:
# Set graph name
graph_name_prefix = example_name
graph_name_nb_suffix = str(
    max(
        [
            int(re.sub(graph_name_prefix, "", g))
            for g in list_graphs
            if g.startswith(graph_name_prefix)
            and re.sub(graph_name_prefix, "", g).isdigit()
        ]
        + [0]
    )
    + 1
)
graph_name = graph_name_prefix + graph_name_nb_suffix
graph_name = re.sub("-", "_", graph_name)
graph_name

'london_transport_TfL1'

In [14]:
%%time

# Create a new graph
client.query(f"CREATE GRAPH {graph_name}")
client.set_graph(graph_name)

# Create a new change on the graph
change = client.query("CHANGE NEW").loc[0, 0]

# Checkout into the change
client.checkout(change=change)

CPU times: user 6.2 ms, sys: 58 μs, total: 6.26 ms
Wall time: 21.8 ms


In [15]:
# Build CREATE command from networkx object
create_command = build_create_command_from_networkx(G)
print(f"Cypher CREATE command :\n\n{100 * '*'}\n{create_command}\n{100 * '*'}")

Cypher CREATE command :

****************************************************************************************************
CREATE (n0:Station {"id":"Elephant & Castle", "displayName":"Elephant & Castle", "type":"Station", "Latitude":"51.4958492200292", "Longitude":"-0.100738714256282", "Zone":"1", "Postcode":"SE1 6LW", "Note":"nan"}),
(n1:Station {"id":"Lambeth North", "displayName":"Lambeth North", "type":"Station", "Latitude":"51.4991297653886", "Longitude":"-0.111768260460206", "Zone":"1", "Postcode":"SE1 7XG", "Note":"Visit the Imperial War Museum and browse the Saturday market on Lower Marsh."}),
(n2:Station {"id":"Waterloo", "displayName":"Waterloo", "type":"Station", "Latitude":"51.5031464567835", "Longitude":"-0.113259251258196", "Zone":"1", "Postcode":"SE1 7NY", "Note":"Ride the London Eye, stroll along the South Bank and see a play at the National Theatre."}),
(n3:Station {"id":"Embankment", "displayName":"Embankment", "type":"Station", "Latitude":"51.5073122969468", "Long

In [16]:
%%time

# Run CREATE command
client.query(create_command)

# Commit the change
client.query("COMMIT")
client.query("CHANGE SUBMIT")

# Checkout into main
client.checkout()

CPU times: user 6.62 ms, sys: 1.09 ms, total: 7.71 ms
Wall time: 145 ms


<div class="alert alert-block alert-info">
    <h2>
        Visualize your graph in TuringDB Graph Visualizer ! Now that your instance is running:
    </h2>
    <h3>
        <ul>
            <li>Go to <a href="https://console.turingdb.ai/databases">TuringDB Console - Database Instances</a></li>
            <li>In your current instance panel, click on "Open Visualizer" button</li>
            <li>Visualizer opens, now you can choose your graph in the dropdown menu at the top-right corner</li>
        </ul>
        You can then play with your graph and visualize the nodes you want !
    </h3>
</div>

# Query TuringDB

## Use metaqueries to have insight on graph overall structure

<h3>
    To learn more about 📮 Metaqueries, please check TuringDB documentation on this <a href="https://turingdb.mintlify.app/query/cypher_subset#%F0%9F%93%AE-metaqueries">link</a>
</h3>

In [17]:
%%time

# CALL LABELS () - returns a column of all the different node labels
command = """
CALL LABELS()
"""
df = client.query(command)
if df.empty:
    print("No result found")
else:
    df.columns = ["Node_type_ID", "Node_type"]
    display(df)

Unnamed: 0,Node_type_ID,Node_type
0,0,Station


CPU times: user 8.65 ms, sys: 47 μs, total: 8.7 ms
Wall time: 7.22 ms


In [18]:
%%time

# CALL EDGETYPES() - returns a column of all the different edge types (edge equivalent of node labels)
command = """
CALL EDGETYPES()
"""
df = client.query(command)
if df.empty:
    print("No result found")
else:
    df.columns = ["Edge_type_ID", "Edge_type"]
    display(df)

Unnamed: 0,Edge_type_ID,Edge_type
0,0,CONNECTED


CPU times: user 6.78 ms, sys: 1.04 ms, total: 7.82 ms
Wall time: 6.6 ms


## Simple queries

In [19]:
%%time

# Match all edges and return them - all links between two stations
command = """
MATCH (n)-[e]-(m)
RETURN n.displayName, e.Line, m.displayName
"""
df = client.query(command)
if df.empty:
    print("No result found")
else:
    df.columns = get_return_statements(command)
    display(df)

Unnamed: 0,n.displayName,e.Line,m.displayName
0,Canonbury,East London,Dalston Junction
1,Canonbury,East London,Highbury & Islington
2,Dalston Junction,East London,Haggerston
3,Dalston Junction,East London,Canonbury
4,Haggerston,East London,Hoxton
...,...,...,...
766,Cannon Street,Circle,Bank
767,Mansion House,Circle,Blackfriars
768,Mansion House,Circle,Cannon Street
769,Blackfriars,Circle,Temple


CPU times: user 11.5 ms, sys: 1.93 ms, total: 13.4 ms
Wall time: 11.8 ms


In [20]:
%%time

# Check if there exists a path between Hoxton Station and South Kensington in 1 hop
command = """
MATCH (start{displayName: "Hoxton Station"})--(end{displayName:"South Kensington"})
RETURN start,end
"""
df = client.query(command)
if df.empty:
    print("No result found")
else:
    df.columns = get_return_statements(command)
    display(df)

No result found
CPU times: user 2.68 ms, sys: 0 ns, total: 2.68 ms
Wall time: 2.22 ms


In [21]:
%%time

# Check if there exists a path between Blackfriars and Mansion House in 1 hop
command = """
MATCH (start{displayName: "Blackfriars"})--(end{displayName:"Mansion House"})
RETURN start, start.displayName, end, end.displayName
"""
df = client.query(command)
if df.empty:
    print("No result found")
else:
    df.columns = get_return_statements(command)
    display(df)

Unnamed: 0,start,start.displayName,end,end.displayName
0,334,Blackfriars,333,Mansion House


CPU times: user 9.37 ms, sys: 70 μs, total: 9.44 ms
Wall time: 8.06 ms


In [22]:
%%time

# Check if there exists a path between Victoria and South Kensington in 2 hops
command = """
MATCH (start{displayName: "Victoria"})--(s1)--(end{displayName:"South Kensington"})
RETURN start, start.displayName, s1, s1.displayName, end, end.displayName
"""
df = client.query(command)
if df.empty:
    print("No result found")
else:
    df.columns = get_return_statements(command)
    display(df)

Unnamed: 0,start,start.displayName,s1,s1.displayName,end,end.displayName
0,296,Victoria,297,Sloane Square,298,South Kensington


CPU times: user 9.18 ms, sys: 1.01 ms, total: 10.2 ms
Wall time: 8.91 ms


In [23]:
%%time

# Check if there exists a path between Victoria and South Kensington in 1 hop specifically using District Line
command = """
MATCH (start{displayName: "Victoria"})-[:line{displayName:"District"}]-(end{displayName:"South Kensington"})
RETURN start, end
"""
df = client.query(command)
if df.empty:
    print("No result found")
else:
    df.columns = get_return_statements(command)
    display(df)

No result found
CPU times: user 3.56 ms, sys: 81 μs, total: 3.64 ms
Wall time: 2.35 ms


In [24]:
%%time

# Check if there exists a path between Notting Hill Gate and High Street Kensington in 1 hop specifically using Circle Line
command = """
MATCH (start{displayName: "Notting Hill Gate"})-[e:CONNECTED{Line:"Circle"}]-(end{displayName:"High Street Kensington"})
RETURN start, start.displayName, e, e.Line, end, end.displayName
"""
df = client.query(command)
if df.empty:
    print("No result found")
else:
    df.columns = get_return_statements(command)
    display(df)

Unnamed: 0,start,start.displayName,e,e.Line,end,end.displayName
0,232,Notting Hill Gate,538,Circle,302,High Street Kensington


CPU times: user 7.1 ms, sys: 3.87 ms, total: 11 ms
Wall time: 9.4 ms


# Python functions to construct Cypher queries

In [25]:
# Define build_query_hops_with_edge function
def build_query_hops_with_edge(
    start_station: str, end_station: str, hopCount: int
) -> tuple[str, list[str]]:
    """
    Build query string and corresponding column names.

    Returns:
        tuple: (query_string, column_names)
    """
    query = "MATCH "
    query += '(start:Station{displayName:"'
    query += start_station
    query += '"})'

    for k in range(1, hopCount):
        query += f"-[e{k}:CONNECTED]-(s{k}:Station)"

    query += (
        "-[e"
        + str(hopCount)
        + ':CONNECTED]-(end:Station{displayName="'
        + end_station
        + '"})'
    )

    query += " RETURN start, start.displayName, start.Note, "
    for k in range(1, hopCount):
        query += f"e{k}.Line, s{k}, s{k}.displayName, s{k}.Note, "
    query += f"e{hopCount}.Line, end, end.displayName, end.Note"

    return query

In [26]:
start_station = "Paddington"
end_station = "Blackfriars"

distmax = 20

In [27]:
%%time

# Loop over distance (number of hops) until we find a path between requested start and end stations
for k in range(1, distmax + 1):
    print(100 * "*")
    print(f"*** k = {k}\n")

    # We get a query and column names
    query_str = build_query_hops_with_edge(start_station, end_station, k)
    print(f"query_str : {query_str}")
    df_path = client.query(query_str)

    if not df_path.empty:
        df_path.columns = get_return_statements(query_str)
        display(df_path)
        break

print(100 * "*")

****************************************************************************************************
*** k = 1

query_str : MATCH (start:Station{displayName:"Paddington"})-[e1:CONNECTED]-(end:Station{displayName="Blackfriars"}) RETURN start, start.displayName, start.Note, e1.Line, end, end.displayName, end.Note
****************************************************************************************************
*** k = 2

query_str : MATCH (start:Station{displayName:"Paddington"})-[e1:CONNECTED]-(s1:Station)-[e2:CONNECTED]-(end:Station{displayName="Blackfriars"}) RETURN start, start.displayName, start.Note, e1.Line, s1, s1.displayName, s1.Note, e2.Line, end, end.displayName, end.Note
****************************************************************************************************
*** k = 3

query_str : MATCH (start:Station{displayName:"Paddington"})-[e1:CONNECTED]-(s1:Station)-[e2:CONNECTED]-(s2:Station)-[e3:CONNECTED]-(end:Station{displayName="Blackfriars"}) RETURN start, start.disp

Unnamed: 0,start,start.displayName,start.Note,e1.Line,s1,s1.displayName,s1.Note,e2.Line,s2,s2.displayName,...,s6.displayName,s6.Note,e7.Line,s7,s7.displayName,s7.Note,e8.Line,end,end.displayName,end.Note
0,218,Paddington,Stroll along the canals at Paddington Basin an...,Circle,300,Edgware Road (C),,Circle,221,Baker Street,...,Embankment,Relax in Victoria Embankment Gardens and see C...,Circle,293,Temple,Discover the historic Temple Church and stroll...,Circle,334,Blackfriars,Cross the Blackfriars Bridge to the Tate Moder...


****************************************************************************************************
CPU times: user 31.3 ms, sys: 3.97 ms, total: 35.3 ms
Wall time: 35.2 ms


# Process returned dataframe

In [28]:
# Define create_journey_string function
def create_journey_string(row):
    """Create journey string from DataFrame row - safer version"""
    # Start with initial station
    result = f"({row['start.displayName']})"

    # Find all available segment numbers
    line_cols = [
        col for col in row.index if col.startswith("e") and col.endswith(".Line")
    ]
    segment_numbers = sorted([int(col.split(".")[0][1:]) for col in line_cols])

    # Process each available segment
    for i in segment_numbers:
        line_col = f"e{i}.Line"
        station_col = f"s{i}.displayName"

        # Only process if both columns exist and have values
        if (
            line_col in row.index
            and station_col in row.index
            and not pd.isna(row[line_col])
            and not pd.isna(row[station_col])
        ):
            line = row[line_col]
            station = row[station_col]
            result += f"--[{line}]-->({station})"

    # Add end station if it exists
    if "end.displayName" in row.index and not pd.isna(row["end.displayName"]):
        # Check if there's a final line segment to the end
        max_segment = max(segment_numbers) if segment_numbers else 0
        final_line_col = f"e{max_segment}.Line"
        if final_line_col in row.index and not pd.isna(row[final_line_col]):
            result += f"--[{row[final_line_col]}]-->({row['end.displayName']})"

    return result

In [29]:
# Show journey strings (multiple paths of the same length can exist between two stations)
for i in range(len(df_path)):
    journey_path = create_journey_string(
        df_path.filter(regex="displayName$|Line$", axis=1).iloc[0]
    )
    print(f"Path n°{i}: {journey_path}")

Path n°0: (Paddington)--[Circle]-->(Edgware Road (C))--[Circle]-->(Baker Street)--[Jubilee]-->(Bond Street)--[Jubilee]-->(Green Park)--[Jubilee]-->(Westminster)--[Circle]-->(Embankment)--[Circle]-->(Temple)--[Circle]-->(Blackfriars)


# Create subgraph to visualise

In [30]:
def get_subgraph(G, df):
    subset_nodes = df.filter(regex="displayName$", axis=1).iloc[0].values.tolist()
    subG = G.subgraph(subset_nodes).copy()

    return subG

In [31]:
# Get subgraph with stations from matched path
subG = get_subgraph(G, df_path)
print(subG)

# Build CREATE command from subgraph
create_command_subG = build_create_command_from_networkx(subG)
print(f"Cypher CREATE command :\n\n{100 * '*'}\n{create_command_subG}\n{100 * '*'}")

DiGraph with 9 nodes and 16 edges
Cypher CREATE command :

****************************************************************************************************
CREATE (n0:Station {"id":"Temple", "displayName":"Temple", "type":"Station", "Latitude":"51.5110404236047", "Longitude":"-0.113725749531254", "Zone":"1", "Postcode":"WC2R 2PH", "Note":"Discover the historic Temple Church and stroll through Victoria Embankment Gardens."}),
(n1:Station {"id":"Paddington", "displayName":"Paddington", "type":"Station", "Latitude":"51.5153935002265", "Longitude":"-0.175736750265772", "Zone":"1", "Postcode":"W2 1RH", "Note":"Stroll along the canals at Paddington Basin and say hello to the Paddington Bear statue."}),
(n2:Station {"id":"Westminster", "displayName":"Westminster", "type":"Station", "Latitude":"51.5014013891933", "Longitude":"-0.12500207834678", "Zone":"1", "Postcode":"SW1A 2JR", "Note":"Marvel at the Houses of Parliament and Big Ben and visit Westminster Abbey."}),
(n3:Station {"id":"Bake

In [32]:
subgraph_name = f"{graph_name}_subgraph"
subgraph_name

'london_transport_TfL1_subgraph'

In [33]:
%%time

# Create new graph
client.query(f"CREATE GRAPH {subgraph_name}")
client.set_graph(subgraph_name)

# Create a new change on the graph
change = client.query("CHANGE NEW").loc[0, 0]

# Checkout into the change
client.checkout(change=change)

# Run CREATE command
client.query(create_command_subG)

# Commit the change
client.query("COMMIT")
client.query("CHANGE SUBMIT")

# Checkout into main
client.checkout()

CPU times: user 7.39 ms, sys: 4.09 ms, total: 11.5 ms
Wall time: 112 ms


<div class="alert alert-block alert-info">
    <h2>
        You can visualise the subgraph directly in the notebook below. For more details on nodes and edges, you can go to TuringDB visualizer (running on your instance)
    </h2>
</div>

<div class="alert alert-block alert-info">
    <h2>
        Visualize your graph in TuringDB Graph Visualizer ! Now that your instance is running:
    </h2>
    <h3>
        <ul>
            <li>Go to <a href="https://console.turingdb.ai/databases">TuringDB Console - Database Instances</a></li>
            <li>In your current instance panel, click on "Open Visualizer" button</li>
            <li>Visualizer opens, now you can choose your graph in the dropdown menu at the top-right corner</li>
        </ul>
        You can then play with your graph and visualize the nodes you want !
    </h3>
</div>

In [34]:
from pyvis.network import Network

net = Network(
    height="750px",
    width="100%",
    notebook=True,
    bgcolor="#ffffff",
    font_color="#000000",
    directed=True,
)

# Line colors (official TfL colors)
line_colors = {
    "Circle": "#FFD329",
    "Jubilee": "#A1A5A7",
    "Central": "#DC241F",
    "District": "#00782A",
    "Metropolitan": "#9B0058",
}

for node, data in subG.nodes(data=True):
    net.add_node(
        node,
        label=data.get("displayName", str(node)),
        title=f"{data.get('displayName', '')}<br>{data.get('Note', '')}",
        color="#3498db",
        size=25,
    )

for source, target, data in subG.edges(data=True):
    line = data.get("Line", "")
    color = line_colors.get(line, "#95a5a6")
    net.add_edge(source, target, title=line, label=line, color=color, width=3)

net.toggle_physics(True)
net.show(f"{example_name}_subgraph.html")

london_transport_TfL_subgraph.html


# Use LLM to generate Cypher query

Before running this section, create a `.env` file in the project root with your API keys:

```env
ANTHROPIC_API_KEY=your_key_here
OPENAI_API_KEY=your_key_here
MISTRAL_API_KEY=your_key_here

In [35]:
import os
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

True

In [36]:
api_keys = {
    "Anthropic": os.getenv("ANTHROPIC_API_KEY"),
    "Mistral": os.getenv("MISTRAL_API_KEY"),
    "OpenAI": os.getenv("OPENAI_API_KEY"),
}

In [37]:
%%time

system_prompt = """
Only base your response on the data provided.
Do not add knowledge but describe it in a natural way as if you were a touristic guide, be friendly !
You can add forms but not content.
Format the output with nice Markdown format and emojis.
"""

prompt = f"""
Here is the path I got from stations {start_station} to {end_station} : {df_path.to_dict()}.
Describe the path between these two stations.
Give me an itinerary of things I could visit on the way.
"""

provider = "Anthropic"

result = query_llm(
    prompt=prompt,
    system_prompt=system_prompt,
    provider=provider,
    api_key=api_keys[provider],
    temperature=0.2,
)

CPU times: user 206 ms, sys: 29.2 ms, total: 236 ms
Wall time: 7.76 s


In [38]:
display(Markdown(result))

# 🚇 Paddington to Blackfriars: A London Underground Adventure! 🇬🇧

## 🛤️ Your Journey Highlights:

1. **Start at Paddington** 🐻
   - Kick off your trip by saying hello to the Paddington Bear statue
   - Enjoy a lovely stroll along Paddington Basin

2. **Edgware Road (Circle Line)** 
   - Quick transit station

3. **Baker Street** 🕵️
   - Sherlock Holmes fans, rejoice! Visit the famous 221B Baker Street
   - Pop into Madame Tussauds wax museum

4. **Bond Street** 💎
   - Luxury shopping paradise!
   - Browse high-end boutiques
   - Explore Oxford Street's fantastic stores

5. **Green Park** 🌳
   - Beautiful royal park
   - Perfect for a quick walk
   - Close to Buckingham Palace

6. **Westminster** 🏛️
   - Iconic London landmarks
   - See the Houses of Parliament
   - Admire Big Ben
   - Visit Westminster Abbey

7. **Embankment** 🌊
   - Relax in Victoria Embankment Gardens
   - See Cleopatra's Needle
   - Enjoy Thames River views

8. **Temple** ⚖️
   - Explore historic Temple Church
   - More garden walks

9. **Final Stop: Blackfriars** 🎨
   - Cross Blackfriars Bridge
   - Visit Tate Modern
   - Explore riverside walkways

### 🚉 Lines Used:
- Circle Line
- Jubilee Line

**Total Journey: A fantastic London exploration!** 🇬🇧✨

# Use LLM to generate Cypher query

In [39]:
"""Build system prompt with TuringDB schema and examples"""

turingdb_cypher_system_prompt = """
You are an expert at converting natural language questions into TuringDB queries.

Your task is to generate syntactically correct TuringDB queries based on natural language input.

VERY IMPORTANT - TuringDB Syntax Guidelines:
1. Return ONLY the TuringDB query, no explanations or markdown formatting
2. Use MATCH or CREATE operations only
3. Nodes: (n:Label{property="value"}) or (n:Label{property:value})
4. Edges: Use UNDIRECTED syntax with - (NOT ->)
5. Pattern matching: MATCH (n)-[e]-(m)
6. Property matching: Use = or : operators for exact matching
7. String approximation: Use ~= for approximate string matching
8. Node ID injection: Use @ operator or AT keyword: (n @ 1) or (n AT 1)
9. Multiple constraints: (n:Person,Engineer{name="John", age=30})
10. Return all matched entities: RETURN n, e, m or use RETURN * for all

VERY IMPORTANT - FORBIDDEN in TuringDB:
- Do NOT use directed edges (-> or <-)
- Do NOT use AS aliases
- Do NOT use LIMIT, SKIP clauses
- Do NOT use WHERE clauses
- Do NOT use WITH clauses
- Do NOT use CALL (except for metaqueries)
- Do NOT use toLower() or other functions

Supported TuringDB Operations:
- MATCH queries: MATCH (n:Label)-[e:Type]-(m) RETURN n, m
- CREATE queries: CREATE (n:Label{property="value"})-[e:Type]-(m:Label)
- Metaqueries: CALL PROPERTIES(), CALL LABELS(), CALL EDGETYPES(), CALL LABELSETS()
- Property types: String ("text" or `text`), Boolean (true/false), Integer (20), Unsigned (20u), Double (20.5)

Examples for few-shot learning:
- Find all persons: MATCH (n:Person) RETURN n
- Find connections: MATCH (n:Person)-[e]-(m:Person) RETURN n, e, m
- Create person: CREATE (n:Person{name="John", age=30})
- String approximation: MATCH (n{name~="John"}) RETURN n
- Node by ID: MATCH (n @ 1) RETURN n
- Multiple IDs: MATCH (n:Person @ 1, 2, 3) RETURN n
- Path with 1 hop between Station Paddington and Blackfriars:  MATCH (start:Station{displayName:"Paddington"})-[e1:CONNECTED]-(end:Station{displayName="Blackfriars"}) RETURN start, start.displayName, start.Note, e1.Line, end, end.displayName, end.Note
- Path with 2 hops between Station Paddington and Blackfriars: MATCH (start:Station{displayName:"Paddington"})-[e1:CONNECTED]-(s1:Station)-[e2:CONNECTED]-(end:Station{displayName="Blackfriars"}) RETURN start, start.displayName, start.Note, e1.Line, s1, s1.displayName, s1.Note, e2.Line, end, end.displayName, end.Note
- Path with 8 hops between Station Paddington and Blackfriars: MATCH (start:Station{displayName:"Paddington"})-[e1:CONNECTED]-(s1:Station)-[e2:CONNECTED]-(s2:Station)-[e3:CONNECTED]-(s3:Station)-[e4:CONNECTED]-(s4:Station)-[e5:CONNECTED]-(s5:Station)-[e6:CONNECTED]-(s6:Station)-[e7:CONNECTED]-(s7:Station)-[e8:CONNECTED]-(end:Station{displayName="Blackfriars"}) RETURN start, start.displayName, start.Note, e1.Line, s1, s1.displayName, s1.Note, e2.Line, s2, s2.displayName, s2.Note, e3.Line, s3, s3.displayName, s3.Note, e4.Line, s4, s4.displayName, s4.Note, e5.Line, s5, s5.displayName, s5.Note, e6.Line, s6, s6.displayName, s6.Note, e7.Line, s7, s7.displayName, s7.Note, e8.Line, end, end.displayName, end.Note
- Find all Chinese providers and what they supply: MATCH (n{provider_country:"CHN"}) RETURN n, n.provider_name, n.displayName, n.share_provided, n.type
- Find all deposition tools and their types: MATCH (specific)-[e:IS_TYPE_OF]-(general:Tool_Resource{displayName:"Deposition tools"}) RETURN specific, specific.displayName, specific.provider_name, e, general, general.displayName
"""

In [40]:
%%time

# Define maximum number of hops between two
distmax = 20

# Get subset of CREATE command to avoid exceeding context window
create_command_subset = "\n".join(
    create_command.split("\n")[:5] + create_command.split("\n")[-5:]
)

for k in range(1, distmax + 1):
    print(100 * "*")
    print(f"*** k = {k}\n")

    #########################################################################################################
    question = f"""
    Give me the Cypher query to find the path using London Tube between stations {start_station} and {end_station}.
    The path between the two stations has to contain {k} hops.
    Very important :
    - Make sure you respect the number of hops and stations required between the two stations.
    - Pay attention to use the correct node and edge properties name in the MATCH section.
    - Pay attention to use the correct node and edge properties name in the RETURN section.
    - If no contrary information is asked by the user :
        - return all the matched nodes and edges and their properties in the RETURN section.
    """

    system_prompt = f"""
    TuringDB Cypher prompt :
    {turingdb_cypher_system_prompt}
    
    Here is a subset of the CREATE command used to create the graph, this way you know graph structure.
    Only a subset is passed because the whole command is to long :
    {create_command_subset}
    
    Here is also the output of "CALL LABELS ()" command, showing the different node types of the graph :
    {client.query("CALL LABELS ()")}
    
    Here is also the output of "CALL EDGETYPES ()" command, showing the different edge types of the graph :
    {client.query("CALL EDGETYPES ()")}
    
    Very important :
    - You MUST follow current TuringDB Syntax Guidelines
    - You MUST NOT USE what is FORBIDDEN in TuringDB
    - By default, RETURN ALL THE MATCHED NODES AND EDGES AND THEIR PROPERTIES in the RETURN section (except contrary demand from user)
    - Use the correct node and edge properties name in the MATCH section.
    - Use the correct node and edge properties name in the RETURN section.
    - Pay attention to which properties come from nodes or edges, to create a functioning query
    - Pay attention to lower and uppercases in properties
    - If some properties contain spaces, be careful to wrap them
    
    Give me the query FOLLOWING TURINGDB GUIDELINES AND NOT USING WHAT IS FORBIDDEN for this specific question :
    """

    cypher_query = natural_language_to_cypher(
        question=question,
        system_prompt=system_prompt,
        provider=provider,
        api_key=api_keys[provider],
    )
    print(f"cypher_query : {cypher_query}")

    #########################################################################################################

    ## Build column names to exactly match the RETURN clause
    # column_names = ['start', 'start.displayName', 'start.Note']
    #
    # for k_ in range(1, k):
    #    column_names.extend([
    #        f'e{k_}.Line',
    #        f's{k_}',
    #        f's{k_}.displayName',
    #        f's{k_}.Note'
    #    ])
    #
    # column_names.extend([
    #    f'e{k}.Line',
    #    'end',
    #    'end.displayName',
    #    'end.Note'
    # ])
    cypher_query

    #########################################################################################################

    df_path = client.query(cypher_query)
    if df_path.empty:
        print("--> No result found\n")
    else:
        df_path.columns = get_return_statements(query_str)
        display(df_path)
        break

print(100 * "*")

****************************************************************************************************
*** k = 1

cypher_query : MATCH (start:Station{displayName:"Paddington"})-[e1:CONNECTED]-(end:Station{displayName:"Blackfriars"}) RETURN start, start.displayName, start.Note, e1.Line, end, end.displayName, end.Note
--> No result found

****************************************************************************************************
*** k = 2

cypher_query : MATCH (start:Station{displayName:"Paddington"})-[e1:CONNECTED]-(s1:Station)-[e2:CONNECTED]-(end:Station{displayName:"Blackfriars"}) RETURN start, start.displayName, start.Note, e1.Line, s1, s1.displayName, s1.Note, e2.Line, end, end.displayName, end.Note
--> No result found

****************************************************************************************************
*** k = 3

cypher_query : MATCH (start:Station{displayName:"Paddington"})-[e1:CONNECTED]-(s1:Station)-[e2:CONNECTED]-(s2:Station)-[e3:CONNECTED]-(end:Station{d

Unnamed: 0,start,start.displayName,start.Note,e1.Line,s1,s1.displayName,s1.Note,e2.Line,s2,s2.displayName,...,s6.displayName,s6.Note,e7.Line,s7,s7.displayName,s7.Note,e8.Line,end,end.displayName,end.Note
0,1,Paddington,Stroll along the canals at Paddington Basin an...,Circle,7,Edgware Road (C),,Circle,3,Baker Street,...,Embankment,Relax in Victoria Embankment Gardens and see C...,Circle,0,Temple,Discover the historic Temple Church and stroll...,Circle,8,Blackfriars,Cross the Blackfriars Bridge to the Tate Moder...


****************************************************************************************************
CPU times: user 437 ms, sys: 25.9 ms, total: 463 ms
Wall time: 30.1 s


In [41]:
# Show journey strings (multiple paths of the same length can exist between two stations)
for i in range(len(df_path)):
    journey_path = create_journey_string(
        df_path.filter(regex="displayName$|Line$", axis=1).iloc[0]
    )
    print(f"Path n°{i}: {journey_path}")

Path n°0: (Paddington)--[Circle]-->(Edgware Road (C))--[Circle]-->(Baker Street)--[Jubilee]-->(Bond Street)--[Jubilee]-->(Green Park)--[Jubilee]-->(Westminster)--[Circle]-->(Embankment)--[Circle]-->(Temple)--[Circle]-->(Blackfriars)


In [42]:
print("Notebook finished !")

Notebook finished !
