In [2]:
# Checking that roads in the roads data set also exist on the bridge dataset

import pandas as pd


def check_bridges_invalid_roads(roads_df, bridges_df):
    """
    Returns a boolean Series indicating if each bridge is on a road in roads_df.

    Parameters:
    - roads_df: DataFrame with a road column
    - bridges_df: DataFrame with a road column

    Returns:
    - pd.Series of bool, True if bridge is on a valid road
    """
    valid_roads = set(roads_df["road"])
    invalid_mask = ~bridges_df["road"].isin(valid_roads)
    return bridges_df[invalid_mask]




In [3]:
df_roads = pd.read_csv("../data/raw/Roads_InfoAboutEachLRP.csv")
df_bridges = pd.read_excel("../data/processed/BMMS_cleaned_monotony.xlsx")

In [5]:
mask = check_bridges_invalid_roads(df_roads, df_bridges)
count_of_invalid_bridges = mask["road"].count()
list_of_invalid_roads = mask["road"].unique()
print(count_of_invalid_bridges, "bridges are in roads that don't exist on the roads dataset")
print("These roads don't exist on the roads dataset:\n", list_of_invalid_roads)


551 bridges are in roads that don't exist on the roads dataset
These roads don't exist on the roads dataset:
 ['R505' 'R680' 'R750' 'R856' 'Z1006' 'Z1090' 'Z1211' 'Z1463' 'Z1503'
 'Z1613' 'Z1632' 'Z1705' 'Z2022' 'Z2033' 'Z2063' 'Z3614' 'Z5071' 'Z5073'
 'Z5208' 'Z5458' 'Z5459' 'Z5478' 'Z6801' 'Z6814' 'Z6815' 'Z7048' 'Z7049'
 'Z8711' 'Z8948']


In [6]:
# Dropping of roads in bridges that aren't in the roads dataset


def drop_invalid_bridges(roads_df, bridges_df):
    """
    Removes bridges whose road is NOT in roads_df (in-place).

    Parameters:
    - roads_df: DataFrame with valid roads
    - bridges_df: DataFrame with bridges to filter

    Returns:
    cleaned dataframe
    """
    mask = check_bridges_invalid_roads(roads_df, bridges_df)
    list_of_invalid_roads = mask["road"].unique()
    df_filtered = bridges_df[~bridges_df["road"].isin(list_of_invalid_roads)].copy()
    return df_filtered



In [7]:
cleaned_bridges = drop_invalid_bridges(df_roads, df_bridges)
mask2 = check_bridges_invalid_roads(df_roads, cleaned_bridges)
list_of_invalid_roads2 = mask2["road"].unique()
print("These bridges are on roads that don't exist on the roads dataset:\n", list_of_invalid_roads2)

These bridges are on roads that don't exist on the roads dataset:
 []


In [8]:
cleaned_bridges.to_excel("../data/processed/BMMS_cleaned_bridges.xlsx")
