# Get a Trip Sample

In [8]:
import geopandas as gpd
import multiprocessing as mp
import numpy as np
import os
import osmnx as ox
import pandas as pd
from statsmodels.discrete.count_model import ZeroInflatedPoisson
from statsmodels.tools.tools import add_constant

from scipy._lib._util import _lazywhere
import statsmodels.distributions.discrete
statsmodels.distributions.discrete._lazywhere = _lazywhere

from config import rpath
os.chdir(rpath)

In [3]:
path = "data/Berlin_2017/data/trips"
files = os.listdir(path)
colnames = ["TripID","DeviceID","ProviderID","Mode","StartDate","StartWday","EndDate","EndWDay","StartLocLat",
            "StartLocLon", "EndLocLat","EndLocLon","IsStartHome","IsEndHome", "GeospatialType","ProviderType",
            "ProviderDrivingProfile","VehicleWeightClass","ProbeSourceType","OriginZoneName","DestinationZoneName",
            "MultipleZones","MultipleCorridors","EndpointType","TripMeanSpeedKph","TripMaxSpeedKph","TripDistanceMeters",
            "MovementType","OriginCensusBlockGroup","DestinationCensucBlockGroup"]
keepcols = ["TripID","DeviceID","StartDate","EndDate","StartLocLat","StartLocLon", "EndLocLat","EndLocLon", "ProviderType",
            "ProviderDrivingProfile","VehicleWeightClass","TripMeanSpeedKph","TripMaxSpeedKph","TripDistanceMeters",
            "MovementType"]

In [6]:
for i, file in enumerate(files):
    if i== 0:
        df = pd.read_csv(os.path.join(path,file), names = colnames)
        df = df[keepcols]
    else:
        df = df.append(pd.read_csv(os.path.join(path,file), names=colnames)[keepcols])
    print("Read file", i+1, "of", len(files))

Read file 1 of 14
Read file 2 of 14
Read file 3 of 14
Read file 4 of 14
Read file 5 of 14
Read file 6 of 14
Read file 7 of 14
Read file 8 of 14
Read file 9 of 14
Read file 10 of 14
Read file 11 of 14
Read file 12 of 14
Read file 13 of 14
Read file 14 of 14


In [7]:
N = len(set(df.TripID.tolist()))
n_sample = int(np.round(N*0.8729,0))
print("Drawing a sample of size", n_sample, "out of", N, "total trips.")
trip_sample = df["TripID"].sample(n=n_sample, random_state=1337).tolist()
np.save("tripsample_5-3.npy", np.array(trip_sample))

Drawing a sample of size 29861135 out of 34209113 total trips.


## Calculate new Rv
Everything taken from the original RV notebook

In [2]:
# Get Waypoint Files
path = "data/Berlin_2017/data/waypoints"
files = []
for file in os.listdir(path):
    if file.endswith("edges.csv"):
        files.append(os.path.join(path,file))
files.sort()
keepcols = ['TripID', 'WaypointSequence', 'CaptureDate','RawSpeed', 'edge_1', 'edge_2', 'edge_dist']

tripsample = np.load("tripsample_5-3.npy")

In [3]:
def read_filter_index(file):
    '''This function reads a waypoint file, gets the CET timestamp and filters the data frame 
       to only include files from the peak hours and those which could be sensibly matched with 
       a road from the street network. It then adds an "edge_id" and a "time_id" to the data frame,
       which are unique edge identifiers and 15-minute time interval identifiers within the
       defined time windows, respectively.
    '''
    # Read file
    gdf = pd.read_csv(file)[keepcols] # note to self: dtypes are being read correctly
    len_gdf = len(gdf)
    gdf = gdf[gdf.TripID.isin(tripsample)] # keep only those which are in tripsample
    print(len(gdf)-len_gdf, "waypoints discarded.")
    
    # Convert CaptureDate to TimeStamp and convert to CET
    gdf["Timestamp_CET"] = pd.to_datetime(gdf["CaptureDate"], format='%Y-%m-%dT%H:%M:%S') 
    gdf["Timestamp_CET"] = pd.DatetimeIndex(gdf.Timestamp_CET).tz_convert('Europe/Berlin').to_series().tolist() 
    gdf = gdf.drop(columns="CaptureDate")
    
    # Get the edge distances to eventually create a histogram or a quantile overview
    edge_dist_list = gdf.edge_dist
    
    # Filter dataframe to only include peak hours during weekdays
    gdf = gdf[gdf.Timestamp_CET.dt.weekday < 5] # 5 = Saturday, 6 = Sunday
    gdf = gdf[((gdf.Timestamp_CET.dt.hour >= 6) & (gdf.Timestamp_CET.dt.hour < 10)) |
              ((gdf.Timestamp_CET.dt.hour >= 14) & (gdf.Timestamp_CET.dt.hour < 20))]
    
    
    # Drop observations that couldn't be matched to a street segment very well
    gdf = gdf[gdf["edge_dist"]<=50] 
    gdf = gdf.drop(columns="edge_dist")
    
    # Create new unique edge ID column
    gdf["edge_id"] = gdf[["edge_1","edge_2"]].astype(str).agg('-'.join, axis=1)
    
    # Add time identifier
    # Create time index for 15 minute blocks
    conditions = [
        ((gdf.Timestamp_CET.dt.hour == 6) & (gdf.Timestamp_CET.dt.minute < 15)),
        ((gdf.Timestamp_CET.dt.hour == 6) & (gdf.Timestamp_CET.dt.minute >= 15) & (gdf.Timestamp_CET.dt.minute < 30)),
        ((gdf.Timestamp_CET.dt.hour == 6) & (gdf.Timestamp_CET.dt.minute >= 30) & (gdf.Timestamp_CET.dt.minute < 45)),
        ((gdf.Timestamp_CET.dt.hour == 6) & (gdf.Timestamp_CET.dt.minute >= 45)),
        ((gdf.Timestamp_CET.dt.hour == 7) & (gdf.Timestamp_CET.dt.minute < 15)),
        ((gdf.Timestamp_CET.dt.hour == 7) & (gdf.Timestamp_CET.dt.minute >= 15) & (gdf.Timestamp_CET.dt.minute < 30)),
        ((gdf.Timestamp_CET.dt.hour == 7) & (gdf.Timestamp_CET.dt.minute >= 30) & (gdf.Timestamp_CET.dt.minute < 45)),
        ((gdf.Timestamp_CET.dt.hour == 7) & (gdf.Timestamp_CET.dt.minute >= 45)),
        ((gdf.Timestamp_CET.dt.hour == 8) & (gdf.Timestamp_CET.dt.minute < 15)),
        ((gdf.Timestamp_CET.dt.hour == 8) & (gdf.Timestamp_CET.dt.minute >= 15) & (gdf.Timestamp_CET.dt.minute < 30)),
        ((gdf.Timestamp_CET.dt.hour == 8) & (gdf.Timestamp_CET.dt.minute >= 30) & (gdf.Timestamp_CET.dt.minute < 45)),
        ((gdf.Timestamp_CET.dt.hour == 8) & (gdf.Timestamp_CET.dt.minute >= 45)),
        ((gdf.Timestamp_CET.dt.hour == 9) & (gdf.Timestamp_CET.dt.minute < 15)),
        ((gdf.Timestamp_CET.dt.hour == 9) & (gdf.Timestamp_CET.dt.minute >= 15) & (gdf.Timestamp_CET.dt.minute < 30)),
        ((gdf.Timestamp_CET.dt.hour == 9) & (gdf.Timestamp_CET.dt.minute >= 30) & (gdf.Timestamp_CET.dt.minute < 45)),
        ((gdf.Timestamp_CET.dt.hour == 9) & (gdf.Timestamp_CET.dt.minute >= 45)),
        ((gdf.Timestamp_CET.dt.hour == 14) & (gdf.Timestamp_CET.dt.minute < 15)),
        ((gdf.Timestamp_CET.dt.hour == 14) & (gdf.Timestamp_CET.dt.minute >= 15) & (gdf.Timestamp_CET.dt.minute < 30)),
        ((gdf.Timestamp_CET.dt.hour == 14) & (gdf.Timestamp_CET.dt.minute >= 30) & (gdf.Timestamp_CET.dt.minute < 45)),
        ((gdf.Timestamp_CET.dt.hour == 14) & (gdf.Timestamp_CET.dt.minute >= 45)),
        ((gdf.Timestamp_CET.dt.hour == 15) & (gdf.Timestamp_CET.dt.minute < 15)),
        ((gdf.Timestamp_CET.dt.hour == 15) & (gdf.Timestamp_CET.dt.minute >= 15) & (gdf.Timestamp_CET.dt.minute < 30)),
        ((gdf.Timestamp_CET.dt.hour == 15) & (gdf.Timestamp_CET.dt.minute >= 30) & (gdf.Timestamp_CET.dt.minute < 45)),
        ((gdf.Timestamp_CET.dt.hour == 15) & (gdf.Timestamp_CET.dt.minute >= 45)),
        ((gdf.Timestamp_CET.dt.hour == 16) & (gdf.Timestamp_CET.dt.minute < 15)),
        ((gdf.Timestamp_CET.dt.hour == 16) & (gdf.Timestamp_CET.dt.minute >= 15) & (gdf.Timestamp_CET.dt.minute < 30)),
        ((gdf.Timestamp_CET.dt.hour == 16) & (gdf.Timestamp_CET.dt.minute >= 30) & (gdf.Timestamp_CET.dt.minute < 45)),
        ((gdf.Timestamp_CET.dt.hour == 16) & (gdf.Timestamp_CET.dt.minute >= 45)),
        ((gdf.Timestamp_CET.dt.hour == 17) & (gdf.Timestamp_CET.dt.minute < 15)),
        ((gdf.Timestamp_CET.dt.hour == 17) & (gdf.Timestamp_CET.dt.minute >= 15) & (gdf.Timestamp_CET.dt.minute < 30)),
        ((gdf.Timestamp_CET.dt.hour == 17) & (gdf.Timestamp_CET.dt.minute >= 30) & (gdf.Timestamp_CET.dt.minute < 45)),
        ((gdf.Timestamp_CET.dt.hour == 17) & (gdf.Timestamp_CET.dt.minute >= 45)),
        ((gdf.Timestamp_CET.dt.hour == 18) & (gdf.Timestamp_CET.dt.minute < 15)),
        ((gdf.Timestamp_CET.dt.hour == 18) & (gdf.Timestamp_CET.dt.minute >= 15) & (gdf.Timestamp_CET.dt.minute < 30)),
        ((gdf.Timestamp_CET.dt.hour == 18) & (gdf.Timestamp_CET.dt.minute >= 30) & (gdf.Timestamp_CET.dt.minute < 45)),
        ((gdf.Timestamp_CET.dt.hour == 18) & (gdf.Timestamp_CET.dt.minute >= 45)),
        ((gdf.Timestamp_CET.dt.hour == 19) & (gdf.Timestamp_CET.dt.minute < 15)),
        ((gdf.Timestamp_CET.dt.hour == 19) & (gdf.Timestamp_CET.dt.minute >= 15) & (gdf.Timestamp_CET.dt.minute < 30)),
        ((gdf.Timestamp_CET.dt.hour == 19) & (gdf.Timestamp_CET.dt.minute >= 30) & (gdf.Timestamp_CET.dt.minute < 45)),
        ((gdf.Timestamp_CET.dt.hour == 19) & (gdf.Timestamp_CET.dt.minute >= 45)),
    ]
    values = list(range(len(conditions))) # Corresponding condition values
    gdf['time_id'] = np.select(conditions, values) 
    
    return (gdf, edge_dist_list)

def get_Rv(file, verbose=True, use_files=True):
    '''Reads a file, applies filtering (see read_filter_index), saves the file, and
       computes Rv data frame for each segment and time id. Additionally returns a 
       list of edge_distances.
    '''
    pathfiles = [path + "/" + f for f in os.listdir(path)]
    
    if use_files:
        if file[:-4] + "_small_2020-sample.csv" in pathfiles:
            print("Reading old file...")
            gdf = pd.read_csv(file[:-4] + "_small_2020-sample.csv")
            edge_dist_list = []
        else:
            print("Reading new file...")
            gdf, edge_dist_list = read_filter_index(file)
    else:
        gdf, edge_dist_list = read_filter_index(file)
        
    if verbose:
        print("A file was processed")
    
    # Save
    if not file[:-4] + "_small_2020-sample.csv" in os.listdir(path):
        outfile = file[:-4] + "_small_2020-sample.csv"
        gdf.to_csv(outfile, index=False)
        
    # Drop unnecessary columns
    gdf = gdf.drop(columns=["edge_1", "edge_2"])
    
    # Return Rv df
    return gdf[['edge_id', 'time_id', 'RawSpeed']].groupby(['edge_id', 'time_id']).agg(['count', 'sum']), edge_dist_list

def get_and_process_Rv(files, verbose=True, cpus=6):
    ''' Gets Rv for every file, processes it to get a clean Rv table for every edge and quarter of an hour.
        Prints out quantiles of edge distances and returns clean Rv data frame.
    '''
    # Start the multiprocessing pool and get Rv
    pool = mp.Pool(processes=min(len(files), cpus))
    out = pool.map(get_Rv, files)
    pool.close()
    
    if verbose:
        print("Pool closed")

    Rv_list = [x[0] for x in out]
    edge_dist_list_list = [x[1] for x in out]
    
    del out # for space
    
    if verbose:
        print("Begin concatenating")
        
    Rv = pd.concat(Rv_list).groupby(['edge_id', 'time_id']).agg(['sum'])
    
    del Rv_list # for space
    
    # Remove entries that have n = 0 (only nan Speeds)
    Rv = Rv[Rv.xs(('RawSpeed','count', 'sum'), axis=1) != 0]
    
    # Remove obs with n < median n (20) on selected edge_id
    Rv = Rv[Rv.xs(('RawSpeed','count', 'sum'), axis=1) >= np.median(Rv.xs(("RawSpeed", "count", "sum"), axis=1))]
    print("The median count of observations on an edge is", np.median(Rv.xs(("RawSpeed", "count", "sum"), axis=1)))
    # Add meanSpeed column
    Rv.loc[:,"meanSpeed"] = Rv.xs(('RawSpeed','sum', 'sum'), axis=1)/Rv.xs(('RawSpeed','count', 'sum'), axis=1)
    
    if verbose:
        print("Begin merging lists")
        
    edge_dists = [item for sublist in edge_dist_list_list for item in sublist]
    edge_dists = np.array(edge_dists)
    print("# all waypoints: " + str(len(edge_dists)))
    print("# waypoints with dist >50: " + str(len(edge_dists[edge_dists<=50])))
    
    return Rv

In [4]:
Rv = get_and_process_Rv(files, cpus=5)

Reading old file...Reading old file...Reading old file...Reading new file...Reading old file...




A file was processed
A file was processed
A file was processed
A file was processed
Reading old file...
Reading old file...
Reading old file...
Reading new file...
A file was processed
A file was processed
A file was processed
Reading old file...
Reading new file...
Reading old file...
A file was processed
A file was processed
-1054574 waypoints discarded.
Reading new file...
Reading old file...
-1059038 waypoints discarded.
A file was processed
Reading new file...
-1064497 waypoints discarded.
-1083189 waypoints discarded.
-1051615 waypoints discarded.
A file was processed
A file was processed
A file was processed
A file was processed
Reading old file...
Reading old file...
A file was processed
A file was processed
A file was processed
Reading old file...
Reading old file...
A file was processed
A file was processed
Reading old file...
A file was processed
Reading old file...
Reading ol

In [5]:
Rv.to_csv("data/Rv-2020-sample.csv")

# Calculate new RSI
Everything taken from the original RSI notebook

In [26]:
Rv = pd.read_csv("data/Rv-2020-sample.csv", names = ['edge_id', 'time_id', 'n', 'speed', 'meanspeed'], skiprows=4)

In [27]:
# Create edge data frame to get speeds from the OSM graph
edges = pd.DataFrame({'edge_id':Rv.edge_id.unique()})
edges[["edge_1", "edge_2"]] = edges["edge_id"].str.split("-", expand=True)
edges["edge_1"] = edges["edge_1"].astype(int)
edges["edge_2"] = edges["edge_2"].astype(int)

# Get the graphs
G = ox.io.load_graphml("data/graphs/berlin-2851.graphml") # 
H = ox.simplification.simplify_graph(ox.io.load_graphml("data/graphs/Berlin-raw.graphml"))

In [28]:
nomaxspeeds = []
maxSpeeds = []

for i in range(len(edges)): # For every edge
    if (i+1) % 1000 == 0:
        print("Processing", i+1, "of", len(edges), "edges")
    
    # Check conditions that max speed is available in G
    maxspeedyes = 'maxspeed' in G[edges.iloc[i,1]][edges.iloc[i,2]][0]
    speed = None
    
    if maxspeedyes:
        speed = G[edges.iloc[i,1]][edges.iloc[i,2]][0]['maxspeed']
        if speed == "DE:urban":
            speed = 50

    # In case G does not have max speed info, look it up in newer graph
    if speed is None: # If there's no info in geoff's belin graph
        osmids = G[edges.iloc[i,1]][edges.iloc[i,2]][0]['osmid'] # get the individual OSM way IDs
        if type(osmids) == int: # If osmids is not a list but a single integer
            osmids = [osmids] # make it a list so we can loop through
        for oid in osmids: # for every way ID
            speed = None
            for u,v in H.edges(): # loop through all the edges in the unsimplified berlin graph
                if H[u][v][0]["osmid"] == oid: # if we find the way ID
                    if "maxspeed" in H[u][v][0]: # If we find maxspeed in graph
                        print("Found max speed in original graph")
                        speed = H[u][v][0]["maxspeed"]
                        break
                    elif "maxspeed:type" in H[u][v][0]: # If we find maxspeed type in graph
                        speed = H[u][v][0]["maxspeed:type"]
                        print("Found max speed type in original graph:", speed)
                        if speed == "DE:urban":
                            speed = 50
                            break
                        elif speed == "DE:zone30":
                            speed = 30
                            break
                        else:
                            print("ALARMALARM", speed)
                            raise Exception("TEST")
                            
    if speed is None: # If there's still no speed info
        print("Neither maxspeed nor maxspeed type info available for osmid", oid)
        nomaxspeeds.append(oid) 
        maxSpeeds.append(np.nan)
        continue        
            
    if type(speed) == list: # if there is more than one edge speed
        osmids = G[edges.iloc[i,1]][edges.iloc[i,2]][0]['osmid'] # get the individual OSM way IDs
        segmentlengths = []
        speeds = []
        if 'none' in speed: # if 'none' is in the speed list
                idx = speed.index('none') # find the index of the none entry
                speed.pop(idx) # pop the none entry
                speed = [float(x) for x in speed] # convert to float
        for oid in osmids: # for every way ID
            for u,v in H.edges(): # loop through all the edges in the unsimplified berlin graph
                try:
                    if H[u][v][0]["osmid"] == oid: # if we find the way ID
                        if "length" not in H[u][v][0] or "maxspeed" not in H[u][v][0]:
                            continue
                        else:
                            segmentlengths.append(H[u][v][0]["length"]) # get the segment length
                            speeds.append(H[u][v][0]["maxspeed"]) # and segment max speed
                except Exception as e:
                    print("An unknown exception incurred.")
                    print(i)
                    raise e
        if len(segmentlengths) == 0 or len(speeds) == 0: # if no way ID can be found
            speed = max(speed) # take the max speed of the existing speed list
        else:
            speed = (pd.DataFrame({'length':segmentlengths,'speed':speeds}) # Data frame with all segment lengths and speeds
                 .groupby("speed") # group by speeds
                 .agg("sum") # sum over segment lengths
                 .sort_values("length",ascending=False) # sort so that the longest segment is first
                 .index[0]) # get the index (speed) of the first entry (longest segment)
    maxSpeeds.append(speed)

Neither maxspeed nor maxspeed type info available for osmid 25368898
Found max speed type in original graph: DE:urban
Neither maxspeed nor maxspeed type info available for osmid 5116453
Found max speed type in original graph: DE:urban
Neither maxspeed nor maxspeed type info available for osmid 29112676
Neither maxspeed nor maxspeed type info available for osmid 167410411
Neither maxspeed nor maxspeed type info available for osmid 26406925
Found max speed in original graph
Neither maxspeed nor maxspeed type info available for osmid 11405545
Neither maxspeed nor maxspeed type info available for osmid 11405545
Neither maxspeed nor maxspeed type info available for osmid 11405545
Neither maxspeed nor maxspeed type info available for osmid 11405545
Neither maxspeed nor maxspeed type info available for osmid 27156341
Neither maxspeed nor maxspeed type info available for osmid 8792775
Neither maxspeed nor maxspeed type info available for osmid 15920080
Neither maxspeed nor maxspeed type info a

Neither maxspeed nor maxspeed type info available for osmid 128978595
Found max speed in original graph
Neither maxspeed nor maxspeed type info available for osmid 154946524
Neither maxspeed nor maxspeed type info available for osmid 154946524
Neither maxspeed nor maxspeed type info available for osmid 154946523
Neither maxspeed nor maxspeed type info available for osmid 131577614
Neither maxspeed nor maxspeed type info available for osmid 131577614
Neither maxspeed nor maxspeed type info available for osmid 19064220
Neither maxspeed nor maxspeed type info available for osmid 19064220
Neither maxspeed nor maxspeed type info available for osmid 82121192
Neither maxspeed nor maxspeed type info available for osmid 82121192
Neither maxspeed nor maxspeed type info available for osmid 14674294
Neither maxspeed nor maxspeed type info available for osmid 211562073
Neither maxspeed nor maxspeed type info available for osmid 14674289
Neither maxspeed nor maxspeed type info available for osmid 14

Neither maxspeed nor maxspeed type info available for osmid 149125751
Neither maxspeed nor maxspeed type info available for osmid 16109891
Neither maxspeed nor maxspeed type info available for osmid 149488400
Neither maxspeed nor maxspeed type info available for osmid 149488401
Neither maxspeed nor maxspeed type info available for osmid 149488399
Neither maxspeed nor maxspeed type info available for osmid 149611534
Neither maxspeed nor maxspeed type info available for osmid 149611534
Found max speed in original graph
Found max speed in original graph
Found max speed in original graph
Found max speed in original graph
Found max speed in original graph
Found max speed in original graph
Neither maxspeed nor maxspeed type info available for osmid 150732372
Neither maxspeed nor maxspeed type info available for osmid 4662762
Found max speed in original graph
Neither maxspeed nor maxspeed type info available for osmid 40193019
Neither maxspeed nor maxspeed type info available for osmid 151621

Neither maxspeed nor maxspeed type info available for osmid 172183870
Neither maxspeed nor maxspeed type info available for osmid 172183867
Neither maxspeed nor maxspeed type info available for osmid 172183871
Neither maxspeed nor maxspeed type info available for osmid 172183869
Neither maxspeed nor maxspeed type info available for osmid 23809468
Neither maxspeed nor maxspeed type info available for osmid 192487146
Neither maxspeed nor maxspeed type info available for osmid 25939421
Neither maxspeed nor maxspeed type info available for osmid 26807992
Neither maxspeed nor maxspeed type info available for osmid 176037629
Neither maxspeed nor maxspeed type info available for osmid 172699421
Neither maxspeed nor maxspeed type info available for osmid 175298126
Neither maxspeed nor maxspeed type info available for osmid 43963437
Found max speed in original graph
Found max speed in original graph
Found max speed in original graph
Found max speed in original graph
Neither maxspeed nor maxspee

Neither maxspeed nor maxspeed type info available for osmid 166974626
Found max speed type in original graph: DE:urban
Processing 9000 of 46828 edges
Neither maxspeed nor maxspeed type info available for osmid 156066238
Neither maxspeed nor maxspeed type info available for osmid 708515798
Neither maxspeed nor maxspeed type info available for osmid 149611534
Neither maxspeed nor maxspeed type info available for osmid 228635045
Neither maxspeed nor maxspeed type info available for osmid 228635045
Neither maxspeed nor maxspeed type info available for osmid 228635046
Neither maxspeed nor maxspeed type info available for osmid 148326359
Neither maxspeed nor maxspeed type info available for osmid 135403248
Neither maxspeed nor maxspeed type info available for osmid 135403248
Neither maxspeed nor maxspeed type info available for osmid 229475732
Neither maxspeed nor maxspeed type info available for osmid 135403248
Neither maxspeed nor maxspeed type info available for osmid 229475750
Neither ma

Neither maxspeed nor maxspeed type info available for osmid 24036305
Neither maxspeed nor maxspeed type info available for osmid 23382150
Neither maxspeed nor maxspeed type info available for osmid 24036305
Neither maxspeed nor maxspeed type info available for osmid 23382155
Neither maxspeed nor maxspeed type info available for osmid 23389632
Neither maxspeed nor maxspeed type info available for osmid 23389632
Neither maxspeed nor maxspeed type info available for osmid 23389943
Found max speed type in original graph: DE:urban
Neither maxspeed nor maxspeed type info available for osmid 23409834
Found max speed type in original graph: DE:urban
Neither maxspeed nor maxspeed type info available for osmid 23409831
Neither maxspeed nor maxspeed type info available for osmid 23409834
Neither maxspeed nor maxspeed type info available for osmid 23409831
Neither maxspeed nor maxspeed type info available for osmid 23410053
Neither maxspeed nor maxspeed type info available for osmid 23410059
Neith

Neither maxspeed nor maxspeed type info available for osmid 24204432
Neither maxspeed nor maxspeed type info available for osmid 24204432
Neither maxspeed nor maxspeed type info available for osmid 24204437
Neither maxspeed nor maxspeed type info available for osmid 24204439
Neither maxspeed nor maxspeed type info available for osmid 24204443
Processing 12000 of 46828 edges
Neither maxspeed nor maxspeed type info available for osmid 24204623
Neither maxspeed nor maxspeed type info available for osmid 24204678
Neither maxspeed nor maxspeed type info available for osmid 31521784
Neither maxspeed nor maxspeed type info available for osmid 31521784
Neither maxspeed nor maxspeed type info available for osmid 31521784
Neither maxspeed nor maxspeed type info available for osmid 151465254
Neither maxspeed nor maxspeed type info available for osmid 448030438
Neither maxspeed nor maxspeed type info available for osmid 151465254
Neither maxspeed nor maxspeed type info available for osmid 15146525

Found max speed in original graph
Found max speed in original graph
Found max speed in original graph
Found max speed in original graph
Found max speed in original graph
Found max speed in original graph
Found max speed in original graph
Found max speed in original graph
Found max speed in original graph
Found max speed in original graph
Found max speed in original graph
Found max speed in original graph
Found max speed in original graph
Neither maxspeed nor maxspeed type info available for osmid 489521492
Found max speed in original graph
Found max speed in original graph
Found max speed in original graph
Found max speed in original graph
Found max speed in original graph
Found max speed in original graph
Neither maxspeed nor maxspeed type info available for osmid 489521492
Found max speed in original graph
Found max speed in original graph
Found max speed in original graph
Found max speed in original graph
Neither maxspeed nor maxspeed type info available for osmid 342159185
Neither 

Neither maxspeed nor maxspeed type info available for osmid 828836631
Neither maxspeed nor maxspeed type info available for osmid 828836631
Neither maxspeed nor maxspeed type info available for osmid 14369669
Neither maxspeed nor maxspeed type info available for osmid 14369553
Found max speed in original graph
Found max speed in original graph
Neither maxspeed nor maxspeed type info available for osmid 24695140
Neither maxspeed nor maxspeed type info available for osmid 24695140
Neither maxspeed nor maxspeed type info available for osmid 191377893
Neither maxspeed nor maxspeed type info available for osmid 160120060
Neither maxspeed nor maxspeed type info available for osmid 100405511
Neither maxspeed nor maxspeed type info available for osmid 101290100
Neither maxspeed nor maxspeed type info available for osmid 101290100
Neither maxspeed nor maxspeed type info available for osmid 814845183
Neither maxspeed nor maxspeed type info available for osmid 4663104
Neither maxspeed nor maxspee

Neither maxspeed nor maxspeed type info available for osmid 69126512
Neither maxspeed nor maxspeed type info available for osmid 454285423
Neither maxspeed nor maxspeed type info available for osmid 151457956
Processing 21000 of 46828 edges
Neither maxspeed nor maxspeed type info available for osmid 25199006
Neither maxspeed nor maxspeed type info available for osmid 645343613
Neither maxspeed nor maxspeed type info available for osmid 4484149
Neither maxspeed nor maxspeed type info available for osmid 28012464
Neither maxspeed nor maxspeed type info available for osmid 207412830
Neither maxspeed nor maxspeed type info available for osmid 22913804
Neither maxspeed nor maxspeed type info available for osmid 111531489
Neither maxspeed nor maxspeed type info available for osmid 529250472
Neither maxspeed nor maxspeed type info available for osmid 4493120
Processing 22000 of 46828 edges
Neither maxspeed nor maxspeed type info available for osmid 8008906
Neither maxspeed nor maxspeed type i

Neither maxspeed nor maxspeed type info available for osmid 166974626
Neither maxspeed nor maxspeed type info available for osmid 166973638
Neither maxspeed nor maxspeed type info available for osmid 166973638
Neither maxspeed nor maxspeed type info available for osmid 25367771
Neither maxspeed nor maxspeed type info available for osmid 41350279
Neither maxspeed nor maxspeed type info available for osmid 41350236
Neither maxspeed nor maxspeed type info available for osmid 41350236
Neither maxspeed nor maxspeed type info available for osmid 41350236
Neither maxspeed nor maxspeed type info available for osmid 41350307
Neither maxspeed nor maxspeed type info available for osmid 590230774
Neither maxspeed nor maxspeed type info available for osmid 24517027
Neither maxspeed nor maxspeed type info available for osmid 154901587
Neither maxspeed nor maxspeed type info available for osmid 178427798
Neither maxspeed nor maxspeed type info available for osmid 178427798
Neither maxspeed nor maxspe

Found max speed in original graph
Found max speed type in original graph: DE:urban
Found max speed type in original graph: DE:urban
Found max speed type in original graph: DE:urban
Neither maxspeed nor maxspeed type info available for osmid 391407797
Neither maxspeed nor maxspeed type info available for osmid 4696063
Neither maxspeed nor maxspeed type info available for osmid 26269149
Neither maxspeed nor maxspeed type info available for osmid 26269149
Neither maxspeed nor maxspeed type info available for osmid 4797951
Found max speed type in original graph: DE:urban
Neither maxspeed nor maxspeed type info available for osmid 433933813
Neither maxspeed nor maxspeed type info available for osmid 232663848
Neither maxspeed nor maxspeed type info available for osmid 25258502
Neither maxspeed nor maxspeed type info available for osmid 30705210
Neither maxspeed nor maxspeed type info available for osmid 4566442
Neither maxspeed nor maxspeed type info available for osmid 364209183
Neither ma

Neither maxspeed nor maxspeed type info available for osmid 24591982
Neither maxspeed nor maxspeed type info available for osmid 26938217
Neither maxspeed nor maxspeed type info available for osmid 42997865
Neither maxspeed nor maxspeed type info available for osmid 26180781
Neither maxspeed nor maxspeed type info available for osmid 30958679
Neither maxspeed nor maxspeed type info available for osmid 26180781
Processing 28000 of 46828 edges
Found max speed type in original graph: DE:urban
Found max speed type in original graph: DE:urban
Neither maxspeed nor maxspeed type info available for osmid 529250469
Neither maxspeed nor maxspeed type info available for osmid 529250471
Neither maxspeed nor maxspeed type info available for osmid 156346105
Neither maxspeed nor maxspeed type info available for osmid 57386690
Found max speed in original graph
Neither maxspeed nor maxspeed type info available for osmid 37116781
Neither maxspeed nor maxspeed type info available for osmid 11025429
Neith

Neither maxspeed nor maxspeed type info available for osmid 198585468
Neither maxspeed nor maxspeed type info available for osmid 385402821
Neither maxspeed nor maxspeed type info available for osmid 470350077
Neither maxspeed nor maxspeed type info available for osmid 385402821
Neither maxspeed nor maxspeed type info available for osmid 31024362
Neither maxspeed nor maxspeed type info available for osmid 27168761
Neither maxspeed nor maxspeed type info available for osmid 45273284
Found max speed in original graph
Found max speed type in original graph: DE:urban
Neither maxspeed nor maxspeed type info available for osmid 47419493
Found max speed type in original graph: DE:urban
Found max speed type in original graph: DE:urban
Neither maxspeed nor maxspeed type info available for osmid 26244468
Neither maxspeed nor maxspeed type info available for osmid 391407781
Neither maxspeed nor maxspeed type info available for osmid 616670793
Neither maxspeed nor maxspeed type info available for 

Neither maxspeed nor maxspeed type info available for osmid 142063347
Neither maxspeed nor maxspeed type info available for osmid 142062866
Neither maxspeed nor maxspeed type info available for osmid 142062866
Neither maxspeed nor maxspeed type info available for osmid 142062901
Found max speed in original graph
Neither maxspeed nor maxspeed type info available for osmid 22057215
Found max speed in original graph
Found max speed in original graph
Neither maxspeed nor maxspeed type info available for osmid 302574975
Neither maxspeed nor maxspeed type info available for osmid 26406925
Neither maxspeed nor maxspeed type info available for osmid 624116223
Neither maxspeed nor maxspeed type info available for osmid 302634911
Neither maxspeed nor maxspeed type info available for osmid 425498708
Neither maxspeed nor maxspeed type info available for osmid 4788443
Neither maxspeed nor maxspeed type info available for osmid 4788443
Neither maxspeed nor maxspeed type info available for osmid 3209

Neither maxspeed nor maxspeed type info available for osmid 444656564
Neither maxspeed nor maxspeed type info available for osmid 444656564
Found max speed type in original graph: DE:urban
Found max speed type in original graph: DE:urban
Found max speed type in original graph: DE:urban
Neither maxspeed nor maxspeed type info available for osmid 25383671
Found max speed type in original graph: DE:urban
Neither maxspeed nor maxspeed type info available for osmid 148809347
Neither maxspeed nor maxspeed type info available for osmid 148809347
Neither maxspeed nor maxspeed type info available for osmid 148809347
Neither maxspeed nor maxspeed type info available for osmid 28541642
Neither maxspeed nor maxspeed type info available for osmid 156346105
Neither maxspeed nor maxspeed type info available for osmid 156346041
Neither maxspeed nor maxspeed type info available for osmid 156346064
Neither maxspeed nor maxspeed type info available for osmid 175084450
Neither maxspeed nor maxspeed type i

Neither maxspeed nor maxspeed type info available for osmid 24028197
Neither maxspeed nor maxspeed type info available for osmid 263787931
Neither maxspeed nor maxspeed type info available for osmid 35576841
Neither maxspeed nor maxspeed type info available for osmid 35576841
Neither maxspeed nor maxspeed type info available for osmid 35576841
Neither maxspeed nor maxspeed type info available for osmid 35576841
Neither maxspeed nor maxspeed type info available for osmid 9933012
Neither maxspeed nor maxspeed type info available for osmid 263787931
Neither maxspeed nor maxspeed type info available for osmid 35576841
Neither maxspeed nor maxspeed type info available for osmid 263787931
Neither maxspeed nor maxspeed type info available for osmid 37291757
Neither maxspeed nor maxspeed type info available for osmid 23436566
Neither maxspeed nor maxspeed type info available for osmid 23436566
Neither maxspeed nor maxspeed type info available for osmid 37291757
Neither maxspeed nor maxspeed ty

Neither maxspeed nor maxspeed type info available for osmid 4603529
Neither maxspeed nor maxspeed type info available for osmid 5096537
Found max speed type in original graph: DE:urban
Found max speed type in original graph: DE:urban
Found max speed type in original graph: DE:urban
Found max speed type in original graph: DE:urban
Found max speed type in original graph: DE:urban
Neither maxspeed nor maxspeed type info available for osmid 138017549
Neither maxspeed nor maxspeed type info available for osmid 24069852
Neither maxspeed nor maxspeed type info available for osmid 176501670
Found max speed in original graph
Found max speed type in original graph: DE:urban
Neither maxspeed nor maxspeed type info available for osmid 39070578
Found max speed type in original graph: DE:urban
Neither maxspeed nor maxspeed type info available for osmid 38717602
Found max speed in original graph
Found max speed in original graph
Found max speed in original graph
Neither maxspeed nor maxspeed type inf

Neither maxspeed nor maxspeed type info available for osmid 36969682
Neither maxspeed nor maxspeed type info available for osmid 25113727
Neither maxspeed nor maxspeed type info available for osmid 23436561
Neither maxspeed nor maxspeed type info available for osmid 836498506
Neither maxspeed nor maxspeed type info available for osmid 431571678
Neither maxspeed nor maxspeed type info available for osmid 562166783
Neither maxspeed nor maxspeed type info available for osmid 23006207
Found max speed type in original graph: DE:urban
Neither maxspeed nor maxspeed type info available for osmid 37229268
Neither maxspeed nor maxspeed type info available for osmid 25799452
Neither maxspeed nor maxspeed type info available for osmid 530416910
Neither maxspeed nor maxspeed type info available for osmid 645183212
Neither maxspeed nor maxspeed type info available for osmid 645183213
Neither maxspeed nor maxspeed type info available for osmid 645183212
Neither maxspeed nor maxspeed type info availab

Neither maxspeed nor maxspeed type info available for osmid 6139005
Neither maxspeed nor maxspeed type info available for osmid 6139041
Neither maxspeed nor maxspeed type info available for osmid 22823712
Neither maxspeed nor maxspeed type info available for osmid 22823712
Neither maxspeed nor maxspeed type info available for osmid 22823712
Neither maxspeed nor maxspeed type info available for osmid 6139947
Neither maxspeed nor maxspeed type info available for osmid 6139947
Neither maxspeed nor maxspeed type info available for osmid 6139947
Neither maxspeed nor maxspeed type info available for osmid 6139947
Neither maxspeed nor maxspeed type info available for osmid 6139947
Found max speed in original graph
Found max speed in original graph
Neither maxspeed nor maxspeed type info available for osmid 486418992
Processing 41000 of 46828 edges
Neither maxspeed nor maxspeed type info available for osmid 464171230
Found max speed type in original graph: DE:urban
Found max speed in original 

Neither maxspeed nor maxspeed type info available for osmid 671122316
Neither maxspeed nor maxspeed type info available for osmid 618924838
Neither maxspeed nor maxspeed type info available for osmid 45904914
Neither maxspeed nor maxspeed type info available for osmid 45904914
Neither maxspeed nor maxspeed type info available for osmid 619455600
Neither maxspeed nor maxspeed type info available for osmid 662900858
Neither maxspeed nor maxspeed type info available for osmid 619455592
Neither maxspeed nor maxspeed type info available for osmid 31428146
Neither maxspeed nor maxspeed type info available for osmid 46115661
Neither maxspeed nor maxspeed type info available for osmid 623949510
Neither maxspeed nor maxspeed type info available for osmid 623949510
Neither maxspeed nor maxspeed type info available for osmid 623949512
Neither maxspeed nor maxspeed type info available for osmid 623949513
Neither maxspeed nor maxspeed type info available for osmid 624437776
Neither maxspeed nor max

Neither maxspeed nor maxspeed type info available for osmid 4592581
Neither maxspeed nor maxspeed type info available for osmid 51807954
Neither maxspeed nor maxspeed type info available for osmid 51807954
Neither maxspeed nor maxspeed type info available for osmid 703963597
Neither maxspeed nor maxspeed type info available for osmid 703963598
Found max speed in original graph
Neither maxspeed nor maxspeed type info available for osmid 9030910
Neither maxspeed nor maxspeed type info available for osmid 9031117
Neither maxspeed nor maxspeed type info available for osmid 105679491
Neither maxspeed nor maxspeed type info available for osmid 9031117
Neither maxspeed nor maxspeed type info available for osmid 9031306
Neither maxspeed nor maxspeed type info available for osmid 9031306
Processing 44000 of 46828 edges
Neither maxspeed nor maxspeed type info available for osmid 45249568
Neither maxspeed nor maxspeed type info available for osmid 9031633
Neither maxspeed nor maxspeed type info a

Found max speed in original graph
Found max speed type in original graph: DE:urban
Neither maxspeed nor maxspeed type info available for osmid 60827398
Found max speed in original graph
Neither maxspeed nor maxspeed type info available for osmid 60827398
Neither maxspeed nor maxspeed type info available for osmid 694526969
Neither maxspeed nor maxspeed type info available for osmid 694526969
Neither maxspeed nor maxspeed type info available for osmid 694526969
Neither maxspeed nor maxspeed type info available for osmid 694526969
Neither maxspeed nor maxspeed type info available for osmid 60917396
Neither maxspeed nor maxspeed type info available for osmid 694526969
Found max speed type in original graph: DE:urban
Neither maxspeed nor maxspeed type info available for osmid 29560616
Neither maxspeed nor maxspeed type info available for osmid 746642315
Found max speed in original graph
Found max speed in original graph
Neither maxspeed nor maxspeed type info available for osmid 847039379


Neither maxspeed nor maxspeed type info available for osmid 733106599
Found max speed in original graph


In [30]:
# Convert maxSpeeds to numeric (includes some string artifacts)
print(np.unique(maxSpeeds))
edges["maxSpeed"] = pd.to_numeric(maxSpeeds, errors='coerce')
print("Road Segments without max speed:",
      round((edges.maxSpeed.isna().sum()/len(edges))*100,2), "%")
print("Number of considered edge segments:", len(edges))
edges.head()

['10' '100' '120' '120.0' '15' '20' '30' '40' '5' '50' '60' '7' '70' '80'
 'nan' 'walk']
Road Segments without max speed: 4.97 %
Number of considered edge segments: 46828


Unnamed: 0,edge_id,edge_1,edge_2,maxSpeed
0,100027768-29690836,100027768,29690836,30.0
1,100032633-29796810,100032633,29796810,50.0
2,100032633-433557530,100032633,433557530,50.0
3,100069498-1370811846,100069498,1370811846,30.0
4,100069498-268224213,100069498,268224213,30.0


In [31]:
# Merge above df with Rv data and get actual Rv
Rv = Rv.merge(edges, on="edge_id")
Rv = Rv.dropna(subset=["maxSpeed"]) # Drop rows with NaN
Rv.loc[:,"Rv"] = (Rv["meanspeed"]/Rv["maxSpeed"])
Rv.loc[Rv.Rv > 1, "Rv"] = 1  # Avoid artifacts where Rv > 1 because of wrong road speed info

In [32]:
# Add road segment status for each time
conditions = [
    (Rv.Rv < .25),
    ((Rv.Rv > .25) & (Rv.Rv <= .50)),
    ((Rv.Rv > .50) & (Rv.Rv <= .75)),
    (Rv.Rv > .75)
]

values = ["Heavy Congestion", "Mild Congestion", "Smooth", "Very Smooth"]
Rv['status'] = np.select(conditions, values) 

In [33]:
Rv_status = Rv.merge((Rv[["edge_id", "time_id"]]
                      .groupby("edge_id")
                      .agg("count")), on="edge_id").rename(columns={"time_id_y": "T", "time_id_x": "time_id"})
Rv_status.head()

Unnamed: 0,edge_id,time_id,n,speed,meanspeed,edge_1,edge_2,maxSpeed,Rv,status,T
0,100027768-29690836,7,25,231.0,9.24,100027768,29690836,30.0,0.308,Mild Congestion,6
1,100027768-29690836,8,26,257.0,9.884615,100027768,29690836,30.0,0.329487,Mild Congestion,6
2,100027768-29690836,9,26,156.0,6.0,100027768,29690836,30.0,0.2,Heavy Congestion,6
3,100027768-29690836,10,19,376.0,19.789474,100027768,29690836,30.0,0.659649,Smooth,6
4,100027768-29690836,29,33,392.0,11.878788,100027768,29690836,30.0,0.39596,Mild Congestion,6


In [35]:
print(np.quantile(Rv_status.drop_duplicates(subset="edge_id")["T"], np.arange(0,1,.1)))

T_med = 0 # np.median(Rv_status.drop_duplicates(subset="edge_id")["T"])
print("Median:", T_med)

print("Number of observations with less than",
      T_med,
      "observation periods within the rush hour window:", 
      len(Rv_status[Rv_status["T"] < T_med].edge_id.unique()))
print("In percent: ", len(Rv_status[Rv_status["T"] < T_med].edge_id.unique())/len(edges))
      
Rv_status = Rv_status[Rv_status["T"] >= T_med] # drop T < T_med

[ 1.  2.  4. 11. 22. 34. 39. 40. 40. 40.]
Median: 0
Number of observations with less than 0 observation periods within the rush hour window: 0
In percent:  0.0


In [36]:
# Add R_NC
Rv_status = Rv_status.merge(
    (Rv_status[["edge_id", "status"]]
     .groupby('edge_id')
     .status
     .apply(lambda x: ((x == 'Smooth') | (x == "Very Smooth")).mean())),
    on = "edge_id"
).rename(columns={'status_x':'status', 'status_y':'R_NC'})
Rv_status.head()

Unnamed: 0,edge_id,time_id,n,speed,meanspeed,edge_1,edge_2,maxSpeed,Rv,status,T,R_NC
0,100027768-29690836,7,25,231.0,9.24,100027768,29690836,30.0,0.308,Mild Congestion,6,0.166667
1,100027768-29690836,8,26,257.0,9.884615,100027768,29690836,30.0,0.329487,Mild Congestion,6,0.166667
2,100027768-29690836,9,26,156.0,6.0,100027768,29690836,30.0,0.2,Heavy Congestion,6,0.166667
3,100027768-29690836,10,19,376.0,19.789474,100027768,29690836,30.0,0.659649,Smooth,6,0.166667
4,100027768-29690836,29,33,392.0,11.878788,100027768,29690836,30.0,0.39596,Mild Congestion,6,0.166667


In [37]:
# Calculate R_i
RSI = Rv_status[["edge_id", "R_NC", "Rv", "maxSpeed"]].groupby(["edge_id", "R_NC", "maxSpeed"]).agg("mean")
RSI = RSI.reset_index(level=['R_NC', "maxSpeed"])
RSI.loc[:,"RSI"] = RSI.R_NC*RSI.Rv
RSI = RSI.sort_values("RSI")
RSI = RSI.reset_index(level="edge_id")
RSI[["edge_1", "edge_2"]] = RSI.edge_id.str.split("-", expand=True)
RSI["edge_1"] = RSI.edge_1.astype(int)
RSI["edge_2"] = RSI.edge_2.astype(int)
RSI = RSI.set_index("edge_id")
print(RSI.head())
print("We have the RSI for", len(RSI), "segments.")
print("In percent, this is", len(RSI)/len(edges),".")

                      R_NC  maxSpeed        Rv  RSI      edge_1     edge_2
edge_id                                                                   
999309349-999309353    0.0      30.0  0.282238  0.0   999309349  999309353
28378389-28378390      0.0      30.0  0.178342  0.0    28378389   28378390
28378390-28385627      0.0      30.0  0.140404  0.0    28378390   28385627
1656195152-260985798   0.0      50.0  0.324747  0.0  1656195152  260985798
28378395-28394275      0.0      30.0  0.303509  0.0    28378395   28394275
We have the RSI for 44500 segments.
In percent, this is 0.9502861535833262 .


In [35]:
# Print out the "top 10" congested road segments
x = []
for i in range(10):
    e1, e2 = RSI.index[i].split("-")
    e1, e2 = int(e1), int(e2)
    x.append(G[e1][e2])
    print(G[e1][e2])

{0: {'osmid': 37633352, 'lanes': '2', 'name': 'Scharnweberstraße', 'highway': 'tertiary', 'maxspeed': '50', 'oneway': False, 'length': 177.692, 'geometry': <shapely.geometry.linestring.LineString object at 0x2aeee11b7e20>, 'grade': 0.006, 'grade_abs': 0.006}}
{0: {'osmid': 431571658, 'oneway': True, 'lanes': '3', 'name': 'Fritz-Erler-Allee', 'highway': 'tertiary', 'maxspeed': '50', 'length': 71.17699999999999, 'geometry': <shapely.geometry.linestring.LineString object at 0x2aeee5794d30>, 'grade': 0.0, 'grade_abs': 0.0}}
{0: {'osmid': 15815831, 'name': 'Schwechtenstraße', 'highway': 'unclassified', 'maxspeed': '50', 'oneway': False, 'length': 441.963, 'geometry': <shapely.geometry.linestring.LineString object at 0x2aeef2dc8a90>, 'grade': 0.0, 'grade_abs': 0.0}}
{0: {'osmid': 150257670, 'lanes': '3', 'ref': 'B 96', 'name': 'Oranienburger Chaussee', 'highway': 'primary', 'maxspeed': '50', 'oneway': False, 'length': 41.075, 'geometry': <shapely.geometry.linestring.LineString object at 0x2a

In [38]:
# And we save it
RSI.to_csv("data/RSI-2020-sample-NOT-FILTERED.csv")

# Accident Prediction

In [39]:
# First: Train model
edges_accidents = pd.read_csv("data/edges_buffered_all_accidents.csv")
edges_accidents = edges_accidents[edges_accidents.RSI <= 0.5]

# Read RV data with n data for edge ids and time ids
Rv = pd.read_csv("data/Rv.csv", names = ['edge_id', 'time_id', 'n', 'speed', 'meanspeed'], skiprows=4)

# Take the sum over all times to get total n per year on edge
Rv = Rv["n"].groupby(Rv["edge_id"]).sum().reset_index()
Rv.rename(columns={"n":"flow_n"}, inplace=True)

# Merge with dataframe
edges_accidents = edges_accidents.merge(Rv, on="edge_id")

# Create logged variables
edges_accidents["log_flow_n"] = np.log(edges_accidents["flow_n"])
edges_accidents["log_flow_n_2"] =  np.power(edges_accidents["log_flow_n"], 2)

upper_thresh = 20
predictors = ["log_flow_n", "maxSpeed"]
target = "n_accidents"

# Get data
all_data = edges_accidents[edges_accidents.n_accidents <= upper_thresh]
all_data = all_data.sample(frac=1) # shuffle data randomly

# Get the model
zip_results = ZeroInflatedPoisson(endog = all_data[target], 
                                           exog = add_constant(all_data[predictors]),
                                           exog_infl = add_constant(all_data[predictors]), 
                                           inflation = 'logit').fit(maxiter=100)

  x = pd.concat(x[::order], 1)


Optimization terminated successfully.
         Current function value: 0.878981
         Iterations: 34
         Function evaluations: 39
         Gradient evaluations: 39


In [40]:
RSI = pd.read_csv("data/RSI-2020-sample-NOT-FILTERED.csv", index_col=0)

In [41]:
# Retrieve nodes and edges as geodataframe
G = ox.io.load_graphml("data/graphs/berlin-2851.graphml")
H = ox.io.load_graphml("data/graphs/Berlin-raw.graphml")
nodes, edges = ox.graph_to_gdfs(G)

# MERGE TOGETHER THE G GDF WITH THE MORE INFORMATIVE H GDF AND ADD LANE INFO TO G.
H_edges = ox.graph_to_gdfs(H)[1][['osmid', 'maxspeed', 'surface', 'oneway', 'length',
       'lanes', 'maxspeed:type', 'ref', 'bridge', 'junction', 'width',
       'access', 'tunnel', 'service', 'area', 'est_width', 'geometry']]

edges = edges.join(H_edges, rsuffix="_y")

keepcols = ['osmid', 'name', 'highway', 'maxspeed', 'oneway', 'length', 'geometry',
       'grade', 'grade_abs', 'lanes', 'surface', 'maxspeed:type']
edges = edges[keepcols]

In [42]:
edges_buffered_all = gpd.GeoDataFrame(edges, geometry="geometry").set_crs(4326).to_crs(25833).reset_index()
edges_buffered_all['u'] = edges_buffered_all['u'].astype(str)
edges_buffered_all['v'] = edges_buffered_all['v'].astype(str)
edges_buffered_all["edge_id"] = edges_buffered_all[['u', 'v']].agg('-'.join, axis=1)
edges_buffered_all = edges_buffered_all.merge(RSI, on="edge_id")
edges_buffered_all = edges_buffered_all.dropna(subset=["RSI"]) # Drop all edges without RSI value

In [43]:
# Read RV data with n data for edge ids and time ids
Rv = pd.read_csv("data/Rv-2020-sample.csv", names = ['edge_id', 'time_id', 'n', 'speed', 'meanspeed'], skiprows=4)

# Take the sum over all times to get total n per year on edge
Rv = Rv["n"].groupby(Rv["edge_id"]).sum().reset_index()
Rv.rename(columns={"n":"flow_n"}, inplace=True)

# Merge with dataframe
edges_buffered_all = edges_buffered_all.merge(Rv, on="edge_id")

In [44]:
(edges_buffered_all[["edge_id", "maxSpeed", "RSI", "flow_n"]].
 to_csv("data/sample-to-predict-accidents-NOT-FILTERED.csv", index=False))

In [55]:
# Read in new data and filter to only include part of the data where RSI in 2017 is smaller than .5 as well
new = pd.read_csv("data/sample-to-predict-accidents-NOT-FILTERED.csv")
print(len(new)) # 22407
new = new[new.edge_id.isin(all_data.edge_id)].drop_duplicates(subset="edge_id")
print(len(new)) # 7725
new["log_flow_n"] = np.log(new.flow_n)

44735
8011


In [56]:
# Get the predictions
new_results = np.round(
    zip_results.predict(exog=add_constant(new[predictors]),
                        exog_infl=add_constant(new[predictors])),
    0)

  x = pd.concat(x[::order], 1)


In [59]:
print("N accidents in 2017:", all_data[target].sum())
print("N accidents in 2020:", int(np.sum(new_results)))
print("Difference:", int(all_data[target].sum()-np.sum(new_results)))

N accidents in 2017: 3833
N accidents in 2020: 3217
Difference: 616
