In [1]:
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
import numpy as np
from pyproj import CRS
import pathlib
from pathlib import Path
from shapely import wkt
from tqdm import tqdm

import math
import codecs
import osm2geojson
from shapely import wkt

import gzip
from xml.etree.ElementTree import Element, SubElement, Comment, tostring
import xml.etree.ElementTree as ET

import os

# set the working directory
BASE_DIR = Path.cwd()
# define the exported folder path
# Check if folder exists
folder_path = pathlib.Path(BASE_DIR.parent.joinpath("Exported_Files","census_tract","agg_network"))
folder_path.mkdir(parents=True, exist_ok=True)
# print(BASE_DIR)

## Fetch SF Champ network for 2010 and join the frames

In [2]:
# Network for YR 2010
dfSFRd2010am = gpd.read_file(BASE_DIR.parent.joinpath("2010","2010_AM.shp"))
dfSFRd2010am["peak"]="AM"
dfSFRd2010am["Tot_CAP"]=dfSFRd2010am["CAP"]*3

dfSFRd2010pm = gpd.read_file(BASE_DIR.parent.joinpath("2010","2010_PM.shp"))
dfSFRd2010pm["peak"]="PM"
dfSFRd2010pm["Tot_CAP"]=dfSFRd2010pm["CAP"]*3

dfSFRd2010ea = gpd.read_file(BASE_DIR.parent.joinpath("2010","2010_EA.shp"))
dfSFRd2010ea["peak"]="EA"
dfSFRd2010ea["Tot_CAP"]=dfSFRd2010ea["CAP"]*3

dfSFRd2010ev = gpd.read_file(BASE_DIR.parent.joinpath("2010","2010_EV.shp"))
dfSFRd2010ev["peak"]="EV"
dfSFRd2010ev["Tot_CAP"]=dfSFRd2010ev["CAP"]*8.5

dfSFRd2010md = gpd.read_file(BASE_DIR.parent.joinpath("2010","2010_MD.shp"))
dfSFRd2010md["peak"]="MD"
dfSFRd2010md["Tot_CAP"]=dfSFRd2010md["CAP"]*6.5

dfSFRdNtwrk2010 = pd.concat([dfSFRd2010am,dfSFRd2010pm,dfSFRd2010ea,dfSFRd2010ev,dfSFRd2010md])
# create empty columns: OOS, PUDO, Tot_Vol
dfSFRdNtwrk2010 = dfSFRdNtwrk2010.assign(OOS=0, PUDO=0, Tot_Vol=0, TNC_Tot_Vol=0)
dfSFRdNtwrk2010["A_B"] = dfSFRdNtwrk2010["A"].astype(str)  + "_" + dfSFRdNtwrk2010["B"].astype(str)
dfSFRdNtwrk2010["A"] = dfSFRdNtwrk2010["A"].astype(str)
dfSFRdNtwrk2010["B"] = dfSFRdNtwrk2010["B"].astype(str)
# get the columns which together formm Tot_Vol
add_2010 = ['V1_1', 'V2_1', 'V3_1', 'V4_1', 'V5_1', 'V6_1', 'V7_1', 'V8_1', 'V9_1', 'V10_1', 'V11_1', 'V12_1',
            'BUSVOL_AM','BUSVOL_PM','BUSVOL_EA','BUSVOL_MD','BUSVOL_EV','OOS']
# add them up
dfSFRdNtwrk2010["Tot_Vol"] = dfSFRdNtwrk2010[add_2010].sum(axis=1)

# Keep only FT types representing real road-network
# 1: Fwy-Fwy Connector; 2: Freeway; 3: Expressway; 4: Collector; 5: Ramp; 6: Centroid Connector;
# 7: Major Arterial; 8: ; 9: Alley (only for DTA); 10: ; 11: Local; 12: Minor Arterial; 13: Bike only;
# 14: ; 15: Super Arterial
dfSFRdNtwrk2010=dfSFRdNtwrk2010[dfSFRdNtwrk2010.FT.isin([1,2,3,4,5,7,9,10,11,12,13,15])]
# dfSFRdNtwrk2010=dfSFRdNtwrk2010[dfSFRdNtwrk2010.FT.isin([4,7,10,11,12,15])]

# convert back to geopandas dataframe
dfSFRdNtwrk2010 = gpd.GeoDataFrame(dfSFRdNtwrk2010, geometry='geometry',crs="EPSG:4326")
dfSFRdNtwrk2010=dfSFRdNtwrk2010.to_crs("EPSG:4326")

# # read the SF_Boundary file
dfSFBoundary = gpd.read_file(BASE_DIR.parent.joinpath("Data","SF_County","SFBay_Boundary.shp"))
dfSFBoundary=dfSFBoundary.to_crs("EPSG:4326")

dfSFRdNtwrk2010.reset_index(drop=True,inplace=True)
# # Overlay and select only links which are within the SF Bay Area Polygon
dfSFRdNtwrk2010 = gpd.clip(dfSFRdNtwrk2010, dfSFBoundary)

dfSFRdNtwrk2010.to_csv(BASE_DIR.parent.joinpath(folder_path,"Raw_SFRdNtwrk_2010_cores.csv"))

# aggregate the dataframe using A_B
wt_avg = lambda x: np.ma.average(x, weights = dfSFRdNtwrk2010.loc[x.index, "Tot_Vol"])

lst_col = ["SPEED","TIME","TIME_1","CSPD_1"]
# average the columns
avg_col = [ 'DISTANCE',
            'CAP', "FT","AT",
            'TIMESEED',
            'LANE_AM', 'LANE_OP', 'LANE_PM', 'BUSLANE_AM', 'BUSLANE_OP', 'BUSLANE_PM', 
            'TOLLAM_DA', 'TOLLAM_SR2', 'TOLLAM_SR3', 'TOLLPM_DA', 'TOLLPM_SR2', 'TOLLPM_SR3', 
            'TOLLEA_DA', 'TOLLEA_SR2', 'TOLLEA_SR3', 'TOLLMD_DA', 'TOLLMD_SR2', 'TOLLMD_SR3', 'TOLLEV_DA',
            'TOLLEV_SR2', 'TOLLEV_SR3',"USE","Tot_CAP"]

def agg_func(df):    
    d = {}
    for col in df.select_dtypes(np.number).columns:
        if col in lst_col:
            d[col] = wt_avg
        elif col in avg_col:
            d[col]="mean"
        else:
            d[col] = "sum"
    for col in df.select_dtypes(object).columns:
        d[col] = "first"    
    d["geometry"] = "first"
    return d

dfSFRdNtwrk2010_agg = dfSFRdNtwrk2010.groupby(['A_B'],as_index=False).aggregate(agg_func(dfSFRdNtwrk2010.copy())).copy()
# above merge converts the geo-dataframe to pandas dataframe. So re-convert it into geodataframe
dfSFRdNtwrk2010_agg = gpd.GeoDataFrame(dfSFRdNtwrk2010_agg, geometry='geometry',crs="EPSG:4326")
dfSFRdNtwrk2010_agg=dfSFRdNtwrk2010_agg.to_crs("EPSG:4326")

# # Overlay and select only links which are within the SF Bay Area Polygon
dfSFRdNtwrk2010_agg = gpd.clip(dfSFRdNtwrk2010_agg, dfSFBoundary)

# export the geodataframe
dfSFRdNtwrk2010_agg.to_file(BASE_DIR.parent.joinpath(folder_path,"SFChamp_2010_agg.geojson"), driver='GeoJSON')
dfSFRdNtwrk2010_agg.to_csv(BASE_DIR.parent.joinpath(folder_path,"SFChamp_2010_agg.csv"))

# reproject the geodataframe to EPSG:3857
dfSFRdNtwrk2010_agg = dfSFRdNtwrk2010_agg.to_crs("EPSG:3857")
dfSFRdNtwrk2010_agg.to_file(BASE_DIR.parent.joinpath(folder_path,"SFChamp_2010_agg_PCS.geojson"), driver='GeoJSON')

  avg = np.multiply(a, wgt, dtype=result_dtype).sum(axis)/scl


## Fetch SF Champ network for 2016 and join the frames

In [3]:
# read 2016 file
dfSFRd2016am = gpd.read_file(BASE_DIR.parent.joinpath("2016","2016_AM.shp"))
dfSFRd2016am["peak"]="AM"
dfSFRd2016am["Tot_CAP"]=dfSFRd2016am["CAP"]*3
dfSFRd2016pm = gpd.read_file(BASE_DIR.parent.joinpath("2016","2016_PM.shp"))
dfSFRd2016pm["peak"]="PM"
dfSFRd2016pm["Tot_CAP"]=dfSFRd2016pm["CAP"]*3
dfSFRd2016ea = gpd.read_file(BASE_DIR.parent.joinpath("2016","2016_EA.shp"))
dfSFRd2016ea["peak"]="EA"
dfSFRd2016ea["Tot_CAP"]=dfSFRd2016ea["CAP"]*3
dfSFRd2016ev = gpd.read_file(BASE_DIR.parent.joinpath("2016","2016_EV.shp"))
dfSFRd2016ev["peak"]="EV"
dfSFRd2016ev["Tot_CAP"]=dfSFRd2016ev["CAP"]*8.5
dfSFRd2016md = gpd.read_file(BASE_DIR.parent.joinpath("2016","2016_MD.shp"))
dfSFRd2016md["peak"]="MD"
dfSFRd2016md["Tot_CAP"]=dfSFRd2016md["CAP"]*6.5

dfSFRdNtwrk2016 = pd.concat([dfSFRd2016am,dfSFRd2016pm,dfSFRd2016ea,dfSFRd2016ev,dfSFRd2016md])

# create empty columns: OOS, PUDO, Tot_Vol
dfSFRdNtwrk2016 = dfSFRdNtwrk2016.assign(Tot_Vol=0)
dfSFRdNtwrk2016["A_B"] = dfSFRdNtwrk2016["A"].astype(str)  + "_" + dfSFRdNtwrk2016["B"].astype(str)
dfSFRdNtwrk2016["A"] = dfSFRdNtwrk2016["A"].astype(str)
dfSFRdNtwrk2016["B"] = dfSFRdNtwrk2016["B"].astype(str)
# get the columns which together formm Tot_Vol
add_2016 = ['V1_1', 'V2_1', 'V3_1', 'V4_1', 'V5_1', 'V6_1', 'V7_1', 'V8_1', 'V9_1', 'V10_1', 'V11_1', 'V12_1',
            'V13_1','V14_1','V15_1','V16_1','V17_1','V18_1','V19_1',
            'BUSVOL_AM','BUSVOL_PM','BUSVOL_EA','BUSVOL_MD','BUSVOL_EV','OOS']
# add them up
dfSFRdNtwrk2016["Tot_Vol"] = dfSFRdNtwrk2016[add_2016].sum(axis=1)

# add to TNC Vol
# TNC_2016 = ['V16_1','V17_1','V18_1','OOS'] # updated 26 Jan 2022 to reflect TNC volumes which were mis-represented
TNC_2016 = ['V13_1','OOS'] # V13_1 is TNC_Volumnes plying on the road segment
dfSFRdNtwrk2016["TNC_Tot_Vol"] = dfSFRdNtwrk2016[TNC_2016].sum(axis=1)

# Keep only FT types representing real road-network
dfSFRdNtwrk2016=dfSFRdNtwrk2016[dfSFRdNtwrk2016.FT.isin([1,2,3,4,5,7,9,10,11,12,13,15])]
# dfSFRdNtwrk2016=dfSFRdNtwrk2016[dfSFRdNtwrk2016.FT.isin([4,7,10,11,12,15])]
                                
# convert back to geopandas dataframe
dfSFRdNtwrk2016 = gpd.GeoDataFrame(dfSFRdNtwrk2016, geometry='geometry',crs="EPSG:4326")
dfSFRdNtwrk2016=dfSFRdNtwrk2016.to_crs("EPSG:4326")

# # read the SF_Boundary file
dfSFBoundary = gpd.read_file(BASE_DIR.parent.joinpath("Data","SF_County","SFBay_Boundary.shp"))
dfSFBoundary=dfSFBoundary.to_crs("EPSG:4326")

dfSFRdNtwrk2016.reset_index(drop=True,inplace=True)
# # Overlay and select only links which are within the SF Bay Area Polygon
dfSFRdNtwrk2016 = gpd.clip(dfSFRdNtwrk2016, dfSFBoundary)

dfSFRdNtwrk2016.to_csv(BASE_DIR.parent.joinpath(folder_path,"Raw_SFRdNtwrk_2016_cores.csv"))

# aggregate the dataframe using A_B
wt_avg = lambda x: np.ma.average(x, weights = dfSFRdNtwrk2016.loc[x.index, "Tot_Vol"])

lst_col = ["SPEED","TIME","TIME_1","CSPD_1"]
# average the columns
avg_col = [ 'DISTANCE',
            'CAP', "FT","AT",
            'TIMESEED',
            'LANE_AM', 'LANE_OP', 'LANE_PM', 'BUSLANE_AM', 'BUSLANE_OP', 'BUSLANE_PM', 
            'TOLLAM_DA', 'TOLLAM_SR2', 'TOLLAM_SR3', 'TOLLPM_DA', 'TOLLPM_SR2', 'TOLLPM_SR3', 
            'TOLLEA_DA', 'TOLLEA_SR2', 'TOLLEA_SR3', 'TOLLMD_DA', 'TOLLMD_SR2', 'TOLLMD_SR3', 'TOLLEV_DA',
            'TOLLEV_SR2', 'TOLLEV_SR3',"USE","Tot_CAP"]

def agg_func(df):    
    d = {}
    for col in df.select_dtypes(np.number).columns:
        if col in lst_col:
            d[col] = wt_avg
        elif col in avg_col:
            d[col]="mean"
        else:
            d[col] = "sum"
    for col in df.select_dtypes(object).columns:
        d[col] = "first"    
    d["geometry"] = "first"
    return d

dfSFRdNtwrk2016_agg = dfSFRdNtwrk2016.groupby(['A_B'],as_index=False).aggregate(agg_func(dfSFRdNtwrk2016.copy())).copy()
# above merge converts the geo-dataframe to pandas dataframe. So re-convert it into geodataframe
dfSFRdNtwrk2016_agg = gpd.GeoDataFrame(dfSFRdNtwrk2016_agg, geometry='geometry',crs="EPSG:4326")
dfSFRdNtwrk2016_agg=dfSFRdNtwrk2016_agg.to_crs("EPSG:4326")

# # Overlay and select only links which are within the SF Bay Area Polygon
dfSFRdNtwrk2016_agg = gpd.clip(dfSFRdNtwrk2016_agg, dfSFBoundary)
# export the geodataframe
dfSFRdNtwrk2016_agg.to_file(BASE_DIR.parent.joinpath(folder_path,"SFChamp_2016_agg.geojson"), driver='GeoJSON')
dfSFRdNtwrk2016_agg.to_csv(BASE_DIR.parent.joinpath(folder_path,"SFChamp_2016_agg.csv"))

# reproject the geodataframe to EPSG:3857
dfSFRdNtwrk2016_agg = dfSFRdNtwrk2016_agg.to_crs("EPSG:3857")
dfSFRdNtwrk2016_agg.to_file(BASE_DIR.parent.joinpath(folder_path,"SFChamp_2016_agg_PCS.geojson"), driver='GeoJSON')

  avg = np.multiply(a, wgt, dtype=result_dtype).sum(axis)/scl


In [4]:
# Also reproject the Census Tract file to EPSG:3857
dfSF_CensusTract = gpd.read_file(BASE_DIR.parent.joinpath("CensusTract","SF_CT_2010.geojson"))
dfSF_CensusTract = dfSF_CensusTract.to_crs("EPSG:3857")
dfSF_CensusTract.to_file(BASE_DIR.parent.joinpath(folder_path,"SF_CensusTract_PCS.geojson"), driver='GeoJSON')

In [None]:
# Check in QGIS if the re-projection is successful .i.e.
# 1. Both SFChamp_2010_agg.geojson and SFChamp_2016_agg.geojson into EPSG:3857, are named with _PCS suffix
# 2. The SF_CensusTract to EPSG:3857, is also name with _PCS suffix

# After the above process do the following in QGIS
# 3. Intersect with SF_CensusTract and RoadNetwork and save the output as SFChamp_201x_agg_CT_PCS.geojson
# 4. Road Crash for each year i.e. 2010 and 2016, convert it to EPSG:3857, name it SFCrash_2010_PCS.geojson & SFCrash_2016_PCS.geojson
# 5. Perform NearestNeighbour join of respective year road crash with respective year road network. Name them as NN_SFCrash_2010_PCS & NN_SFCrash_2016_PCS.
#    Keep only FT, A_B, A, B attribute from roadnetwork in the output file
# 6. Intersect NN_SFCrash_2010_PCS & NN_SFCrash_2016_PCS with SF_CensusTract, name the file as NN_SFCrash_CT_2010_PCS and NN_SFCrash_CT_2016_PCS.
#    Keep only tractce10 attribute from SF_CensusTract in the output file
# This ends QGIS manipulation

In [2]:
# Fetch SF_Census Tract, SF_RoadNetwork and SF_RoadCrash
SF_CT = gpd.read_file(BASE_DIR.parent.joinpath(folder_path,"SF_CT_PCS.geojson"), crs = "EPSG:3857")
# road network
dfSFRdNtwrk2010_agg_CT = gpd.read_file(BASE_DIR.parent.joinpath(folder_path,"SFChamp_2010_agg_CT_PCS.geojson"), crs = "EPSG:3857")
dfSFRdNtwrk2010_agg_CT.fillna(dfSFRdNtwrk2010_agg_CT.dtypes.replace({'float64': 0.0, 'O': 'NULL'}),downcast='infer', inplace=True)
dfSFRdNtwrk2016_agg_CT = gpd.read_file(BASE_DIR.parent.joinpath(folder_path,"SFChamp_2016_agg_CT_PCS.geojson"), crs = "EPSG:3857")
dfSFRdNtwrk2016_agg_CT.fillna(dfSFRdNtwrk2016_agg_CT.dtypes.replace({'float64': 0.0, 'O': 'NULL'}),downcast='infer', inplace=True)
# road crashes
dfSFCrash2010_CT = gpd.read_file(BASE_DIR.parent.joinpath(folder_path,"NN_SFCrash_CT_2010_PCS.geojson"), crs = "EPSG:3857")
dfSFCrash2010_CT.fillna(dfSFCrash2010_CT.dtypes.replace({'float64': 0.0, 'O': 'NULL'}),downcast='infer', inplace=True)
dfSFCrash2016_CT = gpd.read_file(BASE_DIR.parent.joinpath(folder_path,"NN_SFCrash_CT_2016_PCS.geojson"), crs = "EPSG:3857")
dfSFCrash2016_CT.fillna(dfSFCrash2016_CT.dtypes.replace({'float64': 0.0, 'O': 'NULL'}),downcast='infer', inplace=True)

#remember to rename DISTANCE variable (as this is no longer the actual distance (in miles), given that feature is split-up)
dfSFRdNtwrk2010_agg_CT.rename(columns={"DISTANCE":"DISTANCE_MILES"},inplace=True)
dfSFRdNtwrk2010_agg_CT["Length_meters"] = dfSFRdNtwrk2010_agg_CT.geometry.length
dfSFRdNtwrk2016_agg_CT.rename(columns={"DISTANCE":"DISTANCE_MILES"},inplace=True)
dfSFRdNtwrk2016_agg_CT["Length_meters"] = dfSFRdNtwrk2016_agg_CT.geometry.length

In [3]:
## attaching TMCEquiv for future usage, possible
# read the TMC file to map A_B <--> TMCEquiv
dfTMCEquiv = pd.read_csv(BASE_DIR.parent.joinpath("TMC", 'CHAMP_ModifiedTMC_Equiv.csv'))
dfEST2010 = pd.read_csv(BASE_DIR.parent.joinpath("TMC",'ESTFILE_2010.csv'))
dfEST2016 = pd.read_csv(BASE_DIR.parent.joinpath("TMC", 'ESTFILE_2016.csv'))

# Map the road network "A_B" with "TMCLink" using a lambda
TMCEquiv_dict = dict(zip(dfTMCEquiv["A_B"],dfTMCEquiv["ModifiedTMC"]))
dfSFRdNtwrk2010_agg_CT["TMCEquiv"] = dfSFRdNtwrk2010_agg_CT["A_B"].map(TMCEquiv_dict) # this geodataframe only contains "A_B" & "TMC_Equiv" fields
dfSFRdNtwrk2010_agg_CT["TMCEquiv"] = dfSFRdNtwrk2010_agg_CT["A_B"].map(TMCEquiv_dict) # this geodataframe only contains "A_B" & "TMC_Equiv" fields

In [4]:
### drop all columns which have information related to links except A_B, FT, AT.
# Also drop all columns which have information related to Census Tract except "tractce"
# on crash feature class
rename_col = { "join_A_B": "A_B",
               "join_FT": "FT"}
dfSFCrash2010_CT.rename(columns=rename_col,inplace=True)
unwanted = dfSFCrash2010_CT.columns[dfSFCrash2010_CT.columns.str.startswith('join_')]
dfSFCrash2010_CT.drop(unwanted, axis=1, inplace=True)
dfSFCrash2010_CT.drop(columns="Unnamed: 0", axis=1, inplace=True)

dfSFCrash2016_CT.rename(columns=rename_col,inplace=True)
unwanted = dfSFCrash2016_CT.columns[dfSFCrash2016_CT.columns.str.startswith('join_')]
dfSFCrash2016_CT.drop(unwanted, axis=1, inplace=True)
dfSFCrash2016_CT.drop(columns="Unnamed: 0", axis=1, inplace=True)

In [5]:
### create different categories of the network and crashes - in our case full network

In [6]:
# for road network
# save full network as a featureclass
# for cat in [1,2,3]:
# filter the road network by category value
dfSFRdNtwrk2010_agg_CT.to_file(BASE_DIR.parent.joinpath("Exported_Files","census_tract","by_FTs","SFRdNtwrk_CT_2010_fullnetwork.geojson"),driver="GeoJSON")
dfSFRdNtwrk2016_agg_CT.to_file(BASE_DIR.parent.joinpath("Exported_Files","census_tract","by_FTs","SFRdNtwrk_CT_2016_fullnetwork.geojson"),driver="GeoJSON")
# filter the road crash by category value
dfSFCrash2010_CT[(dfSFCrash2010_CT["D2NL"]<=10)].to_file(BASE_DIR.parent.joinpath("Exported_Files","census_tract","by_FTs","SFCrash_CT_2010_fullnetwork.geojson"),driver="GeoJSON")
dfSFCrash2016_CT[(dfSFCrash2016_CT["D2NL"]<=10)].to_file(BASE_DIR.parent.joinpath("Exported_Files","census_tract","by_FTs","SFCrash_CT_2016_fullnetwork.geojson"),driver="GeoJSON")

# export it to CSV
# for cat in [1,2,3]:
# export to CSV
dfSFRdNtwrk2010_agg_CT.loc[:, ~dfSFRdNtwrk2010_agg_CT.columns.isin(['geometry'])].to_csv(BASE_DIR.parent.joinpath("Exported_Files","census_tract","by_FTs","SFRdNtwrk_CT_2010_fullnetwork.csv"))
dfSFRdNtwrk2016_agg_CT.loc[:, ~dfSFRdNtwrk2016_agg_CT.columns.isin(['geometry'])].to_csv(BASE_DIR.parent.joinpath("Exported_Files","census_tract","by_FTs","SFRdNtwrk_CT_2016_fullnetwork.csv"))
dfSFCrash2010_CT[(dfSFCrash2010_CT["D2NL"]<=10)].loc[:,~dfSFCrash2010_CT.columns.isin(["geometry"])].to_csv(BASE_DIR.parent.joinpath("Exported_Files","census_tract","by_FTs","SFCrash_CT_2010_fullnetwork.csv"))
dfSFCrash2016_CT[(dfSFCrash2016_CT["D2NL"]<=10)].loc[:,~dfSFCrash2016_CT.columns.isin(["geometry"])].to_csv(BASE_DIR.parent.joinpath("Exported_Files","census_tract","by_FTs","SFCrash_CT_2016_fullnetwork.csv"))

In [7]:
### Above exported dataframes are exactly what we want.
# Now aggregate variables in each dataframe according to "censustract" it belongs
# merge them with Census Tract shapefile to get geodataframe
# merge both crash and road network with Census Tract to help explore visually

In [None]:
### aggregate the road network along the census_tract IDs

In [8]:
def agg_network_CT(_df):
    _df ["A"] = _df["A"].astype(str)
    _df["B"] = _df["B"].astype(str)
    _df ["A_B"] = _df["A_B"].astype(str)
    #     _df["AT"] = _df["AT"].astype(str)
    _df["FT"] = _df["FT"].astype(str)
    #     _df["category"] = _df["category"].astype(str)
    # aggregate the dataframe using A_B
    wt_avg = lambda x: np.ma.average(x, weights = _df.loc[x.index, "Tot_Vol"])
    # Aggregating rows based on one column with “, ”.join
    concat_agg = lambda ar: ', '.join([item for item in ar if item])

    def agg_func(df):
        d = {}
        for col in df.select_dtypes(np.number).columns:
            if col in wt_col:
                d[col] = wt_avg
            else:
                d[col] = "sum"
        for col in df.select_dtypes(object).columns:
            if col in str_col:
                d[col] = "first"
            else:
                d[col] = concat_agg
        return d

    wt_col = ["SPEED","TIME","CSPD_1"]
    sum_col = [ "CAP", "DISTANCE_MILES",'Length_meters','V1_1', 'V2_1', 'V3_1', 'V4_1', 'V5_1', 'V6_1', 'V7_1', 'V8_1', 'V9_1', 'V10_1', 'V11_1', 'V12_1',"V13_1",'V14_1', 'V15_1',"V16_1",'V17_1', 'V18_1',"V19_1",
                'VT_1', 'V1T_1', 'V2T_1', 'V3T_1', 'V4T_1', 'V5T_1', 'V6T_1', 'V7T_1', 'V8T_1', 'V9T_1', 'V10T_1', 'V11T_1','V12T_1',"V13T_1",'V14T_1', 'V15T_1',"V16T_1",'V17T_1', 'V18T_1',"V19T_1",
                'OOS', 'PUDO', 'Tot_Vol',"TNC_Tot_Vol","TIME_1",'TIMESEED',"Tot_CAP"]

    str_col = ['tractce10',"TMCEquiv"]
    concat_col = ["A_B","FT"]
    drop_col = [ 'A', 'B',"USE",'PER_RISE', 'ONEWAY',"TOLL",'PROJ', 'ACTION', 'AB','peak',
                 'TOLLAM_DA', 'TOLLAM_SR2', 'TOLLAM_SR3', 'TOLLPM_DA', 'TOLLPM_SR2', 'TOLLPM_SR3', 'TOLLEA_DA',
                 'TOLLEA_SR2', 'TOLLEA_SR3', 'TOLLMD_DA', 'TOLLMD_SR2', 'TOLLMD_SR3', 'TOLLEV_DA', 'TOLLEV_SR2', 'TOLLEV_SR3',
                 'DTA_EDIT_F', 'TOLLTIME', 'PHASE', 'AMBUSSAVE', 'MDBUSSAVE', 'PMBUSSAVE', 'EVBUSSAVE', 'EABUSSAVE', 'SPDC', 'CAPC',
                 'BUSVOL_AM', 'BUSVOL_AM', 'BUSVOL_MD', 'BUSVOL_PM', 'BUSVOL_EV', 'BUSVOL_EA',
                 'LANE_AM', 'LANE_OP', 'LANE_PM', 'BUSLANE_AM', 'BUSLANE_OP', 'BUSLANE_PM',
                 'STREETNAME', 'TYPE', 'MTYPE','TSIN']
    #                  'statefp10','mtfcc10','name10','intptlat10','awater10','namelsad10','funcstat10','aland10','geoid10','intptlon10','countyfp10']

    _df.drop(drop_col,axis=1,inplace=True)
    df = _df.groupby(['tractce10'],as_index=False).aggregate(agg_func(_df.copy())).copy()
    return df

# aggregate the road network information according to the CT it belongs
# for cat in [1,2,3]:
# census tract #
SF_CT = gpd.read_file(BASE_DIR.parent.joinpath(folder_path,"SF_CT_PCS.geojson"), crs = "EPSG:3857")
gdfRdNtwrk = gpd.read_file(BASE_DIR.parent.joinpath("Exported_Files","census_tract","by_FTs","SFRdNtwrk_CT_2010_fullnetwork.geojson"))
gdfRdNtwrk = gdfRdNtwrk.assign(V13_1=0, V14_1=0, V15_1=0, V16_1=0, V17_1=0, V18_1=0, V19_1=0,
                               V13T_1=0, V14T_1=0, V15T_1=0, V16T_1=0, V17T_1=0, V18T_1=0, V19T_1=0,)
gdfRdNtwrk_agg = agg_network_CT(gdfRdNtwrk)
# calculate some additional fields
gdfRdNtwrk_agg["Tot_Vol_yr"] = gdfRdNtwrk_agg["Tot_Vol"]*365
# Calculate Vehicle Distance Travelled
gdfRdNtwrk_agg["Veh_Dist_Travel"] = gdfRdNtwrk_agg["Length_meters"]*gdfRdNtwrk_agg["Tot_Vol"]*0.000621371
gdfRdNtwrk_agg["Veh_Dist_Travel_yr"] = gdfRdNtwrk_agg["Length_meters"]*gdfRdNtwrk_agg["Tot_Vol"]*0.000621371*365
# Calculate congested speed
gdfRdNtwrk_agg["CONGESTED_SPEED"] = (((gdfRdNtwrk_agg["Length_meters"]*0.000621371).divide(gdfRdNtwrk_agg["TIME_1"]))*60)
gdfRdNtwrk_agg["CONGESTED_SPEED_yr"] = (((gdfRdNtwrk_agg["Length_meters"]*0.000621371).divide(gdfRdNtwrk_agg["TIME_1"]))*60)
# Calculate TNC Volume
cols = ['V13_1',"OOS"]
gdfRdNtwrk_agg["TNC_VOLUME"] = gdfRdNtwrk_agg[cols].sum(axis=1)
gdfRdNtwrk_agg["TNC_VOLUME_yr"] = gdfRdNtwrk_agg["TNC_VOLUME"]*365

gdfRdNtwrk_agg["Veh_Dist_Travel_TNC"] = gdfRdNtwrk_agg["Length_meters"]*gdfRdNtwrk_agg["TNC_VOLUME"]*0.000621371
gdfRdNtwrk_agg["Veh_Dist_Travel_TNC_yr"] = gdfRdNtwrk_agg["Length_meters"]*gdfRdNtwrk_agg["TNC_VOLUME"]*0.000621371*365

gdfRdNtwrk_agg["NON_TNC_VOLUME"] = gdfRdNtwrk_agg["Tot_Vol"] - gdfRdNtwrk_agg["TNC_VOLUME"]
gdfRdNtwrk_agg["NON_TNC_VOLUME_yr"] = gdfRdNtwrk_agg["Tot_Vol_yr"] - gdfRdNtwrk_agg["TNC_VOLUME_yr"]

gdfRdNtwrk_agg["Veh_Dist_Travel_NON_TNC"] = gdfRdNtwrk_agg["Length_meters"]*gdfRdNtwrk_agg["NON_TNC_VOLUME"]*0.000621371
gdfRdNtwrk_agg["Veh_Dist_Travel_NON_TNC_yr"] = gdfRdNtwrk_agg["Length_meters"]*gdfRdNtwrk_agg["NON_TNC_VOLUME"]*0.000621371*365

gdfRdNtwrk_agg["PUDO_yr"] = gdfRdNtwrk_agg["PUDO"]*365

# Calculate - PCT of the TNC Vol, Pick-up & Drop-off and VC Ratio on the segment
gdfRdNtwrk_agg["PCT_TNC_VOL"] = 0
gdfRdNtwrk_agg["PCT_TNC_PUDO"] = 0
gdfRdNtwrk_agg["PCT_TNC_VOL"] = gdfRdNtwrk_agg["TNC_VOLUME"].divide(gdfRdNtwrk_agg["Tot_Vol"])
gdfRdNtwrk_agg["PCT_TNC_PUDO"] = gdfRdNtwrk_agg["PUDO"].divide(gdfRdNtwrk_agg["Tot_Vol"])
gdfRdNtwrk_agg["VC_ratio"] = gdfRdNtwrk_agg["Tot_Vol"].divide(gdfRdNtwrk_agg["Tot_CAP"])
gdfRdNtwrk_agg.to_csv(BASE_DIR.parent.joinpath("Exported_Files","census_tract","by_FTs","SFRdNtwrk_CT_2010_fullnetwork_agg.csv"))
# convert the db2010 and db2016 as geojson files
SF_CT_2010 = SF_CT.merge(gdfRdNtwrk_agg,on='tractce10',how="left")
# save the file
unwanted_cols = ['statefp10','mtfcc10','name10','intptlat10','awater10','namelsad10','funcstat10','aland10','geoid10','intptlon10','countyfp10']
SF_CT_2010.loc[:,~SF_CT_2010.columns.isin(unwanted_cols)].to_file(BASE_DIR.parent.joinpath("Exported_Files","census_tract","by_FTs","SFRdNtwrk_CT_2010_fullnetwork_agg.geojson"),driver="GeoJSON")
# save the file as csv
SF_CT_2010.loc[:,~SF_CT_2010.columns.isin(["geometry"])].to_csv(BASE_DIR.parent.joinpath("Exported_Files","census_tract","by_FTs","SFRdNtwrk_CT_2010_fullnetwork_agg_merged.csv"))

# for cat in [1,2,3]:
# census tract #
SF_CT = gpd.read_file(BASE_DIR.parent.joinpath(folder_path,"SF_CT_PCS.geojson"), crs = "EPSG:3857")
gdfRdNtwrk = gpd.read_file(BASE_DIR.parent.joinpath("Exported_Files","census_tract","by_FTs","SFRdNtwrk_CT_2016_fullnetwork.geojson"))
gdfRdNtwrk_agg = agg_network_CT(gdfRdNtwrk)
# calculate some additional fields
gdfRdNtwrk_agg["Tot_Vol_yr"] = gdfRdNtwrk_agg["Tot_Vol"]*365
# Calculate Vehicle Distance Travelled
gdfRdNtwrk_agg["Veh_Dist_Travel"] = gdfRdNtwrk_agg["Length_meters"]*gdfRdNtwrk_agg["Tot_Vol"]*0.000621371
gdfRdNtwrk_agg["Veh_Dist_Travel_yr"] = gdfRdNtwrk_agg["Length_meters"]*gdfRdNtwrk_agg["Tot_Vol"]*0.000621371*365
# Calculate congested speed
gdfRdNtwrk_agg["CONGESTED_SPEED"] = (((gdfRdNtwrk_agg["Length_meters"]*0.000621371).divide(gdfRdNtwrk_agg["TIME_1"]))*60)
gdfRdNtwrk_agg["CONGESTED_SPEED_yr"] = (((gdfRdNtwrk_agg["Length_meters"]*0.000621371).divide(gdfRdNtwrk_agg["TIME_1"]))*60)
# Calculate TNC Volume
cols = ['V13_1',"OOS"]
gdfRdNtwrk_agg["TNC_VOLUME"] = gdfRdNtwrk_agg[cols].sum(axis=1)
gdfRdNtwrk_agg["TNC_VOLUME_yr"] = gdfRdNtwrk_agg["TNC_VOLUME"]*365

gdfRdNtwrk_agg["Veh_Dist_Travel_TNC"] = gdfRdNtwrk_agg["Length_meters"]*gdfRdNtwrk_agg["TNC_VOLUME"]*0.000621371
gdfRdNtwrk_agg["Veh_Dist_Travel_TNC_yr"] = gdfRdNtwrk_agg["Length_meters"]*gdfRdNtwrk_agg["TNC_VOLUME"]*0.000621371*365

gdfRdNtwrk_agg["NON_TNC_VOLUME"] = gdfRdNtwrk_agg["Tot_Vol"] - gdfRdNtwrk_agg["TNC_VOLUME"]
gdfRdNtwrk_agg["NON_TNC_VOLUME_yr"] = gdfRdNtwrk_agg["Tot_Vol_yr"] - gdfRdNtwrk_agg["TNC_VOLUME_yr"]

gdfRdNtwrk_agg["Veh_Dist_Travel_NON_TNC"] = gdfRdNtwrk_agg["Length_meters"]*gdfRdNtwrk_agg["NON_TNC_VOLUME"]*0.000621371
gdfRdNtwrk_agg["Veh_Dist_Travel_NON_TNC_yr"] = gdfRdNtwrk_agg["Length_meters"]*gdfRdNtwrk_agg["NON_TNC_VOLUME"]*0.000621371*365

gdfRdNtwrk_agg["PUDO_yr"] = gdfRdNtwrk_agg["PUDO"]*365

# Calculate - PCT of the TNC Vol, Pick-up & Drop-off and VC Ratio on the segment
gdfRdNtwrk_agg["PCT_TNC_VOL"] = 0
gdfRdNtwrk_agg["PCT_TNC_PUDO"] = 0
gdfRdNtwrk_agg["PCT_TNC_VOL"] = gdfRdNtwrk_agg["TNC_VOLUME"].divide(gdfRdNtwrk_agg["Tot_Vol"])
gdfRdNtwrk_agg["PCT_TNC_PUDO"] = gdfRdNtwrk_agg["PUDO"].divide(gdfRdNtwrk_agg["Tot_Vol"])
gdfRdNtwrk_agg["VC_ratio"] = gdfRdNtwrk_agg["Tot_Vol"].divide(gdfRdNtwrk_agg["Tot_CAP"])
gdfRdNtwrk_agg.to_csv(BASE_DIR.parent.joinpath("Exported_Files","census_tract","by_FTs","SFRdNtwrk_CT_2016_fullnetwork_agg.csv"))
# convert the db2010 and db2016 as geojson files
SF_CT_2016 = SF_CT.merge(gdfRdNtwrk_agg,on='tractce10',how="left")
# save the file as geodataframe
unwanted_cols = ['statefp10','mtfcc10','name10','intptlat10','awater10','namelsad10','funcstat10','aland10','geoid10','intptlon10','countyfp10']
SF_CT_2016.loc[:,~SF_CT_2016.columns.isin(unwanted_cols)].to_file(BASE_DIR.parent.joinpath("Exported_Files","census_tract","by_FTs","SFRdNtwrk_CT_2016_fullnetwork_agg.geojson"),driver="GeoJSON")
# save the file as csv
SF_CT_2016.loc[:,~SF_CT_2016.columns.isin(["geometry"])].to_csv(BASE_DIR.parent.joinpath("Exported_Files","census_tract","by_FTs","SFRdNtwrk_CT_2016_fullnetwork_agg.csv"))

In [9]:
### aggregate the road CRASH along the census_tract IDs

In [10]:
# aggregate crashes along the CensusTract
def agg_crash_CT(_df):
    _df ["A_B"] = _df["A_B"].astype(str)
    #     _df["category"] = _df["category"].astype(str)
    _df["CASE_ID"] = _df["CASE_ID"].astype(str)
    _df["FT"] = _df["FT"].astype(str)
    concat_col = ["A_B","CASE_ID","FT"]
    str_col = ['tractce10']
    sum_col = ['NUMBER_KILLED', 'NUMBER_INJURED','COUNT_SEVERE_INJ', 'COUNT_VISIBLE_INJ', 'COUNT_COMPLAINT_PAIN', 'COUNT_PED_KILLED', 'COUNT_PED_INJURED',
               'COUNT_BICYCLIST_KILLED', 'COUNT_BICYCLIST_INJURED', 'COUNT_MC_KILLED', 'COUNT_MC_INJURED','Total_Crash', 'COUNT_Fatal', 'COUNT_Severe_Injury',
               'COUNT_Visible_Injury', 'COUNT_Other_Injury']
    drop_col = [ 'PROC_DATE', 'JURIS', 'COLLISION_DATE', 'COLLISION_TIME', 'OFFICER_ID', 'REPORTING_DISTRICT', 'DAY_OF_WEEK', 'CHP_SHIFT', 'POPULATION',
                 'CNTY_CITY_LOC', 'SPECIAL_COND', 'BEAT_TYPE', 'CHP_BEAT_TYPE', 'CITY_DIVISION_LAPD', 'CHP_BEAT_CLASS', 'BEAT_NUMBER', 'PRIMARY_RD', 'SECONDARY_RD',
                 'DISTANCE', 'DIRECTION', 'INTERSECTION', 'WEATHER_1', 'WEATHER_2', 'STATE_HWY_IND', 'CALTRANS_COUNTY', 'CALTRANS_DISTRICT', 'STATE_ROUTE', 'ROUTE_SUFFIX',
                 'POSTMILE_PREFIX', 'POSTMILE', 'LOCATION_TYPE', 'RAMP_INTERSECTION', 'SIDE_OF_HWY', 'TOW_AWAY', 'COLLISION_SEVERITY', 'PARTY_COUNT', 'PRIMARY_COLL_FACTOR',
                 'PCF_CODE_OF_VIOL', 'PCF_VIOL_CATEGORY', 'PCF_VIOLATION', 'PCF_VIOL_SUBSECTION', 'HIT_AND_RUN', 'TYPE_OF_COLLISION', 'MVIW', 'PED_ACTION', 'ROAD_SURFACE',
                 'ROAD_COND_1', 'ROAD_COND_2', 'LIGHTING', 'CONTROL_DEVICE', 'CHP_ROAD_TYPE', 'PEDESTRIAN_ACCIDENT', 'BICYCLE_ACCIDENT', 'MOTORCYCLE_ACCIDENT',
                 'TRUCK_ACCIDENT', 'NOT_PRIVATE_PROPERTY', 'ALCOHOL_INVOLVED', 'STWD_VEHTYPE_AT_FAULT', 'CHP_VEHTYPE_AT_FAULT',  'PRIMARY_RAMP', 'SECONDARY_RAMP',
                 'LATITUDE', 'LONGITUDE', 'COUNTY', 'CITY', 'POINT_X', 'POINT_Y', 'PRIMARY_RD_3', 'SECONDARY_RD_3','rdntwrk_tractce10', 'D2NL']

    # Aggregating rows based on one column with “, ”.join
    concat_agg = lambda ar: ', '.join([item for item in ar if item])

    def agg_func(df):
        d = {}
        for col in df.select_dtypes(np.number).columns:
            d[col] = "sum"
        for col in df.select_dtypes(object).columns:
            if col in str_col:
                d[col] = "first"
            else:
                d[col] = concat_agg
        return d

    _df.drop(drop_col,axis=1,inplace=True)
    df = _df.groupby(['tractce10'],as_index=False).aggregate(agg_func(_df.copy())).copy()
    return df

# aggregate the road crash information according to the CT it belongs
# for cat in [1,2,3]:
# 2010 Crash
gdfCrash = gpd.read_file(BASE_DIR.parent.joinpath(folder_path.parent.parent,"census_tract","by_FTs","SFCrash_CT_2010_fullnetwork.geojson"))
gdfCrash_agg = agg_crash_CT(gdfCrash)
gdfCrash_agg["ACCIDENT_YEAR"]=2010
gdfCrash_agg.to_csv(BASE_DIR.parent.joinpath(folder_path.parent.parent,"census_tract","by_FTs","SFCrash_CT_2010_fullnetwork_agg.csv"))
# convert this as geojson files
SF_CT_2010 = SF_CT.merge(gdfCrash_agg,on='tractce10',how="left")
# save the file
unwanted_cols = ['statefp10','mtfcc10','name10','intptlat10','awater10','namelsad10','funcstat10','aland10','geoid10','intptlon10','countyfp10']
SF_CT_2010.loc[:,~SF_CT_2010.columns.isin(unwanted_cols)].to_file(BASE_DIR.parent.joinpath(folder_path.parent.parent,"census_tract","by_FTs","SFCrash_CT_2010_fullnetwork_agg.geojson"),driver="GeoJSON")
# save the file as csv
unwanted_cols = ['statefp10','mtfcc10','name10','intptlat10','awater10','namelsad10','funcstat10','aland10','geoid10','intptlon10','countyfp10',"geometry"]
SF_CT_2010.loc[:,~SF_CT_2010.columns.isin(unwanted_cols)].to_csv(BASE_DIR.parent.joinpath("Exported_Files","census_tract","by_FTs","SFCrash_CT_2010_fullnetwork_agg_merged.csv"))

# for cat in [1,2,3]:
# 2016 Crash
gdfCrash = gpd.read_file(BASE_DIR.parent.joinpath(folder_path.parent.parent,"census_tract","by_FTs","SFCrash_CT_2016_fullnetwork.geojson"))
gdfCrash_agg = agg_crash_CT(gdfCrash)
gdfCrash_agg["ACCIDENT_YEAR"]=2016
gdfCrash_agg.to_csv(BASE_DIR.parent.joinpath(folder_path.parent.parent,"census_tract","by_FTs","SFCrash_CT_2016_fullnetwork_agg.csv"))
# convert this as geojson files
SF_CT_2016 = SF_CT.merge(gdfCrash_agg,on='tractce10',how="left")
# save the file
unwanted_cols = ['statefp10','mtfcc10','name10','intptlat10','awater10','namelsad10','funcstat10','aland10','geoid10','intptlon10','countyfp10']
SF_CT_2016.loc[:,~SF_CT_2016.columns.isin(unwanted_cols)].to_file(BASE_DIR.parent.joinpath(folder_path.parent.parent,"census_tract","by_FTs","SFCrash_CT_2016_fullnetwork_agg.geojson"),driver="GeoJSON")
# save the file as csv
unwanted_cols = ['statefp10','mtfcc10','name10','intptlat10','awater10','namelsad10','funcstat10','aland10','geoid10','intptlon10','countyfp10',"geometry"]
SF_CT_2016.loc[:,~SF_CT_2016.columns.isin(unwanted_cols)].to_csv(BASE_DIR.parent.joinpath("Exported_Files","census_tract","by_FTs","SFCrash_CT_2016_fullnetwork_agg_merged.csv"))

In [11]:
# merge the network and the road crash

In [12]:
# Census Tract file
dfSFCT = gpd.read_file(BASE_DIR.parent.joinpath(folder_path,"SF_CT_PCS.geojson"), crs = "EPSG:3857")

# for cat in [1,2,3]:
gpdSFCrash = gpd.read_file(BASE_DIR.parent.joinpath("Exported_Files","census_tract","by_FTs","SFCrash_CT_2010_fullnetwork_agg.geojson"), crs = "EPSG:3857")
dfSFCrash = pd.DataFrame(gpdSFCrash.drop(columns="geometry"),copy=True)
gpdSFRdNtwrk = gpd.read_file(BASE_DIR.parent.joinpath("Exported_Files","census_tract","by_FTs","SFRdNtwrk_CT_2010_fullnetwork_agg.geojson"), crs = "EPSG:3857")
dfSFRdNtwrk = pd.DataFrame(gpdSFRdNtwrk.drop(columns="geometry"),copy=True)
gpd_CT_Crash = dfSFCT.merge(dfSFCrash,on="tractce10",how="left")
gpd_CT_Crash["ACCIDENT_YEAR"]=2010
#convert back to geopandas dataframe
# gpd_CT_Crash = gpd.GeoDataFrame(gpd_CT_Crash, geometry='geometry_x',crs="EPSG:4326")
gpd_CT_Crash_Rdntwrk = gpd_CT_Crash.merge(dfSFRdNtwrk,on="tractce10",how="left")
gpd_CT_Crash_Rdntwrk["ACCIDENT_YEAR"]=2010
gpd_CT_Crash_Rdntwrk.to_file(BASE_DIR.parent.joinpath("Exported_Files","census_tract","by_FTs","SF_merge_CT_2010_fullnetwork_agg.geojson"),driver="GeoJSON")
gpd_CT_Crash_Rdntwrk.loc[:,~gpd_CT_Crash_Rdntwrk.columns.isin(["geometry"])].to_csv(BASE_DIR.parent.joinpath("Exported_Files","census_tract","by_FTs","SF_merge_CT_2010_fullnetwork_agg.csv"))

# for cat in [1,2,3]:
gpdSFCrash = gpd.read_file(BASE_DIR.parent.joinpath("Exported_Files","census_tract","by_FTs","SFCrash_CT_2016_fullnetwork_agg.geojson"), crs = "EPSG:3857")
dfSFCrash = pd.DataFrame(gpdSFCrash.drop(columns="geometry"),copy=True)
gpdSFRdNtwrk = gpd.read_file(BASE_DIR.parent.joinpath("Exported_Files","census_tract","by_FTs","SFRdNtwrk_CT_2016_fullnetwork_agg.geojson"), crs = "EPSG:3857")
dfSFRdNtwrk = pd.DataFrame(gpdSFRdNtwrk.drop(columns="geometry"),copy=True)
gpd_CT_Crash = dfSFCT.merge(dfSFCrash,on="tractce10",how="left")
gpd_CT_Crash["ACCIDENT_YEAR"]=2016
#convert back to geopandas dataframe
# gpd_CT_Crash = gpd.GeoDataFrame(gpd_CT_Crash, geometry='geometry_x',crs="EPSG:4326")
gpd_CT_Crash_Rdntwrk = gpd_CT_Crash.merge(dfSFRdNtwrk,on="tractce10",how="left")
gpd_CT_Crash_Rdntwrk["ACCIDENT_YEAR"]=2016
gpd_CT_Crash_Rdntwrk.to_file(BASE_DIR.parent.joinpath("Exported_Files","census_tract","by_FTs","SF_merge_CT_2016_fullnetwork_agg.geojson"),driver="GeoJSON")
gpd_CT_Crash_Rdntwrk.loc[:,~gpd_CT_Crash_Rdntwrk.columns.isin(["geometry"])].to_csv(BASE_DIR.parent.joinpath("Exported_Files","census_tract","by_FTs","SF_merge_CT_2016_fullnetwork_agg.csv"))

In [13]:
df2010 = pd.read_csv((BASE_DIR.parent.joinpath("Exported_Files","census_tract","by_FTs","SF_merge_CT_2010_fullnetwork_agg.csv")))
df2016 = pd.read_csv((BASE_DIR.parent.joinpath("Exported_Files","census_tract","by_FTs","SF_merge_CT_2016_fullnetwork_agg.csv")))

In [14]:
dfmerged = pd.concat([df2010,df2016], ignore_index=True)
dfmerged.fillna(0,inplace=True)
# Calculate TNC Volume
cols = ["V13_1","OOS"]
dfmerged["TNC_VOLUME"]= dfmerged[cols].sum(axis=1)
dfmerged["TNC_VOLUME_yr"]= dfmerged["TNC_VOLUME"]*365

dfmerged["TNC_Tot_Vol"]= dfmerged[cols].sum(axis=1)
dfmerged["TNC_Tot_Vol_yr"]= dfmerged["TNC_Tot_Vol"]*365

dfmerged["Tot_Vol_yr"] = dfmerged["Tot_Vol"] * 365
dfmerged["PUDO_yr"] = dfmerged["PUDO"]*365

dfmerged["NON_TNC_VOLUME"] = dfmerged["Tot_Vol"] - dfmerged["TNC_VOLUME"]
dfmerged["NON_TNC_VOLUME_yr"] = dfmerged["Tot_Vol_yr"] - dfmerged["TNC_VOLUME_yr"]

# Calculate VDTs
dfmerged["Veh_Dist_Travel_TNC"] = dfmerged["Length_meters"]*dfmerged["TNC_VOLUME"]*0.000621371
dfmerged["Veh_Dist_Travel_NON_TNC"] = dfmerged["Length_meters"]*dfmerged["NON_TNC_VOLUME"]*0.000621371

dfmerged["Veh_Dist_Travel_TNC_yr"] = dfmerged["Length_meters"]*dfmerged["TNC_VOLUME"]*0.000621371*365
dfmerged["Veh_Dist_Travel_NON_TNC_yr"] = dfmerged["Length_meters"]*dfmerged["NON_TNC_VOLUME"]*0.000621371*365

# Calculate - PCT of the TNC Vol, Pick-up & Drop-off and VC Ratio on the segment
dfmerged["PCT_TNC_VOL"] = 0
dfmerged["PCT_TNC_PUDO"] = 0
dfmerged["PCT_TNC_VOL"] = dfmerged["TNC_VOLUME"].divide(dfmerged["Tot_Vol"])
dfmerged["PCT_PUDO_Tot_Vol"] = dfmerged["PUDO"].divide(dfmerged["Tot_Vol"])
dfmerged["PCT_PUDO_TNC_Vol"] = dfmerged["PUDO"].divide(dfmerged["TNC_VOLUME"])

dfmerged["PCT_TNC_VOL_yr"] = 0
dfmerged["PCT_TNC_PUDO_yr"] = 0
dfmerged["PCT_TNC_VOL_yr"] = dfmerged["TNC_VOLUME_yr"].divide(dfmerged["Tot_Vol_yr"])
dfmerged["PCT_PUDO_Tot_Vol_yr"] = dfmerged["PUDO_yr"].divide(dfmerged["Tot_Vol_yr"])
dfmerged["PCT_PUDO_TNC_Vol_yr"] = dfmerged["PUDO"].divide(dfmerged["TNC_VOLUME"])

# PUDO
dfmerged["PUDO_millions"] = dfmerged["PUDO"].divide(1000000)
dfmerged["log_PUDO_millions"] = np.log(dfmerged["PUDO_millions"]+1)

dfmerged["PUDO_millions_yr"] = dfmerged["PUDO"].divide(1000000)*365
dfmerged["log_PUDO_millions_yr"] = np.log((dfmerged["PUDO_millions"]*365)+1)

In [15]:
# Tot_Vol: log, square of and log of the square of
dfmerged["log_Tot_Vol"] = np.log(dfmerged["Tot_Vol"]+1)
dfmerged["Tot_Vol_SQR"] = dfmerged["Tot_Vol"]*dfmerged["Tot_Vol"]
dfmerged["log_Tot_Vol_SQR"] = np.log(dfmerged["Tot_Vol_SQR"]+1)

# TNC_Vol: log, square of and log of the square of
dfmerged["log_TNC_VOLUME"] = np.log(dfmerged["TNC_VOLUME"]+1)
dfmerged["TNC_VOLUME_SQR"] = dfmerged["TNC_VOLUME"]*dfmerged["TNC_VOLUME"]
dfmerged["log_TNC_VOLUME_SQR"] = np.log(dfmerged["TNC_VOLUME_SQR"]+1)

# Non_TNC_Vol: log, square of and log of the square of
dfmerged["log_NON_TNC_VOLUME"] = np.log(dfmerged["NON_TNC_VOLUME"]+1)
dfmerged["NON_TNC_VOLUME_SQR"] = dfmerged["NON_TNC_VOLUME"]*dfmerged["NON_TNC_VOLUME"]
dfmerged["log_NON_TNC_VOLUME_SQR"] = np.log(dfmerged["TNC_VOLUME_SQR"]+1)

# PCT_NON_TNC_VOL: log, square of and log of the square of
dfmerged["PCT_NON_TNC_VOL"] = dfmerged["NON_TNC_VOLUME"].divide(dfmerged["Tot_Vol"])

# Veh_Dist_Travel: log, square of and log of the square of
dfmerged["log_Veh_Dist_Travel"] = np.log(dfmerged["Veh_Dist_Travel"]+1)
dfmerged["Veh_Dist_Travel_SQR"] = dfmerged["Veh_Dist_Travel"]*dfmerged["Veh_Dist_Travel"]
dfmerged["log_Veh_Dist_Travel_SQR"] = np.log(dfmerged["Veh_Dist_Travel_SQR"]+1)
# also convert VDT in millions
dfmerged["Veh_Dist_Travel_millions"] = dfmerged["Veh_Dist_Travel"].divide(1000000)
dfmerged["log_Veh_Dist_Travel_millions"] = np.log(dfmerged["Veh_Dist_Travel_millions"]+1)
dfmerged["Veh_Dist_Travel_millions_SQR"] = dfmerged["Veh_Dist_Travel_millions"]*dfmerged["Veh_Dist_Travel_millions"]
dfmerged["log_Veh_Dist_Travel_millions_SQR"] = np.log(dfmerged["Veh_Dist_Travel_millions_SQR"]+1)

# Veh_Dist_Travel_TNC: log, square of and log of the square of
dfmerged["log_Veh_Dist_Travel_TNC"] = np.log(dfmerged["Veh_Dist_Travel_TNC"]+1)
dfmerged["Veh_Dist_Travel_TNC_SQR"] = dfmerged["Veh_Dist_Travel_TNC"]*dfmerged["Veh_Dist_Travel_TNC"]
dfmerged["log_Veh_Dist_Travel_TNC_SQR"] = np.log(dfmerged["Veh_Dist_Travel_TNC_SQR"]+1)
# also convert VDT in millions
dfmerged["Veh_Dist_Travel_TNC_millions"] = dfmerged["Veh_Dist_Travel_TNC"].divide(1000000)
dfmerged["log_Veh_Dist_Travel_TNC_millions"] = np.log(dfmerged["Veh_Dist_Travel_TNC_millions"]+1)
dfmerged["Veh_Dist_Travel_TNC_millions_SQR"] = dfmerged["Veh_Dist_Travel_TNC_millions"]*dfmerged["Veh_Dist_Travel_TNC_millions"]
dfmerged["log_Veh_Dist_Travel_TNC_millions_SQR"] = np.log(dfmerged["Veh_Dist_Travel_TNC_millions_SQR"]+1)

# Veh_Dist_Travel_NON_TNC: log, square of and log of the square of
dfmerged["log_Veh_Dist_Travel_NON_TNC"] = np.log(dfmerged["Veh_Dist_Travel_NON_TNC"]+1)
dfmerged["Veh_Dist_Travel_NON_TNC_SQR"] = dfmerged["Veh_Dist_Travel_NON_TNC"]*dfmerged["Veh_Dist_Travel_NON_TNC"]
dfmerged["log_Veh_Dist_Travel_NON_TNC_SQR"] = np.log(dfmerged["Veh_Dist_Travel_NON_TNC_SQR"]+1)
# also convert VDT in millions
dfmerged["Veh_Dist_Travel_NON_TNC_millions"] = dfmerged["Veh_Dist_Travel_NON_TNC"].divide(1000000)
dfmerged["log_Veh_Dist_Travel_NON_TNC_millions"] = np.log(dfmerged["Veh_Dist_Travel_NON_TNC_millions"]+1)
dfmerged["Veh_Dist_Travel_NON_TNC_millions_SQR"] = dfmerged["Veh_Dist_Travel_NON_TNC_millions"]*dfmerged["Veh_Dist_Travel_NON_TNC_millions"]
dfmerged["log_Veh_Dist_Travel_NON_TNC_millions_SQR"] = np.log(dfmerged["Veh_Dist_Travel_NON_TNC_millions_SQR"]+1)

# CONGESTED_SPEED
dfmerged["log_CONGESTED_SPEED"] = np.log(dfmerged["CONGESTED_SPEED"]+1)
dfmerged["CONGESTED_SPEED_SQR"] = dfmerged["CONGESTED_SPEED"]*dfmerged["CONGESTED_SPEED"]
dfmerged["log_CONGESTED_SPEED_SQR"] = np.log(dfmerged["CONGESTED_SPEED_SQR"]+1)

# CONGESTED_SPEED
dfmerged["log_VC_ratio"] = np.log(dfmerged["VC_ratio"]+1)
dfmerged["VC_ratio_SQR"] = dfmerged["VC_ratio"]*dfmerged["VC_ratio"]
dfmerged["log_VC_ratio_SQR"] = np.log(dfmerged["VC_ratio_SQR"]+1)

In [16]:
# Tot_Vol: log, square of and log of the square of
dfmerged["log_Tot_Vol_yr"] = np.log(dfmerged["Tot_Vol_yr"]+1)
dfmerged["Tot_Vol_SQR_yr"] = dfmerged["Tot_Vol_yr"]*dfmerged["Tot_Vol_yr"]
dfmerged["log_Tot_Vol_SQR_yr"] = np.log(dfmerged["Tot_Vol_SQR_yr"]+1)

# TNC_Vol: log, square of and log of the square of
dfmerged["log_TNC_VOLUME_yr"] = np.log(dfmerged["TNC_VOLUME_yr"]+1)
dfmerged["TNC_VOLUME_SQR_yr"] = dfmerged["TNC_VOLUME_yr"]*dfmerged["TNC_VOLUME_yr"]
dfmerged["log_TNC_VOLUME_SQR_yr"] = np.log(dfmerged["TNC_VOLUME_SQR_yr"]+1)

# Non_TNC_Vol: log, square of and log of the square of
dfmerged["log_NON_TNC_VOLUME_yr"] = np.log(dfmerged["NON_TNC_VOLUME_yr"]+1)
dfmerged["NON_TNC_VOLUME_SQR_yr"] = dfmerged["NON_TNC_VOLUME_yr"]*dfmerged["NON_TNC_VOLUME_yr"]
dfmerged["log_NON_TNC_VOLUME_SQR_yr"] = np.log(dfmerged["TNC_VOLUME_SQR_yr"]+1)

# PCT_NON_TNC_VOL: log, square of and log of the square of
dfmerged["PCT_NON_TNC_VOL_yr"] = dfmerged["NON_TNC_VOLUME_yr"].divide(dfmerged["Tot_Vol_yr"])

# Veh_Dist_Travel: log, square of and log of the square of
dfmerged["Veh_Dist_Travel_yr"] = (dfmerged["Veh_Dist_Travel"]*365)
dfmerged["log_Veh_Dist_Travel_yr"] = np.log((dfmerged["Veh_Dist_Travel_yr"])+1)
dfmerged["Veh_Dist_Travel_SQR_yr"] = dfmerged["Veh_Dist_Travel_yr"]*dfmerged["Veh_Dist_Travel_yr"]
dfmerged["log_Veh_Dist_Travel_SQR_yr"] = np.log(dfmerged["Veh_Dist_Travel_SQR_yr"]+1)
# also convert VDT in millions
dfmerged["Veh_Dist_Travel_millions_yr"] = dfmerged["Veh_Dist_Travel"].divide(1000000)*365
dfmerged["log_Veh_Dist_Travel_millions_yr"] = np.log((dfmerged["Veh_Dist_Travel_millions_yr"])+1)
dfmerged["Veh_Dist_Travel_millions_SQR_yr"] = dfmerged["Veh_Dist_Travel_millions_yr"]*dfmerged["Veh_Dist_Travel_millions_yr"]
dfmerged["log_Veh_Dist_Travel_millions_SQR_yr"] = np.log(dfmerged["Veh_Dist_Travel_millions_SQR_yr"]+1)

# Veh_Dist_Travel_TNC: log, square of and log of the square of
dfmerged["log_Veh_Dist_Travel_TNC_yr"] = np.log(dfmerged["Veh_Dist_Travel_TNC_yr"]+1)
dfmerged["Veh_Dist_Travel_TNC_SQR_yr"] = dfmerged["Veh_Dist_Travel_TNC_yr"]*dfmerged["Veh_Dist_Travel_TNC_yr"]
dfmerged["log_Veh_Dist_Travel_TNC_SQR_yr"] = np.log(dfmerged["Veh_Dist_Travel_TNC_SQR_yr"]+1)
# also convert VDT in millions
dfmerged["Veh_Dist_Travel_TNC_millions_yr"] = dfmerged["Veh_Dist_Travel_TNC_yr"].divide(1000000)*365
dfmerged["log_Veh_Dist_Travel_TNC_millions_yr"] = np.log(dfmerged["Veh_Dist_Travel_TNC_millions_yr"]+1)
dfmerged["Veh_Dist_Travel_TNC_millions_SQR_yr"] = dfmerged["Veh_Dist_Travel_TNC_millions_yr"]*dfmerged["Veh_Dist_Travel_TNC_millions_yr"]
dfmerged["log_Veh_Dist_Travel_TNC_millions_SQR_yr"] = np.log(dfmerged["Veh_Dist_Travel_TNC_millions_SQR_yr"]+1)

# Veh_Dist_Travel_NON_TNC: log, square of and log of the square of
dfmerged["log_Veh_Dist_Travel_NON_TNC_yr"] = np.log(dfmerged["Veh_Dist_Travel_NON_TNC_yr"]+1)
dfmerged["Veh_Dist_Travel_NON_TNC_SQR_yr"] = dfmerged["Veh_Dist_Travel_NON_TNC_yr"]*dfmerged["Veh_Dist_Travel_NON_TNC_yr"]
dfmerged["log_Veh_Dist_Travel_NON_TNC_SQR_yr"] = np.log(dfmerged["Veh_Dist_Travel_NON_TNC_SQR_yr"]+1)
# also convert VDT in millions
dfmerged["Veh_Dist_Travel_NON_TNC_millions_yr"] = dfmerged["Veh_Dist_Travel_NON_TNC_yr"].divide(1000000)
dfmerged["log_Veh_Dist_Travel_NON_TNC_millions_yr"] = np.log(dfmerged["Veh_Dist_Travel_NON_TNC_millions_yr"]+1)
dfmerged["Veh_Dist_Travel_NON_TNC_millions_SQR_yr"] = dfmerged["Veh_Dist_Travel_NON_TNC_millions_yr"]*dfmerged["Veh_Dist_Travel_NON_TNC_millions_yr"]
dfmerged["log_Veh_Dist_Travel_NON_TNC_millions_SQR_yr"] = np.log(dfmerged["Veh_Dist_Travel_NON_TNC_millions_SQR_yr"]+1)

# CONGESTED_SPEED
dfmerged["log_CONGESTED_SPEED_yr"] = np.log(dfmerged["CONGESTED_SPEED"]+1)
dfmerged["CONGESTED_SPEED_SQR_yr"] = dfmerged["CONGESTED_SPEED"]*dfmerged["CONGESTED_SPEED"]
dfmerged["log_CONGESTED_SPEED_SQR_yr"] = np.log(dfmerged["CONGESTED_SPEED_SQR"]+1)

# CONGESTED_SPEED
dfmerged["log_VC_ratio_yr"] = np.log(dfmerged["VC_ratio"]+1)
dfmerged["VC_ratio_SQR_yr"] = dfmerged["VC_ratio"]*dfmerged["VC_ratio"]
dfmerged["log_VC_ratio_SQR_yr"] = np.log(dfmerged["VC_ratio_SQR_yr"]+1)

In [17]:
dfmerged.to_csv((BASE_DIR.parent.joinpath("Exported_Files","census_tract","sample","SF_merge_CT_fullnetwork_5Feb2022.csv")))

In [None]:
# Find the difference

dfmerged.sort_values(["tractce10","ACCIDENT_YEAR"], inplace=True)
cols = ['NUMBER_KILLED', 'NUMBER_INJURED', 'COUNT_SEVERE_INJ', 'COUNT_VISIBLE_INJ', 'COUNT_COMPLAINT_PAIN', 'COUNT_PED_KILLED', 'COUNT_PED_INJURED', 'COUNT_BICYCLIST_KILLED', 'COUNT_BICYCLIST_INJURED', 'COUNT_MC_KILLED', 'COUNT_MC_INJURED', 'Total_Crash', 'COUNT_Fatal', 'COUNT_Severe_Injury', 'COUNT_Visible_Injury', 'COUNT_Other_Injury', 'COUNT_PDO','Tot_CAP', 'OOS', 'PUDO', 'Tot_Vol', 'TNC_Tot_Vol','Veh_Dist_Travel', 'CONGESTED_SPEED', 'TNC_VOLUME', 'Veh_Dist_Travel_TNC', 'NON_TNC_VOLUME', 'Veh_Dist_Travel_NON_TNC', 'PCT_TNC_VOL', 'PCT_TNC_PUDO', 'VC_ratio', 'log_Tot_Vol', 'Tot_Vol_SQR', 'log_Tot_Vol_SQR', 'log_TNC_VOLUME', 'TNC_VOLUME_SQR', 'log_TNC_VOLUME_SQR', 'log_NON_TNC_VOLUME', 'NON_TNC_VOLUME_SQR', 'log_NON_TNC_VOLUME_SQR', 'PCT_NON_TNC_VOL', 'log_Veh_Dist_Travel', 'Veh_Dist_Travel_SQR', 'log_Veh_Dist_Travel_SQR', 'log_Veh_Dist_Travel_TNC', 'Veh_Dist_Travel_TNC_SQR', 'log_Veh_Dist_Travel_TNC_SQR', 'log_Veh_Dist_Travel_NON_TNC', 'Veh_Dist_Travel_NON_TNC_SQR', 'log_Veh_Dist_Travel_NON_TNC_SQR', 'log_CONGESTED_SPEED', 'CONGESTED_SPEED_SQR', 'log_CONGESTED_SPEED_SQR', 'log_VC_ratio', 'VC_ratio_SQR', 'log_VC_ratio_SQR']
dfmerged.groupby(["tractce10"],as_index=True)[cols].diff(1)