In [1]:
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
import numpy as np
from pyproj import CRS
import pathlib
from pathlib import Path
from shapely import wkt
from tqdm import tqdm
# set the working directory
BASE_DIR = Path.cwd()
# define the exported folder path
# Check if folder exists
folder_path = pathlib.Path(BASE_DIR.parent.joinpath("Exported_Files","census_tract","agg_network"))
folder_path.mkdir(parents=True, exist_ok=True)
# print(BASE_DIR)

In [2]:
"""
Fetch SF Champ network both for year 2010, year 2016 and join the frames
"""

'\nFetch SF Champ network both for year 2010, year 2016 and join the frames\n'

In [3]:
# Network for YR 2010
dfSFRd2010am = gpd.read_file(BASE_DIR.parent.joinpath("2010","2010_AM.shp"))
dfSFRd2010am["peak"]="AM"
dfSFRd2010am["Tot_CAP"]=dfSFRd2010am["CAP"]*3

dfSFRd2010pm = gpd.read_file(BASE_DIR.parent.joinpath("2010","2010_PM.shp"))
dfSFRd2010pm["peak"]="PM"
dfSFRd2010pm["Tot_CAP"]=dfSFRd2010pm["CAP"]*3

dfSFRd2010ea = gpd.read_file(BASE_DIR.parent.joinpath("2010","2010_EA.shp"))
dfSFRd2010ea["peak"]="EA"
dfSFRd2010ea["Tot_CAP"]=dfSFRd2010ea["CAP"]*3

dfSFRd2010ev = gpd.read_file(BASE_DIR.parent.joinpath("2010","2010_EV.shp"))
dfSFRd2010ev["peak"]="EV"
dfSFRd2010ev["Tot_CAP"]=dfSFRd2010ev["CAP"]*8.5

dfSFRd2010md = gpd.read_file(BASE_DIR.parent.joinpath("2010","2010_MD.shp"))
dfSFRd2010md["peak"]="MD"
dfSFRd2010md["Tot_CAP"]=dfSFRd2010md["CAP"]*6.5

dfSFRdNtwrk2010 = pd.concat([dfSFRd2010am,dfSFRd2010pm,dfSFRd2010ea,dfSFRd2010ev,dfSFRd2010md])
# create empty columns: OOS, PUDO, Tot_Vol
dfSFRdNtwrk2010 = dfSFRdNtwrk2010.assign(OOS=0, PUDO=0, Tot_Vol=0, TNC_Tot_Vol=0)
dfSFRdNtwrk2010["A_B"] = dfSFRdNtwrk2010["A"].astype(str)  + "_" + dfSFRdNtwrk2010["B"].astype(str)
dfSFRdNtwrk2010["A"] = dfSFRdNtwrk2010["A"].astype(str)
dfSFRdNtwrk2010["B"] = dfSFRdNtwrk2010["B"].astype(str)
# get the columns which together formm Tot_Vol
add_2010 = ['V1_1', 'V2_1', 'V3_1', 'V4_1', 'V5_1', 'V6_1', 'V7_1', 'V8_1', 'V9_1', 'V10_1', 'V11_1', 'V12_1',
            'BUSVOL_AM','BUSVOL_PM','BUSVOL_EA','BUSVOL_MD','BUSVOL_EV','OOS']
# add them up
dfSFRdNtwrk2010["Tot_Vol"] = dfSFRdNtwrk2010[add_2010].sum(axis=1)

# Keep only FT types representing real road-network
# 1: Fwy-Fwy Connector; 2: Freeway; 3: Expressway; 4: Collector; 5: Ramp; 6: Centroid Connector;
# 7: Major Arterial; 8: ; 9: Alley (only for DTA); 10: ; 11: Local; 12: Minor Arterial; 13: Bike only;
# 14: ; 15: Super Arterial
dfSFRdNtwrk2010=dfSFRdNtwrk2010[dfSFRdNtwrk2010.FT.isin([1,2,3,4,5,7,9,10,11,12,13,15])]
# dfSFRdNtwrk2010=dfSFRdNtwrk2010[dfSFRdNtwrk2010.FT.isin([4,7,10,11,12,15])]

# convert back to geopandas dataframe
dfSFRdNtwrk2010 = gpd.GeoDataFrame(dfSFRdNtwrk2010, geometry='geometry',crs="EPSG:4326")
dfSFRdNtwrk2010=dfSFRdNtwrk2010.to_crs("EPSG:4326")

# # read the SF_Boundary file
dfSFBoundary = gpd.read_file(BASE_DIR.parent.joinpath("Data","SF_County","SFBay_Boundary.shp"))
dfSFBoundary=dfSFBoundary.to_crs("EPSG:4326")

dfSFRdNtwrk2010.reset_index(drop=True,inplace=True)
# # Overlay and select only links which are within the SF Bay Area Polygon
dfSFRdNtwrk2010 = gpd.clip(dfSFRdNtwrk2010, dfSFBoundary)

# dfSFRdNtwrk2010.to_csv(BASE_DIR.parent.joinpath(folder_path,"Feb122022","Raw_SFRdNtwrk_2010_cores.csv"))

# aggregate the dataframe using A_B
wt_avg = lambda x: np.ma.average(x, weights = dfSFRdNtwrk2010.loc[x.index, "Tot_Vol"])

lst_col = ["SPEED","TIME","TIME_1","CSPD_1"]
# average the columns
avg_col = [ 'DISTANCE',
            "FT","AT",
            'TIMESEED',
            'LANE_AM', 'LANE_OP', 'LANE_PM', 'BUSLANE_AM', 'BUSLANE_OP', 'BUSLANE_PM',
            'TOLLAM_DA', 'TOLLAM_SR2', 'TOLLAM_SR3', 'TOLLPM_DA', 'TOLLPM_SR2', 'TOLLPM_SR3',
            'TOLLEA_DA', 'TOLLEA_SR2', 'TOLLEA_SR3', 'TOLLMD_DA', 'TOLLMD_SR2', 'TOLLMD_SR3', 'TOLLEV_DA',
            'TOLLEV_SR2', 'TOLLEV_SR3',"USE",]
# custom aggregate function: weighted average, summation or concatanation
def agg_func(df):
    d = {}
    for col in df.select_dtypes(np.number).columns:
        if col in lst_col:
            d[col] = wt_avg
        elif col in avg_col:
            d[col]="mean"
        else:
            d[col] = "sum"
    for col in df.select_dtypes(object).columns:
        d[col] = "first"
    d["geometry"] = "first"
    return d

dfSFRdNtwrk2010_agg = dfSFRdNtwrk2010.groupby(['A_B'],as_index=False).aggregate(agg_func(dfSFRdNtwrk2010.copy())).copy()
# above merge converts the geo-dataframe to pandas dataframe. So re-convert it into geodataframe
dfSFRdNtwrk2010_agg = gpd.GeoDataFrame(dfSFRdNtwrk2010_agg, geometry='geometry',crs="EPSG:4326")
dfSFRdNtwrk2010_agg=dfSFRdNtwrk2010_agg.to_crs("EPSG:4326")

# # Overlay and select only links which are within the SF Bay Area Polygon
dfSFRdNtwrk2010_agg = gpd.clip(dfSFRdNtwrk2010_agg, dfSFBoundary)

# export the geodataframe
# dfSFRdNtwrk2010_agg.to_file(BASE_DIR.parent.joinpath(folder_path,"Feb122022","SFChamp_2010_agg.geojson"), driver='GeoJSON')
# dfSFRdNtwrk2010_agg.to_csv(BASE_DIR.parent.joinpath(folder_path,"Feb122022","SFChamp_2010_agg.csv"))

# reproject the geodataframe to EPSG:3857
dfSFRdNtwrk2010_agg = dfSFRdNtwrk2010_agg.to_crs("EPSG:3857")
dfSFRdNtwrk2010_agg.to_file(BASE_DIR.parent.joinpath(folder_path,"Feb162022","SFChamp_2010_PCS.geojson"), driver='GeoJSON')

In [4]:
# read 2016 file
dfSFRd2016am = gpd.read_file(BASE_DIR.parent.joinpath("2016","2016_AM.shp"))
dfSFRd2016am["peak"]="AM"
dfSFRd2016am["Tot_CAP"]=dfSFRd2016am["CAP"]*3
dfSFRd2016pm = gpd.read_file(BASE_DIR.parent.joinpath("2016","2016_PM.shp"))
dfSFRd2016pm["peak"]="PM"
dfSFRd2016pm["Tot_CAP"]=dfSFRd2016pm["CAP"]*3
dfSFRd2016ea = gpd.read_file(BASE_DIR.parent.joinpath("2016","2016_EA.shp"))
dfSFRd2016ea["peak"]="EA"
dfSFRd2016ea["Tot_CAP"]=dfSFRd2016ea["CAP"]*3
dfSFRd2016ev = gpd.read_file(BASE_DIR.parent.joinpath("2016","2016_EV.shp"))
dfSFRd2016ev["peak"]="EV"
dfSFRd2016ev["Tot_CAP"]=dfSFRd2016ev["CAP"]*8.5
dfSFRd2016md = gpd.read_file(BASE_DIR.parent.joinpath("2016","2016_MD.shp"))
dfSFRd2016md["peak"]="MD"
dfSFRd2016md["Tot_CAP"]=dfSFRd2016md["CAP"]*6.5

dfSFRdNtwrk2016 = pd.concat([dfSFRd2016am,dfSFRd2016pm,dfSFRd2016ea,dfSFRd2016ev,dfSFRd2016md])

# create empty columns: OOS, PUDO, Tot_Vol
dfSFRdNtwrk2016 = dfSFRdNtwrk2016.assign(Tot_Vol=0)
dfSFRdNtwrk2016["A_B"] = dfSFRdNtwrk2016["A"].astype(str)  + "_" + dfSFRdNtwrk2016["B"].astype(str)
dfSFRdNtwrk2016["A"] = dfSFRdNtwrk2016["A"].astype(str)
dfSFRdNtwrk2016["B"] = dfSFRdNtwrk2016["B"].astype(str)
# get the columns which together formm Tot_Vol
add_2016 = ['V1_1', 'V2_1', 'V3_1', 'V4_1', 'V5_1', 'V6_1', 'V7_1', 'V8_1', 'V9_1', 'V10_1', 'V11_1', 'V12_1',
            'V13_1','V14_1','V15_1','V16_1','V17_1','V18_1','V19_1',
            'BUSVOL_AM','BUSVOL_PM','BUSVOL_EA','BUSVOL_MD','BUSVOL_EV','OOS']
# add them up
dfSFRdNtwrk2016["Tot_Vol"] = dfSFRdNtwrk2016[add_2016].sum(axis=1)

# add to TNC Vol
# TNC_2016 = ['V16_1','V17_1','V18_1','OOS'] # updated 26 Jan 2022 to reflect TNC volumes which were mis-represented
TNC_2016 = ['V13_1','OOS'] # V13_1 is TNC_Volumes plying on the road segment
dfSFRdNtwrk2016["TNC_Tot_Vol"] = dfSFRdNtwrk2016[TNC_2016].sum(axis=1)

# Keep only FT types representing real road-network
dfSFRdNtwrk2016=dfSFRdNtwrk2016[dfSFRdNtwrk2016.FT.isin([1,2,3,4,5,7,9,10,11,12,13,15])]
# dfSFRdNtwrk2016=dfSFRdNtwrk2016[dfSFRdNtwrk2016.FT.isin([4,7,10,11,12,15])]

# convert back to geopandas dataframe
dfSFRdNtwrk2016 = gpd.GeoDataFrame(dfSFRdNtwrk2016, geometry='geometry',crs="EPSG:4326")
dfSFRdNtwrk2016=dfSFRdNtwrk2016.to_crs("EPSG:4326")

# # read the SF_Boundary file
dfSFBoundary = gpd.read_file(BASE_DIR.parent.joinpath("Data","SF_County","SFBay_Boundary.shp"))
dfSFBoundary=dfSFBoundary.to_crs("EPSG:4326")

dfSFRdNtwrk2016.reset_index(drop=True,inplace=True)
# # Overlay and select only links which are within the SF Bay Area Polygon
dfSFRdNtwrk2016 = gpd.clip(dfSFRdNtwrk2016, dfSFBoundary)

# dfSFRdNtwrk2016.to_csv(BASE_DIR.parent.joinpath(folder_path,"Raw_SFRdNtwrk_2016_cores.csv"))

# aggregate the dataframe using A_B
wt_avg = lambda x: np.ma.average(x, weights = dfSFRdNtwrk2016.loc[x.index, "Tot_Vol"])

lst_col = ["SPEED","TIME","TIME_1","CSPD_1"]
# average the columns
avg_col = [ 'DISTANCE',
            "FT","AT",
            'TIMESEED',
            'LANE_AM', 'LANE_OP', 'LANE_PM', 'BUSLANE_AM', 'BUSLANE_OP', 'BUSLANE_PM',
            'TOLLAM_DA', 'TOLLAM_SR2', 'TOLLAM_SR3', 'TOLLPM_DA', 'TOLLPM_SR2', 'TOLLPM_SR3',
            'TOLLEA_DA', 'TOLLEA_SR2', 'TOLLEA_SR3', 'TOLLMD_DA', 'TOLLMD_SR2', 'TOLLMD_SR3', 'TOLLEV_DA',
            'TOLLEV_SR2', 'TOLLEV_SR3',"USE"]

def agg_func(df):
    d = {}
    for col in df.select_dtypes(np.number).columns:
        if col in lst_col:
            d[col] = wt_avg
        elif col in avg_col:
            d[col]="mean"
        else:
            d[col] = "sum"
    for col in df.select_dtypes(object).columns:
        d[col] = "first"
    d["geometry"] = "first"
    return d

dfSFRdNtwrk2016_agg = dfSFRdNtwrk2016.groupby(['A_B'],as_index=False).aggregate(agg_func(dfSFRdNtwrk2016.copy())).copy()
# above merge converts the geo-dataframe to pandas dataframe. So re-convert it into geodataframe
dfSFRdNtwrk2016_agg = gpd.GeoDataFrame(dfSFRdNtwrk2016_agg, geometry='geometry',crs="EPSG:4326")
dfSFRdNtwrk2016_agg=dfSFRdNtwrk2016_agg.to_crs("EPSG:4326")

# # Overlay and select only links which are within the SF Bay Area Polygon
dfSFRdNtwrk2016_agg = gpd.clip(dfSFRdNtwrk2016_agg, dfSFBoundary)
# export the geodataframe
# dfSFRdNtwrk2016_agg.to_file(BASE_DIR.parent.joinpath(folder_path,"SFChamp_2016_agg.geojson"), driver='GeoJSON')
# dfSFRdNtwrk2016_agg.to_csv(BASE_DIR.parent.joinpath(folder_path,"SFChamp_2016_agg.csv"))

# reproject the geodataframe to EPSG:3857
dfSFRdNtwrk2016_agg = dfSFRdNtwrk2016_agg.to_crs("EPSG:3857")
dfSFRdNtwrk2016_agg.to_file(BASE_DIR.parent.joinpath(folder_path,"Feb162022","SFChamp_2016_PCS.geojson"), driver='GeoJSON')

In [5]:
# # Also reproject the Census Tract file to EPSG:3857
# dfSF_CensusTract = gpd.read_file(BASE_DIR.parent.joinpath("CensusTract","SF_CT_2010.geojson"))
# dfSF_CensusTract = dfSF_CensusTract.to_crs("EPSG:3857")
# dfSF_CensusTract["tractce10"]=dfSF_CensusTract["tractce10"].astype(str)
# dfSF_CensusTract.to_file(BASE_DIR.parent.joinpath(folder_path,"Feb162022","SF_CensusTract_PCS.geojson"), driver='GeoJSON')

In [6]:
# Check in QGIS if the re-projection is successful .i.e.
# 1. Both SFChamp_2010_agg.geojson and SFChamp_2016_agg.geojson into EPSG:3857, are named with _PCS suffix
# 2. The SF_CensusTract to EPSG:3857, is also name with _PCS suffix

# After the above process do the following in QGIS
# 3. Road Crash for each year i.e. 2010 and 2016, convert it to EPSG:3857, name it SFCrash_2010_PCS.geojson & SFCrash_2016_PCS.geojson
# 4. Perform spatial intersection:
        # Intersect with SF_CensusTract and RoadNetwork and name the output as SFChamp_201x_agg_CT_PCS.geojson
        # Intersect road crashes with SF_Census Tract and name the output as SFCrash_201x_CT_PCS.geojson
        # for both, keep "tractce" column from SF_Census Tract in the output file
# This ends QGIS manipulation

In [7]:
# Fetch SF_Census Tract, SF_RoadNetwork and SF_RoadCrash
SF_CT = gpd.read_file(BASE_DIR.parent.joinpath(folder_path,"Feb162022","SF_CensusTract_PCS.geojson"), crs = "EPSG:3857")
# road network
dfSFRdNtwrk2010_CT = gpd.read_file(BASE_DIR.parent.joinpath(folder_path,"Feb162022","SFChamp_2010_CT_PCS.geojson"), crs = "EPSG:3857")
dfSFRdNtwrk2010_CT.fillna(dfSFRdNtwrk2010_CT.dtypes.replace({'float64': 0.0, 'O': 'NULL'}),downcast='infer', inplace=True)
dfSFRdNtwrk2010_CT["tractce10"]=dfSFRdNtwrk2010_CT["tractce10"].astype(str)
# dfSFRdNtwrk2010_CT["ACCIDENT_YEAR"] = 2010
dfSFRdNtwrk2016_CT = gpd.read_file(BASE_DIR.parent.joinpath(folder_path,"Feb162022","SFChamp_2016_CT_PCS.geojson"), crs = "EPSG:3857")
dfSFRdNtwrk2016_CT.fillna(dfSFRdNtwrk2016_CT.dtypes.replace({'float64': 0.0, 'O': 'NULL'}),downcast='infer', inplace=True)
dfSFRdNtwrk2016_CT["tractce10"]=dfSFRdNtwrk2016_CT["tractce10"].astype(str)
# dfSFRdNtwrk2016_CT["ACCIDENT_YEAR"] = 2016
# road crashes
dfSFCrash2010_CT = gpd.read_file(BASE_DIR.parent.joinpath(folder_path,"Feb162022","SFCrash_2010_CT_PCS.geojson"), crs = "EPSG:3857")
dfSFCrash2010_CT.fillna(dfSFCrash2010_CT.dtypes.replace({'float64': 0.0, 'O': 'NULL'}),downcast='infer', inplace=True)
dfSFCrash2010_CT["tractce10"]=dfSFCrash2010_CT["tractce10"].astype(str)
dfSFCrash2010_CT["ACCIDENT_YEAR"] = 2010
dfSFCrash2016_CT = gpd.read_file(BASE_DIR.parent.joinpath(folder_path,"Feb162022","SFCrash_2016_CT_PCS.geojson"), crs = "EPSG:3857")
dfSFCrash2016_CT.fillna(dfSFCrash2016_CT.dtypes.replace({'float64': 0.0, 'O': 'NULL'}),downcast='infer', inplace=True)
dfSFCrash2016_CT["ACCIDENT_YEAR"] = 2016
dfSFCrash2016_CT["tractce10"]=dfSFCrash2016_CT["tractce10"].astype(str)

def add_length_columns(_df):
    #remember to rename DISTANCE variable (as this is no longer the actual distance (in miles), given that feature is split-up)
    d = {}
    if isinstance(_df,gpd.GeoDataFrame):
        d["Length_meters"] = _df.geometry.length
        d["Length_miles"] = d["Length_meters"]* 0.000621371

    return pd.concat([_df, pd.DataFrame(d, index=_df.index)],axis=1)

dfSFRdNtwrk2010_CT.rename(columns={"DISTANCE":"DISTANCE_MILES"},inplace=True)
dfSFRdNtwrk2010_CT = add_length_columns(dfSFRdNtwrk2010_CT.copy())

dfSFRdNtwrk2016_CT.rename(columns={"DISTANCE":"DISTANCE_MILES"},inplace=True)
dfSFRdNtwrk2016_CT = add_length_columns(dfSFRdNtwrk2016_CT.copy())

In [8]:
# aggregate the SFChamp network files by Census Tract
def reqd_colmns(_df):
    df = _df.copy()
    d = {}
    reqd_colmns = ['V1_1', 'V2_1', 'V3_1', 'V4_1', 'V5_1', 'V6_1', 'V7_1', 'V8_1', 'V9_1', 'V10_1',
                   'V11_1', 'V12_1',"V13_1",'V14_1', 'V15_1',"V16_1",'V17_1', 'V18_1',"V19_1",
                   'OOS', 'PUDO','Tot_Vol',"TNC_Tot_Vol",
                   'BUSVOL_AM', 'BUSVOL_AM', 'BUSVOL_MD', 'BUSVOL_PM', 'BUSVOL_EV', 'BUSVOL_EA',
                   ]
    for col in reqd_colmns:
        if col not in df.columns:
            d[col]=0
            # df = (df.assign(txt=0))
    return pd.concat([df, pd.DataFrame(d, index=df.index)],axis=1)

# create supplemental fields for estimation
def get_required_fields(_df):
    fields = ["Tot_TNC_Vol", "Tot_Non_TNC_Vol", "Tot_VMT","Tot_TNC_VMT","Tot_Non_TNC_VMT", "Congested_Speed","Tot_Vol","PUDO","OOS"]

    d = {}
    for fld in fields:
        if fld == "Tot_TNC_Vol":# TNC Tot Vol
            cols = ['V13_1',"OOS"]
            d[fld] = _df[cols].sum(axis=1)
            d[f"{fld}_mil"] = d[fld].divide(1000000)
            d[f"{fld}_yr"] = d[fld]*365
            d[f"{fld}_mil_yr"] =  d[f"{fld}_yr"].divide(1000000)
            d[f"log_{fld}"] = np.log(d[fld]+1)
            d[f"log_{fld}_mil"] = np.log(d[f"{fld}_mil"]+1)
            d[f"log_{fld}_mil_yr"] = np.log(d[f"{fld}_mil_yr"]+1)
        elif fld == "Tot_Non_TNC_Vol":
            cols = ['V13_1',"OOS"]
            d[fld] = _df["Tot_Vol"] - _df[cols].sum(axis=1)
            d[f"{fld}_mil"] = d[fld].divide(1000000)
            d[f"{fld}_yr"] = d[fld]*365
            d[f"{fld}_mil_yr"] =  d[f"{fld}_yr"].divide(1000000)
            d[f"log_{fld}"] = np.log(d[fld]+1)
            d[f"log_{fld}_mil"] = np.log(d[f"{fld}_mil"]+1)
            d[f"log_{fld}_mil_yr"] = np.log(d[f"{fld}_mil_yr"]+1)
        elif fld == "Tot_VMT":
            d[fld] = _df["Tot_Vol"]*_df["Length_meters"]*0.000621371
            d[f"{fld}_mil"] = d[fld].divide(1000000)
            d[f"{fld}_yr"] = d[fld]*365
            d[f"{fld}_mil_yr"] =  d[f"{fld}_yr"].divide(1000000)
            d[f"log_{fld}"] = np.log(d[fld]+1)
            d[f"log_{fld}_mil"] = np.log(d[f"{fld}_mil"]+1)
            d[f"log_{fld}_mil_yr"] = np.log(d[f"{fld}_mil_yr"]+1)
        elif fld == "Tot_TNC_VMT":
            cols = ['V13_1',"OOS"]
            d[fld] = (_df[cols].sum(axis=1))*_df["Length_meters"]*0.000621371
            d[f"{fld}_mil"] = d[fld].divide(1000000)
            d[f"{fld}_yr"] = d[fld]*365
            d[f"{fld}_mil_yr"] =  d[f"{fld}_yr"].divide(1000000)
            d[f"log_{fld}"] = np.log(d[fld]+1)
            d[f"log_{fld}_mil"] = np.log(d[f"{fld}_mil"]+1)
            d[f"log_{fld}_mil_yr"] = np.log(d[f"{fld}_mil_yr"]+1)
        elif fld == "Tot_Non_TNC_VMT":
            cols = ['V13_1',"OOS"]
            d[fld] = (_df["Tot_Vol"] - _df[cols].sum(axis=1))*_df["Length_meters"]*0.000621371
            d[f"{fld}_mil"] = d[fld].divide(1000000)
            d[f"{fld}_yr"] = d[fld]*365
            d[f"{fld}_mil_yr"] =  d[f"{fld}_yr"].divide(1000000)
            d[f"log_{fld}"] = np.log(d[fld]+1)
            d[f"log_{fld}_mil"] = np.log(d[f"{fld}_mil"]+1)
            d[f"log_{fld}_mil_yr"] = np.log(d[f"{fld}_mil_yr"]+1)
        elif fld == "Congested_Speed":
            d["Congested_Speed"] = (((_df["Length_meters"]*0.000621371).divide(_df["TIME_1"]))*60)
            d["Congested_Speed_yr"] = d["Congested_Speed"]
        elif fld == "Tot_Vol":
            d["Tot_Vol_yr"] = (_df["Tot_Vol"]*365)
            d["Tot_Vol_mil"] = _df["Tot_Vol"].divide(1000000)
            d["Tot_Vol_mil_yr"] = d["Tot_Vol_yr"].divide(1000000)
            d["log_Tot_Vol"] = np.log(_df["Tot_Vol"]+1)
            d["log_Tot_Vol_yr"] = np.log(d["Tot_Vol_yr"]+1)
            d["log_Tot_Vol_mil"] = np.log(d["Tot_Vol_mil"]+1)
            d["log_Tot_Vol_mil_yr"] = np.log(d["Tot_Vol_mil_yr"]+1)
        elif fld == "PUDO":
            d["PUDO_yr"] = (_df["PUDO"]*365)
            d["PUDO_thousands"] = _df["PUDO"].divide(1000)
            d["PUDO_thousands_yr"] = d["PUDO_thousands"]*365
            d["PUDO_mil"] = _df["PUDO"].divide(1000000)
            d["PUDO_mil_yr"] = d["PUDO_yr"].divide(1000000)

            d["log_PUDO"] = np.log(_df["PUDO"]+1)
            d["log_PUDO_yr"] = np.log(d["PUDO_yr"]+1)
            d["log_PUDO_thousands"] = np.log(d["PUDO_thousands"]+1)
            d["log_PUDO_thousands_yr"] = np.log(d["PUDO_thousands_yr"]+1)
            d["log_PUDO_mil"] = np.log(d["PUDO_mil"]+1)
            d["log_PUDO_mil_yr"] = np.log(d["PUDO_mil_yr"]+1)
        elif fld == "OOS":
            d["OOS_yr"] = (_df["OOS"]*365)
            d["OOS_thousands"] = _df["OOS"].divide(1000)
            d["OOS_thousands_yr"] = d["OOS_thousands"]*365
            d["OOS_mil"] = _df["OOS"].divide(1000000)
            d["OOS_mil_yr"] = d["OOS_yr"].divide(1000000)

            d["log_OOS"] = np.log(_df["OOS"]+1)
            d["log_OOS_yr"] = np.log(d["OOS_yr"]+1)
            d["log_OOS_thousands"] = np.log(d["OOS_thousands"]+1)
            d["log_OOS_thousands_yr"] = np.log(d["OOS_thousands_yr"]+1)
            d["log_OOS_mil"] = np.log(d["OOS_mil"]+1)
            d["log_OOS_mil_yr"] = np.log(d["OOS_mil_yr"]+1)
    return pd.concat([_df, pd.DataFrame(d, index=_df.index)],axis=1)

# aggregate the fields by CensusTract
def agg_network_CT(_df):
    _df = reqd_colmns(_df.copy())
    _df ["A"] = _df["A"].astype(str)
    _df["B"] = _df["B"].astype(str)
    _df ["A_B"] = _df["A_B"].astype(str)
    _df["FT"] = _df["FT"].astype(str)
    _df["tractce10"] = _df["tractce10"].astype(str)
    #     _df["category"] = _df["category"].astype(str)

    # aggregate the dataframe using A_B
    wt_avg = lambda x: np.ma.average(x, weights = _df.loc[x.index, "Tot_Vol"])
    # Aggregating rows based on one column with “, ”.join
    concat_agg = lambda ar: ', '.join([item for item in ar if item])

    def agg_func(df):
        d = {}
        for col in df.select_dtypes(np.number).columns:
            if col in wt_col:
                d[col] = wt_avg
            else:
                d[col] = "sum"
        for col in df.select_dtypes(object).columns:
            if col in str_col:
                d[col] = "first"
            else:
                d[col] = concat_agg
        return d

    wt_col = ["SPEED","TIME","CSPD_1", 'VDT_1', 'VHT_1','VC_1',]
    sum_col = [ "CAP", "DISTANCE_MILES",'Length_meters', 'Length_miles',
                'V_1',
                'V1_1', 'V2_1', 'V3_1', 'V4_1', 'V5_1', 'V6_1', 'V7_1', 'V8_1', 'V9_1', 'V10_1',
                'V11_1', 'V12_1',"V13_1",'V14_1', 'V15_1',"V16_1",'V17_1', 'V18_1',"V19_1",
                'VT_1',
                'V1T_1', 'V2T_1', 'V3T_1', 'V4T_1', 'V5T_1', 'V6T_1', 'V7T_1', 'V8T_1', 'V9T_1', 'V10T_1',
                'V11T_1','V12T_1',"V13T_1",'V14T_1', 'V15T_1',"V16T_1",'V17T_1', 'V18T_1',"V19T_1",
                'OOS', 'PUDO',
                'Tot_Vol',"TNC_Tot_Vol",
                'TIMESEED',"TIME_1",
                "Tot_CAP",
                'BUSVOL_AM', 'BUSVOL_AM', 'BUSVOL_MD', 'BUSVOL_PM', 'BUSVOL_EV', 'BUSVOL_EA',]

    str_col = ['tractce10']
    concat_col = ["A_B","FT","AT"]
    drop_col = [ 'A', 'B',"USE",'PER_RISE', 'ONEWAY',"TOLL",'PROJ', 'ACTION', 'AB','peak',
                 'TOLLAM_DA', 'TOLLAM_SR2', 'TOLLAM_SR3', 'TOLLPM_DA', 'TOLLPM_SR2', 'TOLLPM_SR3', 'TOLLEA_DA',
                 'TOLLEA_SR2', 'TOLLEA_SR3', 'TOLLMD_DA', 'TOLLMD_SR2', 'TOLLMD_SR3', 'TOLLEV_DA', 'TOLLEV_SR2', 'TOLLEV_SR3',
                 'DTA_EDIT_F', 'TOLLTIME', 'PHASE', 'AMBUSSAVE', 'MDBUSSAVE', 'PMBUSSAVE', 'EVBUSSAVE', 'EABUSSAVE', 'SPDC', 'CAPC',
                 'LANE_AM', 'LANE_OP', 'LANE_PM', 'BUSLANE_AM', 'BUSLANE_OP', 'BUSLANE_PM',
                 'STREETNAME', 'TYPE', 'MTYPE','TSIN',
                 'VALUETOLL_', 'PASSTHRU', 'BUSTPS_AM', 'BUSTPS_OP', 'BUSTPS_PM', 'TSVA', 'BIKE_CLASS', 'PER_RISE', 'ONEWAY',
                 'TOLL',
                 'TIMESEED',
                 ]

    _df.drop(drop_col,axis=1,inplace=True)
    df = _df.groupby(['tractce10'],as_index=False).aggregate(agg_func(_df.copy())).copy()

    return get_required_fields(df)

dfSFRdNtwrk2010_CT_agg = agg_network_CT(dfSFRdNtwrk2010_CT.copy())
dfSFRdNtwrk2010_CT_agg["Crash_Year"] = 2010
dfSFRdNtwrk2016_CT_agg = agg_network_CT(dfSFRdNtwrk2016_CT.copy())
dfSFRdNtwrk2016_CT_agg["Crash_Year"] = 2016

#unwanted columns
# unwanted_cols = ['V1T_1', 'V2T_1', 'V3T_1', 'V4T_1', 'V5T_1', 'V6T_1', 'V7T_1', 'V8T_1', 'V9T_1', 'V10T_1',
#                  'V11T_1','V12T_1',"V13T_1",'V14T_1', 'V15T_1',"V16T_1",'V17T_1', 'V18T_1',"V19T_1",
#                  "FT","AT","A_B"]

dfSFRdNtwrk2010_CT_agg.sort_index(axis=1).to_csv(BASE_DIR.parent.joinpath(folder_path,"Feb162022","SFChamp_2010_CT_agg_PCS.csv"))
dfSFRdNtwrk2016_CT_agg.sort_index(axis=1).to_csv(BASE_DIR.parent.joinpath(folder_path,"Feb162022","SFChamp_2016_CT_agg_PCS.csv"))

In [9]:
# aggregate crashes along the CensusTract
def agg_crash_CT(_df):
    drop_fld = ['CASE_ID', 'PROC_DATE', 'JURIS', 'COLLISION_DATE', 'COLLISION_TIME', 'OFFICER_ID', 'REPORTING_DISTRICT', 'DAY_OF_WEEK', 'CHP_SHIFT', 'POPULATION', 'CNTY_CITY_LOC', 'SPECIAL_COND', 'BEAT_TYPE', 'CHP_BEAT_TYPE', 'CITY_DIVISION_LAPD', 'CHP_BEAT_CLASS', 'BEAT_NUMBER', 'PRIMARY_RD', 'SECONDARY_RD', 'DISTANCE', 'DIRECTION', 'INTERSECTION', 'WEATHER_1', 'WEATHER_2', 'STATE_HWY_IND', 'CALTRANS_COUNTY', 'CALTRANS_DISTRICT', 'STATE_ROUTE', 'ROUTE_SUFFIX', 'POSTMILE_PREFIX', 'POSTMILE', 'LOCATION_TYPE', 'RAMP_INTERSECTION', 'SIDE_OF_HWY', 'TOW_AWAY', 'COLLISION_SEVERITY','PARTY_COUNT', 'PRIMARY_COLL_FACTOR', 'PCF_CODE_OF_VIOL', 'PCF_VIOL_CATEGORY', 'PCF_VIOLATION', 'PCF_VIOL_SUBSECTION', 'HIT_AND_RUN', 'TYPE_OF_COLLISION', 'MVIW', 'PED_ACTION', 'ROAD_SURFACE', 'ROAD_COND_1', 'ROAD_COND_2', 'LIGHTING', 'CONTROL_DEVICE', 'CHP_ROAD_TYPE', 'PEDESTRIAN_ACCIDENT', 'BICYCLE_ACCIDENT', 'MOTORCYCLE_ACCIDENT', 'TRUCK_ACCIDENT', 'NOT_PRIVATE_PROPERTY', 'ALCOHOL_INVOLVED', 'STWD_VEHTYPE_AT_FAULT', 'CHP_VEHTYPE_AT_FAULT', 'PRIMARY_RAMP', 'SECONDARY_RAMP', 'LATITUDE', 'LONGITUDE', 'COUNTY', 'CITY', 'POINT_X', 'POINT_Y', 'PRIMARY_RD_3', 'SECONDARY_RD_3', ]
    _df.drop(columns=drop_fld,inplace=True)
    _df["tractce10"] = _df["tractce10"].astype(str)
    str_col = ['tractce10']
    sum_col = ['NUMBER_KILLED', 'NUMBER_INJURED',
               'COUNT_SEVERE_INJ', 'COUNT_VISIBLE_INJ', 'COUNT_COMPLAINT_PAIN', 'COUNT_PED_KILLED', 'COUNT_PED_INJURED', 'COUNT_BICYCLIST_KILLED', 'COUNT_BICYCLIST_INJURED', 'COUNT_MC_KILLED', 'COUNT_MC_INJURED',
               'Total_Crash', 'COUNT_Fatal', 'COUNT_Severe_Injury', 'COUNT_Visible_Injury', 'COUNT_Other_Injury', 'COUNT_PDO',]
    # Aggregating rows based on one column with “, ”.join
    concat_agg = lambda ar: ', '.join([item for item in ar if item])
    def agg_func(df):
        d = {}
        for col in df.select_dtypes(np.number).columns:
            d[col] = "sum"
        for col in df.select_dtypes(object).columns:
            if col in str_col:
                d[col] = "first"
            else:
                d[col] = concat_agg
        return d
    df = _df.groupby(['tractce10'],as_index=False).aggregate(agg_func(_df.copy())).copy()
    return df

dfSFCrash2010_CT_agg = agg_crash_CT(dfSFCrash2010_CT.copy())
dfSFCrash2010_CT_agg["ACCIDENT_YEAR"]=2010

dfSFCrash2016_CT_agg = agg_crash_CT(dfSFCrash2016_CT.copy())
dfSFCrash2016_CT_agg["ACCIDENT_YEAR"]=2016

dfSFCrash2010_CT_agg.sort_index(axis=1).to_csv(BASE_DIR.parent.joinpath(folder_path,"Feb162022","SFCrash_2010_CT_agg_PCS.csv"))
dfSFCrash2016_CT_agg.sort_index(axis=1).to_csv(BASE_DIR.parent.joinpath(folder_path,"Feb162022","SFCrash_2016_CT_agg_PCS.csv"))

In [10]:
dfSF_RdNtwrk_Crash_2010 = pd.merge(dfSFRdNtwrk2010_CT_agg.copy(),dfSFCrash2010_CT_agg.copy(), left_on="tractce10",right_on="tractce10",how="left")
dfSF_RdNtwrk_Crash_2016 = pd.merge(dfSFRdNtwrk2016_CT_agg.copy(),dfSFCrash2016_CT_agg.copy(), left_on="tractce10",right_on="tractce10",how="left")

In [19]:
SF_CT = gpd.read_file(BASE_DIR.parent.joinpath(folder_path,"Feb162022","SF_CensusTract_PCS.geojson"), crs = "EPSG:3857")
drop_cols = ['statefp10', 'mtfcc10', 'name10', 'intptlat10', 'awater10', 'namelsad10', 'funcstat10', 'aland10', 'geoid10','intptlon10', 'countyfp10','FT', 'A_B', "AT",]
SF_CT["tractce10"] = SF_CT["tractce10"].astype(str)

dfSF2010 = SF_CT.merge(dfSF_RdNtwrk_Crash_2010,on="tractce10",how="left")
dfSF2010.drop(columns=drop_cols,inplace=True)
dfSF2010[["Crash_Year","ACCIDENT_YEAR"]]=2010

dfSF2016 = SF_CT.merge(dfSF_RdNtwrk_Crash_2016,on="tractce10",how="left")
dfSF2016.drop(columns=drop_cols,inplace=True)
dfSF2016[["Crash_Year","ACCIDENT_YEAR"]]=2016

# dfSF2010.loc[:,~dfSF2010.columns.isin(["geometry"])].sort_index(axis=1).to_csv(BASE_DIR.parent.joinpath(folder_path,"Feb162022","SF2010_CT_agg_PCS.csv"))
# dfSF2016.loc[:,~dfSF2016.columns.isin(["geometry"])].sort_index(axis=1).to_csv(BASE_DIR.parent.joinpath(folder_path,"Feb162022","SF2016_CT_agg_PCS.csv"))

dfSF2010 = dfSF2010.to_crs(3857)
dfSF2016 = dfSF2016.to_crs(3857)

dfSF2010.to_file(BASE_DIR.parent.joinpath(folder_path,"Feb162022","SF_2010_CT_PCS.geojson"), driver='GeoJSON', crs = "EPSG:3857")
dfSF2016.to_file(BASE_DIR.parent.joinpath(folder_path,"Feb162022","SF_2016_CT_PCS.geojson"), driver='GeoJSON', crs = "EPSG:3857")

dfSFmerged = gpd.GeoDataFrame(pd.concat([dfSF2010,dfSF2016],ignore_index=True),crs=dfSF2010.crs)
dfSFmerged.drop(columns=["Unnamed: 0"],inplace=True)
dfSFmerged.replace([np.inf, -np.inf], np.nan, inplace=True)
dfSFmerged.fillna(0, inplace=True)

dfSFmerged = dfSFmerged.to_crs(3857)
dfSFmerged.to_file(BASE_DIR.parent.joinpath(folder_path,"Feb162022","SFmerged_CT_PCS.geojson"), driver='GeoJSON', crs = "EPSG:3857")
dfSFmerged.loc[:,~dfSFmerged.columns.isin(["geometry"])].sort_index(axis=1).to_csv(BASE_DIR.parent.joinpath(folder_path,"Feb162022","SFmerged_CT_PCS.csv"))

  pd.Int64Index,
  pd.Int64Index,
  pd.Int64Index,


In [12]:
# Create difference and pct_change columns
gdSFdb = pd.read_csv(BASE_DIR.parent.joinpath(folder_path,"Feb162022","SFmerged_CT_PCS.csv"))
# Five types of crashes: 'COUNT_Fatal', 'COUNT_Severe_Injury', 'COUNT_Visible_Injury', 'COUNT_Other_Injury', "COUNT_PDO"
gdSFdb["COUNT_Fatal_and_Injury"] = gdSFdb["COUNT_Fatal"] + gdSFdb["COUNT_Visible_Injury"] + gdSFdb["COUNT_Severe_Injury"]+ gdSFdb["COUNT_Other_Injury"]

gdSFdb2010 = gdSFdb[gdSFdb["ACCIDENT_YEAR"]==2010].add_suffix("_2010").copy()
gdSFdb2016 = gdSFdb[gdSFdb["ACCIDENT_YEAR"]==2016].add_suffix("_2016").copy()

gdSFdb2010["tractce10_2010"] = gdSFdb2010["tractce10_2010"].astype(str)
gdSFdb2016["tractce10_2016"] = gdSFdb2016["tractce10_2016"].astype(str)

dfSFJoined = pd.merge(gdSFdb2010,gdSFdb2016, left_on="tractce10_2010",right_on="tractce10_2016",how="inner")
dfSFJoined.rename(columns={"tractce10_2010":"tractce10"},inplace=True)
dfSFJoined["tractce10"]=dfSFJoined["tractce10"].astype(str)
dfSFJoined = dfSFJoined.sort_index(axis=1)
dfSFJoined.sort_index(axis=1).to_csv(BASE_DIR.parent.joinpath(folder_path,"Feb162022","SF_joined.csv"))

def add_column(df):
    cols = ['CAP', 'SPEED', 'TIME', 'BUSVOL_AM', 'BUSVOL_MD', 'BUSVOL_PM', 'BUSVOL_EV', 'BUSVOL_EA', 'V_1', 'TIME_1', 'VC_1', 'CSPD_1', 'VDT_1', 'VHT_1', 'V1_1', 'V2_1', 'V3_1', 'V4_1', 'V5_1', 'V6_1', 'V7_1', 'V8_1', 'V9_1', 'V10_1', 'V11_1', 'V12_1', 'VT_1', 'V1T_1', 'V2T_1', 'V3T_1', 'V4T_1', 'V5T_1', 'V6T_1', 'V7T_1', 'V8T_1', 'V9T_1', 'V10T_1', 'V11T_1', 'V12T_1', 'Tot_CAP', 'OOS', 'PUDO', 'Tot_Vol', 'TNC_Tot_Vol', 'V13_1', 'V14_1', 'V15_1', 'V16_1', 'V17_1', 'V18_1', 'V19_1', 'Tot_TNC_Vol', 'Tot_TNC_Vol_mil', 'Tot_TNC_Vol_yr', 'Tot_TNC_Vol_mil_yr', 'log_Tot_TNC_Vol', 'log_Tot_TNC_Vol_mil', 'log_Tot_TNC_Vol_mil_yr', 'Tot_Non_TNC_Vol', 'Tot_Non_TNC_Vol_mil', 'Tot_Non_TNC_Vol_yr', 'Tot_Non_TNC_Vol_mil_yr', 'log_Tot_Non_TNC_Vol', 'log_Tot_Non_TNC_Vol_mil', 'log_Tot_Non_TNC_Vol_mil_yr', 'Tot_VMT', 'Tot_VMT_mil', 'Tot_VMT_yr', 'Tot_VMT_mil_yr', 'log_Tot_VMT', 'log_Tot_VMT_mil', 'log_Tot_VMT_mil_yr', 'Tot_TNC_VMT', 'Tot_TNC_VMT_mil', 'Tot_TNC_VMT_yr', 'Tot_TNC_VMT_mil_yr', 'log_Tot_TNC_VMT', 'log_Tot_TNC_VMT_mil', 'log_Tot_TNC_VMT_mil_yr', 'Tot_Non_TNC_VMT', 'Tot_Non_TNC_VMT_mil', 'Tot_Non_TNC_VMT_yr', 'Tot_Non_TNC_VMT_mil_yr', 'log_Tot_Non_TNC_VMT', 'log_Tot_Non_TNC_VMT_mil', 'log_Tot_Non_TNC_VMT_mil_yr', 'Congested_Speed', 'Congested_Speed_yr', 'Tot_Vol_yr', 'Tot_Vol_mil', 'Tot_Vol_mil_yr', 'log_Tot_Vol', 'log_Tot_Vol_yr', 'log_Tot_Vol_mil', 'log_Tot_Vol_mil_yr', 'PUDO_yr', 'PUDO_thousands', 'PUDO_thousands_yr', 'PUDO_mil', 'PUDO_mil_yr', 'log_PUDO','log_PUDO_yr', 'log_PUDO_thousands', 'log_PUDO_thousands_yr', 'log_PUDO_mil', 'log_PUDO_mil_yr', 'OOS_yr', 'OOS_thousands', 'OOS_thousands_yr', 'OOS_mil', 'OOS_mil_yr', 'log_OOS','log_OOS_yr', 'log_OOS_thousands', 'log_OOS_thousands_yr', 'log_OOS_mil', 'log_OOS_mil_yr','NUMBER_KILLED', 'NUMBER_INJURED', 'COUNT_SEVERE_INJ', 'COUNT_VISIBLE_INJ', 'COUNT_COMPLAINT_PAIN', 'COUNT_PED_KILLED', 'COUNT_PED_INJURED', 'COUNT_BICYCLIST_KILLED', 'COUNT_BICYCLIST_INJURED', 'COUNT_MC_KILLED', 'COUNT_MC_INJURED', 'Total_Crash', 'COUNT_Fatal', 'COUNT_Severe_Injury', 'COUNT_Visible_Injury', 'COUNT_Other_Injury', 'COUNT_PDO', "COUNT_Fatal_and_Injury",'V13T_1', 'V14T_1', 'V15T_1', 'V16T_1', 'V17T_1', 'V18T_1', 'V19T_1']
    d = {}
    for col in cols:
        d[f'{col}_{"diff"}'] = df[f'{col}_{"2016"}'] - df[f'{col}_{"2010"}']
        d[f'{col}_{"pct_change"}'] = d[f'{col}_{"diff"}'].divide(df[f'{col}_{"2010"}'])
    return pd.concat([df, pd.DataFrame(d, index=df.index)],axis=1)

dfSFJoined = add_column(dfSFJoined)
dfSFJoined.replace([np.inf, -np.inf], np.nan, inplace=True)
dfSFJoined.fillna(0,inplace=True)

dfSFJoined.sort_index(axis=1).to_csv(BASE_DIR.parent.joinpath(folder_path,"Feb162022","SF_joined_diff_pct_chnge.csv"))

In [13]:
dfSFJoined = pd.read_csv(BASE_DIR.parent.joinpath(folder_path,"Feb162022","SF_joined_diff_pct_chnge.csv"))
SF_CT = gpd.read_file(BASE_DIR.parent.joinpath(folder_path,"Feb162022","SF_CensusTract_PCS.geojson"), crs = "EPSG:3857")
# cols = ['statefp10', 'mtfcc10', 'name10', 'intptlat10', 'awater10', 'namelsad10', 'funcstat10', 'aland10', 'geoid10', 'intptlon10', 'countyfp10',]
# SF_CT.drop(columns=cols,inplace=True)
SF_CT["tractce10"]=SF_CT["tractce10"].astype(int).astype(str)
# SF_CT["tractce10"]=SF_CT["tractce10"].astype(str)
# dfSFJoined.rename(columns={"tractce10_2010":"tractce10"},inplace=True)
# clmn = dfSFJoined.columns.to_list()
# dfSFJoined[clmn] = dfSFJoined[clmn].apply(pd.to_numeric, errors='coerce')
dfSFJoined["tractce10"]=dfSFJoined["tractce10"].astype(str)
# dfSFJoined[["tractce10","tractce10_2016"]]=dfSFJoined[["tractce10","tractce10_2016"]].astype(str)
SF_CT = SF_CT.to_crs(3857)
SF_CT.merge(dfSFJoined,left_on="tractce10",right_on="tractce10",how="left").to_file(BASE_DIR.parent.joinpath(folder_path,"Feb162022","SF_joined_diff_pct_chnge_CT_PCS.geojson"), driver='GeoJSON', crs = "EPSG:3857")

  pd.Int64Index,


In [14]:
"""
Perform analysis based on Facility Type
FT = 1:Fwy-Fwy Connector, 2:Freeway, 3:Expressway, 4:Collector, 5:Ramp, 6:Centroid Connector, 7:Major Arterial, 8:Not used,
9:Alley, 10:Metered Ramp, 11:Local, 12:Minor Arterial,13:Bike-Only!, 14:Not used, 15:Super Arterial,
Segregate the road network into three categories
1. Category 1 = contains FT = [1, 2, 3, 5]
2. Category 2 = contains FT = [4,7,12,13,15]
3. Category 3 = contains FT = [9,11]
"""

'\nPerform analysis based on Facility Type\nFT = 1:Fwy-Fwy Connector, 2:Freeway, 3:Expressway, 4:Collector, 5:Ramp, 6:Centroid Connector, 7:Major Arterial, 8:Not used,\n9:Alley, 10:Metered Ramp, 11:Local, 12:Minor Arterial,13:Bike-Only!, 14:Not used, 15:Super Arterial,\nSegregate the road network into three categories\n1. Category 1 = contains FT = [1, 2, 3, 5]\n2. Category 2 = contains FT = [4,7,12,,13,15]\n3. Category 3 = contains FT = [9,11]\n'

In [15]:
# modify the dataframe to create a new column "CATEGORY" using "FacilityType"
def label_df_by_road_category(_df,fld):
    _df["category"]=0
    _df.loc[_df[fld].isin([1, 2, 3, 5,13]),'category']=1
    _df.loc[_df[fld].isin([4,7,12,15]),'category']=2
    _df.loc[_df[fld].isin([9,11 ]),'category']=3
    return _df

# Fetch SF_Census Tract
SF_CT = gpd.read_file(BASE_DIR.parent.joinpath(folder_path,"Feb162022","SF_CensusTract_PCS.geojson"), crs = "EPSG:3857")
SF_CT = SF_CT.to_crs(3857)

# road network
dfSFRdNtwrk2010_CT = gpd.read_file(BASE_DIR.parent.joinpath(folder_path,"Feb162022","SFChamp_2010_CT_PCS.geojson"), crs = "EPSG:3857")
dfSFRdNtwrk2010_CT.fillna(dfSFRdNtwrk2010_CT.dtypes.replace({'float64': 0.0, 'O': 'NULL'}),downcast='infer', inplace=True)
dfSFRdNtwrk2010_cat_CT =  label_df_by_road_category(dfSFRdNtwrk2010_CT.copy(),"FT")
dfSFRdNtwrk2010_cat_CT = dfSFRdNtwrk2010_cat_CT.to_crs(3857)

# dfSFRdNtwrk2010_CT["ACCIDENT_YEAR"] = 2010
dfSFRdNtwrk2016_CT = gpd.read_file(BASE_DIR.parent.joinpath(folder_path,"Feb162022","SFChamp_2016_CT_PCS.geojson"), crs = "EPSG:3857")
dfSFRdNtwrk2016_CT.fillna(dfSFRdNtwrk2016_CT.dtypes.replace({'float64': 0.0, 'O': 'NULL'}),downcast='infer', inplace=True)
dfSFRdNtwrk2016_cat_CT =  label_df_by_road_category(dfSFRdNtwrk2016_CT.copy(),"FT")
dfSFRdNtwrk2016_cat_CT = dfSFRdNtwrk2016_cat_CT.to_crs(3857)

def add_column(df):
    d = {}
    d['COUNT_Fatal_and_Injury'] = df['COUNT_Fatal'] + df['COUNT_Visible_Injury'] + df['COUNT_Severe_Injury'] + df['COUNT_Other_Injury']
    return pd.concat([df, pd.DataFrame(d, index=df.index)],axis=1)

# road crashes
dfSFCrash2010_CT = gpd.read_file(BASE_DIR.parent.joinpath(folder_path,"Feb162022","NN_SFCrash_SFChamp_2010_CT_PCS.geojson"), crs = "EPSG:3857")
dfSFCrash2010_CT.fillna(dfSFCrash2010_CT.dtypes.replace({'float64': 0.0, 'O': 'NULL'}),downcast='infer', inplace=True)
dfSFCrash2010_CT = add_column(dfSFCrash2010_CT)
dfSFCrash2010_CT["ACCIDENT_YEAR"] = 2010
dfSFCrash2010_CT= dfSFCrash2010_CT.loc[dfSFCrash2010_CT["D2NL"]<10,:]
dfSFCrash2010_cat_CT =  label_df_by_road_category(dfSFCrash2010_CT.copy(),"FT")
dfSFCrash2010_cat_CT = dfSFCrash2010_cat_CT.to_crs(3857)

dfSFCrash2016_CT = gpd.read_file(BASE_DIR.parent.joinpath(folder_path,"Feb162022","NN_SFCrash_SFChamp_2016_CT_PCS.geojson"), crs = "EPSG:3857")
dfSFCrash2016_CT.fillna(dfSFCrash2016_CT.dtypes.replace({'float64': 0.0, 'O': 'NULL'}),downcast='infer', inplace=True)
dfSFCrash2016_CT = add_column(dfSFCrash2016_CT)
dfSFCrash2016_CT["ACCIDENT_YEAR"] = 2016
dfSFCrash2016_CT= dfSFCrash2016_CT.loc[dfSFCrash2016_CT["D2NL"]<10,:]
dfSFCrash2016_cat_CT =  label_df_by_road_category(dfSFCrash2016_CT.copy(),"FT")
dfSFCrash2016_cat_CT = dfSFCrash2016_cat_CT.to_crs(3857)

#remember to rename DISTANCE variable (as this is no longer the actual distance (in miles), given that feature is split-up)
def add_length_columns(_df):
    d = {}
    if isinstance(_df,gpd.GeoDataFrame):
        d["Length_meters"] = _df.geometry.length
        d["Length_miles"] = d["Length_meters"]* 0.000621371

    return pd.concat([_df, pd.DataFrame(d, index=_df.index)],axis=1)

dfSFRdNtwrk2010_cat_CT.rename(columns={"DISTANCE":"DISTANCE_MILES"},inplace=True)
dfSFRdNtwrk2010_cat_CT = add_length_columns(dfSFRdNtwrk2010_cat_CT.copy())
dfSFRdNtwrk2016_cat_CT.rename(columns={"DISTANCE":"DISTANCE_MILES"},inplace=True)
dfSFRdNtwrk2016_cat_CT = add_length_columns(dfSFRdNtwrk2016_cat_CT.copy())

In [16]:
# aggregate the SFChamp network files by TAZ by category
def reqd_colmns(_df):# road network
    df = _df.copy()
    d = {}
    reqd_colmns = ['V1_1', 'V2_1', 'V3_1', 'V4_1', 'V5_1', 'V6_1', 'V7_1', 'V8_1', 'V9_1', 'V10_1',
                   'V11_1', 'V12_1',"V13_1",'V14_1', 'V15_1',"V16_1",'V17_1', 'V18_1',"V19_1",
                   'OOS', 'PUDO','Tot_Vol',"TNC_Tot_Vol",
                   'BUSVOL_AM', 'BUSVOL_AM', 'BUSVOL_MD', 'BUSVOL_PM', 'BUSVOL_EV', 'BUSVOL_EA',
                   ]
    for col in reqd_colmns:
        if col not in df.columns:
            d[col]=0
    return pd.concat([df, pd.DataFrame(d, index=df.index)],axis=1)

def get_req_fields(_df):
    fields = ["Tot_TNC_Vol", "Tot_Non_TNC_Vol", "Tot_VMT","Tot_TNC_VMT","Tot_Non_TNC_VMT", "Congested_Speed","Tot_Vol","PUDO","OOS"]
    d = {}
    for fld in fields:
        if fld == "Tot_TNC_Vol":# TNC Tot Vol
            cols = ['V13_1',"OOS"]
            d[fld] = _df[cols].sum(axis=1)
            d[f"{fld}_mil"] = d[fld].divide(1000000)
            d[f"{fld}_yr"] = d[fld]*365
            d[f"{fld}_mil_yr"] =  d[f"{fld}_yr"].divide(1000000)
            d[f"log_{fld}"] = np.log(d[fld]+1)
            d[f"log_{fld}_mil"] = np.log(d[f"{fld}_mil"]+1)
            d[f"log_{fld}_mil_yr"] = np.log(d[f"{fld}_mil_yr"]+1)
        elif fld == "Tot_Non_TNC_Vol":
            cols = ['V13_1',"OOS"]
            d[fld] = _df["Tot_Vol"] - _df[cols].sum(axis=1)
            d[f"{fld}_mil"] = d[fld].divide(1000000)
            d[f"{fld}_yr"] = d[fld]*365
            d[f"{fld}_mil_yr"] =  d[f"{fld}_yr"].divide(1000000)
            d[f"log_{fld}"] = np.log(d[fld]+1)
            d[f"log_{fld}_mil"] = np.log(d[f"{fld}_mil"]+1)
            d[f"log_{fld}_mil_yr"] = np.log(d[f"{fld}_mil_yr"]+1)
        elif fld == "Tot_VMT":
            d[fld] = _df["Tot_Vol"]*_df["Length_meters"]*0.000621371
            d[f"{fld}_mil"] = d[fld].divide(1000000)
            d[f"{fld}_yr"] = d[fld]*365
            d[f"{fld}_mil_yr"] =  d[f"{fld}_yr"].divide(1000000)
            d[f"log_{fld}"] = np.log(d[fld]+1)
            d[f"log_{fld}_mil"] = np.log(d[f"{fld}_mil"]+1)
            d[f"log_{fld}_mil_yr"] = np.log(d[f"{fld}_mil_yr"]+1)
        elif fld == "Tot_TNC_VMT":
            cols = ['V13_1',"OOS"]
            d[fld] = (_df[cols].sum(axis=1))*_df["Length_meters"]*0.000621371
            d[f"{fld}_mil"] = d[fld].divide(1000000)
            d[f"{fld}_yr"] = d[fld]*365
            d[f"{fld}_mil_yr"] =  d[f"{fld}_yr"].divide(1000000)
            d[f"log_{fld}"] = np.log(d[fld]+1)
            d[f"log_{fld}_mil"] = np.log(d[f"{fld}_mil"]+1)
            d[f"log_{fld}_mil_yr"] = np.log(d[f"{fld}_mil_yr"]+1)
        elif fld == "Tot_Non_TNC_VMT":
            cols = ['V13_1',"OOS"]
            d[fld] = (_df["Tot_Vol"] - _df[cols].sum(axis=1))*_df["Length_meters"]*0.000621371
            d[f"{fld}_mil"] = d[fld].divide(1000000)
            d[f"{fld}_yr"] = d[fld]*365
            d[f"{fld}_mil_yr"] =  d[f"{fld}_yr"].divide(1000000)
            d[f"log_{fld}"] = np.log(d[fld]+1)
            d[f"log_{fld}_mil"] = np.log(d[f"{fld}_mil"]+1)
            d[f"log_{fld}_mil_yr"] = np.log(d[f"{fld}_mil_yr"]+1)
        elif fld == "Congested_Speed":
            d["Congested_Speed"] = (((_df["Length_meters"]*0.000621371).divide(_df["TIME_1"]))*60)
            d["Congested_Speed_yr"] = d["Congested_Speed"]
        elif fld == "Tot_Vol":
            d["Tot_Vol_yr"] = (_df["Tot_Vol"]*365)
            d["Tot_Vol_mil"] = _df["Tot_Vol"].divide(1000000)
            d["Tot_Vol_mil_yr"] = d["Tot_Vol_yr"].divide(1000000)
            d["log_Tot_Vol"] = np.log(_df["Tot_Vol"]+1)
            d["log_Tot_Vol_yr"] = np.log(d["Tot_Vol_yr"]+1)
            d["log_Tot_Vol_mil"] = np.log(d["Tot_Vol_mil"]+1)
            d["log_Tot_Vol_mil_yr"] = np.log(d["Tot_Vol_mil_yr"]+1)
        elif fld == "PUDO":
            d["PUDO_yr"] = (_df["PUDO"]*365)
            d["PUDO_thousands"] = _df["PUDO"].divide(1000)
            d["PUDO_thousands_yr"] = d["PUDO_thousands"]*365
            d["PUDO_mil"] = _df["PUDO"].divide(1000000)
            d["PUDO_mil_yr"] = d["PUDO_yr"].divide(1000000)

            d["log_PUDO"] = np.log(_df["PUDO"]+1)
            d["log_PUDO_yr"] = np.log(d["PUDO_yr"]+1)
            d["log_PUDO_thousands"] = np.log(d["PUDO_thousands"]+1)
            d["log_PUDO_thousands_yr"] = np.log(d["PUDO_thousands_yr"]+1)
            d["log_PUDO_mil"] = np.log(d["PUDO_mil"]+1)
            d["log_PUDO_mil_yr"] = np.log(d["PUDO_mil_yr"]+1)
        elif fld == "OOS":
            d["OOS_yr"] = (_df["OOS"]*365)
            d["OOS_thousands"] = _df["OOS"].divide(1000)
            d["OOS_thousands_yr"] = d["OOS_thousands"]*365
            d["OOS_mil"] = _df["OOS"].divide(1000000)
            d["OOS_mil_yr"] = d["OOS_yr"].divide(1000000)

            d["log_OOS"] = np.log(_df["OOS"]+1)
            d["log_OOS_yr"] = np.log(d["OOS_yr"]+1)
            d["log_OOS_thousands"] = np.log(d["OOS_thousands"]+1)
            d["log_OOS_thousands_yr"] = np.log(d["OOS_thousands_yr"]+1)
            d["log_OOS_mil"] = np.log(d["OOS_mil"]+1)
            d["log_OOS_mil_yr"] = np.log(d["OOS_mil_yr"]+1)
    return pd.concat([_df, pd.DataFrame(d, index=_df.index)],axis=1)

def agg_network(_df):
    _df = reqd_colmns(_df.copy())
    _df ["A"] = _df["A"].astype(str)
    _df["B"] = _df["B"].astype(str)
    _df ["A_B"] = _df["A_B"].astype(str)
    _df["FT"] = _df["FT"].astype(str)
    _df["tractce10"] = _df["tractce10"].astype(str)
    # aggregate the dataframe using A_B
    wt_avg = lambda x: np.ma.average(x, weights = _df.loc[x.index, "Tot_Vol"])
    # Aggregating rows based on one column with “, ”.join
    concat_agg = lambda ar: ', '.join([item for item in ar if item])

    def agg_func_rdntwrk(df):
        d = {}
        for col in df.select_dtypes(np.number).columns:
            if col in wt_col:
                d[col] = wt_avg
            else:
                d[col] = "sum"
        for col in df.select_dtypes(object).columns:
            if col in str_col:
                d[col] = "first"
            else:
                d[col] = concat_agg
        return d

    wt_col = ["SPEED","TIME","CSPD_1", 'VDT_1', 'VHT_1','VC_1', ]
    sum_col = [ "CAP", "DISTANCE_MILES",'Length_meters', 'Length_miles',
                'V_1',
                'V1_1', 'V2_1', 'V3_1', 'V4_1', 'V5_1', 'V6_1', 'V7_1', 'V8_1', 'V9_1', 'V10_1',
                'V11_1', 'V12_1',"V13_1",'V14_1', 'V15_1',"V16_1",'V17_1', 'V18_1',"V19_1",
                'VT_1',
                'V1T_1', 'V2T_1', 'V3T_1', 'V4T_1', 'V5T_1', 'V6T_1', 'V7T_1', 'V8T_1', 'V9T_1', 'V10T_1',
                'V11T_1','V12T_1',"V13T_1",'V14T_1', 'V15T_1',"V16T_1",'V17T_1', 'V18T_1',"V19T_1",
                'OOS', 'PUDO',
                'Tot_Vol',"TNC_Tot_Vol",
               'TIMESEED',"TIME_1",
                "Tot_CAP",
                'BUSVOL_AM', 'BUSVOL_AM', 'BUSVOL_MD', 'BUSVOL_PM', 'BUSVOL_EV', 'BUSVOL_EA',]

    str_col = ['tractce10']
    concat_col = ["FT",]
    drop_col = [ 'A', 'B',"USE",'PER_RISE', 'ONEWAY',"TOLL",'PROJ', 'ACTION', 'AB','peak',
                 'TOLLAM_DA', 'TOLLAM_SR2', 'TOLLAM_SR3', 'TOLLPM_DA', 'TOLLPM_SR2', 'TOLLPM_SR3', 'TOLLEA_DA',
                 'TOLLEA_SR2', 'TOLLEA_SR3', 'TOLLMD_DA', 'TOLLMD_SR2', 'TOLLMD_SR3', 'TOLLEV_DA', 'TOLLEV_SR2', 'TOLLEV_SR3',
                 'DTA_EDIT_F', 'TOLLTIME', 'PHASE', 'AMBUSSAVE', 'MDBUSSAVE', 'PMBUSSAVE', 'EVBUSSAVE', 'EABUSSAVE', 'SPDC', 'CAPC',
                 'LANE_AM', 'LANE_OP', 'LANE_PM', 'BUSLANE_AM', 'BUSLANE_OP', 'BUSLANE_PM',
                 'STREETNAME', 'TYPE', 'MTYPE','TSIN',
                 'VALUETOLL_', 'PASSTHRU', 'BUSTPS_AM', 'BUSTPS_OP', 'BUSTPS_PM', 'TSVA', 'BIKE_CLASS', 'PER_RISE', 'ONEWAY',
                 'TOLL',
                 'TIMESEED',"A_B","AT"
                 ]

    _df.drop(drop_col,axis=1,inplace=True)
    df = _df.groupby(['tractce10'],as_index=False).aggregate(agg_func_rdntwrk(_df.copy())).copy()

    return get_req_fields(df)

def agg_crash(_df):
    drop_fld = ['CASE_ID', 'PROC_DATE', 'JURIS', 'COLLISION_DATE', 'COLLISION_TIME', 'OFFICER_ID', 'REPORTING_DISTRICT', 'DAY_OF_WEEK', 'CHP_SHIFT', 'POPULATION', 'CNTY_CITY_LOC', 'SPECIAL_COND', 'BEAT_TYPE', 'CHP_BEAT_TYPE', 'CITY_DIVISION_LAPD', 'CHP_BEAT_CLASS', 'BEAT_NUMBER', 'PRIMARY_RD', 'SECONDARY_RD', 'DISTANCE', 'DIRECTION', 'INTERSECTION', 'WEATHER_1', 'WEATHER_2', 'STATE_HWY_IND', 'CALTRANS_COUNTY', 'CALTRANS_DISTRICT', 'STATE_ROUTE', 'ROUTE_SUFFIX', 'POSTMILE_PREFIX', 'POSTMILE', 'LOCATION_TYPE', 'RAMP_INTERSECTION', 'SIDE_OF_HWY', 'TOW_AWAY', 'COLLISION_SEVERITY','PARTY_COUNT', 'PRIMARY_COLL_FACTOR', 'PCF_CODE_OF_VIOL', 'PCF_VIOL_CATEGORY', 'PCF_VIOLATION', 'PCF_VIOL_SUBSECTION', 'HIT_AND_RUN', 'TYPE_OF_COLLISION', 'MVIW', 'PED_ACTION', 'ROAD_SURFACE', 'ROAD_COND_1', 'ROAD_COND_2', 'LIGHTING', 'CONTROL_DEVICE', 'CHP_ROAD_TYPE', 'PEDESTRIAN_ACCIDENT', 'BICYCLE_ACCIDENT', 'MOTORCYCLE_ACCIDENT', 'TRUCK_ACCIDENT', 'NOT_PRIVATE_PROPERTY', 'ALCOHOL_INVOLVED', 'STWD_VEHTYPE_AT_FAULT', 'CHP_VEHTYPE_AT_FAULT', 'PRIMARY_RAMP', 'SECONDARY_RAMP', 'LATITUDE', 'LONGITUDE', 'COUNTY', 'CITY', 'POINT_X', 'POINT_Y', 'PRIMARY_RD_3', 'SECONDARY_RD_3',]
    _df.drop(columns=drop_fld,inplace=True)
    str_col = ["FT","join_tractce10","tractce10"]
    sum_col = ['NUMBER_KILLED', 'NUMBER_INJURED',
               'COUNT_SEVERE_INJ', 'COUNT_VISIBLE_INJ', 'COUNT_COMPLAINT_PAIN', 'COUNT_PED_KILLED', 'COUNT_PED_INJURED', 'COUNT_BICYCLIST_KILLED', 'COUNT_BICYCLIST_INJURED', 'COUNT_MC_KILLED', 'COUNT_MC_INJURED',
               'Total_Crash', 'COUNT_Fatal', 'COUNT_Severe_Injury', 'COUNT_Visible_Injury', 'COUNT_Other_Injury', 'COUNT_PDO',"COUNT_Fatal_and_Injury"]
    _df["FT"] = _df["FT"].astype(str)
    _df["join_tractce10"] = _df["join_tractce10"].astype(str)
    _df["tractce10"] = _df["tractce10"].astype(str)
    # Aggregating rows based on one column with “, ”.join
    concat_agg = lambda ar: ', '.join([item for item in ar if item])
    def agg_func(df):
        d = {}
        for col in df.select_dtypes(np.number).columns:
            d[col] = "sum"
        for col in df.select_dtypes(object).columns:
            if col in str_col:
                d[col] = "first"
            else:
                d[col] = concat_agg
        return d
    df = _df.groupby(['tractce10'],as_index=False).aggregate(agg_func(_df.copy())).copy()
    return df

def perform_merge(_dfrdntwrk,_dfcrash,_dfSFTAZ,crash_yr):
    dfrdntwrk = agg_network(_dfrdntwrk)
    dfcrash = agg_crash(_dfcrash)
    dfcrash["ACCIDENT_YEAR"]=crash_yr

    dfrdntwrk_crash = pd.merge(dfrdntwrk,dfcrash, left_on="tractce10",right_on="join_tractce10",how="left")
    dfrdntwrk_crash.rename(columns={"tractce10_x":"tractce10"},inplace=True)

    col_drop = ['statefp10', 'mtfcc10', 'name10', 'intptlat10', 'awater10', 'namelsad10', 'funcstat10', 'aland10', 'geoid10','intptlon10', 'countyfp10',]

    dfSFTAZ = _dfSFTAZ.copy()
    dfSFTAZ["tractce10"]=dfSFTAZ["tractce10"].astype(str)

    dfrdntwrk_crash_taz = dfSFTAZ.merge(dfrdntwrk_crash,left_on="tractce10",right_on="tractce10",how="left")
    dfrdntwrk_crash_taz.drop(columns=col_drop,inplace=True)
    dfrdntwrk_crash_taz.replace([np.inf, -np.inf], np.nan, inplace=True)
    # Fill NaN for all strings column with NaN and for all numeric columns with 0
    d = {**dict.fromkeys(dfrdntwrk_crash_taz.select_dtypes(np.number).columns, 0),
         **dict.fromkeys(dfrdntwrk_crash_taz.select_dtypes(exclude=np.number).columns, '')}
    dfrdntwrk_crash_taz = dfrdntwrk_crash_taz.fillna(d)

    drop_clmn = ['category_x', 'category_y', 'tractce10_y', 'FT_y', 'join_tractce10', 'FT_x', ]
    dfrdntwrk_crash_taz.drop(columns=drop_clmn, inplace=True)
    dfrdntwrk_crash_taz["tractce10"]=dfrdntwrk_crash_taz["tractce10"].astype("int")
    # dfrdntwrk_crash_taz = dfrdntwrk_crash_taz.loc[dfrdntwrk_crash_taz["tractce10"].between(0,981)]
    dfrdntwrk_crash_taz["ACCIDENT_YEAR"]=crash_yr
    dfrdntwrk_crash_taz["Crash_Year"]=crash_yr
    # dfrdntwrk_crash_taz.loc[:,~dfrdntwrk_crash_taz.columns.isin(["geometry"])].to_csv(BASE_DIR.parent.joinpath(folder_path,"Feb162022","TAZ","SFmerged_TAZ_cat_1_PCS.csv"))

    return dfrdntwrk_crash_taz

In [17]:
def add_column(df):
    cols = ['CAP', 'SPEED', 'TIME', 'BUSVOL_AM', 'BUSVOL_MD', 'BUSVOL_PM', 'BUSVOL_EV', 'BUSVOL_EA', 'V_1', 'TIME_1', 'VC_1', 'CSPD_1', 'VDT_1', 'VHT_1', 'V1_1', 'V2_1', 'V3_1', 'V4_1', 'V5_1', 'V6_1', 'V7_1', 'V8_1', 'V9_1', 'V10_1', 'V11_1', 'V12_1', 'VT_1', 'V1T_1', 'V2T_1', 'V3T_1', 'V4T_1', 'V5T_1', 'V6T_1', 'V7T_1', 'V8T_1', 'V9T_1', 'V10T_1', 'V11T_1', 'V12T_1', 'Tot_CAP', 'OOS', 'PUDO', 'Tot_Vol', 'TNC_Tot_Vol', 'V13_1', 'V14_1', 'V15_1', 'V16_1', 'V17_1', 'V18_1', 'V19_1', 'Tot_TNC_Vol', 'Tot_TNC_Vol_mil', 'Tot_TNC_Vol_yr', 'Tot_TNC_Vol_mil_yr', 'log_Tot_TNC_Vol', 'log_Tot_TNC_Vol_mil', 'log_Tot_TNC_Vol_mil_yr', 'Tot_Non_TNC_Vol', 'Tot_Non_TNC_Vol_mil', 'Tot_Non_TNC_Vol_yr', 'Tot_Non_TNC_Vol_mil_yr', 'log_Tot_Non_TNC_Vol', 'log_Tot_Non_TNC_Vol_mil', 'log_Tot_Non_TNC_Vol_mil_yr', 'Tot_VMT', 'Tot_VMT_mil', 'Tot_VMT_yr', 'Tot_VMT_mil_yr', 'log_Tot_VMT', 'log_Tot_VMT_mil', 'log_Tot_VMT_mil_yr', 'Tot_TNC_VMT', 'Tot_TNC_VMT_mil', 'Tot_TNC_VMT_yr', 'Tot_TNC_VMT_mil_yr', 'log_Tot_TNC_VMT', 'log_Tot_TNC_VMT_mil', 'log_Tot_TNC_VMT_mil_yr', 'Tot_Non_TNC_VMT', 'Tot_Non_TNC_VMT_mil', 'Tot_Non_TNC_VMT_yr', 'Tot_Non_TNC_VMT_mil_yr', 'log_Tot_Non_TNC_VMT', 'log_Tot_Non_TNC_VMT_mil', 'log_Tot_Non_TNC_VMT_mil_yr', 'Congested_Speed', 'Congested_Speed_yr', 'Tot_Vol_yr', 'Tot_Vol_mil', 'Tot_Vol_mil_yr', 'log_Tot_Vol', 'log_Tot_Vol_yr', 'log_Tot_Vol_mil', 'log_Tot_Vol_mil_yr', 'PUDO_yr', 'PUDO_thousands', 'PUDO_thousands_yr', 'PUDO_mil', 'PUDO_mil_yr', 'log_PUDO_yr', 'log_PUDO_thousands', 'log_PUDO_thousands_yr', 'log_PUDO_mil', 'log_PUDO_mil_yr', 'OOS_yr', 'OOS_thousands', 'OOS_thousands_yr', 'OOS_mil', 'OOS_mil_yr', 'log_OOS_yr', 'log_OOS_thousands', 'log_OOS_thousands_yr', 'log_OOS_mil', 'log_OOS_mil_yr','NUMBER_KILLED', 'NUMBER_INJURED', 'COUNT_SEVERE_INJ', 'COUNT_VISIBLE_INJ', 'COUNT_COMPLAINT_PAIN', 'COUNT_PED_KILLED', 'COUNT_PED_INJURED', 'COUNT_BICYCLIST_KILLED', 'COUNT_BICYCLIST_INJURED', 'COUNT_MC_KILLED', 'COUNT_MC_INJURED', 'Total_Crash', 'COUNT_Fatal', 'COUNT_Severe_Injury', 'COUNT_Visible_Injury', 'COUNT_Other_Injury', 'COUNT_PDO', "COUNT_Fatal_and_Injury",'V13T_1', 'V14T_1', 'V15T_1', 'V16T_1', 'V17T_1', 'V18T_1', 'V19T_1',]
    d = {}
    for col in cols:
        d[f'{col}_{"diff"}'] = df[f'{col}_{"2016"}'] - df[f'{col}_{"2010"}']
        d[f'{col}_{"pct_change"}'] = d[f'{col}_{"diff"}'].divide(df[f'{col}_{"2010"}'])
    return pd.concat([df, pd.DataFrame(d, index=df.index)],axis=1)

def create_pct_change_files(_dfmerged,_dfSFTAZ,cat):
    dfmerged = _dfmerged
    dfSFTAZ = _dfSFTAZ

    gdSFdb2010 = dfmerged[dfmerged["ACCIDENT_YEAR"]==2010].add_suffix("_2010").copy()
    gdSFdb2016 = dfmerged[dfmerged["ACCIDENT_YEAR"]==2016].add_suffix("_2016").copy()

    df_joined = pd.merge(gdSFdb2010, gdSFdb2016, left_on="tractce10_2010", right_on="tractce10_2016", how="inner")
    df_joined.rename(columns={"tractce10_2010": "tractce10"}, inplace=True)
    df_joined["tractce10"]=df_joined["tractce10"].astype(str)
    df_joined = df_joined.sort_index(axis=1)
    df_joined.sort_index(axis=1).to_csv(BASE_DIR.parent.joinpath(folder_path, "Feb162022", f"SF_joined_cat_{cat}.csv"))

    df_joined = add_column(df_joined)
    df_joined.replace([np.inf, -np.inf], np.nan, inplace=True)
    df_joined.fillna(0, inplace=True)
    df_joined.sort_index(axis=1).to_csv(BASE_DIR.parent.joinpath(folder_path, "Feb162022", f"SF_joined_cat_{cat}_diff_pct_chnge.csv"))

    df_joined = pd.read_csv(BASE_DIR.parent.joinpath(folder_path,"Feb162022",f"SF_joined_cat_{cat}_diff_pct_chnge.csv"))

    # cols = ['statefp10', 'mtfcc10', 'name10', 'intptlat10', 'awater10', 'namelsad10', 'funcstat10', 'aland10', 'geoid10', 'intptlon10', 'countyfp10',]
    # SF_CT.drop(columns=cols,inplace=True)
    dfSFTAZ["tractce10"]=dfSFTAZ["tractce10"].astype(int).astype(str)

    # dfSFJoined.rename(columns={"tractce10_2010":"tractce10"},inplace=True)
    # clmn = dfSFJoined.columns.to_list()
    # dfSFJoined[clmn] = dfSFJoined[clmn].apply(pd.to_numeric, errors='coerce')
    df_joined["tractce10"]=df_joined["tractce10"].astype(str)
    # dfSFJoined[["tractce10","tractce10_2016"]]=dfSFJoined[["tractce10","tractce10_2016"]].astype(str)
    dfSFTAZ.merge(df_joined,left_on="tractce10",right_on="tractce10",how="left").to_file(BASE_DIR.parent.joinpath(folder_path,"Feb162022",f"SF_joined_cat_{cat}_diff_pct_chnge_CT_PCS.geojson"), driver='GeoJSON', crs = "EPSG:3857")


In [18]:
def perform_manipulation_by_category(_dfrdntwrk2010,_dfrdntwrk2016,_dfcrash2010,_dfcrash2016,_dfSF_CT,cat):
    dfSF2010 = perform_merge(_dfrdntwrk2010,_dfcrash2010,_dfSF_CT,2010)
    dfSF2016 = perform_merge(_dfrdntwrk2016,_dfcrash2016,_dfSF_CT,2016)
    dfSFmerged = gpd.GeoDataFrame(pd.concat([dfSF2010,dfSF2016],ignore_index=True),crs=dfSF2010.crs)
    dfSFmerged = dfSFmerged.set_crs(3857)
    dfSFmerged.to_file(BASE_DIR.parent.joinpath(folder_path,"Feb162022",f"SFmerged_CT_cat_{cat}_PCS.geojson"), driver='GeoJSON', crs = "EPSG:3857")
    dfSFmerged.loc[:,~dfSFmerged.columns.isin(["geometry"])].to_csv(BASE_DIR.parent.joinpath(folder_path,"Feb162022",f"SFmerged_CT_cat_{cat}_PCS.csv"))
    create_pct_change_files(dfSFmerged.loc[:,~dfSFmerged.columns.isin(["geometry"])],_dfSF_CT,cat)
    # return dfSF2010,dfSF2016

for cat in [1,2,3]:
    dfMerged = perform_manipulation_by_category(dfSFRdNtwrk2010_cat_CT.loc[dfSFRdNtwrk2010_cat_CT["category"]==cat,:].copy(),
                                                dfSFRdNtwrk2016_cat_CT.loc[dfSFRdNtwrk2016_cat_CT["category"]==cat,:].copy(),
                                                dfSFCrash2010_cat_CT.loc[dfSFCrash2010_cat_CT["category"]==cat,:].copy(),
                                                dfSFCrash2016_cat_CT.loc[dfSFCrash2016_cat_CT["category"]==cat,:].copy(),
                                                SF_CT.copy(),
                                                cat)

  avg = np.multiply(a, wgt, dtype=result_dtype).sum(axis)/scl
  avg = np.multiply(a, wgt, dtype=result_dtype).sum(axis)/scl
  pd.Int64Index,
  pd.Int64Index,
  pd.Int64Index,
  pd.Int64Index,
  avg = np.multiply(a, wgt, dtype=result_dtype).sum(axis)/scl
  pd.Int64Index,
  pd.Int64Index,
