In [1]:
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
import numpy as np
from pyproj import CRS
import pathlib
from pathlib import Path
from shapely import wkt
from tqdm import tqdm
# set the working directory
BASE_DIR = Path.cwd()
# define the exported folder path
# Check if folder exists
folder_path = pathlib.Path(BASE_DIR.parent.joinpath("Exported_Files","census_tract","agg_network"))
folder_path.mkdir(parents=True, exist_ok=True)
# print(BASE_DIR)

In [2]:
"""
This notebook is for undertaking TAZ level of analysis.
It assumes that QGIS level of analysis is been completed and following files are made available
1. The projection is EPSG: 3857
2. RdNetwork files are named in the SFChamp_201x_PCS.geojson format
3. RoadCrash files are named in the SFCrash_201x_PCS.geojson format
"""

'\nThis notebook is for undertaking TAZ level of analysis.\nIt assumes that QGIS level of analysis is been completed and following files are made available\n1. The projection is EPSG: 3857\n2. RdNetwork files are named in the SFChamp_201x_PCS.geojson format\n3. RoadCrash files are named in the SFCrash_201x_PCS.geojson format\n'

In [3]:
# Fetch SF_Census Tract, SF_RoadNetwork and SF_RoadCrash
SF_TAZ = gpd.read_file(BASE_DIR.parent.joinpath(folder_path,"Feb162022","TAZ","SF_TAZ_PCS.geojson"), crs = "EPSG:3857")
# road network
dfSFRdNtwrk2010_taz = gpd.read_file(BASE_DIR.parent.joinpath(folder_path,"Feb162022","TAZ","SFChamp_2010_TAZ_PCS.geojson"), crs = "EPSG:3857")
dfSFRdNtwrk2010_taz.fillna(dfSFRdNtwrk2010_taz.dtypes.replace({'float64': 0.0, 'O': 'NULL'}),downcast='infer', inplace=True)
dfSFRdNtwrk2010_taz["TAZ"]=dfSFRdNtwrk2010_taz["TAZ"].astype(str)

# dfSFRdNtwrk2010_CT["ACCIDENT_YEAR"] = 2010
dfSFRdNtwrk2016_taz = gpd.read_file(BASE_DIR.parent.joinpath(folder_path,"Feb162022","TAZ","SFChamp_2016_TAZ_PCS.geojson"), crs = "EPSG:3857")
dfSFRdNtwrk2016_taz.fillna(dfSFRdNtwrk2016_taz.dtypes.replace({'float64': 0.0, 'O': 'NULL'}),downcast='infer', inplace=True)
dfSFRdNtwrk2016_taz["TAZ"]=dfSFRdNtwrk2016_taz["TAZ"].astype(str)

# dfSFRdNtwrk2016_CT["ACCIDENT_YEAR"] = 2016
# road crashes
dfSFCrash2010_taz = gpd.read_file(BASE_DIR.parent.joinpath(folder_path,"Feb162022","TAZ","SFCrash_2010_TAZ_PCS.geojson"), crs = "EPSG:3857")
dfSFCrash2010_taz.fillna(dfSFCrash2010_taz.dtypes.replace({'float64': 0.0, 'O': 'NULL'}),downcast='infer', inplace=True)
dfSFCrash2010_taz["TAZ"]=dfSFCrash2010_taz["TAZ"].astype(str)
dfSFCrash2010_taz["ACCIDENT_YEAR"] = 2010

dfSFCrash2016_taz = gpd.read_file(BASE_DIR.parent.joinpath(folder_path,"Feb162022","TAZ","SFCrash_2016_TAZ_PCS.geojson"), crs = "EPSG:3857")
dfSFCrash2016_taz.fillna(dfSFCrash2016_taz.dtypes.replace({'float64': 0.0, 'O': 'NULL'}),downcast='infer', inplace=True)
dfSFCrash2016_taz["ACCIDENT_YEAR"] = 2016
dfSFCrash2016_taz["TAZ"]=dfSFCrash2016_taz["TAZ"].astype(str)

#remember to rename DISTANCE variable (as this is no longer the actual distance (in miles), given that feature is split-up)
dfSFRdNtwrk2010_taz.rename(columns={"DISTANCE":"DISTANCE_MILES"},inplace=True)
dfSFRdNtwrk2010_taz["Length_meters"] = dfSFRdNtwrk2010_taz.geometry.length
dfSFRdNtwrk2010_taz["Length_miles"] = dfSFRdNtwrk2010_taz["Length_meters"]* 0.000621371
dfSFRdNtwrk2016_taz.rename(columns={"DISTANCE":"DISTANCE_MILES"},inplace=True)
dfSFRdNtwrk2016_taz["Length_meters"] = dfSFRdNtwrk2016_taz.geometry.length
dfSFRdNtwrk2010_taz["Length_miles"] = dfSFRdNtwrk2016_taz["Length_meters"] * 0.000621371

  super().__setitem__(key, value)
  super().__setitem__(key, value)
  super().__setitem__(key, value)


In [4]:
# aggregate the SFChamp network files by TAZ
def reqd_colmns(_df):
    df = _df.copy()
    d = {}
    reqd_colmns = ['V1_1', 'V2_1', 'V3_1', 'V4_1', 'V5_1', 'V6_1', 'V7_1', 'V8_1', 'V9_1', 'V10_1',
                   'V11_1', 'V12_1',"V13_1",'V14_1', 'V15_1',"V16_1",'V17_1', 'V18_1',"V19_1",
                   'OOS', 'PUDO','Tot_Vol',"TNC_Tot_Vol",
                   'BUSVOL_AM', 'BUSVOL_AM', 'BUSVOL_MD', 'BUSVOL_PM', 'BUSVOL_EV', 'BUSVOL_EA',
                   ]
    for col in reqd_colmns:
        if col not in df.columns:
            txt= col
            d[col]=0
            # df = (df.assign(txt=0))
    return pd.concat([df, pd.DataFrame(d, index=df.index)],axis=1)
def get_required_fields(_df):
    fields = ["Tot_TNC_Vol", "Tot_Non_TNC_Vol", "Tot_VMT","Tot_TNC_VMT","Tot_Non_TNC_VMT", "Congested_Speed","Tot_Vol","PUDO","OOS"]
    d = {}
    for fld in fields:
        if fld == "Tot_TNC_Vol":# TNC Tot Vol
            cols = ['V13_1',"OOS"]
            d[fld] = _df[cols].sum(axis=1)
            d[f"{fld}_mil"] = d[fld].divide(1000000)
            d[f"{fld}_yr"] = d[fld]*365
            d[f"{fld}_mil_yr"] =  d[f"{fld}_yr"].divide(1000000)
            d[f"log_{fld}"] = np.log(d[fld]+1)
            d[f"log_{fld}_mil"] = np.log(d[f"{fld}_mil"]+1)
            d[f"log_{fld}_mil_yr"] = np.log(d[f"{fld}_mil_yr"]+1)
        elif fld == "Tot_Non_TNC_Vol":
            cols = ['V13_1',"OOS"]
            d[fld] = _df["Tot_Vol"] - _df[cols].sum(axis=1)
            d[f"{fld}_mil"] = d[fld].divide(1000000)
            d[f"{fld}_yr"] = d[fld]*365
            d[f"{fld}_mil_yr"] =  d[f"{fld}_yr"].divide(1000000)
            d[f"log_{fld}"] = np.log(d[fld]+1)
            d[f"log_{fld}_mil"] = np.log(d[f"{fld}_mil"]+1)
            d[f"log_{fld}_mil_yr"] = np.log(d[f"{fld}_mil_yr"]+1)
        elif fld == "Tot_VMT":
            d[fld] = _df["Tot_Vol"]*_df["Length_meters"]*0.000621371
            d[f"{fld}_mil"] = d[fld].divide(1000000)
            d[f"{fld}_yr"] = d[fld]*365
            d[f"{fld}_mil_yr"] =  d[f"{fld}_yr"].divide(1000000)
            d[f"log_{fld}"] = np.log(d[fld]+1)
            d[f"log_{fld}_mil"] = np.log(d[f"{fld}_mil"]+1)
            d[f"log_{fld}_mil_yr"] = np.log(d[f"{fld}_mil_yr"]+1)
        elif fld == "Tot_TNC_VMT":
            cols = ['V13_1',"OOS"]
            d[fld] = (_df[cols].sum(axis=1))*_df["Length_meters"]*0.000621371
            d[f"{fld}_mil"] = d[fld].divide(1000000)
            d[f"{fld}_yr"] = d[fld]*365
            d[f"{fld}_mil_yr"] =  d[f"{fld}_yr"].divide(1000000)
            d[f"log_{fld}"] = np.log(d[fld]+1)
            d[f"log_{fld}_mil"] = np.log(d[f"{fld}_mil"]+1)
            d[f"log_{fld}_mil_yr"] = np.log(d[f"{fld}_mil_yr"]+1)
        elif fld == "Tot_Non_TNC_VMT":
            cols = ['V13_1',"OOS"]
            d[fld] = (_df["Tot_Vol"] - _df[cols].sum(axis=1))*_df["Length_meters"]*0.000621371
            d[f"{fld}_mil"] = d[fld].divide(1000000)
            d[f"{fld}_yr"] = d[fld]*365
            d[f"{fld}_mil_yr"] =  d[f"{fld}_yr"].divide(1000000)
            d[f"log_{fld}"] = np.log(d[fld]+1)
            d[f"log_{fld}_mil"] = np.log(d[f"{fld}_mil"]+1)
            d[f"log_{fld}_mil_yr"] = np.log(d[f"{fld}_mil_yr"]+1)
        elif fld == "Congested_Speed":
            d["Congested_Speed"] = (((_df["Length_meters"]*0.000621371).divide(_df["TIME_1"]))*60)
            d["Congested_Speed_yr"] = d["Congested_Speed"]
        elif fld == "Tot_Vol":
            d["Tot_Vol_yr"] = (_df["Tot_Vol"]*365)
            d["Tot_Vol_mil"] = _df["Tot_Vol"].divide(1000000)
            d["Tot_Vol_mil_yr"] = d["Tot_Vol_yr"].divide(1000000)
            d["log_Tot_Vol"] = np.log(_df["Tot_Vol"]+1)
            d["log_Tot_Vol_yr"] = np.log(d["Tot_Vol_yr"]+1)
            d["log_Tot_Vol_mil"] = np.log(d["Tot_Vol_mil"]+1)
            d["log_Tot_Vol_mil_yr"] = np.log(d["Tot_Vol_mil_yr"]+1)
        elif fld == "PUDO":
            d["PUDO_yr"] = (_df["PUDO"]*365)
            d["PUDO_thousands"] = _df["PUDO"].divide(1000)
            d["PUDO_thousands_yr"] = d["PUDO_thousands"]*365
            d["PUDO_mil"] = _df["PUDO"].divide(1000000)
            d["PUDO_mil_yr"] = d["PUDO_yr"].divide(1000000)

            d["log_PUDO_yr"] = np.log(d["PUDO_yr"]+1)
            d["log_PUDO_thousands"] = np.log(d["PUDO_thousands"]+1)
            d["log_PUDO_thousands_yr"] = np.log(d["PUDO_thousands_yr"]+1)
            d["log_PUDO_mil"] = np.log(d["PUDO_mil"]+1)
            d["log_PUDO_mil_yr"] = np.log(d["PUDO_mil_yr"]+1)
        elif fld == "OOS":
            d["OOS_yr"] = (_df["OOS"]*365)
            d["OOS_thousands"] = _df["OOS"].divide(1000)
            d["OOS_thousands_yr"] = d["OOS_thousands"]*365
            d["OOS_mil"] = _df["OOS"].divide(1000000)
            d["OOS_mil_yr"] = d["OOS_yr"].divide(1000000)

            d["log_OOS_yr"] = np.log(d["OOS_yr"]+1)
            d["log_OOS_thousands"] = np.log(d["OOS_thousands"]+1)
            d["log_OOS_thousands_yr"] = np.log(d["OOS_thousands_yr"]+1)
            d["log_OOS_mil"] = np.log(d["OOS_mil"]+1)
            d["log_OOS_mil_yr"] = np.log(d["OOS_mil_yr"]+1)
    return pd.concat([_df, pd.DataFrame(d, index=_df.index)],axis=1)
def agg_network_CT(_df):
    _df = reqd_colmns(_df.copy())
    _df ["A"] = _df["A"].astype(str)
    _df["B"] = _df["B"].astype(str)
    _df ["A_B"] = _df["A_B"].astype(str)
    #     _df["AT"] = _df["AT"].astype(str)
    _df["FT"] = _df["FT"].astype(str)
    #     _df["category"] = _df["category"].astype(str)
    # aggregate the dataframe using A_B
    wt_avg = lambda x: np.ma.average(x, weights = _df.loc[x.index, "Tot_Vol"])
    # Aggregating rows based on one column with “, ”.join
    concat_agg = lambda ar: ', '.join([item for item in ar if item])

    def agg_func(df):
        d = {}
        for col in df.select_dtypes(np.number).columns:
            if col in wt_col:
                d[col] = wt_avg
            else:
                d[col] = "sum"
        for col in df.select_dtypes(object).columns:
            if col in str_col:
                d[col] = "first"
            else:
                d[col] = concat_agg
        return d

    wt_col = ["SPEED","TIME","CSPD_1", 'VDT_1', 'VHT_1','VC_1',]
    sum_col = [ "CAP", "DISTANCE_MILES",'Length_meters', 'Length_miles',
                'V_1',
                'V1_1', 'V2_1', 'V3_1', 'V4_1', 'V5_1', 'V6_1', 'V7_1', 'V8_1', 'V9_1', 'V10_1',
                'V11_1', 'V12_1',"V13_1",'V14_1', 'V15_1',"V16_1",'V17_1', 'V18_1',"V19_1",
                'VT_1',
                'V1T_1', 'V2T_1', 'V3T_1', 'V4T_1', 'V5T_1', 'V6T_1', 'V7T_1', 'V8T_1', 'V9T_1', 'V10T_1',
                'V11T_1','V12T_1',"V13T_1",'V14T_1', 'V15T_1',"V16T_1",'V17T_1', 'V18T_1',"V19T_1",
                'OOS', 'PUDO',
                'Tot_Vol',"TNC_Tot_Vol",
                "TIME_1",'TIMESEED',
                "Tot_CAP",
                'BUSVOL_AM', 'BUSVOL_AM', 'BUSVOL_MD', 'BUSVOL_PM', 'BUSVOL_EV', 'BUSVOL_EA',]

    str_col = ['TAZ']
    concat_col = ["A_B","FT","AT"]
    drop_col = [ 'A', 'B',"USE",'PER_RISE', 'ONEWAY',"TOLL",'PROJ', 'ACTION', 'AB','peak',
                 'TOLLAM_DA', 'TOLLAM_SR2', 'TOLLAM_SR3', 'TOLLPM_DA', 'TOLLPM_SR2', 'TOLLPM_SR3', 'TOLLEA_DA',
                 'TOLLEA_SR2', 'TOLLEA_SR3', 'TOLLMD_DA', 'TOLLMD_SR2', 'TOLLMD_SR3', 'TOLLEV_DA', 'TOLLEV_SR2', 'TOLLEV_SR3',
                 'DTA_EDIT_F', 'TOLLTIME', 'PHASE', 'AMBUSSAVE', 'MDBUSSAVE', 'PMBUSSAVE', 'EVBUSSAVE', 'EABUSSAVE', 'SPDC', 'CAPC',
                 'LANE_AM', 'LANE_OP', 'LANE_PM', 'BUSLANE_AM', 'BUSLANE_OP', 'BUSLANE_PM',
                 'STREETNAME', 'TYPE', 'MTYPE','TSIN',
                 'VALUETOLL_', 'PASSTHRU', 'BUSTPS_AM', 'BUSTPS_OP', 'BUSTPS_PM', 'TSVA', 'BIKE_CLASS', 'PER_RISE', 'ONEWAY',
                 'TOLL',
                 'TIMESEED',
                 ]

    _df.drop(drop_col,axis=1,inplace=True)
    df = _df.groupby(['TAZ'],as_index=False).aggregate(agg_func(_df.copy())).copy()

    return get_required_fields(df)

dfSFRdNtwrk2010_TAZ_agg = agg_network_CT(dfSFRdNtwrk2010_taz.copy())
dfSFRdNtwrk2010_TAZ_agg["Crash_Year"] = 2010
dfSFRdNtwrk2016_TAZ_agg = agg_network_CT(dfSFRdNtwrk2016_taz.copy())
dfSFRdNtwrk2016_TAZ_agg["Crash_Year"] = 2016

dfSFRdNtwrk2010_TAZ_agg.to_csv(BASE_DIR.parent.joinpath(folder_path,"Feb162022","TAZ","SFChamp_2010_TAZ_agg_PCS.csv"))
dfSFRdNtwrk2016_TAZ_agg.to_csv(BASE_DIR.parent.joinpath(folder_path,"Feb162022","TAZ","SFChamp_2016_TAZ_agg_PCS.csv"))

In [5]:
# aggregate crashes along the CensusTract
def agg_crash_CT(_df):
    drop_fld = ['CASE_ID', 'PROC_DATE', 'JURIS', 'COLLISION_DATE', 'COLLISION_TIME', 'OFFICER_ID', 'REPORTING_DISTRICT', 'DAY_OF_WEEK', 'CHP_SHIFT', 'POPULATION', 'CNTY_CITY_LOC', 'SPECIAL_COND', 'BEAT_TYPE', 'CHP_BEAT_TYPE', 'CITY_DIVISION_LAPD', 'CHP_BEAT_CLASS', 'BEAT_NUMBER', 'PRIMARY_RD', 'SECONDARY_RD', 'DISTANCE', 'DIRECTION', 'INTERSECTION', 'WEATHER_1', 'WEATHER_2', 'STATE_HWY_IND', 'CALTRANS_COUNTY', 'CALTRANS_DISTRICT', 'STATE_ROUTE', 'ROUTE_SUFFIX', 'POSTMILE_PREFIX', 'POSTMILE', 'LOCATION_TYPE', 'RAMP_INTERSECTION', 'SIDE_OF_HWY', 'TOW_AWAY', 'COLLISION_SEVERITY','PARTY_COUNT', 'PRIMARY_COLL_FACTOR', 'PCF_CODE_OF_VIOL', 'PCF_VIOL_CATEGORY', 'PCF_VIOLATION', 'PCF_VIOL_SUBSECTION', 'HIT_AND_RUN', 'TYPE_OF_COLLISION', 'MVIW', 'PED_ACTION', 'ROAD_SURFACE', 'ROAD_COND_1', 'ROAD_COND_2', 'LIGHTING', 'CONTROL_DEVICE', 'CHP_ROAD_TYPE', 'PEDESTRIAN_ACCIDENT', 'BICYCLE_ACCIDENT', 'MOTORCYCLE_ACCIDENT', 'TRUCK_ACCIDENT', 'NOT_PRIVATE_PROPERTY', 'ALCOHOL_INVOLVED', 'STWD_VEHTYPE_AT_FAULT', 'CHP_VEHTYPE_AT_FAULT', 'PRIMARY_RAMP', 'SECONDARY_RAMP', 'LATITUDE', 'LONGITUDE', 'COUNTY', 'CITY', 'POINT_X', 'POINT_Y', 'PRIMARY_RD_3', 'SECONDARY_RD_3', ]
    _df.drop(columns=drop_fld,inplace=True)
    str_col = ['TAZ']
    sum_col = ['NUMBER_KILLED', 'NUMBER_INJURED',
               'COUNT_SEVERE_INJ', 'COUNT_VISIBLE_INJ', 'COUNT_COMPLAINT_PAIN', 'COUNT_PED_KILLED', 'COUNT_PED_INJURED', 'COUNT_BICYCLIST_KILLED', 'COUNT_BICYCLIST_INJURED', 'COUNT_MC_KILLED', 'COUNT_MC_INJURED',
               'Total_Crash', 'COUNT_Fatal', 'COUNT_Severe_Injury', 'COUNT_Visible_Injury', 'COUNT_Other_Injury', 'COUNT_PDO',]
    # Aggregating rows based on one column with “, ”.join
    concat_agg = lambda ar: ', '.join([item for item in ar if item])
    def agg_func(df):
        d = {}
        for col in df.select_dtypes(np.number).columns:
            d[col] = "sum"
        for col in df.select_dtypes(object).columns:
            if col in str_col:
                d[col] = "first"
            else:
                d[col] = concat_agg
        return d
    df = _df.groupby(['TAZ'],as_index=False).aggregate(agg_func(_df.copy())).copy()
    return df
dfSFCrash2010_TAZ_agg = agg_crash_CT(dfSFCrash2010_taz.copy())
dfSFCrash2010_TAZ_agg["ACCIDENT_YEAR"]=2010

dfSFCrash2016_TAZ_agg = agg_crash_CT(dfSFCrash2016_taz.copy())
dfSFCrash2016_TAZ_agg["ACCIDENT_YEAR"]=2016

dfSFCrash2010_TAZ_agg.to_csv(BASE_DIR.parent.joinpath(folder_path,"Feb162022","TAZ","SFCrash_2010_TAZ_agg_PCS.csv"))
dfSFCrash2016_TAZ_agg.to_csv(BASE_DIR.parent.joinpath(folder_path,"Feb162022","TAZ","SFCrash_2016_TAZ_agg_PCS.csv"))

In [6]:
dfSF_RdNtwrk_Crash_2010 = pd.merge(dfSFRdNtwrk2010_TAZ_agg.copy(),dfSFCrash2010_TAZ_agg.copy(), left_on="TAZ",right_on="TAZ",how="left")
dfSF_RdNtwrk_Crash_2016 = pd.merge(dfSFRdNtwrk2016_TAZ_agg.copy(),dfSFCrash2016_TAZ_agg.copy(), left_on="TAZ",right_on="TAZ",how="left")

In [7]:
SF_TAZ = gpd.read_file(BASE_DIR.parent.joinpath(folder_path,"Feb162022","TAZ","SF_TAZ_PCS.geojson"), crs = "EPSG:3857")
drop_cols = ['SUPERD', 'TAZ1454', 'AREALAND', 'AREAWATR', 'WATRACRE', 'SD', 'COUNTY', 'SFBLKGRP', 'X_COORD', 'Y_COORD',
             'Nhood', 'nhood_num', 'lg_nhood', 'lg_nhd_num', 'SQ_MILE', 'DIST20',]
SF_TAZ["TAZ"]=SF_TAZ["TAZ"].astype(str)
dfSF2010 = SF_TAZ.merge(dfSF_RdNtwrk_Crash_2010,on="TAZ",how="left")
dfSF2010.drop(columns=drop_cols,inplace=True)
dfSF2010[["Crash_Year","ACCIDENT_YEAR"]]=2010
dfSF2016 = SF_TAZ.merge(dfSF_RdNtwrk_Crash_2016,on="TAZ",how="left")
dfSF2016.drop(columns=drop_cols,inplace=True)
dfSF2016[["Crash_Year","ACCIDENT_YEAR"]]=2016
dfSF2010.loc[:,~dfSF2010.columns.isin(["geometry"])].to_csv(BASE_DIR.parent.joinpath(folder_path,"Feb162022","TAZ","SF2010_TAZ_agg_PCS.csv"))
dfSF2016.loc[:,~dfSF2016.columns.isin(["geometry"])].to_csv(BASE_DIR.parent.joinpath(folder_path,"Feb162022","TAZ","SF2016_TAZ_agg_PCS.csv"))

dfSF2010.to_file(BASE_DIR.parent.joinpath(folder_path,"Feb162022","TAZ","SF_2010_TAZ_PCS.geojson"), driver='GeoJSON', crs = "EPSG:3857")
dfSF2016.to_file(BASE_DIR.parent.joinpath(folder_path,"Feb162022","TAZ","SF_2016_TAZ_PCS.geojson"), driver='GeoJSON', crs = "EPSG:3857")

dfSFmerged = gpd.GeoDataFrame(pd.concat([dfSF2010,dfSF2016],ignore_index=True),crs=dfSF2010.crs)
dfSFmerged.drop(columns=["Unnamed: 0"],inplace=True)
dfSFmerged.to_file(BASE_DIR.parent.joinpath(folder_path,"Feb162022","SFmerged_TAZ_PCS.geojson"), driver='GeoJSON', crs = "EPSG:3857")
dfSFmerged.loc[:,~dfSFmerged.columns.isin(["geometry"])].to_csv(BASE_DIR.parent.joinpath(folder_path,"Feb162022","TAZ","SFmerged_TAZ_PCS.csv"))

  pd.Int64Index,
  pd.Int64Index,
  pd.Int64Index,


In [8]:
gdSFdb = pd.read_csv(BASE_DIR.parent.joinpath(folder_path,"Feb162022","TAZ","SFmerged_TAZ_PCS.csv"))
gdSFdb2010 = gdSFdb[gdSFdb["ACCIDENT_YEAR"]==2010].add_suffix("_2010").copy()
gdSFdb2016 = gdSFdb[gdSFdb["ACCIDENT_YEAR"]==2016].add_suffix("_2016").copy()

dfSFJoined = pd.merge(gdSFdb2010,gdSFdb2016, left_on="TAZ_2010",right_on="TAZ_2016",how="inner")
dfSFJoined.rename(columns={"TAZ_2010":"TAZ"},inplace=True)
dfSFJoined["TAZ"]=dfSFJoined["TAZ"].astype(str)
dfSFJoined = dfSFJoined.sort_index(axis=1)
dfSFJoined.sort_index(axis=1).to_csv(BASE_DIR.parent.joinpath(folder_path,"Feb162022","TAZ","SF_TAZ_joined.csv"))

def add_column(df):
    cols = ['CAP', 'SPEED', 'TIME', 'BUSVOL_AM', 'BUSVOL_MD', 'BUSVOL_PM', 'BUSVOL_EV', 'BUSVOL_EA', 'V_1', 'TIME_1', 'VC_1', 'CSPD_1', 'VDT_1', 'VHT_1', 'V1_1', 'V2_1', 'V3_1', 'V4_1', 'V5_1', 'V6_1', 'V7_1', 'V8_1', 'V9_1', 'V10_1', 'V11_1', 'V12_1', 'VT_1', 'V1T_1', 'V2T_1', 'V3T_1', 'V4T_1', 'V5T_1', 'V6T_1', 'V7T_1', 'V8T_1', 'V9T_1', 'V10T_1', 'V11T_1', 'V12T_1', 'Tot_CAP', 'OOS', 'PUDO', 'Tot_Vol', 'TNC_Tot_Vol', 'V13_1', 'V14_1', 'V15_1', 'V16_1', 'V17_1', 'V18_1', 'V19_1', 'Tot_TNC_Vol', 'Tot_TNC_Vol_mil', 'Tot_TNC_Vol_yr', 'Tot_TNC_Vol_mil_yr', 'log_Tot_TNC_Vol', 'log_Tot_TNC_Vol_mil', 'log_Tot_TNC_Vol_mil_yr', 'Tot_Non_TNC_Vol', 'Tot_Non_TNC_Vol_mil', 'Tot_Non_TNC_Vol_yr', 'Tot_Non_TNC_Vol_mil_yr', 'log_Tot_Non_TNC_Vol', 'log_Tot_Non_TNC_Vol_mil', 'log_Tot_Non_TNC_Vol_mil_yr', 'Tot_VMT', 'Tot_VMT_mil', 'Tot_VMT_yr', 'Tot_VMT_mil_yr', 'log_Tot_VMT', 'log_Tot_VMT_mil', 'log_Tot_VMT_mil_yr', 'Tot_TNC_VMT', 'Tot_TNC_VMT_mil', 'Tot_TNC_VMT_yr', 'Tot_TNC_VMT_mil_yr', 'log_Tot_TNC_VMT', 'log_Tot_TNC_VMT_mil', 'log_Tot_TNC_VMT_mil_yr', 'Tot_Non_TNC_VMT', 'Tot_Non_TNC_VMT_mil', 'Tot_Non_TNC_VMT_yr', 'Tot_Non_TNC_VMT_mil_yr', 'log_Tot_Non_TNC_VMT', 'log_Tot_Non_TNC_VMT_mil', 'log_Tot_Non_TNC_VMT_mil_yr', 'Congested_Speed', 'Congested_Speed_yr', 'Tot_Vol_yr', 'Tot_Vol_mil', 'Tot_Vol_mil_yr', 'log_Tot_Vol', 'log_Tot_Vol_yr', 'log_Tot_Vol_mil', 'log_Tot_Vol_mil_yr', 'PUDO_yr', 'PUDO_thousands', 'PUDO_thousands_yr', 'PUDO_mil', 'PUDO_mil_yr', 'log_PUDO_yr', 'log_PUDO_thousands', 'log_PUDO_thousands_yr', 'log_PUDO_mil', 'log_PUDO_mil_yr', 'OOS_yr', 'OOS_thousands', 'OOS_thousands_yr', 'OOS_mil', 'OOS_mil_yr', 'log_OOS_yr', 'log_OOS_thousands', 'log_OOS_thousands_yr', 'log_OOS_mil', 'log_OOS_mil_yr','NUMBER_KILLED', 'NUMBER_INJURED', 'COUNT_SEVERE_INJ', 'COUNT_VISIBLE_INJ', 'COUNT_COMPLAINT_PAIN', 'COUNT_PED_KILLED', 'COUNT_PED_INJURED', 'COUNT_BICYCLIST_KILLED', 'COUNT_BICYCLIST_INJURED', 'COUNT_MC_KILLED', 'COUNT_MC_INJURED', 'Total_Crash', 'COUNT_Fatal', 'COUNT_Severe_Injury', 'COUNT_Visible_Injury', 'COUNT_Other_Injury', 'COUNT_PDO', 'V13T_1', 'V14T_1', 'V15T_1', 'V16T_1', 'V17T_1', 'V18T_1', 'V19T_1']
    d = {}
    for col in cols:
        d[f'{col}_{"diff"}'] = df[f'{col}_{"2016"}'] - df[f'{col}_{"2010"}']
        d[f'{col}_{"pct_change"}'] = d[f'{col}_{"diff"}'].divide(df[f'{col}_{"2010"}'])
    return pd.concat([df, pd.DataFrame(d, index=df.index)],axis=1)
dfSFJoined = add_column(dfSFJoined)
dfSFJoined.replace([np.inf, -np.inf], np.nan, inplace=True)
dfSFJoined.fillna(0,inplace=True)
dfSFJoined.sort_index(axis=1).to_csv(BASE_DIR.parent.joinpath(folder_path,"Feb162022","TAZ","SF_TAZ_joined_diff_pct_chnge.csv"))

In [9]:
dfSFJoined = pd.read_csv(BASE_DIR.parent.joinpath(folder_path,"Feb162022","TAZ","SF_TAZ_joined_diff_pct_chnge.csv"))
dfSFJoined["TAZ"]=dfSFJoined["TAZ"].astype(str)
SF_TAZ = gpd.read_file(BASE_DIR.parent.joinpath(folder_path,"Feb162022","TAZ","SF_TAZ_PCS.geojson"), crs = "EPSG:3857")
SF_TAZ["TAZ"]=SF_TAZ["TAZ"].astype(str)
# clmn = dfSFJoined.columns.to_list()
# dfSFJoined[clmn] = dfSFJoined[clmn].apply(pd.to_numeric, errors='coerce')
SF_TAZ.merge(dfSFJoined,left_on="TAZ",right_on="TAZ",how="left").to_file(BASE_DIR.parent.joinpath(folder_path,"Feb162022","TAZ","SF_TAZ_joined_diff_pct_chnge_TAZ_PCS.geojson"), driver='GeoJSON', crs = "EPSG:3857")

  pd.Int64Index,


In [10]:
"""
Perform analysis based on Facility Type
FT = 1:Fwy-Fwy Connector, 2:Freeway, 3:Expressway, 4:Collector, 5:Ramp, 6:Centroid Connector, 7:Major Arterial, 8:Not used,
9:Alley, 10:Metered Ramp, 11:Local, 12:Minor Arterial,13:Bike-Only!, 14:Not used, 15:Super Arterial,
Segregate the road network into three categories
1. Category 1 = contains FT = [1, 2, 3, 5]
2. Category 2 = contains FT = [7,12,,13,15]
3. Category 3 = contains FT = [4,9,11]
"""

'\nPerform analysis based on Facility Type\nFT = 1:Fwy-Fwy Connector, 2:Freeway, 3:Expressway, 4:Collector, 5:Ramp, 6:Centroid Connector, 7:Major Arterial, 8:Not used,\n9:Alley, 10:Metered Ramp, 11:Local, 12:Minor Arterial,13:Bike-Only!, 14:Not used, 15:Super Arterial,\nSegregate the road network into three categories\n1. Category 1 = contains FT = [1, 2, 3, 5]\n2. Category 2 = contains FT = [7,12,,13,15]\n3. Category 3 = contains FT = [4,9,11]\n'

In [11]:
# modify the dataframe to create a new column "CATEGORY" using "FacilityType"
def label_df_by_road_category(_df,fld):
    _df["category"]=0
    _df.loc[_df[fld].isin([1, 2, 3, 5,13]),'category']=1
    _df.loc[_df[fld].isin([7,12,15]),'category']=2
    _df.loc[_df[fld].isin([4,9,11 ]),'category']=3

    return _df

# Fetch SF_Census Tract
SF_TAZ = gpd.read_file(BASE_DIR.parent.joinpath(folder_path,"Feb162022","TAZ","SF_TAZ_PCS.geojson"), crs = "EPSG:3857")

# road network
dfSFRdNtwrk2010_taz = gpd.read_file(BASE_DIR.parent.joinpath(folder_path,"Feb162022","TAZ","SFChamp_2010_TAZ_PCS.geojson"), crs = "EPSG:3857")
dfSFRdNtwrk2010_taz.fillna(dfSFRdNtwrk2010_taz.dtypes.replace({'float64': 0.0, 'O': 'NULL'}),downcast='infer', inplace=True)
dfSFRdNtwrk2010_cat_taz =  label_df_by_road_category(dfSFRdNtwrk2010_taz.copy(),"FT")

# dfSFRdNtwrk2010_CT["ACCIDENT_YEAR"] = 2010
dfSFRdNtwrk2016_taz = gpd.read_file(BASE_DIR.parent.joinpath(folder_path,"Feb162022","TAZ","SFChamp_2016_TAZ_PCS.geojson"), crs = "EPSG:3857")
dfSFRdNtwrk2016_taz.fillna(dfSFRdNtwrk2016_taz.dtypes.replace({'float64': 0.0, 'O': 'NULL'}),downcast='infer', inplace=True)
dfSFRdNtwrk2016_cat_taz =  label_df_by_road_category(dfSFRdNtwrk2016_taz.copy(),"FT")

# road crashes
dfSFCrash2010_taz = gpd.read_file(BASE_DIR.parent.joinpath(folder_path,"Feb162022","TAZ","NN_SFCrash_2010_TAZ_PCS.geojson"), crs = "EPSG:3857")
dfSFCrash2010_taz.fillna(dfSFCrash2010_taz.dtypes.replace({'float64': 0.0, 'O': 'NULL'}),downcast='infer', inplace=True)
dfSFCrash2010_taz["ACCIDENT_YEAR"] = 2010
dfSFCrash2010_cat_taz =  label_df_by_road_category(dfSFCrash2010_taz.copy(),"FT")

dfSFCrash2016_taz = gpd.read_file(BASE_DIR.parent.joinpath(folder_path,"Feb162022","TAZ","NN_SFCrash_2016_TAZ_PCS.geojson"), crs = "EPSG:3857")
dfSFCrash2016_taz.fillna(dfSFCrash2016_taz.dtypes.replace({'float64': 0.0, 'O': 'NULL'}),downcast='infer', inplace=True)
dfSFCrash2016_taz["ACCIDENT_YEAR"] = 2016
dfSFCrash2016_cat_taz =  label_df_by_road_category(dfSFCrash2016_taz.copy(),"FT")


#remember to rename DISTANCE variable (as this is no longer the actual distance (in miles), given that feature is split-up)
def add_length_columns(_df):
    d = {}
    if isinstance(_df,gpd.GeoDataFrame):
        d["Length_meters"] = _df.geometry.length
        d["Length_miles"] = d["Length_meters"]* 0.000621371

    return pd.concat([_df, pd.DataFrame(d, index=_df.index)],axis=1)

dfSFRdNtwrk2010_cat_taz.rename(columns={"DISTANCE":"DISTANCE_MILES"},inplace=True)
dfSFRdNtwrk2010_cat_taz = add_length_columns(dfSFRdNtwrk2010_cat_taz.copy())
dfSFRdNtwrk2016_cat_taz.rename(columns={"DISTANCE":"DISTANCE_MILES"},inplace=True)
dfSFRdNtwrk2016_cat_taz = add_length_columns(dfSFRdNtwrk2016_cat_taz.copy())

In [12]:
# aggregate the SFChamp network files by TAZ by category
def reqd_colmns(_df):# road network
    df = _df.copy()
    d = {}
    reqd_colmns = ['V1_1', 'V2_1', 'V3_1', 'V4_1', 'V5_1', 'V6_1', 'V7_1', 'V8_1', 'V9_1', 'V10_1',
                   'V11_1', 'V12_1',"V13_1",'V14_1', 'V15_1',"V16_1",'V17_1', 'V18_1',"V19_1",
                   'OOS', 'PUDO','Tot_Vol',"TNC_Tot_Vol",
                   'BUSVOL_AM', 'BUSVOL_AM', 'BUSVOL_MD', 'BUSVOL_PM', 'BUSVOL_EV', 'BUSVOL_EA',
                   ]
    for col in reqd_colmns:
        if col not in df.columns:
            d[col]=0
    return pd.concat([df, pd.DataFrame(d, index=df.index)],axis=1)
def get_req_fields(_df):
    fields = ["Tot_TNC_Vol", "Tot_Non_TNC_Vol", "Tot_VMT","Tot_TNC_VMT","Tot_Non_TNC_VMT", "Congested_Speed","Tot_Vol","PUDO","OOS"]
    d = {}
    for fld in fields:
        if fld == "Tot_TNC_Vol":# TNC Tot Vol
            cols = ['V13_1',"OOS"]
            d[fld] = _df[cols].sum(axis=1)
            d[f"{fld}_mil"] = d[fld].divide(1000000)
            d[f"{fld}_yr"] = d[fld]*365
            d[f"{fld}_mil_yr"] =  d[f"{fld}_yr"].divide(1000000)
            d[f"log_{fld}"] = np.log(d[fld]+1)
            d[f"log_{fld}_mil"] = np.log(d[f"{fld}_mil"]+1)
            d[f"log_{fld}_mil_yr"] = np.log(d[f"{fld}_mil_yr"]+1)
        elif fld == "Tot_Non_TNC_Vol":
            cols = ['V13_1',"OOS"]
            d[fld] = _df["Tot_Vol"] - _df[cols].sum(axis=1)
            d[f"{fld}_mil"] = d[fld].divide(1000000)
            d[f"{fld}_yr"] = d[fld]*365
            d[f"{fld}_mil_yr"] =  d[f"{fld}_yr"].divide(1000000)
            d[f"log_{fld}"] = np.log(d[fld]+1)
            d[f"log_{fld}_mil"] = np.log(d[f"{fld}_mil"]+1)
            d[f"log_{fld}_mil_yr"] = np.log(d[f"{fld}_mil_yr"]+1)
        elif fld == "Tot_VMT":
            d[fld] = _df["Tot_Vol"]*_df["Length_meters"]*0.000621371
            d[f"{fld}_mil"] = d[fld].divide(1000000)
            d[f"{fld}_yr"] = d[fld]*365
            d[f"{fld}_mil_yr"] =  d[f"{fld}_yr"].divide(1000000)
            d[f"log_{fld}"] = np.log(d[fld]+1)
            d[f"log_{fld}_mil"] = np.log(d[f"{fld}_mil"]+1)
            d[f"log_{fld}_mil_yr"] = np.log(d[f"{fld}_mil_yr"]+1)
        elif fld == "Tot_TNC_VMT":
            cols = ['V13_1',"OOS"]
            d[fld] = (_df[cols].sum(axis=1))*_df["Length_meters"]*0.000621371
            d[f"{fld}_mil"] = d[fld].divide(1000000)
            d[f"{fld}_yr"] = d[fld]*365
            d[f"{fld}_mil_yr"] =  d[f"{fld}_yr"].divide(1000000)
            d[f"log_{fld}"] = np.log(d[fld]+1)
            d[f"log_{fld}_mil"] = np.log(d[f"{fld}_mil"]+1)
            d[f"log_{fld}_mil_yr"] = np.log(d[f"{fld}_mil_yr"]+1)
        elif fld == "Tot_Non_TNC_VMT":
            cols = ['V13_1',"OOS"]
            d[fld] = (_df["Tot_Vol"] - _df[cols].sum(axis=1))*_df["Length_meters"]*0.000621371
            d[f"{fld}_mil"] = d[fld].divide(1000000)
            d[f"{fld}_yr"] = d[fld]*365
            d[f"{fld}_mil_yr"] =  d[f"{fld}_yr"].divide(1000000)
            d[f"log_{fld}"] = np.log(d[fld]+1)
            d[f"log_{fld}_mil"] = np.log(d[f"{fld}_mil"]+1)
            d[f"log_{fld}_mil_yr"] = np.log(d[f"{fld}_mil_yr"]+1)
        elif fld == "Congested_Speed":
            d["Congested_Speed"] = (((_df["Length_meters"]*0.000621371).divide(_df["TIME_1"]))*60)
            d["Congested_Speed_yr"] = d["Congested_Speed"]
        elif fld == "Tot_Vol":
            d["Tot_Vol_yr"] = (_df["Tot_Vol"]*365)
            d["Tot_Vol_mil"] = _df["Tot_Vol"].divide(1000000)
            d["Tot_Vol_mil_yr"] = d["Tot_Vol_yr"].divide(1000000)
            d["log_Tot_Vol"] = np.log(_df["Tot_Vol"]+1)
            d["log_Tot_Vol_yr"] = np.log(d["Tot_Vol_yr"]+1)
            d["log_Tot_Vol_mil"] = np.log(d["Tot_Vol_mil"]+1)
            d["log_Tot_Vol_mil_yr"] = np.log(d["Tot_Vol_mil_yr"]+1)
        elif fld == "PUDO":
            d["PUDO_yr"] = (_df["PUDO"]*365)
            d["PUDO_thousands"] = _df["PUDO"].divide(1000)
            d["PUDO_thousands_yr"] = d["PUDO_thousands"]*365
            d["PUDO_mil"] = _df["PUDO"].divide(1000000)
            d["PUDO_mil_yr"] = d["PUDO_yr"].divide(1000000)

            d["log_PUDO_yr"] = np.log(d["PUDO_yr"]+1)
            d["log_PUDO_thousands"] = np.log(d["PUDO_thousands"]+1)
            d["log_PUDO_thousands_yr"] = np.log(d["PUDO_thousands_yr"]+1)
            d["log_PUDO_mil"] = np.log(d["PUDO_mil"]+1)
            d["log_PUDO_mil_yr"] = np.log(d["PUDO_mil_yr"]+1)
        elif fld == "OOS":
            d["OOS_yr"] = (_df["OOS"]*365)
            d["OOS_thousands"] = _df["OOS"].divide(1000)
            d["OOS_thousands_yr"] = d["OOS_thousands"]*365
            d["OOS_mil"] = _df["OOS"].divide(1000000)
            d["OOS_mil_yr"] = d["OOS_yr"].divide(1000000)

            d["log_OOS_yr"] = np.log(d["OOS_yr"]+1)
            d["log_OOS_thousands"] = np.log(d["OOS_thousands"]+1)
            d["log_OOS_thousands_yr"] = np.log(d["OOS_thousands_yr"]+1)
            d["log_OOS_mil"] = np.log(d["OOS_mil"]+1)
            d["log_OOS_mil_yr"] = np.log(d["OOS_mil_yr"]+1)
    return pd.concat([_df, pd.DataFrame(d, index=_df.index)],axis=1)
def agg_network(_df):
    _df = reqd_colmns(_df.copy())
    _df ["A"] = _df["A"].astype(str)
    _df["B"] = _df["B"].astype(str)
    _df ["A_B"] = _df["A_B"].astype(str)
    _df["FT"] = _df["FT"].astype(str)
    _df["TAZ"] = _df["TAZ"].astype(str)
    # aggregate the dataframe using A_B
    wt_avg = lambda x: np.ma.average(x, weights = _df.loc[x.index, "Tot_Vol"])
    # Aggregating rows based on one column with “, ”.join
    concat_agg = lambda ar: ', '.join([item for item in ar if item])

    def agg_func_rdntwrk(df):
        d = {}
        for col in df.select_dtypes(np.number).columns:
            if col in wt_col:
                d[col] = wt_avg
            else:
                d[col] = "sum"
        for col in df.select_dtypes(object).columns:
            if col in str_col:
                d[col] = "first"
            else:
                d[col] = concat_agg
        return d

    wt_col = ["SPEED","TIME","CSPD_1", 'VDT_1', 'VHT_1','VC_1',]
    sum_col = [ "CAP", "DISTANCE_MILES",'Length_meters', 'Length_miles',
                'V_1',
                'V1_1', 'V2_1', 'V3_1', 'V4_1', 'V5_1', 'V6_1', 'V7_1', 'V8_1', 'V9_1', 'V10_1',
                'V11_1', 'V12_1',"V13_1",'V14_1', 'V15_1',"V16_1",'V17_1', 'V18_1',"V19_1",
                'VT_1',
                'V1T_1', 'V2T_1', 'V3T_1', 'V4T_1', 'V5T_1', 'V6T_1', 'V7T_1', 'V8T_1', 'V9T_1', 'V10T_1',
                'V11T_1','V12T_1',"V13T_1",'V14T_1', 'V15T_1',"V16T_1",'V17T_1', 'V18T_1',"V19T_1",
                'OOS', 'PUDO',
                'Tot_Vol',"TNC_Tot_Vol",
                "TIME_1",'TIMESEED',
                "Tot_CAP",
                'BUSVOL_AM', 'BUSVOL_AM', 'BUSVOL_MD', 'BUSVOL_PM', 'BUSVOL_EV', 'BUSVOL_EA',]

    str_col = ['TAZ']
    concat_col = ["FT",]
    drop_col = [ 'A', 'B',"USE",'PER_RISE', 'ONEWAY',"TOLL",'PROJ', 'ACTION', 'AB','peak',
                 'TOLLAM_DA', 'TOLLAM_SR2', 'TOLLAM_SR3', 'TOLLPM_DA', 'TOLLPM_SR2', 'TOLLPM_SR3', 'TOLLEA_DA',
                 'TOLLEA_SR2', 'TOLLEA_SR3', 'TOLLMD_DA', 'TOLLMD_SR2', 'TOLLMD_SR3', 'TOLLEV_DA', 'TOLLEV_SR2', 'TOLLEV_SR3',
                 'DTA_EDIT_F', 'TOLLTIME', 'PHASE', 'AMBUSSAVE', 'MDBUSSAVE', 'PMBUSSAVE', 'EVBUSSAVE', 'EABUSSAVE', 'SPDC', 'CAPC',
                 'LANE_AM', 'LANE_OP', 'LANE_PM', 'BUSLANE_AM', 'BUSLANE_OP', 'BUSLANE_PM',
                 'STREETNAME', 'TYPE', 'MTYPE','TSIN',
                 'VALUETOLL_', 'PASSTHRU', 'BUSTPS_AM', 'BUSTPS_OP', 'BUSTPS_PM', 'TSVA', 'BIKE_CLASS', 'PER_RISE', 'ONEWAY',
                 'TOLL',
                 'TIMESEED',"A_B","AT"
                 ]

    _df.drop(drop_col,axis=1,inplace=True)
    df = _df.groupby(['TAZ'],as_index=False).aggregate(agg_func_rdntwrk(_df.copy())).copy()

    return get_req_fields(df)
def agg_crash(_df):
    drop_fld = ['CASE_ID', 'PROC_DATE', 'JURIS', 'COLLISION_DATE', 'COLLISION_TIME', 'OFFICER_ID', 'REPORTING_DISTRICT', 'DAY_OF_WEEK', 'CHP_SHIFT', 'POPULATION', 'CNTY_CITY_LOC', 'SPECIAL_COND', 'BEAT_TYPE', 'CHP_BEAT_TYPE', 'CITY_DIVISION_LAPD', 'CHP_BEAT_CLASS', 'BEAT_NUMBER', 'PRIMARY_RD', 'SECONDARY_RD', 'DISTANCE', 'DIRECTION', 'INTERSECTION', 'WEATHER_1', 'WEATHER_2', 'STATE_HWY_IND', 'CALTRANS_COUNTY', 'CALTRANS_DISTRICT', 'STATE_ROUTE', 'ROUTE_SUFFIX', 'POSTMILE_PREFIX', 'POSTMILE', 'LOCATION_TYPE', 'RAMP_INTERSECTION', 'SIDE_OF_HWY', 'TOW_AWAY', 'COLLISION_SEVERITY','PARTY_COUNT', 'PRIMARY_COLL_FACTOR', 'PCF_CODE_OF_VIOL', 'PCF_VIOL_CATEGORY', 'PCF_VIOLATION', 'PCF_VIOL_SUBSECTION', 'HIT_AND_RUN', 'TYPE_OF_COLLISION', 'MVIW', 'PED_ACTION', 'ROAD_SURFACE', 'ROAD_COND_1', 'ROAD_COND_2', 'LIGHTING', 'CONTROL_DEVICE', 'CHP_ROAD_TYPE', 'PEDESTRIAN_ACCIDENT', 'BICYCLE_ACCIDENT', 'MOTORCYCLE_ACCIDENT', 'TRUCK_ACCIDENT', 'NOT_PRIVATE_PROPERTY', 'ALCOHOL_INVOLVED', 'STWD_VEHTYPE_AT_FAULT', 'CHP_VEHTYPE_AT_FAULT', 'PRIMARY_RAMP', 'SECONDARY_RAMP', 'LATITUDE', 'LONGITUDE', 'COUNTY', 'CITY', 'POINT_X', 'POINT_Y', 'PRIMARY_RD_3', 'SECONDARY_RD_3',]
    _df.drop(columns=drop_fld,inplace=True)
    str_col = ["FT","join_TAZ","TAZ"]
    sum_col = ['NUMBER_KILLED', 'NUMBER_INJURED',
               'COUNT_SEVERE_INJ', 'COUNT_VISIBLE_INJ', 'COUNT_COMPLAINT_PAIN', 'COUNT_PED_KILLED', 'COUNT_PED_INJURED', 'COUNT_BICYCLIST_KILLED', 'COUNT_BICYCLIST_INJURED', 'COUNT_MC_KILLED', 'COUNT_MC_INJURED',
               'Total_Crash', 'COUNT_Fatal', 'COUNT_Severe_Injury', 'COUNT_Visible_Injury', 'COUNT_Other_Injury', 'COUNT_PDO',]
    _df["FT"] = _df["FT"].astype(str)
    _df["join_TAZ"] = _df["join_TAZ"].astype(str)
    _df["TAZ"] = _df["TAZ"].astype(str)
    # Aggregating rows based on one column with “, ”.join
    concat_agg = lambda ar: ', '.join([item for item in ar if item])
    def agg_func(df):
        d = {}
        for col in df.select_dtypes(np.number).columns:
            d[col] = "sum"
        for col in df.select_dtypes(object).columns:
            if col in str_col:
                d[col] = "first"
            else:
                d[col] = concat_agg
        return d
    df = _df.groupby(['TAZ'],as_index=False).aggregate(agg_func(_df.copy())).copy()
    return df

In [13]:
def perform_merge(_dfrdntwrk,_dfcrash,_dfSFTAZ,crash_yr):
    dfrdntwrk = agg_network(_dfrdntwrk)
    dfcrash = agg_crash(_dfcrash)
    dfcrash["ACCIDENT_YEAR"]=crash_yr

    dfrdntwrk_crash = pd.merge(dfrdntwrk,dfcrash, left_on="TAZ",right_on="join_TAZ",how="left")
    dfrdntwrk_crash.rename(columns={"TAZ_x":"TAZ"},inplace=True)

    col_drop = ['SUPERD', 'TAZ1454', 'AREALAND', 'AREAWATR', 'WATRACRE', 'SD', 'COUNTY', 'SFBLKGRP', 'X_COORD', 'Y_COORD',
                'Nhood', 'nhood_num', 'lg_nhood', 'lg_nhd_num', 'SQ_MILE', 'DIST20',]

    dfSFTAZ = _dfSFTAZ.copy()
    dfSFTAZ["TAZ"]=dfSFTAZ["TAZ"].astype(str)

    dfrdntwrk_crash_taz = dfSFTAZ.merge(dfrdntwrk_crash,left_on="TAZ",right_on="TAZ",how="left")
    dfrdntwrk_crash_taz.drop(columns=col_drop,inplace=True)
    dfrdntwrk_crash_taz.replace([np.inf, -np.inf], np.nan, inplace=True)
    # Fill NaN for all strings column with NaN and for all numeric columns with 0
    d = {**dict.fromkeys(dfrdntwrk_crash_taz.select_dtypes(np.number).columns, 0),
         **dict.fromkeys(dfrdntwrk_crash_taz.select_dtypes(exclude=np.number).columns, '')}
    dfrdntwrk_crash_taz = dfrdntwrk_crash_taz.fillna(d)

    drop_clmn = ['category_x', 'Unnamed: 0', 'category_y', 'TAZ_y', 'FT_y', 'join_TAZ', 'FT_x', ]
    dfrdntwrk_crash_taz.drop(columns=drop_clmn, inplace=True)
    dfrdntwrk_crash_taz["TAZ"]=dfrdntwrk_crash_taz["TAZ"].astype("int")
    dfrdntwrk_crash_taz = dfrdntwrk_crash_taz.loc[dfrdntwrk_crash_taz["TAZ"].between(0,981)]
    dfrdntwrk_crash_taz["ACCIDENT_YEAR"]=crash_yr
    dfrdntwrk_crash_taz["Crash_Year"]=crash_yr
    # dfrdntwrk_crash_taz.loc[:,~dfrdntwrk_crash_taz.columns.isin(["geometry"])].to_csv(BASE_DIR.parent.joinpath(folder_path,"Feb162022","TAZ","SFmerged_TAZ_cat_1_PCS.csv"))

    return dfrdntwrk_crash_taz

In [17]:
def add_column(df):
    cols = ['CAP', 'SPEED', 'TIME', 'BUSVOL_AM', 'BUSVOL_MD', 'BUSVOL_PM', 'BUSVOL_EV', 'BUSVOL_EA', 'V_1', 'TIME_1', 'VC_1', 'CSPD_1', 'VDT_1', 'VHT_1', 'V1_1', 'V2_1', 'V3_1', 'V4_1', 'V5_1', 'V6_1', 'V7_1', 'V8_1', 'V9_1', 'V10_1', 'V11_1', 'V12_1', 'VT_1', 'V1T_1', 'V2T_1', 'V3T_1', 'V4T_1', 'V5T_1', 'V6T_1', 'V7T_1', 'V8T_1', 'V9T_1', 'V10T_1', 'V11T_1', 'V12T_1', 'Tot_CAP', 'OOS', 'PUDO', 'Tot_Vol', 'TNC_Tot_Vol', 'V13_1', 'V14_1', 'V15_1', 'V16_1', 'V17_1', 'V18_1', 'V19_1', 'Tot_TNC_Vol', 'Tot_TNC_Vol_mil', 'Tot_TNC_Vol_yr', 'Tot_TNC_Vol_mil_yr', 'log_Tot_TNC_Vol', 'log_Tot_TNC_Vol_mil', 'log_Tot_TNC_Vol_mil_yr', 'Tot_Non_TNC_Vol', 'Tot_Non_TNC_Vol_mil', 'Tot_Non_TNC_Vol_yr', 'Tot_Non_TNC_Vol_mil_yr', 'log_Tot_Non_TNC_Vol', 'log_Tot_Non_TNC_Vol_mil', 'log_Tot_Non_TNC_Vol_mil_yr', 'Tot_VMT', 'Tot_VMT_mil', 'Tot_VMT_yr', 'Tot_VMT_mil_yr', 'log_Tot_VMT', 'log_Tot_VMT_mil', 'log_Tot_VMT_mil_yr', 'Tot_TNC_VMT', 'Tot_TNC_VMT_mil', 'Tot_TNC_VMT_yr', 'Tot_TNC_VMT_mil_yr', 'log_Tot_TNC_VMT', 'log_Tot_TNC_VMT_mil', 'log_Tot_TNC_VMT_mil_yr', 'Tot_Non_TNC_VMT', 'Tot_Non_TNC_VMT_mil', 'Tot_Non_TNC_VMT_yr', 'Tot_Non_TNC_VMT_mil_yr', 'log_Tot_Non_TNC_VMT', 'log_Tot_Non_TNC_VMT_mil', 'log_Tot_Non_TNC_VMT_mil_yr', 'Congested_Speed', 'Congested_Speed_yr', 'Tot_Vol_yr', 'Tot_Vol_mil', 'Tot_Vol_mil_yr', 'log_Tot_Vol', 'log_Tot_Vol_yr', 'log_Tot_Vol_mil', 'log_Tot_Vol_mil_yr', 'PUDO_yr', 'PUDO_thousands', 'PUDO_thousands_yr', 'PUDO_mil', 'PUDO_mil_yr', 'log_PUDO_yr', 'log_PUDO_thousands', 'log_PUDO_thousands_yr', 'log_PUDO_mil', 'log_PUDO_mil_yr', 'OOS_yr', 'OOS_thousands', 'OOS_thousands_yr', 'OOS_mil', 'OOS_mil_yr', 'log_OOS_yr', 'log_OOS_thousands', 'log_OOS_thousands_yr', 'log_OOS_mil', 'log_OOS_mil_yr','NUMBER_KILLED', 'NUMBER_INJURED', 'COUNT_SEVERE_INJ', 'COUNT_VISIBLE_INJ', 'COUNT_COMPLAINT_PAIN', 'COUNT_PED_KILLED', 'COUNT_PED_INJURED', 'COUNT_BICYCLIST_KILLED', 'COUNT_BICYCLIST_INJURED', 'COUNT_MC_KILLED', 'COUNT_MC_INJURED', 'Total_Crash', 'COUNT_Fatal', 'COUNT_Severe_Injury', 'COUNT_Visible_Injury', 'COUNT_Other_Injury', 'COUNT_PDO', 'V13T_1', 'V14T_1', 'V15T_1', 'V16T_1', 'V17T_1', 'V18T_1', 'V19T_1']
    d = {}
    for col in cols:
        d[f'{col}_{"diff"}'] = df[f'{col}_{"2016"}'] - df[f'{col}_{"2010"}']
        d[f'{col}_{"pct_change"}'] = d[f'{col}_{"diff"}'].divide(df[f'{col}_{"2010"}'])
    return pd.concat([df, pd.DataFrame(d, index=df.index)],axis=1)

def create_pct_change_files(_dfmerged,_dfSFTAZ,cat):
    dfmerged = _dfmerged
    dfSFTAZ = _dfSFTAZ

    gdSFdb2010 = dfmerged[dfmerged["ACCIDENT_YEAR"]==2010].add_suffix("_2010").copy()
    gdSFdb2016 = dfmerged[dfmerged["ACCIDENT_YEAR"]==2016].add_suffix("_2016").copy()

    df_joined = pd.merge(gdSFdb2010, gdSFdb2016, left_on="TAZ_2010", right_on="TAZ_2016", how="inner")
    df_joined.rename(columns={"TAZ_2010": "TAZ"}, inplace=True)
    df_joined["TAZ"]=df_joined["TAZ"].astype(str)
    df_joined = df_joined.sort_index(axis=1)
    df_joined.sort_index(axis=1).to_csv(BASE_DIR.parent.joinpath(folder_path, "Feb162022", "TAZ", f"SF_joined_cat_{cat}.csv"))

    df_joined = add_column(df_joined)
    df_joined.replace([np.inf, -np.inf], np.nan, inplace=True)
    df_joined.fillna(0, inplace=True)
    df_joined.sort_index(axis=1).to_csv(BASE_DIR.parent.joinpath(folder_path, "Feb162022", "TAZ", f"SF_joined_cat_{cat}_diff_pct_chnge.csv"))

    df_joined = pd.read_csv(BASE_DIR.parent.joinpath(folder_path,"Feb162022","TAZ",f"SF_joined_cat_{cat}_diff_pct_chnge.csv"))

    # cols = ['statefp10', 'mtfcc10', 'name10', 'intptlat10', 'awater10', 'namelsad10', 'funcstat10', 'aland10', 'geoid10', 'intptlon10', 'countyfp10',]
    # SF_CT.drop(columns=cols,inplace=True)
    dfSFTAZ["TAZ"]=dfSFTAZ["TAZ"].astype(int).astype(str)

    # dfSFJoined.rename(columns={"tractce10_2010":"tractce10"},inplace=True)
    # clmn = dfSFJoined.columns.to_list()
    # dfSFJoined[clmn] = dfSFJoined[clmn].apply(pd.to_numeric, errors='coerce')
    df_joined["TAZ"]=df_joined["TAZ"].astype(str)
    # dfSFJoined[["tractce10","tractce10_2016"]]=dfSFJoined[["tractce10","tractce10_2016"]].astype(str)
    dfSFTAZ.merge(df_joined,left_on="TAZ",right_on="TAZ",how="left").to_file(BASE_DIR.parent.joinpath(folder_path,"Feb162022","TAZ",f"SF_joined_cat_{cat}_diff_pct_chnge_TAZ_PCS.geojson"), driver='GeoJSON', crs = "EPSG:3857")

In [18]:
def perform_manipulation_by_category(_dfrdntwrk2010,_dfrdntwrk2016,_dfcrash2010,_dfcrash2016,_dfSF_TAZ,cat):
    dfSF2010 = perform_merge(_dfrdntwrk2010,_dfcrash2010,_dfSF_TAZ,2010)
    dfSF2016 = perform_merge(_dfrdntwrk2016,_dfcrash2016,_dfSF_TAZ,2016)
    dfSFmerged = gpd.GeoDataFrame(pd.concat([dfSF2010,dfSF2016],ignore_index=True),crs=dfSF2010.crs)
    # dfSFmerged.drop(columns=["Unnamed: 0"],inplace=True)
    dfSFmerged.to_file(BASE_DIR.parent.joinpath(folder_path,"Feb162022","TAZ",f"SFmerged_TAZ_cat_{cat}_PCS.geojson"), driver='GeoJSON', crs = "EPSG:3857")
    dfSFmerged.loc[:,~dfSFmerged.columns.isin(["geometry"])].to_csv(BASE_DIR.parent.joinpath(folder_path,"Feb162022","TAZ",f"SFmerged_TAZ_cat_{cat}_PCS.csv"))

    create_pct_change_files(dfSFmerged.loc[:,~dfSFmerged.columns.isin(["geometry"])],_dfSF_TAZ,cat)

for cat in [1,2,3]:
    perform_manipulation_by_category(dfSFRdNtwrk2010_cat_taz.loc[dfSFRdNtwrk2010_cat_taz["category"]==cat,:].copy(),
                                     dfSFRdNtwrk2016_cat_taz.loc[dfSFRdNtwrk2016_cat_taz["category"]==cat,:].copy(),
                                     dfSFCrash2010_cat_taz.loc[dfSFCrash2010_cat_taz["category"]==cat,:].copy(),
                                     dfSFCrash2016_cat_taz.loc[dfSFCrash2016_cat_taz["category"]==cat,:].copy(),
                                     SF_TAZ.copy(),
                                     cat)

  avg = np.multiply(a, wgt, dtype=result_dtype).sum(axis)/scl
  avg = np.multiply(a, wgt, dtype=result_dtype).sum(axis)/scl
  pd.Int64Index,
  pd.Int64Index,
  pd.Int64Index,
  pd.Int64Index,
  avg = np.multiply(a, wgt, dtype=result_dtype).sum(axis)/scl
  avg = np.multiply(a, wgt, dtype=result_dtype).sum(axis)/scl
  pd.Int64Index,
  pd.Int64Index,
