In [None]:
import pandas as pd
import numpy as np
from functools import reduce

# Clean Merged ArcGIS Data

In [None]:
# Reading in data
df = pd.read_excel("Processed Data/ArcGIS_SpaitialJoin_TableToExcel.xlsx")

In [None]:
# Here are the first five lines
df.head()

In [None]:
# What is the shape of the data? # rows and # cols
df.shape

In [None]:
# Here are all the column names
df.columns

In [None]:
# Create a copy of the dataframe
df1 = df.copy()

In [None]:
# Convert the GEOID column from float64 (e.g. 3.604700e+10) to int64 (36005000100)
df1['CT2020_GEOID'] = df1['CT2020_GEOID'].astype('Int64')
df1['CT2020_GEOID']

In [None]:
# Checking -- note: there will still be dozens of records that don't have GEOID, those crashes usually are on bridges or turnpike
df1[df1['CT2020_GEOID'].isnull()][['CRASH_DATE','CRASH_TIME','BOROUGH','ZIP_CODE','LATITUDE','LONGITUDE', 'ON_STREET_NAME','CROSS_STREET_NAME']]

In [None]:
# Drop the columns created because of ArcGIS's "spatial join" operation (6 join * 3 col/join = 18 columns)
ArcGIS_drop_columns=['OBJECTID', 'Join_Count', 'TARGET_FID']

# Drop all columns in census tract dataset except 'CT2020_GEOID'
census_tract_drop_columns = [
    # 'CT2020_GEOID'
    'CTLabel', 
    'BoroCode', 
    'BoroName',
    'CT2020', 
    'BoroCT2020', 
    'CDEligibil', 
    'NTAName', 
    'NTA2020', 
    'CDTA2020',
    'CDTANAME', 
    'Shape_Leng'
]

# Drop unuseful columns from crash datasets
crash_drop_columns = [
	# 'CRASH_DATE',
    'CRASH_TIME',
    'BOROUGH',
    # 'ZIP_CODE',
    'LATITUDE',
    'LONGITUDE',
    'LOCATION',
    'ON_STREET_NAME',
    'CROSS_STREET_NAME',
    'OFF_STREET_NAME',
    # 'NUMBER_OF_PERSONS_INJURED',
    # 'NUMBER_OF_PERSONS_KILLED',
    'NUMBER_OF_PEDESTRIANS_INJURED',
    'NUMBER_OF_PEDESTRIANS_KILLED',
    # 'NUMBER_OF_CYCLIST_INJURED',
    # 'NUMBER_OF_CYCLIST_KILLED',
    'NUMBER_OF_MOTORIST_INJURED',
    'NUMBER_OF_MOTORIST_KILLED',
    'CONTRIBUTING_FACTOR_VEHICLE_1',
    'CONTRIBUTING_FACTOR_VEHICLE_2',
    'CONTRIBUTING_FACTOR_VEHICLE_3',
    'CONTRIBUTING_FACTOR_VEHICLE_4',
    'CONTRIBUTING_FACTOR_VEHICLE_5',
    'COLLISION_ID',
    'VEHICLE_TYPE_CODE_1',
    'VEHICLE_TYPE_CODE_2',
    'VEHICLE_TYPE_CODE_3',
    'VEHICLE_TYPE_CODE_4',
    'VEHICLE_TYPE_CODE_5',
]

df2 = df1.drop(columns=ArcGIS_drop_columns+census_tract_drop_columns+crash_drop_columns)
df1.shape[1] - df2.shape[1]

In [None]:
# Here are the first five lines
df2.head()

### Create time-related variables from CRASH DATE

In [None]:
df3 = df2.copy()

In [None]:
# Create a numerical variable column to indicate the year of crash accident
df3['CRASH_YEAR'] = pd.to_datetime(df3['CRASH_DATE']).dt.year

In [None]:
# Create a numerical variable column to indicate the month of crash accident
df3['CRASH_MONTH'] = pd.to_datetime(df3['CRASH_DATE']).dt.month

In [None]:
# Create a numerical variable column to indicate the year+month of crash accident
df3['CRASH_YEAR-MONTH'] = pd.to_datetime(df3['CRASH_DATE']).dt.to_period('m')
df3['CRASH_YEAR-MONTH']

# Merge All Census Data to the Merged ArcGIS Data

### Merge all census data

In [None]:
# Import census data
# df_acs2017 = pd.read_csv("Raw Data/Raw Data in txt File for American Community Survey (ACS) 5-Year Estimates/ACS_2013-2017.txt",sep='\t')
df_acs2018 = pd.read_csv("Raw Data/Raw Data in txt File for American Community Survey (ACS) 5-Year Estimates/ACS_2014-2018.txt",sep='\t')
df_acs2019 = pd.read_csv("Raw Data/Raw Data in txt File for American Community Survey (ACS) 5-Year Estimates/ACS_2015-2019.txt",sep='\t')
df_acs2020 = pd.read_csv("Raw Data/Raw Data in txt File for American Community Survey (ACS) 5-Year Estimates/ACS_2016-2020.txt",sep='\t')
df_acs2021 = pd.read_csv("Raw Data/Raw Data in txt File for American Community Survey (ACS) 5-Year Estimates/ACS_2017-2021.txt",sep='\t')

In [None]:
def process_acs_data(dataframe, year):
    '''
    This function takes in one single ACS dataframe and its corresponding year, cleans it, and outputs the dataframe.
    '''
    df = dataframe.copy()

    # Convert the data type for later join operation & create as a new column under the same name as census tract data
    # Census tract uses CT2020_GEOID, ACS uses Geo_FIPS
    df['CT2020_GEOID'] = df['Geo_FIPS'].astype('Int64')

    # Keep the borough column by renaming it before dropping all the others
    df['borough'] = df['Geo_COUNTY']

    # Drop all columns that start with "Geo_" (ie. geo data, non-demographic data)
    df = df.loc[:,~df.columns.str.startswith('Geo_')]

    # Rename columns
    # A00001_001:     Total Population
    # A00002_002:     Population Density (Per Sq. Mile)
    # B12001_001:     Population 25 Years and Over
    # B12001_002:     Population 25 Years and Over: Less than High School
    # B12001_003:     Population 25 Years and Over: High School Diploma
    # B12001_004:     Population 25 Years and Over: Bachelor's Degree or Better
    # A14006_001:     Median Household Income (In 2021 Inflation Adjusted Dollars) [Dollars adjusted for inflation to match value in 2021]
    # A09005_001:     Workers 16 Years and Over:
    # A09005_002:     Workers 16 Years and Over: Car, Truck, or Van
    # A09005_003:     Workers 16 Years and Over: Public Transportation (Includes Taxicab)
    # A09005_005:     Workers 16 Years and Over: Bicycle
    # A09003_001:     Average Commute to Work (In Min)
    df = df.rename({
        'SE_A00001_001':'ttl_pop',
        'SE_A00002_002':'pop_density_per_sq_mil',
        'SE_B12001_001':'pop_25_yr_over',
        'SE_B12001_002':'educ_less_hs',
        'SE_B12001_003':'educ_hs',
        'SE_B12001_004':'educ_bs_over',
        'SE_A14006_001':'median_household_inc',
        'SE_A09005_001':'workers_16_yr_over',
        'SE_A09005_002':'tranport_mean_car',
        'SE_A09005_003':'tranport_mean_public',
        'SE_A09005_005':'tranport_mean_bike',
        'SE_A09003_001':'avg_commmute_to_work_min'
        }, axis='columns')

    # Drop all remaining columns that start with "SE"
    df = df.loc[:,~df.columns.str.startswith('SE_')]

    # # Compute the "population over 25 years and over for education"
    # df['educ_less_hs_pct'] = df['educ_less_hs']/df['pop_25_yr_over']
    # df['educ_hs_pct'] = df['educ_hs']/df['pop_25_yr_over']
    # df['educ_bs_over_pct'] = df['educ_bs_over']/df['pop_25_yr_over']

    # # Compute the "workers over 16 years and over for tranportation mean"
    # df['tranport_mean_car_pct'] = df['tranport_mean_car']/df['workers_16_yr_over']
    # df['tranport_mean_public_pct'] = df['tranport_mean_public']/df['workers_16_yr_over']
    # df['tranport_mean_bike_pct'] = df['tranport_mean_bike']/df['workers_16_yr_over']

    # Drop the columns after we finished the computation
    drop_columns = [
        # 'pop_25_yr_over',
        'educ_less_hs',
        'educ_hs',
        # 'educ_bs_over',
        # 'workers_16_yr_over',
        # 'tranport_mean_car',
        # 'tranport_mean_public',
        # 'tranport_mean_bike',
        'avg_commmute_to_work_min'
        ]
    df = df.drop(columns=drop_columns)

    # Add year to column name
    for col in df.columns:
        if col != 'CT2020_GEOID':
            df.rename({col:'ACS'+str(year)+'_'+col}, axis='columns', inplace=True)
    return df

In [None]:
# Process and create new dataframe for ACS data
# df_acs2017_processed = process_acs_data(df_acs2017, 2017)
df_acs2018_processed = process_acs_data(df_acs2018, 2018)
df_acs2019_processed = process_acs_data(df_acs2019, 2019)
df_acs2020_processed = process_acs_data(df_acs2020, 2020)
df_acs2021_processed = process_acs_data(df_acs2021, 2021)

In [None]:
# Merge data
# dfs_to_merge = [df3, df_acs2017_processed, df_acs2018_processed, df_acs2019_processed, df_acs2020_processed, df_acs2021_processed]
dfs_to_merge = [df3, df_acs2018_processed, df_acs2019_processed, df_acs2020_processed, df_acs2021_processed]
df4 = reduce(lambda left, right: pd.merge(left, right, how='inner', on='CT2020_GEOID'), dfs_to_merge)
df4.shape

In [None]:
# Combine ACS data depending on the year
def combine_ACS_data(row, variable_name):
    '''
    This function takes in a default argument for the apply function.
    The function will combine 4 years of ACS data into 1 and show the data that corresponds to the crash year. 
    For example, if the crash happens in year 2021, ACS_ttl_pop will be the data from ACS2021_ttl_pop.
    '''
    if row['CRASH_YEAR'] == 2018:
        return row['ACS2018_'+variable_name]
    elif row['CRASH_YEAR'] == 2019:
        return row['ACS2019_'+variable_name]
    elif row['CRASH_YEAR'] == 2020:
        return row['ACS2020_'+variable_name]
    elif row['CRASH_YEAR'] == 2021:
        return row['ACS2021_'+variable_name]

df4['ACS_ttl_pop'] = df4.apply(combine_ACS_data, variable_name='ttl_pop', axis=1)
df4['ACS_pop_density_per_sq_mil'] = df4.apply(combine_ACS_data, variable_name='pop_density_per_sq_mil', axis=1)
df4['ACS_pop_25_yr_over'] = df4.apply(combine_ACS_data, variable_name='pop_25_yr_over', axis=1)
df4['ACS_workers_16_yr_over'] = df4.apply(combine_ACS_data, variable_name='workers_16_yr_over', axis=1)
df4['ACS_educ_bs_over'] = df4.apply(combine_ACS_data, variable_name='educ_bs_over', axis=1)
df4['ACS_median_household_inc'] = df4.apply(combine_ACS_data, variable_name='median_household_inc', axis=1)
df4['ACS_tranport_mean_car'] = df4.apply(combine_ACS_data, variable_name='tranport_mean_car', axis=1)
df4['ACS_tranport_mean_public'] = df4.apply(combine_ACS_data, variable_name='tranport_mean_public', axis=1)
df4['ACS_tranport_mean_bike'] = df4.apply(combine_ACS_data, variable_name='tranport_mean_bike', axis=1)
df4['ACS_borough'] = df4.apply(combine_ACS_data, variable_name='borough', axis=1)

# Checking
df4[df4['CRASH_YEAR'].notnull()][['CRASH_YEAR','ACS_ttl_pop','ACS_pop_density_per_sq_mil','ACS_tranport_mean_car']]

In [None]:
df4.head()

In [None]:
df4.columns

In [None]:
# Drop unneeded columns
drop_columns = [
    # 'ACS2017_ttl_pop', 
    # 'ACS2017_pop_density_per_sq_mil', 
    # 'ACS2017_educ_less_hs', 
    # 'ACS2017_educ_hs', 
    # 'ACS2017_educ_bs_over', 
    # 'ACS2017_median_household_inc', 
    # 'ACS2017_tranport_mean_car', 
    # 'ACS2017_tranport_mean_public', 
    # 'ACS2017_tranport_mean_bike', 
    # 'ACS2017_avg_commmute_to_work_min', 
    # 'ACS2017_borough', 
    'ACS2018_ttl_pop', 
    'ACS2018_pop_density_per_sq_mil', 
    'ACS2018_pop_25_yr_over',
    'ACS2018_workers_16_yr_over',
    # 'ACS2018_educ_less_hs', 
    # 'ACS2018_educ_hs', 
    'ACS2018_educ_bs_over', 
    'ACS2018_median_household_inc', 
    'ACS2018_tranport_mean_car', 
    'ACS2018_tranport_mean_public', 
    'ACS2018_tranport_mean_bike', 
    # 'ACS2018_avg_commmute_to_work_min', 
    'ACS2018_borough', 
    'ACS2019_ttl_pop', 
    'ACS2019_pop_density_per_sq_mil', 
    'ACS2019_pop_25_yr_over',
    'ACS2019_workers_16_yr_over',
    # 'ACS2019_educ_less_hs', 
    # 'ACS2019_educ_hs', 
    'ACS2019_educ_bs_over', 
    'ACS2019_median_household_inc', 
    'ACS2019_tranport_mean_car', 
    'ACS2019_tranport_mean_public', 
    'ACS2019_tranport_mean_bike', 
    # 'ACS2019_avg_commmute_to_work_min', 
    'ACS2019_borough', 
    'ACS2020_ttl_pop', 
    'ACS2020_pop_density_per_sq_mil', 
    'ACS2020_pop_25_yr_over',
    'ACS2020_workers_16_yr_over',
    # 'ACS2020_educ_less_hs', 
    # 'ACS2020_educ_hs', 
    'ACS2020_educ_bs_over', 
    'ACS2020_median_household_inc', 
    'ACS2020_tranport_mean_car', 
    'ACS2020_tranport_mean_public', 
    'ACS2020_tranport_mean_bike', 
    # 'ACS2020_avg_commmute_to_work_min', 
    'ACS2020_borough', 
    'ACS2021_ttl_pop', 
    'ACS2021_pop_density_per_sq_mil', 
    'ACS2021_pop_25_yr_over',
    'ACS2021_workers_16_yr_over',
    # 'ACS2021_educ_less_hs', 
    # 'ACS2021_educ_hs', 
    'ACS2021_educ_bs_over', 
    'ACS2021_median_household_inc', 
    'ACS2021_tranport_mean_car', 
    'ACS2021_tranport_mean_public', 
    'ACS2021_tranport_mean_bike', 
    # 'ACS2021_avg_commmute_to_work_min', 
    'ACS2021_borough']
df5 = df4.drop(columns=drop_columns)
df5.columns

# Group by Census Tract and Crash Date

In [None]:
df6 = df5.copy()

In [None]:
# # Group by (date, census tract), then sum by death and injury number, use max for the ACS numbers

# # Method 1
# agg_func_math = {
#     'NUMBER_OF_PERSONS_INJURED': ['sum'],
#     'NUMBER_OF_PERSONS_KILLED': ['sum'],
#     'NUMBER_OF_CYCLIST_INJURED': ['sum'],
#     'NUMBER_OF_CYCLIST_KILLED': ['sum'],
#     'ACS_ttl_pop': ['median'],
#     'ACS_pop_density_per_sq_mil': ['median'],
#     'ACS_educ_less_hs_pct': ['median'],
#     'ACS_educ_hs_pct': ['median'],
#     'ACS_educ_bs_over_pct': ['median'],
#     'ACS_median_household_inc': ['median'],
#     'ACS_tranport_mean_car_pct': ['median'],
#     'ACS_tranport_mean_public_pct': ['median'],
#     'ACS_tranport_mean_bike_pct': ['median'],
#     'ACS_avg_commmute_to_work_min': ['median'],
#     'ACS_borough': ['median']
# }
# df7 = df6.groupby(['CRASH_DATE','CT2020_GEOID'], as_index=False).agg(agg_func_math)
# df7.columns = df7.columns.droplevel(-1)
# df7

In [None]:
# Group by (year-month, census tract), then sum by death and injury number, use max for the ACS numbers

# Method 1
agg_func_math = {
    'NUMBER_OF_PERSONS_INJURED': ['sum'],
    'NUMBER_OF_PERSONS_KILLED': ['sum'],
    'NUMBER_OF_CYCLIST_INJURED': ['sum'],
    'NUMBER_OF_CYCLIST_KILLED': ['sum'],
    'ACS_ttl_pop': ['median'],
    'ACS_pop_density_per_sq_mil': ['median'],
    'ACS_pop_25_yr_over': ['median'],
    'ACS_workers_16_yr_over': ['median'],
    'ACS_educ_bs_over': ['median'],
    'ACS_median_household_inc': ['median'],
    'ACS_tranport_mean_car': ['median'],
    'ACS_tranport_mean_public': ['median'],
    'ACS_tranport_mean_bike': ['median'],
    'ACS_borough': ['median']
}
df7 = df6.groupby(['CRASH_YEAR-MONTH','CT2020_GEOID'], as_index=False).agg(agg_func_math)
df7.columns = df7.columns.droplevel(-1)
df7

In [None]:
# Create a numerical variable column to indicate the year of crash accident
df7['CRASH_YEAR'] = pd.to_datetime(df7['CRASH_YEAR-MONTH'].astype('datetime64[ns]')).dt.year

In [None]:
# Create a numerical variable column to indicate the month of crash accident
df7['CRASH_MONTH'] = pd.to_datetime(df7['CRASH_YEAR-MONTH'].astype('datetime64[ns]')).dt.month

In [None]:
df7

# Compute Ridability Score (Per Census Tract)

In [None]:
# Import length data
df_road_length = pd.read_excel("Processed Data/Length_Data_for_Ridability_Score_Calculation.xlsx", sheet_name='road_by_ct_Statistics')
df_bikelane2021_class1 = pd.read_excel("Processed Data/Length_Data_for_Ridability_Score_Calculation.xlsx", sheet_name='bikelane2021_class1_Statistics')
df_bikelane2021_class2 = pd.read_excel("Processed Data/Length_Data_for_Ridability_Score_Calculation.xlsx", sheet_name='bikelane2021_class2_Statistics')
df_bikelane2021_class3 = pd.read_excel("Processed Data/Length_Data_for_Ridability_Score_Calculation.xlsx", sheet_name='bikelane2021_class3_Statistics')
df_bikelane2020_class1 = pd.read_excel("Processed Data/Length_Data_for_Ridability_Score_Calculation.xlsx", sheet_name='bikelane2020_class1_Statistics')
df_bikelane2020_class2 = pd.read_excel("Processed Data/Length_Data_for_Ridability_Score_Calculation.xlsx", sheet_name='bikelane2020_class2_Statistics')
df_bikelane2020_class3 = pd.read_excel("Processed Data/Length_Data_for_Ridability_Score_Calculation.xlsx", sheet_name='bikelane2020_class3_Statistics')
df_bikelane2019_class1 = pd.read_excel("Processed Data/Length_Data_for_Ridability_Score_Calculation.xlsx", sheet_name='bikelane2019_class1_Statistics')
df_bikelane2019_class2 = pd.read_excel("Processed Data/Length_Data_for_Ridability_Score_Calculation.xlsx", sheet_name='bikelane2019_class2_Statistics')
df_bikelane2019_class3 = pd.read_excel("Processed Data/Length_Data_for_Ridability_Score_Calculation.xlsx", sheet_name='bikelane2019_class3_Statistics')
df_bikelane2018_class1 = pd.read_excel("Processed Data/Length_Data_for_Ridability_Score_Calculation.xlsx", sheet_name='bikelane2018_class1_Statistics')
df_bikelane2018_class2 = pd.read_excel("Processed Data/Length_Data_for_Ridability_Score_Calculation.xlsx", sheet_name='bikelane2018_class2_Statistics')
df_bikelane2018_class3 = pd.read_excel("Processed Data/Length_Data_for_Ridability_Score_Calculation.xlsx", sheet_name='bikelane2018_class3_Statistics')

In [None]:
df_road_length.head()

## Clean, process, merge length data from ArcGIS output

In [None]:
# Drop columns
df_road_length = df_road_length.drop(columns=['OBJECTID', 'FREQUENCY'])

df_bikelane2021_class1 = df_bikelane2021_class1.drop(columns=['OBJECTID', 'FREQUENCY'])
df_bikelane2021_class2 = df_bikelane2021_class2.drop(columns=['OBJECTID', 'FREQUENCY'])
df_bikelane2021_class3 = df_bikelane2021_class3.drop(columns=['OBJECTID', 'FREQUENCY'])

df_bikelane2020_class1 = df_bikelane2020_class1.drop(columns=['OBJECTID', 'FREQUENCY'])
df_bikelane2020_class2 = df_bikelane2020_class2.drop(columns=['OBJECTID', 'FREQUENCY'])
df_bikelane2020_class3 = df_bikelane2020_class3.drop(columns=['OBJECTID', 'FREQUENCY'])

df_bikelane2019_class1 = df_bikelane2019_class1.drop(columns=['OBJECTID', 'FREQUENCY'])
df_bikelane2019_class2 = df_bikelane2019_class2.drop(columns=['OBJECTID', 'FREQUENCY'])
df_bikelane2019_class3 = df_bikelane2019_class3.drop(columns=['OBJECTID', 'FREQUENCY'])

df_bikelane2018_class1 = df_bikelane2018_class1.drop(columns=['OBJECTID', 'FREQUENCY'])
df_bikelane2018_class2 = df_bikelane2018_class2.drop(columns=['OBJECTID', 'FREQUENCY'])
df_bikelane2018_class3 = df_bikelane2018_class3.drop(columns=['OBJECTID', 'FREQUENCY'])

In [None]:
df_bikelane2018_class3.head()

In [None]:
# Rename the length column (so that all have the same name)
df_road_length = df_road_length.rename({'SUM_road_length':'road_length'}, axis='columns')

df_bikelane2021_class1 = df_bikelane2021_class1.rename({'SUM_bikelane2021_class1_length':'class1_length'}, axis='columns')
df_bikelane2021_class2 = df_bikelane2021_class2.rename({'SUM_bikelane2021_class2_length':'class2_length'}, axis='columns')
df_bikelane2021_class3 = df_bikelane2021_class3.rename({'SUM_bikelane2021_class3_length':'class3_length'}, axis='columns')

df_bikelane2020_class1 = df_bikelane2020_class1.rename({'SUM_bikelane2020_class1_length':'class1_length'}, axis='columns')
df_bikelane2020_class2 = df_bikelane2020_class2.rename({'SUM_bikelane2020_class2_length':'class2_length'}, axis='columns')
df_bikelane2020_class3 = df_bikelane2020_class3.rename({'SUM_bikelane2020_class3_length':'class3_length'}, axis='columns')

df_bikelane2019_class1 = df_bikelane2019_class1.rename({'SUM_bikelane2019_class1_length':'class1_length'}, axis='columns')
df_bikelane2019_class2 = df_bikelane2019_class2.rename({'SUM_bikelane2019_class2_length':'class2_length'}, axis='columns')
df_bikelane2019_class3 = df_bikelane2019_class3.rename({'SUM_bikelane2019_class3_length':'class3_length'}, axis='columns')

df_bikelane2018_class1 = df_bikelane2018_class1.rename({'SUM_bikelane2018_class1_length':'class1_length'}, axis='columns')
df_bikelane2018_class2 = df_bikelane2018_class2.rename({'SUM_bikelane2018_class2_length':'class2_length'}, axis='columns')
df_bikelane2018_class3 = df_bikelane2018_class3.rename({'SUM_bikelane2018_class3_length':'class3_length'}, axis='columns')

In [None]:
# Convert from string to interger
df_bikelane2021_class1['CT2020_GEOID'] = df_bikelane2021_class1['CT2020_GEOID'].astype('Int64')
df_bikelane2021_class2['CT2020_GEOID'] = df_bikelane2021_class2['CT2020_GEOID'].astype('Int64')
df_bikelane2021_class3['CT2020_GEOID'] = df_bikelane2021_class3['CT2020_GEOID'].astype('Int64')

df_bikelane2020_class1['CT2020_GEOID'] = df_bikelane2020_class1['CT2020_GEOID'].astype('Int64')
df_bikelane2020_class2['CT2020_GEOID'] = df_bikelane2020_class2['CT2020_GEOID'].astype('Int64')
df_bikelane2020_class3['CT2020_GEOID'] = df_bikelane2020_class3['CT2020_GEOID'].astype('Int64')

df_bikelane2019_class1['CT2020_GEOID'] = df_bikelane2019_class1['CT2020_GEOID'].astype('Int64')
df_bikelane2019_class2['CT2020_GEOID'] = df_bikelane2019_class2['CT2020_GEOID'].astype('Int64')
df_bikelane2019_class3['CT2020_GEOID'] = df_bikelane2019_class3['CT2020_GEOID'].astype('Int64')

df_bikelane2018_class1['CT2020_GEOID'] = df_bikelane2018_class1['CT2020_GEOID'].astype('Int64')
df_bikelane2018_class2['CT2020_GEOID'] = df_bikelane2018_class2['CT2020_GEOID'].astype('Int64')
df_bikelane2018_class3['CT2020_GEOID'] = df_bikelane2018_class3['CT2020_GEOID'].astype('Int64')

In [None]:
df_bikelane2019_class3

In [None]:
# Make copies of the road_length column for later use (different years)
df_road_length2021 = df_road_length.copy()
df_road_length2020 = df_road_length.copy()
df_road_length2019 = df_road_length.copy()
df_road_length2018 = df_road_length.copy()

In [None]:
# Create a year column
df_road_length2021['CRASH_YEAR'] = 2021
df_road_length2020['CRASH_YEAR'] = 2020
df_road_length2019['CRASH_YEAR'] = 2019
df_road_length2018['CRASH_YEAR'] = 2018

df_bikelane2021_class1['CRASH_YEAR'] = 2021
df_bikelane2021_class2['CRASH_YEAR'] = 2021
df_bikelane2021_class3['CRASH_YEAR'] = 2021

df_bikelane2020_class1['CRASH_YEAR'] = 2020
df_bikelane2020_class2['CRASH_YEAR'] = 2020
df_bikelane2020_class3['CRASH_YEAR'] = 2020

df_bikelane2019_class1['CRASH_YEAR'] = 2019
df_bikelane2019_class2['CRASH_YEAR'] = 2019
df_bikelane2019_class3['CRASH_YEAR'] = 2019

df_bikelane2018_class1['CRASH_YEAR'] = 2018
df_bikelane2018_class2['CRASH_YEAR'] = 2018
df_bikelane2018_class3['CRASH_YEAR'] = 2018

In [None]:
df_bikelane2019_class3

In [None]:
# For each year, merge length data of road and bike lane
dfs_length2021 = [df_road_length2021,df_bikelane2021_class1,df_bikelane2021_class2,df_bikelane2021_class3]
df_length2021_merge = reduce(lambda left, right: pd.merge(left, right, how='outer', on=['CT2020_GEOID','CRASH_YEAR']), dfs_length2021)

dfs_length2020 = [df_road_length2020,df_bikelane2020_class1,df_bikelane2020_class2,df_bikelane2020_class3]
df_length2020_merge = reduce(lambda left, right: pd.merge(left, right, how='outer', on=['CT2020_GEOID','CRASH_YEAR']), dfs_length2020)

dfs_length2019 = [df_road_length2019,df_bikelane2019_class1,df_bikelane2019_class2,df_bikelane2019_class3]
df_length2019_merge = reduce(lambda left, right: pd.merge(left, right, how='outer', on=['CT2020_GEOID','CRASH_YEAR']), dfs_length2019)

dfs_length2018 = [df_road_length2018,df_bikelane2018_class1,df_bikelane2018_class2,df_bikelane2018_class3]
df_length2018_merge = reduce(lambda left, right: pd.merge(left, right, how='outer', on=['CT2020_GEOID','CRASH_YEAR']), dfs_length2018)

In [None]:
df_length2018_merge.head()

In [None]:
# Concatenate the data from different years in the vertical direction
dfs_to_concat = [df_length2021_merge,
    df_length2020_merge,
    df_length2019_merge,
    df_length2018_merge
]
dfs_length_concat = pd.concat(dfs_to_concat, axis=0)
dfs_length_concat

## Merge bike lane & road length data with the crash & census data

In [None]:
df8 = df7.copy()

In [None]:
df_non2017 = df8[(df8['CRASH_YEAR']!=2017)]
df_non2017.shape

In [None]:
# Merge the road & bike lane length data to the crash data by CT2020_ID and year
df9 = pd.merge(df_non2017,dfs_length_concat, on=['CT2020_GEOID','CRASH_YEAR'], how ='left')
df9

## Compute score

In [None]:
# Replace NaN by 0
df9['class1_length'] = df9['class1_length'].fillna(0)
df9['class2_length'] = df9['class2_length'].fillna(0)
df9['class3_length'] = df9['class3_length'].fillna(0)

In [None]:
# Compute the percent of bike lane and non-bike lane out of all roads
df9['class1_percent'] = df9['class1_length']/df9['road_length']
df9['class2_percent'] = df9['class2_length']/df9['road_length']
df9['class3_percent'] = df9['class3_length']/df9['road_length']
df9['no_bikelane_percent'] = (df9['road_length']-df9['class1_length']-df9['class2_length']-df9['class3_length'])/df9['road_length']

In [None]:
# Compute ridability score
df9['score1'] = df9['no_bikelane_percent']*(-1) + df9['class1_percent']*2 + df9['class2_percent']*1 + df9['class3_percent']*0
df9['score2'] = df9['no_bikelane_percent']*0 + df9['class1_percent']*3 + df9['class2_percent']*2 + df9['class3_percent']*1
df9['score3'] = df9['no_bikelane_percent']*1 + df9['class1_percent']*1000 + df9['class2_percent']*100 + df9['class3_percent']*10


In [None]:
df9

In [None]:
df9.columns

# Data for Thesis

In [None]:
df10 = df9.copy()

In [None]:
# Drop columns
drop_columns = [
   'CRASH_YEAR-MONTH',
   # 'CT2020_GEOID',
   'NUMBER_OF_PERSONS_INJURED',
   'NUMBER_OF_PERSONS_KILLED',
   # 'NUMBER_OF_CYCLIST_INJURED',
   # 'NUMBER_OF_CYCLIST_KILLED',
   # 'ACS_ttl_pop',
   # 'ACS_pop_density_per_sq_mil',
   # 'ACS_pop_25_yr_over',
   # 'ACS_workers_16_yr_over',
   # 'ACS_educ_bs_over',
   # 'ACS_median_household_inc',
   # 'ACS_tranport_mean_car',
   # 'ACS_tranport_mean_public',
   # 'ACS_tranport_mean_bike',
   # 'ACS_borough',
   # 'CRASH_YEAR',
   # 'CRASH_MONTH',
   # 'road_length',
   # 'class1_length',
   # 'class2_length',
   # 'class3_length',
   'class1_percent',
   'class2_percent',
   'class3_percent',
   'no_bikelane_percent',
   # 'score1',
   'score2',
   'score3'
]
df10 = df10.drop(columns=drop_columns)

In [None]:
df10.columns

In [None]:
# Rename columns
df10 = df10.rename({
    'CT2020_GEOID':'census_tract_id',
    'score1':'ridability_score',
    'NUMBER_OF_CYCLIST_INJURED':'cyclist_injuries',
    'NUMBER_OF_CYCLIST_KILLED':'cyclist_death',
    'ACS_ttl_pop':'ttl_pop',
    'ACS_pop_25_yr_over': 'pop_25yr_educ',
    'ACS_workers_16_yr_over': 'worker_16yr_transport',
    'ACS_pop_density_per_sq_mil':'pop_density',
    'ACS_median_household_inc':'income',
    'ACS_educ_bs_over':'educ',
    'ACS_tranport_mean_car':'car',
    'ACS_tranport_mean_public':'public_transportation',
    'ACS_tranport_mean_bike':'bike',
    'ACS_borough':'borough',
    'CRASH_YEAR':'crash_year',
    'CRASH_MONTH':'crash_month',
    }, axis='columns')

In [None]:
# Drop data entry that has missing values (which cannot be used in regression in STATA)
df10 = df10.dropna()

In [None]:
df10.columns

# Export data

In [None]:
# Export data as csv file
df9.to_csv('Final Data/final_data_w_ridability_score.csv',index=False)

In [None]:
# Export thesis data
df10.to_csv('Final Data/final_data_w_ridability_score_for_thesis.csv',index=False)