# Montgomery crashes

In [1]:
from datetime import datetime

import pandas as pd

In [2]:
crashes_raw = pd.read_csv("../data/montgomery_incidents_data.csv")

In [3]:
def describe(column):
    return column.value_counts(dropna=False)

In [4]:
def filter_columns(data):
    """ keeps only necessary columns with changed names and format """
    # keep columns
    data = data[['Report Number', 'Local Case Number', 'Agency Name', 'ACRS Report Type', 'Crash Date/Time', 'Hit/Run', 'Route Type', 'Lane Direction', 'Lane Number',
                'Number of Lanes', 'Road Grade', 'NonTraffic', 'Road Name', 'Cross-Street Type', 'Cross-Street Name', 'Off-Road Description',
                 'At Fault', 'Collision Type', 'Surface Condition', 'Light', 'Traffic Control', 'Junction', 'Intersection Type',
                'Road Alignment', 'Road Condition', 'Road Division', 'Latitude', 'Longitude']]
    # change names
    data.columns = ['ReportNumber', 'LocalCaseNumber', 'AgencyName', 'ACRSReportType', 'Datetime', 'HitRun', 'RouteType', 'LaneDirection', 'LaneNumber',
                'NumberOfLanes', 'RoadGrade', 'NonTraffic', 'RoadName', 'CrossStreetType', 'CrossStreetName', 'OffRoadIncident',
                 'AccidentAtFault', 'CollisionType', 'SurfaceCondition', 'Light', 'TrafficControl', 'Junction', 'IntersectionType',
                'RoadAlignment', 'RoadCondition', 'RoadDivision', 'Latitude', 'Longitude']
    # change format
    data = data.astype(str)
    data['LaneNumber'] = pd.to_numeric(data['LaneNumber'])
    data['NumberOfLanes'] = pd.to_numeric(data['NumberOfLanes'])
    data['Latitude'] = pd.to_numeric(data['Latitude'])
    data['Longitude'] = pd.to_numeric(data['Longitude'])
    return data

In [5]:
def change_to_unknown(string):
    return 'UNKNOWN' if (string.lower() == 'unknown' or string == 'nan' or string == '') else string

In [6]:
def handle_nans(data):
    data = data.copy()
    # 
    data['LaneNumber'] = data['LaneNumber'].fillna(0)
    data['NumberOfLanes'] = data['NumberOfLanes'].fillna(0)
    # str columns changed to unknown
    columns_to_unknown = ['AgencyName', 'ACRSReportType', 'RouteType', 'LaneDirection', 'RoadGrade', 'RoadName', 'CrossStreetType', 'CrossStreetName', 'AccidentAtFault',
                          'CollisionType', 'SurfaceCondition', 'Light', 'TrafficControl', 'Junction', 'IntersectionType', 'RoadAlignment', 'RoadCondition', 'RoadDivision']
    for col in columns_to_unknown:
        data[col] = data[col].apply(change_to_unknown)
    # Datetime, HitRun, NonTraffic, OffRoadIncident handled in transform
    return data

In [7]:
def map_to_datetime(date_str):
    try:
        date = datetime.strptime(date_str, "%m/%d/%Y %I:%M:%S %p")
    except Exception:
        date = ""
    return date

In [8]:
def transform_columns(data):
    data = data.copy()
    # nothing to clean 'ReportNumber', 'LocalCaseNumber', 'AgencyName', maybe validate?
    # clean acrs report type
    data['ACRSReportType'] = data['ACRSReportType'].apply(lambda x: x.replace("Crash", ""))
    # crash date to datetime format
    data['Datetime'] = data['Datetime'].apply(map_to_datetime)
    # change hitrun to boolean
    data['HitRun'] = data['HitRun'].apply(lambda x: True if x == 'Yes' else False)
    data['NonTraffic'] = data['NonTraffic'].apply(lambda x: True if x == 'Yes' else False)
    # map offroadincident to binary
    data['OffRoadIncident'] = data['OffRoadIncident'].apply(lambda x: False if (x == 'nan' or x == '') else True)
    # obrobienie ładnie collision type?
    return data

In [9]:
crashes_filtered = filter_columns(crashes_raw)
crashes_nonull = handle_nans(crashes_filtered)
crashes = transform_columns(crashes_nonull)

In [10]:
crashes.head()

Unnamed: 0,ReportNumber,LocalCaseNumber,AgencyName,ACRSReportType,Datetime,HitRun,RouteType,LaneDirection,LaneNumber,NumberOfLanes,...,SurfaceCondition,Light,TrafficControl,Junction,IntersectionType,RoadAlignment,RoadCondition,RoadDivision,Latitude,Longitude
0,MCP2686006F,230031339,Montgomery County Police,Property Damage,2023-06-30 22:55:00,False,UNKNOWN,UNKNOWN,0,0,...,UNKNOWN,DARK NO LIGHTS,NO CONTROLS,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,39.24428,-77.335827
1,MCP30580053,230064814,Montgomery County Police,Property Damage,2023-11-06 23:20:00,True,UNKNOWN,UNKNOWN,0,0,...,UNKNOWN,DARK LIGHTS ON,NO CONTROLS,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,39.117697,-77.183973
2,MCP2760004K,230071388,Montgomery County Police,Property Damage,2023-12-12 20:00:00,True,UNKNOWN,UNKNOWN,0,0,...,UNKNOWN,DARK LIGHTS ON,NO CONTROLS,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,39.148897,-77.209362
3,MCP3230004G,230031335,Montgomery County Police,Property Damage,2023-06-30 20:00:00,False,County,North,1,2,...,DRY,DAYLIGHT,TRAFFIC SIGNAL,INTERSECTION,FOUR-WAY INTERSECTION,STRAIGHT,NO DEFECTS,"TWO-WAY, DIVIDED, UNPROTECTED PAINTED MIN 4 FEET",39.092024,-77.07057
4,MCP12600013,230031067,Montgomery County Police,Injury,2023-06-29 11:53:00,False,Maryland (State),North,1,1,...,DRY,DAYLIGHT,STOP SIGN,NON INTERSECTION,UNKNOWN,STRAIGHT,NO DEFECTS,"TWO-WAY, DIVIDED, UNPROTECTED PAINTED MIN 4 FEET",39.277424,-77.324644


## Creating RoadDim

In [11]:
def prepare_roaddim_data(data):
    data = data[['Road Name', 'Route Type', 'Cross-Street Name', 'Cross-Street Type']]
    data.columns = ['RoadName', 'RouteType', 'CrossStreetName', 'CrossStreetType']
    data = data.astype(str)
    for col in data.columns:
        data[col] = data[col].apply(change_to_unknown)
    return data

In [12]:
def generate_roaddim_key(name_col, type_col):
    return name_col.apply(lambda x: x.replace(' ', '_')) + '_' + type_col.apply(lambda x: x.split()[0])

In [13]:
crashes_road = prepare_roaddim_data(crashes_raw)
roads = crashes_road[['RoadName', 'RouteType']]
crossroads = crashes_road[['CrossStreetName', 'CrossStreetType']]
crossroads.columns = ['RoadName', 'RouteType']

In [14]:
road_dim = pd.concat([roads, crossroads]).drop_duplicates()

In [15]:
road_dim['RoadKey'] = generate_roaddim_key(road_dim['RoadName'], road_dim['RouteType'])

In [16]:
road_dim.head()

Unnamed: 0,RoadName,RouteType,RoadKey
0,UNKNOWN,UNKNOWN,UNKNOWN_UNKNOWN
3,BEL PRE RD,County,BEL_PRE_RD_County
4,OLD HUNDRED RD,Maryland (State),OLD_HUNDRED_RD_Maryland
5,MEADOW HALL DR,Municipality,MEADOW_HALL_DR_Municipality
6,LAKEVIEW DR,Other Public Roadway,LAKEVIEW_DR_Other


## Non motorists data

In [17]:
nonmoto_raw = pd.read_csv("../data/montgomery_nonmotorist.csv")

In [18]:
describe(nonmoto_raw['Injury Severity'])

Injury Severity
SUSPECTED MINOR INJURY      2478
POSSIBLE INJURY             1860
SUSPECTED SERIOUS INJURY     631
NO APPARENT INJURY           558
FATAL INJURY                 123
Name: count, dtype: int64

In [19]:
def prepare_nonmoto_data(data):
    data = data[['Report Number', 'Injury Severity']]
    data.columns = ['ReportNumber', 'InjurySeverity']
    data = data.astype(str)
    return data

In [20]:
def classify_injury(injury):
    words = injury.lower().split(' ')
    if 'fatal' in words:
        return 'Fatal'
    elif 'no' in words:
        return 'No injury'
    elif 'injury' in words:
        return 'Injury'
    else:
        return 'No injury'

In [21]:
nonmoto = prepare_nonmoto_data(nonmoto_raw)

In [22]:
nonmoto['InjurySeverity'] = nonmoto['InjurySeverity'].apply(classify_injury)

In [23]:
nonmoto['Fatal'] = nonmoto['InjurySeverity'].apply(lambda x: 1 if x == 'Fatal' else 0)
nonmoto['Injury'] = nonmoto['InjurySeverity'].apply(lambda x: 1 if x == 'Injury' else 0)

In [24]:
nonmoto_agg = nonmoto.groupby('ReportNumber').agg(
    NonMotoristTotal=pd.NamedAgg('InjurySeverity', 'count'),
    NonMotoristInjury=pd.NamedAgg('Injury', 'sum'),
    NonMotoristFatal=pd.NamedAgg('Fatal', 'sum')).reset_index()

## Joining tables

In [25]:
crashes.columns

Index(['ReportNumber', 'LocalCaseNumber', 'AgencyName', 'ACRSReportType',
       'Datetime', 'HitRun', 'RouteType', 'LaneDirection', 'LaneNumber',
       'NumberOfLanes', 'RoadGrade', 'NonTraffic', 'RoadName',
       'CrossStreetType', 'CrossStreetName', 'OffRoadIncident',
       'AccidentAtFault', 'CollisionType', 'SurfaceCondition', 'Light',
       'TrafficControl', 'Junction', 'IntersectionType', 'RoadAlignment',
       'RoadCondition', 'RoadDivision', 'Latitude', 'Longitude'],
      dtype='object')

In [26]:
crashes_joined = crashes.copy()
crashes_joined['RoadKey'] = generate_roaddim_key(crashes['RoadName'], crashes['RouteType'])
crashes_joined['CrossStreetKey'] = generate_roaddim_key(crashes['CrossStreetName'], crashes['CrossStreetType'])
crashes_joined.drop(['RoadName', 'RouteType', 'CrossStreetName', 'CrossStreetType'], axis=1, inplace=True)

In [41]:
crashes_nonmoto = crashes_joined.merge(nonmoto_agg, how='left', on='ReportNumber')
crashes_nonmoto[["NonMotoristTotal", "NonMotoristInjury", "NonMotoristFatal"]] = crashes_nonmoto[["NonMotoristTotal", "NonMotoristInjury", "NonMotoristFatal"]].fillna(0).astype(int)

In [42]:
def generate_date_hour_dim_key(data):
    return data['Datetime'].dt.strftime('%Y%m%d%H').astype(int)

In [45]:
crashes_nonmoto['DateHourKey'] = generate_date_hour_dim_key(crashes_nonmoto)
crashes_nonmoto.drop('Datetime', inplace=True, axis=1)

In [46]:
crashes_nonmoto

Unnamed: 0,ReportNumber,LocalCaseNumber,AgencyName,ACRSReportType,HitRun,LaneDirection,LaneNumber,NumberOfLanes,RoadGrade,NonTraffic,...,RoadCondition,RoadDivision,Latitude,Longitude,RoadKey,CrossStreetKey,NonMotoristTotal,NonMotoristInjury,NonMotoristFatal,DateHourKey
0,MCP2686006F,230031339,Montgomery County Police,Property Damage,False,UNKNOWN,0,0,UNKNOWN,True,...,UNKNOWN,UNKNOWN,39.244280,-77.335827,UNKNOWN_UNKNOWN,UNKNOWN_UNKNOWN,0,0,0,2023063022
1,MCP30580053,230064814,Montgomery County Police,Property Damage,True,UNKNOWN,0,0,UNKNOWN,True,...,UNKNOWN,UNKNOWN,39.117697,-77.183973,UNKNOWN_UNKNOWN,UNKNOWN_UNKNOWN,0,0,0,2023110623
2,MCP2760004K,230071388,Montgomery County Police,Property Damage,True,UNKNOWN,0,0,UNKNOWN,True,...,UNKNOWN,UNKNOWN,39.148897,-77.209362,UNKNOWN_UNKNOWN,UNKNOWN_UNKNOWN,0,0,0,2023121220
3,MCP3230004G,230031335,Montgomery County Police,Property Damage,False,North,1,2,LEVEL,False,...,NO DEFECTS,"TWO-WAY, DIVIDED, UNPROTECTED PAINTED MIN 4 FEET",39.092024,-77.070570,BEL_PRE_RD_County,CONNECTICUT_AVE_County,0,0,0,2023063020
4,MCP12600013,230031067,Montgomery County Police,Injury,False,North,1,1,LEVEL,False,...,NO DEFECTS,"TWO-WAY, DIVIDED, UNPROTECTED PAINTED MIN 4 FEET",39.277424,-77.324644,OLD_HUNDRED_RD_Maryland,RAMP_6_FR_MD_109_EB_TO_IS_270_SB_Ramp,0,0,0,2023062911
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
97453,MCP13660071,180003770,Montgomery County Police,Injury,False,North,3,3,LEVEL,False,...,NO DEFECTS,"TWO-WAY, DIVIDED, POSITIVE MEDIAN BARRIER",39.062794,-77.116372,TWINBROOK_PKWY_County,FISHERS_LA_County,1,1,0,2018012306
97454,MCP28270022,16009955,Montgomery County Police,Property Damage,False,North,3,5,LEVEL,False,...,NO DEFECTS,"TWO-WAY, DIVIDED, POSITIVE MEDIAN BARRIER",39.075443,-77.001442,NEW_HAMPSHIRE_AVE_Maryland,E_RANDOLPH_RD_County,0,0,0,2016022817
97455,HA2433001B,23000434,Maryland-National Capital,Property Damage,False,UNKNOWN,0,0,UNKNOWN,True,...,UNKNOWN,UNKNOWN,39.150631,-77.305672,UNKNOWN_UNKNOWN,UNKNOWN_UNKNOWN,0,0,0,2023031009
97456,MCP1128003R,15008142,Montgomery County Police,Property Damage,False,North,1,4,LEVEL,False,...,NO DEFECTS,"TWO-WAY, DIVIDED, POSITIVE MEDIAN BARRIER",39.008168,-77.080083,CONNECTICUT_AVE_Maryland,BEACH_DR_Government,0,0,0,2015021909


In [29]:
road_dim.to_csv("../data/dwh/RoadDim.csv", index=False)