# Montgomery crashes

In [81]:
from datetime import datetime
import hashlib

import numpy as np
import pandas as pd
import geopandas as gpd
from shapely.wkt import loads as load_wkt
from shapely.geometry import Point

In [2]:
crashes_raw = pd.read_csv("../data/montgomery_incidents_data.csv")

In [3]:
def describe(column):
    return column.value_counts(dropna=False)

In [6]:
def filter_columns(data):
    """ keeps only necessary columns with changed names and format """
    # keep columns
    data = data[['Report Number', 'Local Case Number', 'Agency Name', 'ACRS Report Type', 'Crash Date/Time', 'Hit/Run', 'Route Type', 'Lane Direction', 'Lane Number',
                'Number of Lanes', 'Road Grade', 'NonTraffic', 'Road Name', 'Cross-Street Type', 'Cross-Street Name', 'Off-Road Description',
                 'At Fault', 'Collision Type', 'Surface Condition', 'Light', 'Traffic Control', 'Junction', 'Intersection Type',
                'Road Alignment', 'Road Condition', 'Road Division', 'Latitude', 'Longitude']]
    # change names
    data.columns = ['ReportNumber', 'LocalCaseNumber', 'AgencyName', 'ACRSReportType', 'Datetime', 'HitRun', 'RouteType', 'LaneDirection', 'LaneNumber',
                'NumberOfLanes', 'RoadGrade', 'NonTraffic', 'RoadName', 'CrossStreetType', 'CrossStreetName', 'OffRoadIncident',
                 'AccidentAtFault', 'CollisionType', 'SurfaceCondition', 'Light', 'TrafficControl', 'Junction', 'IntersectionType',
                'RoadAlignment', 'RoadCondition', 'RoadDivision', 'Latitude', 'Longitude']
    # change format
    data = data.astype(str)
    data['LaneNumber'] = pd.to_numeric(data['LaneNumber'])
    data['NumberOfLanes'] = pd.to_numeric(data['NumberOfLanes'])
    data['Latitude'] = pd.to_numeric(data['Latitude'])
    data['Longitude'] = pd.to_numeric(data['Longitude'])
    return data

In [7]:
def change_to_unknown(string):
    return 'UNKNOWN' if (string.lower() == 'unknown' or string == 'nan' or string == '') else string

In [8]:
def handle_nans(data):
    data = data.copy()
    # 
    data['LaneNumber'] = data['LaneNumber'].fillna(0)
    data['NumberOfLanes'] = data['NumberOfLanes'].fillna(0)
    # str columns changed to unknown
    columns_to_unknown = ['AgencyName', 'ACRSReportType', 'RouteType', 'LaneDirection', 'RoadGrade', 'RoadName', 'CrossStreetType', 'CrossStreetName', 'AccidentAtFault',
                          'CollisionType', 'SurfaceCondition', 'Light', 'TrafficControl', 'Junction', 'IntersectionType', 'RoadAlignment', 'RoadCondition', 'RoadDivision']
    for col in columns_to_unknown:
        data[col] = data[col].apply(change_to_unknown)
    # Datetime, HitRun, NonTraffic, OffRoadIncident handled in transform
    return data

In [9]:
def map_to_datetime(date_str):
    try:
        date = datetime.strptime(date_str, "%m/%d/%Y %I:%M:%S %p")
    except Exception:
        date = ""
    return date

In [10]:
def transform_columns(data):
    data = data.copy()
    # nothing to clean 'ReportNumber', 'LocalCaseNumber', 'AgencyName', maybe validate?
    # clean acrs report type
    data['ACRSReportType'] = data['ACRSReportType'].apply(lambda x: x.replace("Crash", ""))
    # crash date to datetime format
    data['Datetime'] = data['Datetime'].apply(map_to_datetime)
    # change hitrun to boolean
    data['HitRun'] = data['HitRun'].apply(lambda x: True if x == 'Yes' else False)
    data['NonTraffic'] = data['NonTraffic'].apply(lambda x: True if x == 'Yes' else False)
    # map offroadincident to binary
    data['OffRoadIncident'] = data['OffRoadIncident'].apply(lambda x: False if (x == 'nan' or x == '') else True)
    # obrobienie ładnie collision type?
    return data

In [11]:
crashes_filtered = filter_columns(crashes_raw)
crashes_nonull = handle_nans(crashes_filtered)
crashes = transform_columns(crashes_nonull)

In [12]:
crashes.head()

Unnamed: 0,ReportNumber,LocalCaseNumber,AgencyName,ACRSReportType,Datetime,HitRun,RouteType,LaneDirection,LaneNumber,NumberOfLanes,...,SurfaceCondition,Light,TrafficControl,Junction,IntersectionType,RoadAlignment,RoadCondition,RoadDivision,Latitude,Longitude
0,MCP2686006F,230031339,Montgomery County Police,Property Damage,2023-06-30 22:55:00,False,UNKNOWN,UNKNOWN,0,0,...,UNKNOWN,DARK NO LIGHTS,NO CONTROLS,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,39.24428,-77.335827
1,MCP30580053,230064814,Montgomery County Police,Property Damage,2023-11-06 23:20:00,True,UNKNOWN,UNKNOWN,0,0,...,UNKNOWN,DARK LIGHTS ON,NO CONTROLS,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,39.117697,-77.183973
2,MCP2760004K,230071388,Montgomery County Police,Property Damage,2023-12-12 20:00:00,True,UNKNOWN,UNKNOWN,0,0,...,UNKNOWN,DARK LIGHTS ON,NO CONTROLS,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,39.148897,-77.209362
3,MCP3230004G,230031335,Montgomery County Police,Property Damage,2023-06-30 20:00:00,False,County,North,1,2,...,DRY,DAYLIGHT,TRAFFIC SIGNAL,INTERSECTION,FOUR-WAY INTERSECTION,STRAIGHT,NO DEFECTS,"TWO-WAY, DIVIDED, UNPROTECTED PAINTED MIN 4 FEET",39.092024,-77.07057
4,MCP12600013,230031067,Montgomery County Police,Injury,2023-06-29 11:53:00,False,Maryland (State),North,1,1,...,DRY,DAYLIGHT,STOP SIGN,NON INTERSECTION,UNKNOWN,STRAIGHT,NO DEFECTS,"TWO-WAY, DIVIDED, UNPROTECTED PAINTED MIN 4 FEET",39.277424,-77.324644


## Creating RoadDim

In [29]:
def fnv1a_hash_16_digit(s: str) -> int:
    """
    FNV-1a Hash Function to hash a string to a 16-digit deterministic integer value.
    
    :param s: Input string to hash
    :return: Deterministic 16-digit integer hash value
    """
    fnv_prime = 0x1000193
    hash_value = 0xcbf29ce484222325

    for char in s:
        hash_value ^= ord(char)
        hash_value *= fnv_prime
        hash_value &= 0xffffffffffffffff

    return hash_value % 10**16

In [13]:
def prepare_roaddim_data(data):
    data = data[['Road Name', 'Route Type', 'Cross-Street Name', 'Cross-Street Type']]
    data.columns = ['RoadName', 'RouteType', 'CrossStreetName', 'CrossStreetType']
    data = data.astype(str)
    for col in data.columns:
        data[col] = data[col].apply(change_to_unknown)
    return data

In [50]:
def generate_roaddim_key(name_col, type_col):
    unique_str = name_col.replace(' ', '') + type_col.split()[0]
    return fnv1a_hash_16_digit(unique_str)

In [71]:
def transform_road_data(data):
    roads = data[['RoadName', 'RouteType']]
    crossroads = data[['CrossStreetName', 'CrossStreetType']]
    crossroads.columns = ['RoadName', 'RouteType']
    road_dim = pd.concat([roads, crossroads]).drop_duplicates()
    road_dim['RoadKey'] = road_dim.apply(lambda x: generate_roaddim_key(x.RoadName, x.RouteType), axis=1)
    return road_dim

In [76]:
crashes_road = prepare_roaddim_data(crashes_raw)
road_dim = transform_road_data(crashes_road)
road_dim.head()

Unnamed: 0,RoadName,RouteType,RoadKey
0,UNKNOWN,UNKNOWN,7781945108562677
3,BEL PRE RD,County,3811654527549161
4,OLD HUNDRED RD,Maryland (State),2422235381346342
5,MEADOW HALL DR,Municipality,3154753875195093
6,LAKEVIEW DR,Other Public Roadway,2558107378426777


## Non motorists data

In [64]:
nonmoto_raw = pd.read_csv("../data/montgomery_nonmotorist.csv")

In [77]:
def prepare_nonmoto_data(data):
    data = data[['Report Number', 'Injury Severity']]
    data.columns = ['ReportNumber', 'InjurySeverity']
    data = data.astype(str)
    return data

In [78]:
def classify_injury(injury):
    words = injury.lower().split(' ')
    if 'fatal' in words:
        return 'Fatal'
    elif 'no' in words:
        return 'No injury'
    elif 'injury' in words:
        return 'Injury'
    else:
        return 'No injury'

In [79]:
def transform_nonmoto_data(data):
    nonmoto = data.copy()
    
    nonmoto['InjurySeverity'] = nonmoto['InjurySeverity'].apply(classify_injury)
    
    nonmoto['Fatal'] = nonmoto['InjurySeverity'].apply(lambda x: 1 if x == 'Fatal' else 0)
    nonmoto['Injury'] = nonmoto['InjurySeverity'].apply(lambda x: 1 if x == 'Injury' else 0)
    
    nonmoto_agg = nonmoto.groupby('ReportNumber').agg(
        NonMotoristTotal=pd.NamedAgg('InjurySeverity', 'count'),
        NonMotoristInjury=pd.NamedAgg('Injury', 'sum'),
        NonMotoristFatal=pd.NamedAgg('Fatal', 'sum')).reset_index()
    return nonmoto_agg        

In [80]:
nonmoto = prepare_nonmoto_data(nonmoto_raw)
nonmoto_agg = transform_nonmoto_data(nonmoto)
nonmoto_agg

Unnamed: 0,ReportNumber,NonMotoristTotal,NonMotoristInjury,NonMotoristFatal
0,DD55020019,1,1,0
1,DD5502001V,1,1,0
2,DD55020020,1,1,0
3,DD55020029,1,1,0
4,DD5502002C,2,2,0
...,...,...,...,...
5376,MCP9442001B,1,1,0
5377,MCP94420020,1,1,0
5378,MCP94420023,1,1,0
5379,MCP94420028,1,1,0


## Joining tables

In [33]:
crashes.columns

Index(['ReportNumber', 'LocalCaseNumber', 'AgencyName', 'ACRSReportType',
       'Datetime', 'HitRun', 'RouteType', 'LaneDirection', 'LaneNumber',
       'NumberOfLanes', 'RoadGrade', 'NonTraffic', 'RoadName',
       'CrossStreetType', 'CrossStreetName', 'OffRoadIncident',
       'AccidentAtFault', 'CollisionType', 'SurfaceCondition', 'Light',
       'TrafficControl', 'Junction', 'IntersectionType', 'RoadAlignment',
       'RoadCondition', 'RoadDivision', 'Latitude', 'Longitude'],
      dtype='object')

In [82]:
# Add RoadKey and CrossStreetKey
crashes_joined = crashes.copy()
crashes_joined['RoadKey'] = crashes.apply(lambda x: generate_roaddim_key(x.RoadName, x.RouteType), axis=1)
crashes_joined['CrossStreetKey'] = crashes.apply(lambda x: generate_roaddim_key(x.CrossStreetName, x.CrossStreetType), axis=1)
crashes_joined.drop(['RoadName', 'RouteType', 'CrossStreetName', 'CrossStreetType'], axis=1, inplace=True)

In [85]:
# Add non motorists aggregated measures
crashes_nonmoto = crashes_joined.merge(nonmoto_agg, how='left', on='ReportNumber')
crashes_nonmoto[["NonMotoristTotal", "NonMotoristInjury", "NonMotoristFatal"]] = crashes_nonmoto[["NonMotoristTotal", "NonMotoristInjury", "NonMotoristFatal"]].fillna(0).astype(int)

In [91]:
def generate_date_hour_dim_key(data):
    return data['Datetime'].dt.strftime('%Y%m%d%H').astype(int)

In [92]:
# Add DateHourKey
crashes_nonmoto['DateHourKey'] = generate_date_hour_dim_key(crashes_nonmoto)
crashes_nonmoto.drop('Datetime', inplace=True, axis=1)

In [93]:
area_mapper = pd.read_csv("../data/area_mapper.csv")
area_mapper['Geometry'] = area_mapper['Geometry'].apply(load_wkt)
gdf = gpd.GeoDataFrame(area_mapper, geometry='Geometry')

In [96]:
def map_location(lat, long, gdf):
    point = Point(float(long), float(lat))
    result = gdf[gdf['Geometry'].contains(point)].reset_index()
    try:
        area_key = result.loc[0]['LocationAreaKey']
    except Exception as e:
        area_key = 0
    return area_key

In [95]:
# Add LocationAreaKey
crashes_nonmoto['LocationAreaKey'] = crashes_nonmoto.apply(lambda x: map_location(x.Latitude, x.Longitude, gdf), axis=1)

In [97]:
crashes_nonmoto

Unnamed: 0,ReportNumber,LocalCaseNumber,AgencyName,ACRSReportType,HitRun,LaneDirection,LaneNumber,NumberOfLanes,RoadGrade,NonTraffic,...,RoadDivision,Latitude,Longitude,RoadKey,CrossStreetKey,NonMotoristTotal,NonMotoristInjury,NonMotoristFatal,DateHourKey,LocationAreaKey
0,MCP2686006F,230031339,Montgomery County Police,Property Damage,False,UNKNOWN,0,0,UNKNOWN,True,...,UNKNOWN,39.244280,-77.335827,7781945108562677,7781945108562677,0,0,0,2023063022,7728275839257602
1,MCP30580053,230064814,Montgomery County Police,Property Damage,True,UNKNOWN,0,0,UNKNOWN,True,...,UNKNOWN,39.117697,-77.183973,7781945108562677,7781945108562677,0,0,0,2023110623,7718751639140820
2,MCP2760004K,230071388,Montgomery County Police,Property Damage,True,UNKNOWN,0,0,UNKNOWN,True,...,UNKNOWN,39.148897,-77.209362,7781945108562677,7781945108562677,0,0,0,2023121220,7718751639140820
3,MCP3230004G,230031335,Montgomery County Police,Property Damage,False,North,1,2,LEVEL,False,...,"TWO-WAY, DIVIDED, UNPROTECTED PAINTED MIN 4 FEET",39.092024,-77.070570,3811654527549161,6553260825247036,0,0,0,2023063020,7705486039088590
4,MCP12600013,230031067,Montgomery County Police,Injury,False,North,1,1,LEVEL,False,...,"TWO-WAY, DIVIDED, UNPROTECTED PAINTED MIN 4 FEET",39.277424,-77.324644,2422235381346342,8973590314752562,0,0,0,2023062911,7728275839257602
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
97453,MCP13660071,180003770,Montgomery County Police,Injury,False,North,3,3,LEVEL,False,...,"TWO-WAY, DIVIDED, POSITIVE MEDIAN BARRIER",39.062794,-77.116372,6984223247500269,5529293059953432,1,1,0,2018012306,7712482939051552
97454,MCP28270022,16009955,Montgomery County Police,Property Damage,False,North,3,5,LEVEL,False,...,"TWO-WAY, DIVIDED, POSITIVE MEDIAN BARRIER",39.075443,-77.001442,4963063311328854,6447998938996376,0,0,0,2016022817,7698062539067210
97455,HA2433001B,23000434,Maryland-National Capital,Property Damage,False,UNKNOWN,0,0,UNKNOWN,True,...,UNKNOWN,39.150631,-77.305672,7781945108562677,7781945108562677,0,0,0,2023031009,7732651739188276
97456,MCP1128003R,15008142,Montgomery County Police,Property Damage,False,North,1,4,LEVEL,False,...,"TWO-WAY, DIVIDED, POSITIVE MEDIAN BARRIER",39.008168,-77.080083,5405885102715052,3529109164558297,0,0,0,2015021909,7707527239025656


In [98]:
crashes_nonmoto.to_csv("../data/dwh/CrashesFact.csv", index=False)

## Api data loader

In [39]:
from dotenv import load_dotenv
import os
from sodapy import Socrata
import pandas as pd
from datetime import datetime, timedelta

load_dotenv('../.env')

# Initialize Socrata client
client = Socrata("data.montgomerycountymd.gov",
                 os.getenv('SOTA_TOKEN'),
                 username=os.getenv('SOTA_USER'),
                 password=os.getenv('SOTA_PWD'))

# Calculate the date for the last 8 months
end_date = datetime.now()
start_date = end_date - timedelta(days=7*30)  # Roughly 7 months
start_date_str = start_date.strftime('%Y-%m-%dT%H:%M:%S.000')
end_date_str = end_date.strftime('%Y-%m-%dT%H:%M:%S.000')

# Define the where clause for the last 8 months
where_clause = f"crash_date_time >= '{start_date_str}' AND crash_date_time <= '{end_date_str}'"

# Fetch results from the API
results = client.get("bhju-22kf", where=where_clause, limit=10000)

# Convert to pandas DataFrame
results_df = pd.DataFrame.from_records(results)



Unnamed: 0,report_number,local_case_number,agency_name,acrs_report_type,crash_date_time,hit_run,route_type,mile_point,mile_point_direction,lane_direction,...,road_alignment,road_condition,road_division,latitude,longitude,geolocation,off_road_description,lane_type,related_non_motorist,non_motorist_substance_abuse
0,MCP29460062,230065530,Montgomery County Police,Property Damage Crash,2023-11-10T09:22:00.000,No,County,0.86,South,South,...,STRAIGHT,NO DEFECTS,"TWO-WAY, DIVIDED, POSITIVE MEDIAN BARRIER",39.20028533,-77.25833317,"{'latitude': '39.20028533', 'longitude': '-77....",,,,
1,MCP235800BZ,230065531,Montgomery County Police,Property Damage Crash,2023-11-10T09:30:00.000,No,County,0.24,North,North,...,STRAIGHT,NO DEFECTS,"TWO-WAY, NOT DIVIDED",38.9599467,-77.09632161,"{'latitude': '38.9599467', 'longitude': '-77.0...",,,,
2,MCP2503001M,230065544,Montgomery County Police,Property Damage Crash,2023-11-10T10:00:00.000,No,County,3.92,West,West,...,STRAIGHT,NO DEFECTS,"TWO-WAY, DIVIDED, UNPROTECTED PAINTED MIN 4 FEET",39.04047,-77.15912167,"{'latitude': '39.04047', 'longitude': '-77.159...",,,,
3,MCP3200009T,230065549,Montgomery County Police,Property Damage Crash,2023-11-10T10:09:00.000,No,Other Public Roadway,0.6,East,East,...,STRAIGHT,NO DEFECTS,"TWO-WAY, DIVIDED, POSITIVE MEDIAN BARRIER",39.05074833,-77.1260695,"{'latitude': '39.05074833', 'longitude': '-77....",,,,
4,MCP235800C0,230065563,Montgomery County Police,Property Damage Crash,2023-11-10T11:00:00.000,No,County,0.06,East,East,...,STRAIGHT,NO DEFECTS,"TWO-WAY, NOT DIVIDED",38.94057836,-77.11942867,"{'latitude': '38.94057836', 'longitude': '-77....",,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1614,MCP2302000G,230074415,Montgomery County Police,Property Damage Crash,2023-12-31T17:26:00.000,No,County,1.35,North,East,...,STRAIGHT,NO DEFECTS,"TWO-WAY, NOT DIVIDED",39.17416033,-77.20836817,"{'latitude': '39.17416033', 'longitude': '-77....",,,,
1615,EJ78710072,230074429,Gaithersburg Police Depar,Property Damage Crash,2023-12-31T21:19:00.000,No,Maryland (State),3.59,North,West,...,STRAIGHT,NO DEFECTS,"TWO-WAY, DIVIDED, POSITIVE MEDIAN BARRIER",39.14005447,-77.19562112,"{'latitude': '39.14005447', 'longitude': '-77....",,,,
1616,MCP3300002L,230074431,Montgomery County Police,Property Damage Crash,2023-12-31T22:00:00.000,No,Maryland (State),2.23,North,South,...,STRAIGHT,NO DEFECTS,"TWO-WAY, DIVIDED, POSITIVE MEDIAN BARRIER",39.09060018,-77.04483779,"{'latitude': '39.09060018', 'longitude': '-77....",,,,
1617,MCP15540064,230074436,Montgomery County Police,Property Damage Crash,2023-12-31T22:15:00.000,No,County,2.08,East,South,...,CURVE RIGHT,NO DEFECTS,"TWO-WAY, NOT DIVIDED",39.12393557,-77.13479125,"{'latitude': '39.12393557', 'longitude': '-77....",,,,
