In [None]:
import pandas as pd
import os
import shutil
import datetime

StatementMeta(, c2872e01-b22b-48a1-85eb-65a28915ee8c, 5, Finished, Available, Finished)

In [None]:
def spark_table_gen(df, table_name, mode='append'):
    
    spark_df = spark.createDataFrame(df)

    spark_df.write.format('delta').mode(mode).save(lakehouse_path + '/Tables/' + table_name)

    # Create a table in the lakehouse that references the existing data
    spark.sql(f"""
        CREATE TABLE IF NOT EXISTS {table_name}
        USING DELTA
        LOCATION '{lakehouse_path}'
    """)

def extract_movement(full_movement):
    movement_parts = full_movement.split(' ')
    lane = movement_parts[1]
    approach = movement_parts[3].replace(',', '')
    movement = movement_parts[4]
    return lane, approach, movement

StatementMeta(, c2872e01-b22b-48a1-85eb-65a28915ee8c, 7, Finished, Available, Finished)

In [None]:
# Microsoft Fabric lakehouse settings
app_name = "tahoe"
lakehouse_path = os.getenv('tahoe_lakehouse_path')

spark = SparkSession.builder.appName(app_name).getOrCreate()

StatementMeta(, c2872e01-b22b-48a1-85eb-65a28915ee8c, 8, Finished, Available, Finished)

In [None]:
unprocessed_dir = '/lakehouse/default/Files/Unprocessed/'
processed_dir = '/lakehouse/default/Files/Processed/'

vru_approach_dict = {
    'NB': 'S',
    'SB': 'N',
    'EB': 'W',
    'WB': 'E'
}

class_dict = {
    'Mobility Aid': 'Pedestrian',
    'Motorcycle': 'Passenger Vehicle',
    'Articulated Truck': 'Semi Truck',
    'Single Unit Truck': 'Box Truck',
    'Person Mobility Device': 'Pedestrian'
}

severity_dict = {
    'High': 'Severe',
    'Low': 'Moderate'
}

default_date = '1900-01-01'

StatementMeta(, c2872e01-b22b-48a1-85eb-65a28915ee8c, 9, Finished, Available, Finished)

### Volume Processing

In [None]:
source_dir = os.path.join(unprocessed_dir, 'DERQ/Volumes')

dfs = []

for file in os.listdir(source_dir):
    if 'csv' in file:
        # get intersection id from file name
        intersection_id = file
        # read csv file to df
        file_path = os.path.join(source_dir, file)
        df = pd.read_csv(file_path)
        df['intersection_id'] = intersection_id
        dfs.append(df)

combined_volume = pd.concat(dfs, ignore_index=True)

StatementMeta(, c2872e01-b22b-48a1-85eb-65a28915ee8c, 12, Finished, Available, Finished)

In [10]:
# get times
combined_volume[['time', 'end_time']] = combined_volume['timeInterval'].str.split(' - ', expand=True)
combined_volume['time'] = pd.to_datetime(default_date + ' ' + combined_volume['time'])

# get dates as datetime
combined_volume['date'] = pd.to_datetime(combined_volume['date'])
combined_volume['date'] = combined_volume['date'].dt.date

# drop and rename excess columns
combined_volume.drop(columns=['timeInterval', 'movement', 'dayOfTheWeek', 'end_time'], inplace=True)
combined_volume.rename(columns={'movementType': 'movement', 'count': 'volume'}, inplace=True)

combined_volume['class'] = combined_volume['class'].str.replace('_', ' ').str.title()
combined_volume['class'] = combined_volume['class'].replace(class_dict)

mask = combined_volume['movement'] == 'CROSSING'
combined_volume.loc[mask, 'approach'] = combined_volume.loc[mask, 'approach'].map(vru_approach_dict)

combined_volume

StatementMeta(, 5581e27a-5c98-4d6a-b3cc-986558a4ba82, 14, Finished, Available, Finished)

Unnamed: 0,volume,date,lane,approach,movement,class,intersection_id,time
0,1,2024-10-24,1.0,NB,LT,Passenger Vehicle,CharlestonBlvd8thSt_vehicle_10-01-2024_to_10-3...,1900-01-01 00:00:00
1,1,2024-10-31,1.0,NB,LT,Passenger Vehicle,CharlestonBlvd8thSt_vehicle_10-01-2024_to_10-3...,1900-01-01 00:00:00
2,1,2024-10-24,1.0,NB,TH,Passenger Vehicle,CharlestonBlvd8thSt_vehicle_10-01-2024_to_10-3...,1900-01-01 00:00:00
3,1,2024-10-17,1.0,SB,TH,Passenger Vehicle,CharlestonBlvd8thSt_vehicle_10-01-2024_to_10-3...,1900-01-01 00:00:00
4,1,2024-10-24,1.0,SB,TH,Passenger Vehicle,CharlestonBlvd8thSt_vehicle_10-01-2024_to_10-3...,1900-01-01 00:00:00
...,...,...,...,...,...,...,...,...
448255,1,2024-10-02,,W,CROSSING,Bicycle,CharlestonBlvdFremontStBoulderHwy_vru_10-01-20...,1900-01-01 23:45:00
448256,3,2024-10-02,,E,CROSSING,Pedestrian,CharlestonBlvdFremontStBoulderHwy_vru_10-01-20...,1900-01-01 23:45:00
448257,1,2024-10-23,,E,CROSSING,Pedestrian,CharlestonBlvdFremontStBoulderHwy_vru_10-01-20...,1900-01-01 23:45:00
448258,1,2024-10-09,,E,CROSSING,Pedestrian,CharlestonBlvdFremontStBoulderHwy_vru_10-01-20...,1900-01-01 23:45:00


In [11]:
table_name = 'derq_volume_fact_table'
spark_table_gen(combined_volume, table_name)

StatementMeta(, 5581e27a-5c98-4d6a-b3cc-986558a4ba82, 15, Finished, Available, Finished)



### Event processing

In [None]:
source_dir = os.path.join(unprocessed_dir, 'DERQ/Events')

event_dfs = []

for file_name in os.listdir(source_dir):
    file_path = os.path.join(source_dir, file_name)

    # read csv file to df
    df = pd.read_csv(file_path)
    event_dfs.append(df)

combined_events = pd.concat(event_dfs, ignore_index=True)
combined_events.head(5)

StatementMeta(, c2872e01-b22b-48a1-85eb-65a28915ee8c, 36, Finished, Available, Finished)

Unnamed: 0,id,event_type,datetime,intersection,detection_area,speed,nearmiss_type,pet,ttc,gap_time,isSevere,involved,direction,movement,latitude,longitude,time_in_red
0,6721373649636700129e1503,Vehicle-VRU Near Miss,10/29/2024 12:27:47 PM,Charleston Blvd. & 15th St.,North Leg,18 mph,Cutoff,,1.9,0.3,Low,"Pedestrian, Passenger Vehicle",NEB,LT,36.159088,-115.132637,
1,67202d2f7a12a6001252358e,Vehicle-VRU Near Miss,10/28/2024 05:32:44 PM,Charleston Blvd. & 15th St.,North Leg,24 mph,Cutoff,,,0.9,High,"Passenger Vehicle, Pedestrian",NB,TH,36.15905,-115.132652,
2,6719b3f4890b010012ee9b1d,Vehicle-VRU Near Miss,10/23/2024 07:41:54 PM,Charleston Blvd. & 15th St.,North Leg,34 mph,Right-of-way,1.8,,,Low,"Passenger Vehicle, Pedestrian",SB,TH,36.159031,-115.132713,
3,67140da8ba63c30012f9b850,Vehicle-VRU Near Miss,10/19/2024 12:51:01 PM,Charleston Blvd. & 15th St.,South Leg,20 mph,Right-of-way,2.2,,,Low,"Passenger Vehicle, Bicyclist",SB,RT,36.158684,-115.132858,
4,6713ee416dc00d0013d4b250,Vehicle-VRU Near Miss,10/19/2024 10:37:03 AM,Charleston Blvd. & 15th St.,South Leg,39 mph,Right-of-way,2.1,,,Low,"Passenger Vehicle, Pedestrian",SB,LT,36.158607,-115.132851,


In [None]:
# rename id to event_id  
combined_events.rename(columns={'id': 'event_id'}, inplace=True)

# split date into date and time  
combined_events['datetime'] = pd.to_datetime(combined_events['datetime'])
combined_events['date'] = combined_events['datetime'].dt.date

# strip seconds from time 
combined_events['time_to_second'] = combined_events['datetime'].dt.strftime('%H:%M:%S')
combined_events['time'] = combined_events['datetime'].dt.strftime('%H:%M')
combined_events['time'] = combined_events['time'] + ':00'

# re-set time cols as datetimes
combined_events['time'] = pd.to_datetime(default_date + ' ' + combined_events['time'])
combined_events['time_to_second'] = pd.to_datetime(default_date + ' ' + combined_events['time_to_second'])

# drop excess columns
combined_events.drop(columns=['datetime'], inplace=True)
combined_events.rename(columns={'speed': 'speed_mph'}, inplace=True)

combined_events['speed_mph'] = combined_events['speed_mph'].str.extract(r'(\d+)').astype(float)

combined_events.rename(columns={'isSevere': 'severity'}, inplace=True)
combined_events['severity'] = combined_events['severity'].map(severity_dict)
combined_events.loc[combined_events['event_type'] == 'Illegal Crossing', 'severity'] = 'Low'

StatementMeta(, c2872e01-b22b-48a1-85eb-65a28915ee8c, 37, Finished, Available, Finished)

In [None]:
table_name = 'derq_event_fact_table'
spark_table_gen(combined_events, table_name)

StatementMeta(, c2872e01-b22b-48a1-85eb-65a28915ee8c, 38, Finished, Available, Finished)

