In [2]:
from pyspark.sql import SparkSession, Window, DataFrame

import os
import sys

os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

# Initialize Spark session
spark = (SparkSession.builder.appName("DataProcessingApp")
         .config("spark.executor.memory", "8g")
         .config("spark.driver.memory", "8g")
         .config("spark.driver.maxResultSize", "4g")
         .getOrCreate())
spark.sparkContext.setLogLevel("DEBUG")

# Read the CSV file into a DataFrame
df = spark.read.csv('../data/US_Accidents_March23_cleaned.csv', header=True, inferSchema=True)
df.show(5)

+---+--------+-------------------+-------------------+------------------+-------------------+-------+-------+------------+--------------------+--------------+---------------+----------+-----+-------------------+--------------+-------------+-----------+------------+--------------+--------------+---------------+-----------------+-----------------+-------+-----+--------+--------+--------+-------+-------+----------+-------+-----+---------------+--------------+------------+--------------+--------------+-----------------+---------------------+
| ID|Severity|         Start_Time|           End_Time|         Start_Lat|          Start_Lng|End_Lat|End_Lng|Distance(mi)|         Description|        Street|           City|    County|State|  Weather_Timestamp|Temperature(F)|Wind_Chill(F)|Humidity(%)|Pressure(in)|Visibility(mi)|Wind_Direction|Wind_Speed(mph)|Precipitation(in)|Weather_Condition|Amenity| Bump|Crossing|Give_Way|Junction|No_Exit|Railway|Roundabout|Station| Stop|Traffic_Calming|Traffic_Sig

In [3]:
from pyspark.sql.functions import col, year, month, dayofmonth, dayofweek, hour, minute

# Convert Start_Time to timestamp (if not already)
df = df.withColumn("Start_Time", col("Start_Time").cast("timestamp"))

df = df.withColumn("Year", year(col("Start_Time"))).withColumn("Month", month(col("Start_Time"))).withColumn("Weekday", dayofweek(col("Start_Time")).cast("int") - 2).withColumn("Day", dayofmonth(col("Start_Time")))

# Extract hour and minute
df = df.withColumn("Hour", hour(col("Start_Time"))) \
       .withColumn("Minute", minute(col("Start_Time")))
df.show(5)

+---+--------+-------------------+-------------------+------------------+-------------------+-------+-------+------------+--------------------+--------------+---------------+----------+-----+-------------------+--------------+-------------+-----------+------------+--------------+--------------+---------------+-----------------+-----------------+-------+-----+--------+--------+--------+-------+-------+----------+-------+-----+---------------+--------------+------------+--------------+--------------+-----------------+---------------------+----+-----+-------+---+----+------+
| ID|Severity|         Start_Time|           End_Time|         Start_Lat|          Start_Lng|End_Lat|End_Lng|Distance(mi)|         Description|        Street|           City|    County|State|  Weather_Timestamp|Temperature(F)|Wind_Chill(F)|Humidity(%)|Pressure(in)|Visibility(mi)|Wind_Direction|Wind_Speed(mph)|Precipitation(in)|Weather_Condition|Amenity| Bump|Crossing|Give_Way|Junction|No_Exit|Railway|Roundabout|Statio

In [4]:
features_to_drop = ["ID", "Start_Time", "End_Time", "End_Lat", "End_Lng", "Description", "Street", "County", "Zipcode", "Weather_Timestamp", "Wind_Chill(F)", "Turning_Loop", "Sunrise_Sunset", "Nautical_Twilight", "Astronomical_Twilight"]
df = df.drop(*features_to_drop)
df.show(5)

+--------+------------------+-------------------+------------+---------------+-----+--------------+-----------+------------+--------------+--------------+---------------+-----------------+-----------------+-------+-----+--------+--------+--------+-------+-------+----------+-------+-----+---------------+--------------+--------------+----+-----+-------+---+----+------+
|Severity|         Start_Lat|          Start_Lng|Distance(mi)|           City|State|Temperature(F)|Humidity(%)|Pressure(in)|Visibility(mi)|Wind_Direction|Wind_Speed(mph)|Precipitation(in)|Weather_Condition|Amenity| Bump|Crossing|Give_Way|Junction|No_Exit|Railway|Roundabout|Station| Stop|Traffic_Calming|Traffic_Signal|Civil_Twilight|Year|Month|Weekday|Day|Hour|Minute|
+--------+------------------+-------------------+------------+---------------+-----+--------------+-----------+------------+--------------+--------------+---------------+-----------------+-----------------+-------+-----+--------+--------+--------+-------+-----