In [1]:
import os
import subprocess

java_home = subprocess.check_output(["/usr/libexec/java_home", "-v", "17"]).strip().decode('utf-8')

# Set JAVA_HOME and PATH
os.environ["JAVA_HOME"] = java_home
os.environ["PATH"] = os.path.join(java_home, "bin") + ":" + os.environ["PATH"]
os.environ["PYSPARK_SUBMIT_ARGS"] = "--master local pyspark-shell"

# Verify JAVA_HOME and Java version
print("JAVA_HOME:", os.environ['JAVA_HOME'])
!java -version

JAVA_HOME: /opt/homebrew/Cellar/openjdk@17/17.0.13/libexec/openjdk.jdk/Contents/Home
openjdk version "17.0.13" 2024-10-15
OpenJDK Runtime Environment Homebrew (build 17.0.13+0)
OpenJDK 64-Bit Server VM Homebrew (build 17.0.13+0, mixed mode, sharing)


In [28]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('CsvColumnSelection').getOrCreate()
spark.sparkContext.setLogLevel("ERROR")

df_spark = spark.read.csv('Traffic_Crashes.csv', header=True, inferSchema=True)

                                                                                

In [None]:
# basic analysis

filtered_death = df_spark.filter(df_spark["number_killed"]>0)

print(f'the fatality rate is {filtered_death.count()/(df_spark.count())}')



the fatal rate is 0.009359466693265506


In [19]:
from pyspark.sql.functions import sum, col

def get_highest_count(df, feature, filter, res):
    most = df.groupBy(feature).agg(sum(filter).alias(res))
    return most.orderBy(col(res).desc()).first()

In [30]:
from pyspark.sql.functions import col

road_with_most_injuries = get_highest_count(
    df_spark, "primary_rd", "number_injured", "total_injuries"
)

print(
    f'the most injuries occured on street: {road_with_most_injuries["primary_rd"]} with {road_with_most_injuries["total_injuries"]} injuries.'
)

road_condition_most_injuries = get_highest_count(
    df_spark, "road_cond_1", "number_injured", "total_injuries"
)

print(
    f'the most injuries occured with the following road condition: "{road_condition_most_injuries["road_cond_1"]}" with {road_condition_most_injuries["total_injuries"]} injuries'
)
distance_intersect_with_most_injuries = get_highest_count(
    df_spark, "intersection", "number_injured", "total_injuries"
)

print(
    f'the most injuries had the following distance away from an intersection: {distance_intersect_with_most_injuries["intersection"]} with {distance_intersect_with_most_injuries["total_injuries"]} injuries.'
)

the most injuries occured on street: MISSION ST with 3255 injuries.
the most injuries occured with the following road condition: "No Unusual Condition" with 71541 injuries
the most injuries had the following distance away from an intersection: Intersection <= 20ft with 50277 injuries.


In [32]:
road_with_most_deaths = get_highest_count(df_spark, "primary_rd", "number_killed", "total_deaths")
print(
    f'the most deaths occured on street: {road_with_most_deaths["primary_rd"]} with {road_with_most_deaths["total_deaths"]} deaths.'
)
road_cond_with_most_deaths = get_highest_count(
    df_spark, "road_cond_1", "number_killed", "total_deaths"
)
print(
    f'the most deaths occured on street: {road_cond_with_most_deaths["road_cond_1"]} with {road_cond_with_most_deaths["total_deaths"]} deaths.'
)

distance_intersect_with_most_deaths = get_highest_count(
    df_spark, "intersection", "number_killed", "total_deaths"
)
print(
    f'the most injuries had the following distance away from an intersection: {distance_intersect_with_most_deaths["intersection"]} with {distance_intersect_with_most_deaths["total_deaths"]} injuries.'
)

the most deaths occured on street: MISSION ST with 27 deaths.
the most deaths occured on street: No Unusual Condition with 530 deaths.
the most injuries had the following distance away from an intersection: Intersection <= 20ft with 341 injuries.
