In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql.functions import when, col

In [2]:
spark = SparkSession.builder \
    .appName("CrashRemedyPrediction") \
    .config("spark.driver.memory", "6g") \
    .config("spark.executor.memory", "6g") \
    .config("spark.sql.shuffle.partitions", "8") \
    .config("spark.sql.debug.maxToStringFields", "200") \
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/04/08 10:09:11 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


----------------------------------------
Exception occurred during processing of request from ('127.0.0.1', 50176)
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.12/socketserver.py", line 318, in _handle_request_noblock
    self.process_request(request, client_address)
  File "/opt/anaconda3/lib/python3.12/socketserver.py", line 349, in process_request
    self.finish_request(request, client_address)
  File "/opt/anaconda3/lib/python3.12/socketserver.py", line 362, in finish_request
    self.RequestHandlerClass(request, client_address, self)
  File "/opt/anaconda3/lib/python3.12/socketserver.py", line 761, in __init__
    self.handle()
  File "/opt/anaconda3/lib/python3.12/site-packages/pyspark/accumulators.py", line 295, in handle
    poll(accum_updates)
  File "/opt/anaconda3/lib/python3.12/site-packages/pyspark/accumulators.py", line 267, in poll
    if self.rfile in r and func():
                           ^^^^^^
  File "/opt/anaconda3/lib/python3.12/site-pac

In [3]:
df_icbc = spark.read.parquet("/Users/swaifahaque/Documents/BigDataProject/parquet/icbc/")     # Replace with your ICBC path
df_tas = spark.read.parquet("/Users/swaifahaque/Documents/BigDataProject/parquet/TAS/no_cf")

                                                                                

In [4]:
df_icbc = df_icbc.withColumnRenamed("municipality", "muni_icbc") \
                 .withColumnRenamed("region", "region_icbc") \
                 .withColumnRenamed("month", "icbc_month")

df_tas = df_tas.withColumnRenamed("Municipality", "muni_tas") \
               .withColumnRenamed("Month", "tas_month") \
               .withColumnRenamed("Year", "tas_year") \
               .withColumnRenamed("Region", "region_tas")

In [5]:
from pyspark.sql.functions import when, upper, trim
df_icbc = df_icbc.withColumn("muni_icbc_norm", upper(trim(col("muni_icbc"))))
df_tas = df_tas.withColumn("muni_tas_norm", upper(trim(col("muni_tas"))))

# Now join only on muni and year (ignore month for now)
df_joined= df_icbc.join(
    df_tas,
    (df_icbc["muni_icbc_norm"] == df_tas["muni_tas_norm"]) &
    (df_icbc["year"] == df_tas["tas_year"].cast("int")),
    how="left"
)

In [6]:

df_model = df_joined.withColumn(
    "remedy",
    when((col("Crash Configuration") == "Rear End") & (col("Road Condition") == "Wet"), "Install Warning Sign")
    .when((col("Pedestrian Involved") == "Yes") & (col("Crash Configuration").contains("Turn")), "Add Pedestrian Crossing")
    .when(col("Cyclist Involved") == "Yes", "Add Bike Lane")
    .when(col("Motorcycle Involved") == "Yes", "Improve Road Surface")
    .when(col("Road Condition") == "Icy", "Apply Anti-Ice Treatment")
    .when(col("Hit And Run Indicator") == "Yes", "Install CCTV")
    .otherwise("More Patrols")
)

In [7]:
selected_features = [
    "Crash Configuration",
    "Cyclist Involved",
    "Motorcycle Involved",
    "Pedestrian Involved",
    "Road Condition"
]

In [8]:
df_clean = df_model.dropna(subset=selected_features + ["remedy"])

In [9]:
df_clean.count()

                                                                                

1062148288

In [10]:
df_sampled = df_clean.sample(withReplacement=False, fraction=0.005, seed=42)

In [11]:
indexers = [StringIndexer(inputCol=col_name, outputCol=f"{col_name}_idx", handleInvalid="keep") for col_name in selected_features]
label_indexer = StringIndexer(inputCol="remedy", outputCol="label", handleInvalid="keep")

# 🔗 Step 5: Assemble features
feature_cols = [f"{col}_idx" for col in selected_features]
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")

# 🌲 Step 6: Define classifier
clf = DecisionTreeClassifier(labelCol="label", featuresCol="features", maxBins=512)

In [12]:
pipeline = Pipeline(stages=indexers + [label_indexer, assembler, clf])

In [13]:
train_data, test_data = df_sampled.randomSplit([0.8, 0.2], seed=42)
model = pipeline.fit(train_data)

25/04/08 10:09:26 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors
                                                                                

In [14]:
preds = model.transform(test_data)

In [15]:
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(preds)
print(f"Remedy Prediction Accuracy: {accuracy:.4f}")



Remedy Prediction Accuracy: 0.7793


                                                                                

In [17]:
tree_model = model.stages[-1]

# Zip feature importances with feature names
importances = tree_model.featureImportances.toArray()
for name, score in zip(feature_cols, importances):
    print(f"{name}: {score:.4f}")

Crash Configuration_idx: 0.2654
Cyclist Involved_idx: 0.3593
Motorcycle Involved_idx: 0.2184
Pedestrian Involved_idx: 0.0001
Road Condition_idx: 0.1568


In [18]:
from pyspark.ml.feature import StringIndexerModel
from pyspark.sql.functions import udf, col
from pyspark.sql.types import StringType

# STEP 1: Extract the label indexer stage from pipeline model
label_indexer_model = None
for stage in model.stages:
    if isinstance(stage, StringIndexerModel) and stage.getOutputCol() == "label":
        label_indexer_model = stage
        break

if label_indexer_model is None:
    raise ValueError("No StringIndexerModel found with outputCol='label'")

# STEP 2: Get list of remedy labels (index to label)
labels = label_indexer_model.labels


In [21]:
def decode_prediction(index):
    if index is not None and int(index) < len(labels):
        return labels[int(index)]
    return "Unknown"

hotspot_preds = preds.select("latitude", "longitude", "city", "street", "prediction") \
    .filter(col("latitude").isNotNull() & col("longitude").isNotNull()) \
    .withColumn("predicted_remedy", decode_udf(col("prediction")))

# STEP 5: Preview
hotspot_preds.show(30, truncate=False)

[Stage 52:>                                                         (0 + 1) / 1]

+---------+-----------+-------------------------------+--------------+----------+--------------------+
|latitude |longitude  |city                           |street        |prediction|predicted_remedy    |
+---------+-----------+-------------------------------+--------------+----------+--------------------+
|49.12274 |-122.823318|66 AVE, SURREY                 |66 AVE        |0.0       |More Patrols        |
|49.12274 |-122.823318|66 AVE, SURREY                 |66 AVE        |0.0       |More Patrols        |
|49.12274 |-122.823318|66 AVE, SURREY                 |66 AVE        |0.0       |More Patrols        |
|49.12274 |-122.823318|66 AVE, SURREY                 |66 AVE        |0.0       |More Patrols        |
|49.12274 |-122.823318|66 AVE, SURREY                 |66 AVE        |0.0       |More Patrols        |
|49.311704|-123.021179|MT SEYMOUR PKY, NORTH VANCOUVER|MT SEYMOUR PKY|4.0       |Improve Road Surface|
|49.192002|-123.116139|BRIDGEPORT RD, RICHMOND        |BRIDGEPORT RD |0.0

                                                                                

25/04/08 16:13:13 WARN TransportChannelHandler: Exception in connection from d207-023-175-169.wireless.sfu.ca/207.23.175.169:50120
java.io.IOException: Operation timed out
	at java.base/sun.nio.ch.SocketDispatcher.read0(Native Method)
	at java.base/sun.nio.ch.SocketDispatcher.read(SocketDispatcher.java:47)
	at java.base/sun.nio.ch.IOUtil.readIntoNativeBuffer(IOUtil.java:340)
	at java.base/sun.nio.ch.IOUtil.read(IOUtil.java:294)
	at java.base/sun.nio.ch.IOUtil.read(IOUtil.java:269)
	at java.base/sun.nio.ch.SocketChannelImpl.implRead(SocketChannelImpl.java:426)
	at java.base/sun.nio.ch.SocketChannelImpl.read(SocketChannelImpl.java:493)
	at io.netty.buffer.PooledByteBuf.setBytes(PooledByteBuf.java:254)
	at io.netty.buffer.AbstractByteBuf.writeBytes(AbstractByteBuf.java:1132)
	at io.netty.channel.socket.nio.NioSocketChannel.doReadBytes(NioSocketChannel.java:357)
	at io.netty.channel.nio.AbstractNioByteChannel$NioByteUnsafe.read(AbstractNioByteChannel.java:151)
	at io.netty.channel.nio.NioE