In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

# Khởi tạo Spark session
spark = SparkSession.builder.appName('Infras_Analysis').getOrCreate()

# Đọc dữ liệu vào DataFrame
df = spark.read.csv("dataset/CleanData/cleandata.csv", header=True, inferSchema=True)

# Chuyển đổi Start_Time thành kiểu dữ liệu timestamp
df = df.withColumn("Start_Time", to_timestamp("Start_Time", "M/d/yyyy H:mm"))

In [2]:
# Chuyển đổi Severity thành dạng text
df = df.withColumn(
    "Severity",
    when(col("Severity") == 1, "Low")
    .when(col("Severity") == 2, "Moderate")
    .when(col("Severity") == 3, "High")
    .when(col("Severity") == 4, "Severe")
)
df.printSchema()
df.count()

root
 |-- ID: string (nullable = true)
 |-- Source: string (nullable = true)
 |-- Severity: string (nullable = true)
 |-- Start_Time: timestamp (nullable = true)
 |-- End_Time: timestamp (nullable = true)
 |-- Start_Lat: double (nullable = true)
 |-- Start_Lng: double (nullable = true)
 |-- Distance(mi): double (nullable = true)
 |-- Description: string (nullable = true)
 |-- Street: string (nullable = true)
 |-- City: string (nullable = true)
 |-- County: string (nullable = true)
 |-- State: string (nullable = true)
 |-- Zipcode: string (nullable = true)
 |-- Country: string (nullable = true)
 |-- Timezone: string (nullable = true)
 |-- Airport_Code: string (nullable = true)
 |-- Weather_Timestamp: timestamp (nullable = true)
 |-- Temperature(F): double (nullable = true)
 |-- Wind_Chill(F): double (nullable = true)
 |-- Humidity(%): double (nullable = true)
 |-- Pressure(in): double (nullable = true)
 |-- Visibility(mi): double (nullable = true)
 |-- Wind_Direction: string (nullable =

492326

In [3]:
# Phân tích mức độ tai nạn theo cơ sở hạ tầng
# Chuyển đổi các cột boolean thành số (1/0)
boolean_columns = ["Amenity", "Bump", "Crossing", "Give_Way", "Junction", "No_Exit", "Railway", 
                   "Roundabout", "Station", "Stop", "Traffic_Calming", "Traffic_Signal", "Turning_Loop"]

for col_name in boolean_columns:
    df = df.withColumn(col_name, when(col(col_name) == True, 1).otherwise(0))

# Xem dữ liệu đã xử lý
accidents_by_infras = df.select("ID", "Severity", *boolean_columns)
accidents_by_infras.show()

+---------+--------+-------+----+--------+--------+--------+-------+-------+----------+-------+----+---------------+--------------+------------+
|       ID|Severity|Amenity|Bump|Crossing|Give_Way|Junction|No_Exit|Railway|Roundabout|Station|Stop|Traffic_Calming|Traffic_Signal|Turning_Loop|
+---------+--------+-------+----+--------+--------+--------+-------+-------+----------+-------+----+---------------+--------------+------------+
|A-2047758|Moderate|      0|   0|       0|       0|       0|      0|      0|         0|      0|   0|              0|             1|           0|
|A-4694324|Moderate|      0|   0|       0|       0|       0|      0|      0|         0|      0|   0|              0|             0|           0|
|A-5006183|Moderate|      0|   0|       0|       0|       0|      0|      0|         0|      0|   0|              0|             1|           0|
|A-4237356|Moderate|      0|   0|       0|       0|       0|      0|      0|         0|      0|   0|              0|             0

In [4]:
# Xuất ra file CSV
accidents_by_infras.coalesce(1).write.csv("dataset/visualize_data/accidents_by_infras", header=True, mode="overwrite")

In [5]:
#stop
spark.stop()