In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

# Khởi tạo Spark session
spark = SparkSession.builder.appName('Time_Analysis').getOrCreate()

# Đọc dữ liệu vào DataFrame
df = spark.read.csv("dataset/CleanData/cleandata.csv", header=True, inferSchema=True)

# Chuyển đổi Start_Time thành kiểu dữ liệu timestamp
df = df.withColumn("Start_Time", to_timestamp("Start_Time", "M/d/yyyy H:mm"))

In [2]:
# Chuyển đổi Severity thành dạng text
df = df.withColumn(
    "Severity",
    when(col("Severity") == 1, "Low")
    .when(col("Severity") == 2, "Moderate")
    .when(col("Severity") == 3, "High")
    .when(col("Severity") == 4, "Severe")
)
df.printSchema()
df.count()

root
 |-- ID: string (nullable = true)
 |-- Source: string (nullable = true)
 |-- Severity: string (nullable = true)
 |-- Start_Time: timestamp (nullable = true)
 |-- End_Time: timestamp (nullable = true)
 |-- Start_Lat: double (nullable = true)
 |-- Start_Lng: double (nullable = true)
 |-- Distance(mi): double (nullable = true)
 |-- Description: string (nullable = true)
 |-- Street: string (nullable = true)
 |-- City: string (nullable = true)
 |-- County: string (nullable = true)
 |-- State: string (nullable = true)
 |-- Zipcode: string (nullable = true)
 |-- Country: string (nullable = true)
 |-- Timezone: string (nullable = true)
 |-- Airport_Code: string (nullable = true)
 |-- Weather_Timestamp: timestamp (nullable = true)
 |-- Temperature(F): double (nullable = true)
 |-- Wind_Chill(F): double (nullable = true)
 |-- Humidity(%): double (nullable = true)
 |-- Pressure(in): double (nullable = true)
 |-- Visibility(mi): double (nullable = true)
 |-- Wind_Direction: string (nullable =

492326

In [3]:
df = df.withColumn("Hour", hour("Start_Time"))
df = df.withColumn("Day", dayofweek("Start_Time"))
df = df.withColumn("Month", month("Start_Time"))
df = df.withColumn("Year", year("Start_Time"))

In [4]:
# Phân tích mức độ tai nạn theo thời gian
accidents_by_time = df.groupBy("Year", "Month", "Day", "Hour", "Severity").count()
accidents_by_time.show()

+----+-----+---+----+--------+-----+
|Year|Month|Day|Hour|Severity|count|
+----+-----+---+----+--------+-----+
|2016|    9|  3|  10|    High|   21|
|2021|    9|  3|  20|Moderate|   18|
|2017|    7|  6|  13|Moderate|   26|
|2020|    4|  6|   6|Moderate|   39|
|2019|   12|  6|  16|Moderate|   58|
|2021|   11|  3|   6|    High|   10|
|2021|    8|  6|  12|Moderate|   57|
|2018|    9|  3|  15|Moderate|   31|
|2016|    6|  3|  18|    High|    4|
|2017|    1|  3|  13|Moderate|   23|
|2020|   10|  7|   7|Moderate|   37|
|2019|   10|  5|   7|Moderate|  123|
|2020|    2|  5|   3|    High|    2|
|2019|   11|  4|   5|    High|   10|
|2020|    1|  2|  19|Moderate|   19|
|2020|    3|  1|  15|Moderate|   18|
|2021|    4|  3|   9|Moderate|   40|
|2022|    3|  7|   6|    High|    2|
|2022|    4|  4|  18|     Low|    3|
|2018|    1|  3|  18|Moderate|   36|
+----+-----+---+----+--------+-----+
only showing top 20 rows



In [5]:
# Xuất ra file csv
accidents_by_time.coalesce(1).write.csv("dataset/visualize_data/accidents_by_time", header=True, mode="overwrite")

In [6]:
#stop
spark.stop()