In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

# Khởi tạo Spark session
spark = SparkSession.builder.appName('Location_Analysis').getOrCreate()

# Đọc dữ liệu vào DataFrame
df = spark.read.csv("dataset/CleanData/cleandata.csv", header=True, inferSchema=True)

# Chuyển đổi Start_Time thành kiểu dữ liệu timestamp
df = df.withColumn("Start_Time", to_timestamp("Start_Time", "M/d/yyyy H:mm"))

In [2]:
# Chuyển đổi Severity thành dạng text
df = df.withColumn(
    "Severity",
    when(col("Severity") == 1, "Low")
    .when(col("Severity") == 2, "Moderate")
    .when(col("Severity") == 3, "High")
    .when(col("Severity") == 4, "Severe")
)
df.printSchema()
df.count()

root
 |-- ID: string (nullable = true)
 |-- Source: string (nullable = true)
 |-- Severity: string (nullable = true)
 |-- Start_Time: timestamp (nullable = true)
 |-- End_Time: timestamp (nullable = true)
 |-- Start_Lat: double (nullable = true)
 |-- Start_Lng: double (nullable = true)
 |-- Distance(mi): double (nullable = true)
 |-- Description: string (nullable = true)
 |-- Street: string (nullable = true)
 |-- City: string (nullable = true)
 |-- County: string (nullable = true)
 |-- State: string (nullable = true)
 |-- Zipcode: string (nullable = true)
 |-- Country: string (nullable = true)
 |-- Timezone: string (nullable = true)
 |-- Airport_Code: string (nullable = true)
 |-- Weather_Timestamp: timestamp (nullable = true)
 |-- Temperature(F): double (nullable = true)
 |-- Wind_Chill(F): double (nullable = true)
 |-- Humidity(%): double (nullable = true)
 |-- Pressure(in): double (nullable = true)
 |-- Visibility(mi): double (nullable = true)
 |-- Wind_Direction: string (nullable =

492326

In [3]:
# Phân tích mức độ tai nạn theo địa lý
accidents_by_geolocation = df.groupBy("State", "City", "Severity").count()
accidents_by_geolocation.show()

+-----+-------------+--------+-----+
|State|         City|Severity|count|
+-----+-------------+--------+-----+
|   CA|    San Diego|Moderate| 2757|
|   MN|   Albert Lea|Moderate|   62|
|   CA|  Bakersfield|  Severe|   14|
|   CA|    San Dimas|Moderate|  195|
|   CA|  West Covina|Moderate|  265|
|   CA|        Vista|Moderate|  251|
|   VA|      Henrico|    High|   45|
|   IL|Downers Grove|Moderate|  167|
|   SC|         York|Moderate|   79|
|   MT|       Hysham|Moderate|    1|
|   CA|       Orland|Moderate|   42|
|   GA|      Zebulon|    High|    1|
|   CA|       Oxnard|Moderate|  206|
|   NJ| Little Falls|Moderate|   24|
|   LA|      Convent|Moderate|    1|
|   IL| Vernon Hills|Moderate|   49|
|   CA|      Oakdale|Moderate|   91|
|   MA|  West Newton|Moderate|    9|
|   PA|Port Allegany|Moderate|    2|
|   CA|      Truckee|    High|   11|
+-----+-------------+--------+-----+
only showing top 20 rows



In [4]:
# Xuất ra file CSV
accidents_by_geolocation.coalesce(1).write.csv("dataset/visualize_data/accidents_by_geolocation", header=True, mode="overwrite")

In [5]:
#stop
spark.stop()