In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

# Khởi tạo Spark session
spark = SparkSession.builder.appName('Weather_Analysis').getOrCreate()

# Đọc dữ liệu vào DataFrame
df = spark.read.csv("dataset/CleanData/cleandata.csv", header=True, inferSchema=True)

# Chuyển đổi Start_Time thành kiểu dữ liệu timestamp
df = df.withColumn("Start_Time", to_timestamp("Start_Time", "M/d/yyyy H:mm"))

In [2]:
# Chuyển đổi Severity thành dạng text
df = df.withColumn(
    "Severity",
    when(col("Severity") == 1, "Low")
    .when(col("Severity") == 2, "Moderate")
    .when(col("Severity") == 3, "High")
    .when(col("Severity") == 4, "Severe")
)
df.printSchema()
df.count()

root
 |-- ID: string (nullable = true)
 |-- Source: string (nullable = true)
 |-- Severity: string (nullable = true)
 |-- Start_Time: timestamp (nullable = true)
 |-- End_Time: timestamp (nullable = true)
 |-- Start_Lat: double (nullable = true)
 |-- Start_Lng: double (nullable = true)
 |-- Distance(mi): double (nullable = true)
 |-- Description: string (nullable = true)
 |-- Street: string (nullable = true)
 |-- City: string (nullable = true)
 |-- County: string (nullable = true)
 |-- State: string (nullable = true)
 |-- Zipcode: string (nullable = true)
 |-- Country: string (nullable = true)
 |-- Timezone: string (nullable = true)
 |-- Airport_Code: string (nullable = true)
 |-- Weather_Timestamp: timestamp (nullable = true)
 |-- Temperature(F): double (nullable = true)
 |-- Wind_Chill(F): double (nullable = true)
 |-- Humidity(%): double (nullable = true)
 |-- Pressure(in): double (nullable = true)
 |-- Visibility(mi): double (nullable = true)
 |-- Wind_Direction: string (nullable =

492326

In [3]:
# Phân tích mức độ tai nạn theo thời tiết
weather_analysis = df.groupBy("Severity").agg(
    avg("Temperature(F)").alias("Avg_Temperature"),
    avg("Visibility(mi)").alias("Avg_Visibility"),
    avg("Wind_Speed(mph)").alias("Avg_WindSpeed")
)
weather_analysis.show()

+--------+------------------+-----------------+------------------+
|Severity|   Avg_Temperature|   Avg_Visibility|     Avg_WindSpeed|
+--------+------------------+-----------------+------------------+
|    High|  62.2155888757205|9.111724321085584| 8.257717716592467|
|     Low| 72.19688657537068|9.520051882824763| 7.077954744936069|
|  Severe|58.231902150944954| 9.09713639947612| 7.654875313408872|
|Moderate| 61.52252127932248|9.082440646316048|7.5660346799014935|
+--------+------------------+-----------------+------------------+



In [4]:
# xuất ra file csv
weather_analysis.coalesce(1).write.csv("dataset/visualize_data/weather_analysis", header=True, mode="overwrite")

In [5]:
#stop
spark.stop()