In [0]:
flights_df = spark.read.table("databricks_airline_performance_data.v01.flights_small");

In [0]:
flights_df.printSchema()

In [0]:
display(flights_df.limit(10));

In [0]:
flights_df_required_cols = flights_df.select("Year","Month","DayofMonth","DepTime","FlightNum","ActualElapsedTime","CRSElapsedTime","ArrDelay")


In [0]:
flights_df_required_cols.count()

In [0]:
flights_df_required_cols\
    .selectExpr("Year",
                "Month",
                "DayofMonth",
                "CAST(DepTime AS INT) AS DepTime",
                "FlightNum",
                "CAST(ActualElapsedTime AS INT) AS ActualElapsedTime",
                "CRSElapsedTime",
                "CAST(ArrDelay AS INT) AS ArrDelay"
    )\
        .createOrReplaceTempView("flights_temp")

In [0]:
not_null_flights_df = flights_df_required_cols.na.drop(
    how = "any",
    subset = ["CRSElapsedTime"]
)

not_null_flights_df = flights_df_required_cols.dropna(
    how = "any",
    subset = ["ActualElapsedTime"]
)

not_null_flights_df = flights_df_required_cols.dropna(
    how = "any",
    subset = ["DepTime"]
)

not_null_flights_df = not_null_flights_df.filter(col("DepTime") != "NA")

not_null_flights_df = not_null_flights_df.filter(col("ActualElapsedTime") != "NA")

not_null_flights_df = not_null_flights_df.filter(col("ArrDelay") != "NA")

display(not_null_flights_df.filter(col("ActualElapsedTime") == "NA").limit(10))

display(not_null_flights_df)



In [0]:
from pyspark.sql.functions import col
# filtering out not null values of other 3 fields
flights_with_valid_data_df1 = not_null_flights_df.filter(
    col("ArrDelay").cast("integer").isNotNull()
)

flights_with_valid_data_df1.printSchema()
display(not_null_flights_df.limit(100))

In [0]:
from pyspark.sql.functions import col
clean_flights_df = not_null_flights_df\
    .withColumn("ArrDelay",col("ArrDelay").cast("integer"))\
    .withColumn("ActualElapsedTime",col("ActualElapsedTime").cast("integer"))\
    .withColumn("DepTime",col("DepTime").cast("integer"))

clean_flights_df.printSchema()

In [0]:
from pyspark.sql.functions import make_timestamp_ntz, substr, lpad, lit, col

flights_with_datetime_df = clean_flights_df.withColumn(
    "FlightDateTime",
    make_timestamp_ntz(
        col("Year"),
        col("Month"),
        col("DayofMonth"),
        substr(lpad(col("DepTime"), 4, "0"), lit(1), lit(2)),
        substr(lpad(col("DepTime"), 4, "0"), lit(3), lit(2)),
        lit(0)
    )
).drop("Year", "Month", "DayofMonth", "DepTime")

display(
    not_null_flights_df.filter(
        col("DepTime") == "NA"
    ).limit(10)
)

not_null_flights_df_depTime = not_null_flights_df.filter(col("DepTime") != "NA")

display(not_null_flights_df_depTime)