In [54]:
import plotly.express as px
import pandas as pd
import plotly.graph_objects as go
import plotly.offline as py
import numpy as np
from pyspark.sql import SparkSession
import pyspark.sql.functions as F

In [55]:
spark = SparkSession.builder.appName('flights').getOrCreate()

In [56]:
def count_null(df):
    cols_check = df.columns
    df.select(*[
    (
        F.count(F.when((F.isnan(c) | F.col(c).isNull()), c)) if t not in ("timestamp", "date")
        else F.count(F.when(F.col(c).isNull(), c))
    ).alias(c)
    for c, t in df.dtypes if c in cols_check
]).show()

In [57]:
def select_features():
    features = []
    with open("features_analysis.txt","r") as f:
        lines = f.readlines()
        for line in lines:
            line = line.strip()
            if line=="":
                continue
            split = line.split()
            feature = split[0].strip()
            features.append(feature)
    return features


In [58]:
def clean_data(df,features):
    df = df.select(features)
    df = df.na.drop()
    return df

In [59]:
def join_airports(df,airports):
    df = df.join(airports,df.Origin==airports.IATA)
    df = df.withColumnRenamed("LATITUDE","ORIGIN_LATITUDE")
    df = df.withColumnRenamed("LONGITUDE","ORIGIN_LONGITUDE")
    df = df.withColumnRenamed("STATE","ORIGIN_STATE")
    df = df.withColumnRenamed("AIRPORT", "ORIGIN_AIRPORT_FULL_NAME")
    df = df.withColumnRenamed("STATE_FULL_NAME", "ORIGIN_STATE_FULL_NAME")
    df = df.drop("IATA","CITY","COUNTRY")
    df = df.join(airports,df.Dest==airports.IATA)
    df = df.withColumnRenamed("LATITUDE","DEST_LATITUDE")
    df = df.withColumnRenamed("LONGITUDE","DEST_LONGITUDE")
    df = df.withColumnRenamed("STATE","DEST_STATE")
    df = df.withColumnRenamed("AIRPORT", "DEST_AIRPORT_FULL_NAME")
    df = df.withColumnRenamed("STATE_FULL_NAME", "DEST_STATE_FULL_NAME")
    df = df.drop("IATA","CITY","COUNTRY")    
    return df

In [60]:
# get all files with .csv extension
import glob
files = glob.glob("../data.nosync/*.csv")

In [61]:
from pyspark.sql.functions import date_format, col, concat_ws, to_date, unix_timestamp

spark.conf.set("spark.sql.legacy.timeParserPolicy", "LEGACY")

def preprocess_normal_data(path):
    df = spark.read.csv(path, inferSchema=True, header=True)
    columns = df.columns
    df_normal = df.filter(df['Diverted'] == 0)
    features = select_features()
    df_normal = clean_data(df_normal,features)
    airports = spark.read.csv("../../csv_files/airports.csv", inferSchema=True, header=True)
    df_normal = join_airports(df_normal,airports)
    df_normal =  df_normal.withColumn("WeekofMonth", concat_ws("-", df_normal.Year, df_normal.Month, date_format(col("FlightDate"), "W").cast("string")))
    df_normal = df_normal.withColumn("WeekofMonth", to_date(unix_timestamp(col('WeekofMonth'), 'yyyy-MM-dd').cast("timestamp")))
    return df_normal

In [64]:
df_cleaned = preprocess_normal_data("../../data.nosync")
df_cleaned = df_cleaned.drop("_c0")

                                                                                

In [65]:
# save the cleaned data
df_cleaned.write.csv("../../data.nosync/cleaned/cleaned_flights.csv",header=True)

                                                                                