## Cleaning The Dataset


Include all the necessary imports


In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import timedelta

%matplotlib inline

pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)

Read the csv into a pandas dataframe


In [2]:
df = pd.read_csv("Flight_delay.csv")

Drop the unecessary columns


In [3]:
df.drop(
    columns=[
        "ArrTime",
        "Airline",
        "FlightNum",
        "ActualElapsedTime",
        "AirTime",
        "ArrDelay",
        "Org_Airport",
        "Dest_Airport",
        "TaxiIn",
        "TaxiOut",
        "Cancelled",
        "CancellationCode",
        "Diverted",
        "CarrierDelay",
        "WeatherDelay",
        "NASDelay",
        "LateAircraftDelay",
        "SecurityDelay",
    ],
    inplace=True,
)

Check for null values


In [4]:
np.any(np.array(df.isnull()))

False

Get the features and the label


In [5]:
LABEL_NAME = "DepDelay"
LABEL_COL = df[LABEL_NAME]

FEATURES_NAMES = [col for col in df.columns if col != LABEL_NAME]
FEATURES_COLS = df[FEATURES_NAMES]

### Define functions for Data Processing


Define a function to create a column in the dataframe called 'ScheduledDepTime', which has the scheduled departure time of each flight in the dataset.

This computed by subtracting the departure delay (in minutes) from the actual departure time.


In [6]:
def create_scheduled_dep_time_col():
    # Convert 'DepTime' to string type
    df["DepTime"] = df["DepTime"].astype(str)

    # Pad 'DepTime' with leading zeros to ensure it has 4 digits
    df["DepTime"] = df["DepTime"].str.zfill(4)

    # Replace '2400' with '0000' in 'DepTime'
    df["DepTime"] = df["DepTime"].replace("2400", "0000")

    # Convert 'DepTime' column to datetime format
    df["DepTime"] = pd.to_datetime(df["DepTime"], format="%H%M")

    # Subtract 'DepDelay' from 'DepTime'
    FEATURES_COLS["ScheduledDepTime"] = df.apply(
        lambda row: row["DepTime"] - timedelta(minutes=row["DepDelay"]), axis=1
    )

    # Convert 'ScheduledDepTime' back to the original format
    FEATURES_COLS["ScheduledDepTime"] = FEATURES_COLS["ScheduledDepTime"].dt.strftime(
        "%H%M"
    )

    # Drop the 'DepTime' column as it is not needed
    FEATURES_COLS.drop(columns="DepTime", inplace=True)

Define a function to recompute the scheduled elpased time of each flight.

This is computed by subtracting the scheduled departure time from the scheduled arrival time.


In [7]:
def upsert_scheduled_elapsed_time():
    FEATURES_COLS.drop(columns="CRSElapsedTime", inplace=True)

    # Rename the 'CRSArrTime' column to 'ScheduledArrTime'
    FEATURES_COLS.rename(columns={"CRSArrTime": "ScheduledArrTime"}, inplace=True)

    # Convert columns to string type
    FEATURES_COLS["ScheduledArrTime"] = FEATURES_COLS["ScheduledArrTime"].astype(str)
    FEATURES_COLS["ScheduledDepTime"] = FEATURES_COLS["ScheduledDepTime"].astype(str)

    # Pad columns with leading zeros to ensure it has 4 digits
    FEATURES_COLS["ScheduledArrTime"] = FEATURES_COLS["ScheduledArrTime"].str.zfill(4)
    FEATURES_COLS["ScheduledDepTime"] = FEATURES_COLS["ScheduledDepTime"].str.zfill(4)

    # Replace '2400' with '0000' in columns
    FEATURES_COLS["ScheduledArrTime"] = FEATURES_COLS["ScheduledArrTime"].replace(
        "2400", "0000"
    )
    FEATURES_COLS["ScheduledDepTime"] = FEATURES_COLS["ScheduledDepTime"].replace(
        "2400", "0000"
    )

    # Convert columns to datetime format
    FEATURES_COLS["ScheduledArrTime"] = pd.to_datetime(
        FEATURES_COLS["ScheduledArrTime"], format="%H%M"
    )
    FEATURES_COLS["ScheduledDepTime"] = pd.to_datetime(
        FEATURES_COLS["ScheduledDepTime"], format="%H%M"
    )

    # Calculate the scheduled elapsed time and create a new column 'ScheduledElapsedTime'
    FEATURES_COLS["ScheduledElapsedTime"] = (
        (
            FEATURES_COLS["ScheduledArrTime"]
            - FEATURES_COLS["ScheduledDepTime"]
            + pd.Timedelta(days=1)
        ).dt.total_seconds()
        / 60
    ).astype(int)

    # Use modulo operation to limit the elapsed time within 24 hours
    FEATURES_COLS["ScheduledElapsedTime"] = FEATURES_COLS["ScheduledElapsedTime"] % (
        24 * 60
    )

    # Convert 'ScheduledArrTime' and 'ScheduledDepTime' back to the original format
    FEATURES_COLS["ScheduledArrTime"] = FEATURES_COLS["ScheduledArrTime"].dt.strftime(
        "%H%M"
    )
    FEATURES_COLS["ScheduledDepTime"] = FEATURES_COLS["ScheduledDepTime"].dt.strftime(
        "%H%M"
    )

    # Convert 'ScheduledArrTime' and 'ScheduledDepTime' to type int
    FEATURES_COLS["ScheduledArrTime"] = FEATURES_COLS["ScheduledArrTime"].astype(int)
    FEATURES_COLS["ScheduledDepTime"] = FEATURES_COLS["ScheduledDepTime"].astype(int)

Define a function to expand the 'Date' column to a 'Day' and 'Month' columns.


In [8]:
def expand_date_col():
    # Convert the date column to datetime
    FEATURES_COLS["Date"] = pd.to_datetime(FEATURES_COLS["Date"], format="%d-%m-%Y")

    # Create the Day, Month and Year columns
    FEATURES_COLS["Day"] = FEATURES_COLS["Date"].dt.day
    FEATURES_COLS["Month"] = FEATURES_COLS["Date"].dt.month

    # Drop the 'Date' column
    FEATURES_COLS.drop(columns="Date", inplace=True)

Define a function to update the df and the Feature column names list


In [9]:
def update():
    return pd.concat([LABEL_COL, FEATURES_COLS], axis=1), [
        col for col in FEATURES_COLS.columns
    ]

### Call the functions


In [10]:
create_scheduled_dep_time_col()

In [11]:
upsert_scheduled_elapsed_time()

In [12]:
expand_date_col()

In [13]:
df, FEATURES_NAMES = update()

### Data Satistics and Visualization


Plot the distribution of each column.


In [None]:
for column in df.columns:
    sns.displot(data=df, x=column, bins=50)

Make scatter plots for each column against the 'DepDelay' column.


In [None]:
# Increase the width of the figure
fig, axes = plt.subplots(
    nrows=len(FEATURES_NAMES), ncols=1, figsize=(5, len(FEATURES_NAMES) * 5)
)

for i in range(0, len(FEATURES_NAMES)):
    axes[i].scatter(FEATURES_COLS[FEATURES_NAMES[i]], LABEL_COL)
    axes[i].set_title(f"{LABEL_NAME} vs {FEATURES_NAMES[i]}")

plt.tight_layout()

Print out the correlations between the 'DepDelay' column and all the other columns


In [16]:
# Select only numeric columns
numeric_features = FEATURES_COLS.select_dtypes(include=["int64", "int32"])

# Calculate the correlation of the label column with the other numeric columns
correlation = numeric_features.corrwith(LABEL_COL).to_frame("Correlation with DepDelay")
correlation

Unnamed: 0,Correlation with DepDelay
DayOfWeek,0.003026
ScheduledArrTime,0.045205
Distance,0.027307
ScheduledDepTime,0.048632
ScheduledElapsedTime,0.03796
Day,0.003587
Month,-0.016151


Show the head of the updated dataframe


In [17]:
df.head()

Unnamed: 0,DepDelay,DayOfWeek,ScheduledArrTime,UniqueCarrier,TailNum,Origin,Dest,Distance,ScheduledDepTime,ScheduledElapsedTime,Day,Month
0,34,4,1925,WN,N464WN,IND,BWI,515,1755,90,3,1
1,67,4,1940,WN,N763SW,IND,LAS,1591,1830,70,3,1
2,94,4,1725,WN,N334SW,IND,MCO,828,1510,135,3,1
3,27,4,1625,WN,N286WN,IND,PHX,1489,1425,120,3,1
4,28,4,1510,WN,N674AA,IND,TPA,838,1255,135,3,1
