## Cleaning The Dataset


Include all the necessary imports


In [119]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import timedelta

%matplotlib inline

pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)

Read the csv into a pandas dataframe


In [120]:
df = pd.read_csv("Flight_delay.csv")

Drop the unecessary columns


In [121]:
df.drop(
    columns=[
        "ArrTime",
        "Airline",
        "FlightNum",
        "ActualElapsedTime",
        "AirTime",
        "ArrDelay",
        "Org_Airport",
        "Dest_Airport",
        "TaxiIn",
        "TaxiOut",
        "Cancelled",
        "CancellationCode",
        "Diverted",
        "CarrierDelay",
        "WeatherDelay",
        "NASDelay",
        "LateAircraftDelay",
        "SecurityDelay",
    ],
    inplace=True,
)

Check for null values


In [122]:
np.any(np.array(df.isnull()))

False

### Define functions for Data Processing


Define a function to create a column in the dataframe called 'ScheduledDepTime', which has the scheduled departure time of each flight in the dataset.

This computed by subtracting the departure delay (in minutes) from the actual departure time.


In [123]:
def create_scheduled_dep_time_col():
    # Convert 'DepTime' to string type
    df["DepTime"] = df["DepTime"].astype(str)

    # Pad 'DepTime' with leading zeros to ensure it has 4 digits
    df["DepTime"] = df["DepTime"].str.zfill(4)

    # Replace '2400' with '0000' in 'DepTime'
    df["DepTime"] = df["DepTime"].replace("2400", "0000")

    # Convert 'DepTime' column to datetime format
    df["DepTime"] = pd.to_datetime(df["DepTime"], format="%H%M")

    # Subtract 'DepDelay' from 'DepTime'
    df["ScheduledDepTime"] = df.apply(
        lambda row: row["DepTime"] - timedelta(minutes=row["DepDelay"]), axis=1
    )

    # Convert 'ScheduledDepTime' back to the original format
    df["ScheduledDepTime"] = df["ScheduledDepTime"].dt.strftime("%H%M")

    # Drop the 'DepTime' column as it is not needed
    df.drop(columns="DepTime", inplace=True)

Define a function to recompute the scheduled elpased time of each flight.

This is computed by subtracting the scheduled departure time from the scheduled arrival time.


In [124]:
def upsert_scheduled_elapsed_time():
    df.drop(columns="CRSElapsedTime", inplace=True)

    # Rename the 'CRSArrTime' column to 'ScheduledArrTime'
    df.rename(columns={"CRSArrTime": "ScheduledArrTime"}, inplace=True)

    # Convert columns to string type
    df["ScheduledArrTime"] = df["ScheduledArrTime"].astype(str)
    df["ScheduledDepTime"] = df["ScheduledDepTime"].astype(str)

    # Pad columns with leading zeros to ensure it has 4 digits
    df["ScheduledArrTime"] = df["ScheduledArrTime"].str.zfill(4)
    df["ScheduledDepTime"] = df["ScheduledDepTime"].str.zfill(4)

    # Replace '2400' with '0000' in columns
    df["ScheduledArrTime"] = df["ScheduledArrTime"].replace("2400", "0000")
    df["ScheduledDepTime"] = df["ScheduledDepTime"].replace("2400", "0000")

    # Convert columns to datetime format
    df["ScheduledArrTime"] = pd.to_datetime(df["ScheduledArrTime"], format="%H%M")
    df["ScheduledDepTime"] = pd.to_datetime(df["ScheduledDepTime"], format="%H%M")

    # Calculate the scheduled elapsed time and create a new column 'ScheduledElapsedTime'
    df["ScheduledElapsedTime"] = (
        (
            df["ScheduledArrTime"] - df["ScheduledDepTime"] + pd.Timedelta(days=1)
        ).dt.total_seconds()
        / 60
    ).astype(int)

    # Use modulo operation to limit the elapsed time within 24 hours
    df["ScheduledElapsedTime"] = df["ScheduledElapsedTime"] % (24 * 60)

    # Convert 'ScheduledArrTime' and 'ScheduledDepTime' back to the original format
    df["ScheduledArrTime"] = df["ScheduledArrTime"].dt.strftime("%H%M")
    df["ScheduledDepTime"] = df["ScheduledDepTime"].dt.strftime("%H%M")

    # Convert 'ScheduledArrTime' and 'ScheduledDepTime' to type int
    df["ScheduledArrTime"] = df["ScheduledArrTime"].astype(int)
    df["ScheduledDepTime"] = df["ScheduledDepTime"].astype(int)

Define a function to expand the 'Date' column to a 'Day' and 'Month' columns.


In [125]:
def expand_date_col():
    # Convert the date column to datetime
    df["Date"] = pd.to_datetime(df["Date"], format="%d-%m-%Y")

    # Create the Day, Month and Year columns
    df["Day"] = df["Date"].dt.day
    df["Month"] = df["Date"].dt.month

    # Drop the 'Date' column
    df.drop(columns="Date", inplace=True)

### Call the functions


In [126]:
create_scheduled_dep_time_col()

In [127]:
upsert_scheduled_elapsed_time()

In [128]:
expand_date_col()

### Data Visualization


Plot the distribution of each column.


In [None]:
for column in df.columns:
    sns.displot(data=df, x=column, bins=50)

Make scatter plots for each column against the 'DepDelay' column.


In [None]:
y_column = "DepDelay"
x_columns = [col for col in df.columns if col != y_column]

# Increase the width of the figure
fig, axes = plt.subplots(nrows=len(x_columns), ncols=1, figsize=(5, len(x_columns) * 5))

for i in range(0, len(x_columns)):
    axes[i].scatter(df[x_columns[i]], df[y_column])
    axes[i].set_title(f"{y_column} vs {x_columns[i]}")

plt.tight_layout()

Show the head of the updated dataframe


In [131]:
df.head()

Unnamed: 0,DayOfWeek,ScheduledArrTime,UniqueCarrier,TailNum,DepDelay,Origin,Dest,Distance,ScheduledDepTime,ScheduledElapsedTime,Day,Month
0,4,1925,WN,N464WN,34,IND,BWI,515,1755,90,3,1
1,4,1940,WN,N763SW,67,IND,LAS,1591,1830,70,3,1
2,4,1725,WN,N334SW,94,IND,MCO,828,1510,135,3,1
3,4,1625,WN,N286WN,27,IND,PHX,1489,1425,120,3,1
4,4,1510,WN,N674AA,28,IND,TPA,838,1255,135,3,1
