In [0]:
#Goals
#Analyze airline data to uncover operational trends and delay #patterns.
#Help improve airline/airport performance through visual insights.
#Serve stakeholders: airlines, airports, analysts, travelers.
#KPIs
#Average Departure/Arrival Delay (min)
#Cancellation Rate %
#Delay Causes (Carrier, Weather, NAS, Late Aircraft, Security)
#Top Busiest/Most Delayed Routes & Airports
#Seasonal/Hourly Trends
#Workflow
'''Load & Understand Data
Clean Data & Engineer Features (e.g., Route, Season, Delay Flags)
Univariate Analysis Distributions of key metrics
Bivariate Analysis Relationships between variables
Delay Cause Analysis Why delays happen
Route & Airport Analysis Identify hotspots
Seasonal & Cancellation Trends Time-based patterns
Build Interactive Dashboard/Report'''

In [0]:
import pandas as pd
df=pd.read_csv("/Volumes/workspace/default/airlines/Flight_delay1.csv",na_values=['#N/A'])
display(df)

In [0]:
df.info()
df.shape
df.size
df.columns
df.dtypes
df.describe()

In [0]:
print(df.isnull().sum())
#find the no. of duplicates
num_duplicates = df.duplicated().sum()
print(num_duplicates)

In [0]:
sample_df = df.sample(frac=0.01, random_state=42)
print(sample_df.shape)
for col in sample_df.select_dtypes(include=['float']):
    sample_df[col]=pd.to_numeric(sample_df[col],downcast='float')
for col in sample_df.select_dtypes(include=['int']):
    sample_df[col]=pd.to_numeric(sample_df[col],downcast='integer')
for col in sample_df.select_dtypes(include=['object']):
    num_unique=sample_df[col].nunique()
    num_total=len(sample_df[col])
    if num_unique/num_total < 0.5:
        sample_df[col]=sample_df[col].astype('category')

In [0]:
df_pd=df.copy()

In [0]:
df_pd = df_pd.drop_duplicates()

In [0]:
#Fill Empty cells of Cancelled column with 0 or 1 according to the CancellationCode if N then 0 otherwise 1
df_pd['Cancelled'] = df_pd['CancellationCode'].fillna('N').apply(lambda x: 0 if x == 'N' else 1)

In [0]:
#Display the rows that have Null Values in Org_Airport and Dest_Airport
print(df["Org_Airport"].isnull().sum())
print(df["Dest_Airport"].isnull().sum())

In [0]:
# Load the airport codes CSV
airport_df = pd.read_csv("/Volumes/workspace/default/airlines/airport_codes_mapping.csv")

# Convert CSV to dictionary for mapping
airport_dict = dict(zip(airport_df["IATA_Code"], airport_df["Airport_Name"]))

# Fill missing values in Org_Airport and Dest_Airport
df_pd["Org_Airport"] = df_pd["Origin"].map(airport_dict)
df_pd["Dest_Airport"] = df_pd["Dest"].map(airport_dict)

In [0]:
print(df_pd["Org_Airport"].isnull().sum())
print(df_pd["Dest_Airport"].isnull().sum())

In [0]:
df_pd['Date'] = pd.to_datetime(df_pd['Date'])

In [0]:
# Month and Day of Week from Date
df_pd["Month"] = df_pd["Date"].dt.month
df_pd["DayOfWeek"] = df_pd["Date"].dt.dayofweek  # Monday=0, Sunday=6
arr_dt = pd.to_datetime(df_pd["ArrTime"], format="%H:%M", errors="coerce")
dep_dt = pd.to_datetime(df_pd["DepTime"], format="%H:%M", errors="coerce")
df_pd["Hour"] = ((arr_dt - dep_dt).dt.total_seconds() / 3600).fillna(0).astype(int)

df_pd["Route"] = df_pd["Org_Airport"] + "-" + df_pd["Dest_Airport"]
#for col in ["ArrTime", "DepTime"]:
 #   df_pd[col] = df_pd[col].astype(str)
display(df_pd.head(10))

In [0]:
df_pd.to_csv("/Volumes/workspace/default/airlines/Flight_delay_cleaned.csv")

In [0]:
#i did not include the outputs as the file was getting big than 25 MB 
#But i have included them in pdf.