In [40]:
# 01_imports.py
import os
import zipfile
import pandas as pd
import numpy as np
from datetime import datetime
pd.set_option('display.max_columns', 200)
pd.set_option('display.width', 200)


In [41]:
# 02_inspect_zip.py
zip_path = "/content/flights_sample_100k.csv.zip"
with zipfile.ZipFile(zip_path, 'r') as z:
    print("Files in zip:", z.namelist())



Files in zip: ['flights_sample_100k.csv']


In [42]:
# 03_load_csv_head.py
csv_name = "flights_sample_100k.csv"  # change if the filename differs
zip_path = "/content/flights_sample_100k.csv.zip"
with zipfile.ZipFile(zip_path) as z:
    with z.open(csv_name) as f:
        df = pd.read_csv(f, nrows=1000)  # quick preview sample
df.info()
df.head(3)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 32 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   FL_DATE                  1000 non-null   object 
 1   AIRLINE                  1000 non-null   object 
 2   AIRLINE_DOT              1000 non-null   object 
 3   AIRLINE_CODE             1000 non-null   object 
 4   DOT_CODE                 1000 non-null   int64  
 5   FL_NUMBER                1000 non-null   int64  
 6   ORIGIN                   1000 non-null   object 
 7   ORIGIN_CITY              1000 non-null   object 
 8   DEST                     1000 non-null   object 
 9   DEST_CITY                1000 non-null   object 
 10  CRS_DEP_TIME             1000 non-null   int64  
 11  DEP_TIME                 974 non-null    float64
 12  DEP_DELAY                974 non-null    float64
 13  TAXI_OUT                 973 non-null    float64
 14  WHEELS_OFF               

Unnamed: 0,FL_DATE,AIRLINE,AIRLINE_DOT,AIRLINE_CODE,DOT_CODE,FL_NUMBER,ORIGIN,ORIGIN_CITY,DEST,DEST_CITY,CRS_DEP_TIME,DEP_TIME,DEP_DELAY,TAXI_OUT,WHEELS_OFF,WHEELS_ON,TAXI_IN,CRS_ARR_TIME,ARR_TIME,ARR_DELAY,CANCELLED,CANCELLATION_CODE,DIVERTED,CRS_ELAPSED_TIME,ELAPSED_TIME,AIR_TIME,DISTANCE,DELAY_DUE_CARRIER,DELAY_DUE_WEATHER,DELAY_DUE_NAS,DELAY_DUE_SECURITY,DELAY_DUE_LATE_AIRCRAFT
0,2019-03-01,Allegiant Air,Allegiant Air: G4,G4,20368,1668,PGD,"Punta Gorda, FL",SPI,"Springfield, IL",630,620.0,-10.0,9.0,629.0,731.0,7.0,810,738.0,-32.0,0.0,,0.0,160.0,138.0,122.0,994.0,,,,,
1,2021-02-16,American Airlines Inc.,American Airlines Inc.: AA,AA,19805,2437,DFW,"Dallas/Fort Worth, TX",LAX,"Los Angeles, CA",1329,,,,,,,1500,,,1.0,B,0.0,211.0,,,1235.0,,,,,
2,2022-04-12,PSA Airlines Inc.,PSA Airlines Inc.: OH,OH,20397,5560,EWN,"New Bern/Morehead/Beaufort, NC",CLT,"Charlotte, NC",625,618.0,-7.0,16.0,634.0,725.0,11.0,744,736.0,-8.0,0.0,,0.0,79.0,78.0,51.0,221.0,,,,,


In [43]:
# 04_load_csv_with_dtypes.py
# Provide dtype hints for common columns to reduce memory usage.
dtype_hints = {
    'YEAR': 'int16',
    'MONTH': 'int8',
    'DAY': 'int8',
    'DAY_OF_WEEK': 'int8',
    'OP_CARRIER': 'category',
    'ORIGIN': 'category',
    'DEST': 'category',
    'TAIL_NUM': 'category',
    'FL_NUM': 'int32'
    # extend as appropriate for your CSV's column names
}

zip_path = "/content/flights_sample_100k.csv.zip"
with zipfile.ZipFile(zip_path) as z:
    with z.open(csv_name) as f:
        df = pd.read_csv(f, dtype=dtype_hints, low_memory=True)
print("Loaded shape:", df.shape)
df.info()


Loaded shape: (100000, 32)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 32 columns):
 #   Column                   Non-Null Count   Dtype   
---  ------                   --------------   -----   
 0   FL_DATE                  100000 non-null  object  
 1   AIRLINE                  100000 non-null  object  
 2   AIRLINE_DOT              100000 non-null  object  
 3   AIRLINE_CODE             100000 non-null  object  
 4   DOT_CODE                 100000 non-null  int64   
 5   FL_NUMBER                100000 non-null  int64   
 6   ORIGIN                   100000 non-null  category
 7   ORIGIN_CITY              100000 non-null  object  
 8   DEST                     100000 non-null  category
 9   DEST_CITY                100000 non-null  object  
 10  CRS_DEP_TIME             100000 non-null  int64   
 11  DEP_TIME                 97424 non-null   float64 
 12  DEP_DELAY                97423 non-null   float64 
 13  TAXI_OUT          

In [44]:
# 05_missing_and_stats.py
missing = df.isna().sum().sort_values(ascending=False)
summary = df.describe(include='all').T
print("Top missing columns:\n", missing.head(20))
summary.loc[:, ['count','mean','std']].head(20)


Top missing columns:
 CANCELLATION_CODE          97373
DELAY_DUE_LATE_AIRCRAFT    82008
DELAY_DUE_CARRIER          82008
DELAY_DUE_SECURITY         82008
DELAY_DUE_NAS              82008
DELAY_DUE_WEATHER          82008
ARR_DELAY                   2852
ELAPSED_TIME                2852
AIR_TIME                    2852
WHEELS_ON                   2655
ARR_TIME                    2655
TAXI_IN                     2655
WHEELS_OFF                  2618
TAXI_OUT                    2618
DEP_DELAY                   2577
DEP_TIME                    2576
DEST_CITY                      0
DEST                           0
CRS_DEP_TIME                   0
AIRLINE_CODE                   0
dtype: int64


Unnamed: 0,count,mean,std
FL_DATE,100000.0,,
AIRLINE,100000.0,,
AIRLINE_DOT,100000.0,,
AIRLINE_CODE,100000.0,,
DOT_CODE,100000.0,19977.25777,377.223161
FL_NUMBER,100000.0,2511.91066,1745.63292
ORIGIN,100000.0,,
ORIGIN_CITY,100000.0,,
DEST,100000.0,,
DEST_CITY,100000.0,,


In [45]:
# 06_parse_datetimes.py
# Typical flight datasets have CRS_DEP_TIME, DEP_TIME, etc. which are HHMM integers.
# We'll create a safe function to parse 'CRS_DEP_TIME' and date parts into datetimes.

def hhmm_to_time(hhmm):
    # accepts ints/floats/strings like 0, 30, 2359, '005', etc.
    try:
        hhmm = int(hhmm)
    except (ValueError, TypeError):
        return np.nan
    hh = hhmm // 100
    mm = hhmm % 100
    if hh >= 24 or mm >= 60:
        return np.nan
    return f"{hh:02d}:{mm:02d}"

# Example using YEAR, MONTH, DAY, CRS_DEP_TIME to build scheduled departure
if {'YEAR','MONTH','DAY','CRS_DEP_TIME'}.issubset(df.columns):
    sched_time_str = df['CRS_DEP_TIME'].apply(hhmm_to_time)
    df['SCHEDULED_DEP'] = pd.to_datetime(df['YEAR'].astype(str) + '-' + df['MONTH'].astype(str).str.zfill(2) + '-' + df['DAY'].astype(str).str.zfill(2) + ' ' + sched_time_str, errors='coerce')
    print("SCHEDULED_DEP example:", df['SCHEDULED_DEP'].head())
else:
    print("Required columns for scheduled dep not present; please adjust column names.")


Required columns for scheduled dep not present; please adjust column names.


In [93]:
# 07_create_time_features_fixed.py
# Note: Functionality moved to cell 7784c2dd for better workflow.
# This modification is to make this cell runnable in isolation if needed.

def hhmm_to_time_str(hhmm):
    # accepts ints/floats/strings like 0, 30, 2359, '005', etc.
    if pd.isna(hhmm): return None
    try:
        hhmm = int(hhmm)
    except (ValueError, TypeError):
        return None
    hh = hhmm // 100
    mm = hhmm % 100
    if hh >= 24 or mm >= 60:
        return None
    return f"{hh:02d}:{mm:02d}"

# Re-create necessary columns if they don't exist
if 'SCHEDULED_DEP' not in df.columns and {'CRS_DEP_TIME','FL_DATE'}.issubset(df.columns):
    sched_times = df['CRS_DEP_TIME'].apply(hhmm_to_time_str)
    df['SCHEDULED_DEP'] = pd.to_datetime(
        df['FL_DATE'].astype(str) + ' ' +
        sched_times, errors='coerce'
    )

# Now extract features if SCHEDULED_DEP is available
if 'SCHEDULED_DEP' in df.columns:
    df['month'] = df['SCHEDULED_DEP'].dt.month.astype('Int8')
    df['day_of_week'] = df['SCHEDULED_DEP'].dt.dayofweek.astype('Int8')
    df['hour'] = df['SCHEDULED_DEP'].dt.hour.astype('Int8')
    print(df[['SCHEDULED_DEP','month','day_of_week','hour']].head())
else:
    # Fallback if SCHEDULED_DEP couldn't be created
    print("SCHEDULED_DEP could not be created. Cannot extract time features.")
    # Attempt to print existing time columns if they exist from other processing
    cols_to_print = []
    if 'month' in df.columns: cols_to_print.append('month')
    if 'day_of_week' in df.columns: cols_to_print.append('day_of_week')
    if 'hour' in df.columns: cols_to_print.append('hour')
    if cols_to_print:
         print(df[cols_to_print].head())
    else:
         print("No time feature columns ('month', 'day_of_week', 'hour') found.")

            SCHEDULED_DEP  month  day_of_week  hour
39142 2019-02-15 17:53:00      2            4    17
67447 2019-04-15 17:58:00      4            0    17
24280 2019-07-29 06:10:00      7            0     6
67413 2019-10-14 13:03:00     10            0    13
71243 2020-02-05 17:42:00      2            2    17


In [47]:
# 07_create_datetime_features.py
# Consolidate datetime parsing and feature creation

def hhmm_to_time_str(hhmm):
    # accepts ints/floats/strings like 0, 30, 2359, '005', etc.
    if pd.isna(hhmm): return None
    try:
        hhmm = int(hhmm)
    except (ValueError, TypeError):
        return None
    hh = hhmm // 100
    mm = hhmm % 100
    if hh >= 24 or mm >= 60:
        return None
    return f"{hh:02d}:{mm:02d}"

# Ensure CRS_DEP_TIME and FL_DATE exist in dataset
if {'CRS_DEP_TIME','FL_DATE'}.issubset(df.columns):
    # Scheduled departure datetime
    sched_times = df['CRS_DEP_TIME'].apply(hhmm_to_time_str)
    df['SCHEDULED_DEP'] = pd.to_datetime(
        df['FL_DATE'].astype(str) + ' ' +
        sched_times, errors='coerce'
    )

    # Extract month, day of week, and hour from SCHEDULED_DEP
    df['month'] = df['SCHEDULED_DEP'].dt.month.astype('Int8')
    df['day_of_week'] = df['SCHEDULED_DEP'].dt.dayofweek.astype('Int8')
    df['hour'] = df['SCHEDULED_DEP'].dt.hour.astype('Int8')

    print(df[['FL_DATE','CRS_DEP_TIME','SCHEDULED_DEP','month','day_of_week','hour']].head())
else:
    print("Required columns for datetime feature creation (FL_DATE, CRS_DEP_TIME) not found.")

      FL_DATE  CRS_DEP_TIME       SCHEDULED_DEP  month  day_of_week  hour
0  2019-03-01           630 2019-03-01 06:30:00      3            4     6
1  2021-02-16          1329 2021-02-16 13:29:00      2            1    13
2  2022-04-12           625 2022-04-12 06:25:00      4            1     6
3  2021-10-13          1715 2021-10-13 17:15:00     10            2    17
4  2022-06-05           535 2022-06-05 05:35:00      6            6     5


In [48]:
# 08_create_route.py
if {'ORIGIN','DEST'}.issubset(df.columns):
    df['route'] = df['ORIGIN'].astype(str) + "_" + df['DEST'].astype(str)
    df['route_id'] = df['route'].astype('category').cat.codes.astype('int32')
    print(df[['ORIGIN','DEST','route','route_id']].head())
else:
    print("ORIGIN/DEST columns missing; adjust column names.")


  ORIGIN DEST    route  route_id
0    PGD  SPI  PGD_SPI      4660
1    DFW  LAX  DFW_LAX      1738
2    EWN  CLT  EWN_CLT      2058
3    ABQ  DEN  ABQ_DEN        17
4    PIT  STL  PIT_STL      4964


In [49]:
# 09_handle_delays.py
# We will treat arrival/departure delays: ARR_DELAY, DEP_DELAY commonly present.
delay_cols = []
for c in ['ARR_DELAY','DEP_DELAY','ARR_DELAY_NEW','DEP_DELAY_NEW','ARR_DELAY_MINUTES']:
    if c in df.columns:
        delay_cols.append(c)

# Fill numeric nulls with 0 where appropriate and create flags
for c in delay_cols:
    df[c] = pd.to_numeric(df[c], errors='coerce')
    df[c].fillna(0, inplace=True)
# Flag for significant delay (>=15 min)
if 'ARR_DELAY' in df.columns:
    df['is_arr_delay_15'] = (df['ARR_DELAY'] >= 15).astype('int8')
if 'DEP_DELAY' in df.columns:
    df['is_dep_delay_15'] = (df['DEP_DELAY'] >= 15).astype('int8')

print("Delay columns processed:", delay_cols)


Delay columns processed: ['ARR_DELAY', 'DEP_DELAY']


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[c].fillna(0, inplace=True)


In [50]:
# 10_cancellations.py
# Many datasets have CANCELED (0/1) and CANCELLATION_CODE (A,B,C,D)
if 'CANCELLED' in df.columns or 'CANCELED' in df.columns:
    cancel_col = 'CANCELLED' if 'CANCELLED' in df.columns else 'CANCELED'
    df['is_cancelled'] = df[cancel_col].fillna(0).astype('int8')
else:
    # sometimes DEP_DELAY == NaN and large indicator; but don't infer without column
    df['is_cancelled'] = 0

if 'CANCELLATION_CODE' in df.columns:
    df['cancellation_code'] = df['CANCELLATION_CODE'].astype('category')
else:
    # create a placeholder column
    df['cancellation_code'] = pd.Categorical(np.nan)
print("Cancelled count:", df['is_cancelled'].sum())


Cancelled count: 2627


In [51]:
# 11_time_of_day_bins.py
def time_of_day(hour):
    if pd.isna(hour):
        return 'unknown'
    hour = int(hour)
    if 5 <= hour < 12:
        return 'morning'
    if 12 <= hour < 17:
        return 'afternoon'
    if 17 <= hour < 21:
        return 'evening'
    return 'night'

# Extract hour from CRS_DEP_TIME
if 'CRS_DEP_TIME' in df.columns:
    df['hour'] = (df['CRS_DEP_TIME'] // 100).astype('Int8')
    df['time_of_day'] = df['hour'].apply(lambda h: time_of_day(h) if not pd.isna(h) else 'unknown')
    df['time_of_day'] = df['time_of_day'].astype('category')
    print(df['time_of_day'].value_counts(dropna=False))
else:
    print("CRS_DEP_TIME column not found; cannot create time of day features.")

time_of_day
morning      41243
afternoon    30086
evening      22114
night         6557
Name: count, dtype: int64


In [52]:
# 12_downcast_numerics.py
def downcast_nums(df_in):
    for col in df_in.select_dtypes(include=['int64','float64']).columns:
        col_min = df_in[col].min(skipna=True)
        col_max = df_in[col].max(skipna=True)
        if pd.api.types.is_integer_dtype(df_in[col].dropna()):
            df_in[col] = pd.to_numeric(df_in[col], downcast='integer')
        else:
            df_in[col] = pd.to_numeric(df_in[col], downcast='float')
    return df_in

df = downcast_nums(df)
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 43 columns):
 #   Column                   Non-Null Count   Dtype         
---  ------                   --------------   -----         
 0   FL_DATE                  100000 non-null  object        
 1   AIRLINE                  100000 non-null  object        
 2   AIRLINE_DOT              100000 non-null  object        
 3   AIRLINE_CODE             100000 non-null  object        
 4   DOT_CODE                 100000 non-null  int16         
 5   FL_NUMBER                100000 non-null  int16         
 6   ORIGIN                   100000 non-null  category      
 7   ORIGIN_CITY              100000 non-null  object        
 8   DEST                     100000 non-null  category      
 9   DEST_CITY                100000 non-null  object        
 10  CRS_DEP_TIME             100000 non-null  int16         
 11  DEP_TIME                 97424 non-null   float32       
 12  DEP_DELAY        

In [53]:
# 13_drop_duplicates.py
before = df.shape[0]
df = df.drop_duplicates()
after = df.shape[0]
print(f"Dropped {before-after} duplicate rows. New shape: {df.shape}")


Dropped 0 duplicate rows. New shape: (100000, 43)


In [54]:
# 14_compute_delay_minutes_fixed.py

# Ensure CRS_DEP_TIME and DEP_TIME exist in dataset
if {'CRS_DEP_TIME','DEP_TIME'}.issubset(df.columns):
    # Convert both scheduled and actual hhmm integers into time strings
    def hhmm_to_time_str(hhmm):
        if pd.isna(hhmm): return None
        try:
            hhmm = int(hhmm)
        except (ValueError, TypeError):
            return None
        hh = hhmm // 100
        mm = hhmm % 100
        if hh >= 24 or mm >= 60:
            return None
        return f"{hh:02d}:{mm:02d}"

    # Scheduled departure datetime
    sched_times = df['CRS_DEP_TIME'].apply(hhmm_to_time_str)
    df['SCHEDULED_DEP'] = pd.to_datetime(
        df['FL_DATE'].astype(str) + ' ' +
        sched_times, errors='coerce'
    )

    # Actual departure datetime
    actual_times = df['DEP_TIME'].apply(hhmm_to_time_str)
    df['ACTUAL_DEP'] = pd.to_datetime(
        df['FL_DATE'].astype(str) + ' ' +
        actual_times, errors='coerce'
    )

    # Calculate difference
    df['dep_delay_minutes_calc'] = (df['ACTUAL_DEP'] - df['SCHEDULED_DEP']).dt.total_seconds() / 60

    # Fill missing DEP_DELAY if needed
    if 'DEP_DELAY' in df.columns:
        df['DEP_DELAY'] = df['DEP_DELAY'].fillna(df['dep_delay_minutes_calc']).astype('float32')

    # Show a preview
    print(df[['FL_DATE','SCHEDULED_DEP','ACTUAL_DEP','DEP_DELAY','dep_delay_minutes_calc']].head())
else:
    print("Columns CRS_DEP_TIME or DEP_TIME not found in dataset")

      FL_DATE       SCHEDULED_DEP          ACTUAL_DEP  DEP_DELAY  dep_delay_minutes_calc
0  2019-03-01 2019-03-01 06:30:00 2019-03-01 06:20:00      -10.0                   -10.0
1  2021-02-16 2021-02-16 13:29:00                 NaT        0.0                     NaN
2  2022-04-12 2022-04-12 06:25:00 2022-04-12 06:18:00       -7.0                    -7.0
3  2021-10-13 2021-10-13 17:15:00 2021-10-13 17:40:00       25.0                    25.0
4  2022-06-05 2022-06-05 05:35:00 2022-06-05 05:35:00        0.0                     0.0


In [55]:
# 15_route_rolling_delay.py
# Sort by route and scheduled departure to compute rolling metrics
if 'route' in df.columns and 'SCHEDULED_DEP' in df.columns and 'ARR_DELAY' in df.columns:
    df = df.sort_values(['route','SCHEDULED_DEP'])
    # rolling mean of last 5 flights on same route
    df['route_delay_roll5'] = df.groupby('route')['ARR_DELAY'].rolling(5, min_periods=1).mean().reset_index(level=0, drop=True).astype('float32')
    print("route_delay_roll5 example:\n", df[['route','SCHEDULED_DEP','ARR_DELAY','route_delay_roll5']].head(10))
else:
    print("Required columns for route rolling delay calculation not found.")

route_delay_roll5 example:
          route       SCHEDULED_DEP  ARR_DELAY  route_delay_roll5
39142  ABE_ATL 2019-02-15 17:53:00      -18.0         -18.000000
67447  ABE_ATL 2019-04-15 17:58:00      -21.0         -19.500000
24280  ABE_ATL 2019-07-29 06:10:00       -2.0         -13.666667
67413  ABE_ATL 2019-10-14 13:03:00      -25.0         -16.500000
71243  ABE_ATL 2020-02-05 17:42:00      -27.0         -18.600000
45383  ABE_ATL 2021-05-21 17:21:00      -24.0         -19.799999
29849  ABE_ATL 2021-08-27 13:17:00       -6.0         -16.799999
92002  ABE_ATL 2021-12-26 12:50:00      -10.0         -18.400000
38520  ABE_ATL 2022-02-11 06:40:00      -16.0         -16.600000
90125  ABE_ATL 2022-04-27 14:00:00      -17.0         -14.600000


In [56]:
# 16_categorical_encoding.py
# Example: frequency encoding for ORIGIN and DEST (simple numeric feature)
for col in ['ORIGIN','DEST','OP_CARRIER']:
    if col in df.columns:
        freq = df[col].value_counts(normalize=True)
        df[f'{col}_freq'] = df[col].map(freq).astype('float32')
# Example label encoding via category codes for model-ready features
for col in ['ORIGIN','DEST','OP_CARRIER','time_of_day']:
    if col in df.columns:
        df[f'{col}_code'] = df[col].astype('category').cat.codes.astype('int32')
print("Added frequency and code features for origin/dest/carrier")


Added frequency and code features for origin/dest/carrier


In [57]:
# 17_qa_checks.py
# Check negative scheduled elapsed times, implausible hours, and flights spanning > 24h
anomalies = {}
if 'ARR_DELAY' in df.columns:
    anomalies['neg_arr_delay_count'] = (df['ARR_DELAY'] < -1000).sum()  # unrealistic extreme
if 'hour' in df.columns:
    anomalies['invalid_hours'] = df[~df['hour'].between(0,23, inclusive='both')].shape[0]
if 'dep_delay_minutes_calc' in df.columns:
    anomalies['huge_calc_delays'] = (df['dep_delay_minutes_calc'].abs() > 24*60).sum()
print("Anomalies summary:", anomalies)

Anomalies summary: {'neg_arr_delay_count': np.int64(0), 'invalid_hours': 0, 'huge_calc_delays': np.int64(0)}


In [58]:
# 18_create_sample_for_dev.py
sample_df = df.sample(frac=0.01, random_state=42)  # 1% sample for fast plotting/iteration
print("Sample shape:", sample_df.shape)
print(sample_df.head(5).to_string())
# If you want to persist locally for reuse:
# sample_df.to_parquet("/mnt/data/flights_sample_1pct.parquet", index=False)


Sample shape: (1000, 51)
          FL_DATE                 AIRLINE                 AIRLINE_DOT AIRLINE_CODE  DOT_CODE  FL_NUMBER ORIGIN         ORIGIN_CITY DEST              DEST_CITY  CRS_DEP_TIME  DEP_TIME  DEP_DELAY  TAXI_OUT  WHEELS_OFF  WHEELS_ON  TAXI_IN  CRS_ARR_TIME  ARR_TIME  ARR_DELAY  CANCELLED CANCELLATION_CODE  DIVERTED  CRS_ELAPSED_TIME  ELAPSED_TIME  AIR_TIME  DISTANCE  DELAY_DUE_CARRIER  DELAY_DUE_WEATHER  DELAY_DUE_NAS  DELAY_DUE_SECURITY  DELAY_DUE_LATE_AIRCRAFT       SCHEDULED_DEP  month  day_of_week  hour    route  route_id  is_arr_delay_15  is_dep_delay_15  is_cancelled cancellation_code time_of_day          ACTUAL_DEP  dep_delay_minutes_calc  route_delay_roll5  ORIGIN_freq  DEST_freq  ORIGIN_code  DEST_code  time_of_day_code
4280   2021-08-10   United Air Lines Inc.   United Air Lines Inc.: UA           UA     19977       1693    PDX        Portland, OR  ORD            Chicago, IL           650       NaN        0.0       NaN         NaN        NaN      NaN        

In [59]:
# 19_save_cleaned.py
import os
clean_path = "/mnt/data/flights_cleaned_week2.parquet"
# Create the directory if it doesn't exist
os.makedirs(os.path.dirname(clean_path), exist_ok=True)
df.to_parquet(clean_path, index=False)
print("Saved cleaned parquet to:", clean_path)

Saved cleaned parquet to: /mnt/data/flights_cleaned_week2.parquet


In [94]:
from google.colab import files

files.download('/mnt/data/flights_cleaned_week2.parquet')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [60]:
# 20_summary_functions.py
def top_n_counts(df, col, n=10):
    if col in df.columns:
        return df[col].value_counts().head(n)
    return pd.Series([])

# Example usage:
print("Top origin airports:\n", top_n_counts(df, 'ORIGIN', 10))
print("Top carriers:\n", top_n_counts(df, 'OP_CARRIER', 10))


Top origin airports:
 ORIGIN
ATL    5099
DFW    4444
ORD    4053
DEN    3886
CLT    3110
LAX    2866
PHX    2460
SEA    2451
LAS    2391
IAH    2180
Name: count, dtype: int64
Top carriers:
 Series([], dtype: object)


In [61]:
# 21_carrier_avg_delays.py
if {'OP_CARRIER','ARR_DELAY'}.issubset(df.columns):
    carrier_delay = df.groupby('OP_CARRIER')['ARR_DELAY'].mean().sort_values(ascending=False)
    print("Carrier-level average arrival delay:\n", carrier_delay.head(10))


In [62]:
# 22_route_delay_stats.py
if {'route','ARR_DELAY'}.issubset(df.columns):
    route_stats = df.groupby('route')['ARR_DELAY'].agg(['mean','count']).sort_values('mean', ascending=False)
    print(route_stats.head(10))


                mean  count
route                      
FCA_LGA  1520.000000      1
RFD_SFB   595.000000      2
BUF_MSP   474.000000      2
SMF_HOU   408.000000      1
AZA_GTF   339.000000      1
RIC_PHL   324.000000      1
EWR_VPS   315.000000      1
GSO_PHL   307.333344      3
GTF_AZA   279.000000      1
MLI_PIE   278.333344      3


In [63]:
# 23_carrier_cancellations.py
if {'OP_CARRIER','is_cancelled'}.issubset(df.columns):
    cancel_rate = df.groupby('OP_CARRIER')['is_cancelled'].mean().sort_values(ascending=False)
    print("Cancellation rates:\n", cancel_rate.head(10))


In [64]:
# 24_cancel_reasons.py
if 'cancellation_code' in df.columns:
    print("Cancellation reasons:\n", df['cancellation_code'].value_counts(dropna=False))


Cancellation reasons:
 cancellation_code
NaN    97373
B        930
D        827
A        671
C        199
Name: count, dtype: int64


In [65]:
# 25_hourly_delays.py
if {'hour','ARR_DELAY'}.issubset(df.columns):
    hour_delay = df.groupby('hour')['ARR_DELAY'].mean()
    print("Avg arrival delay by hour:\n", hour_delay)


Avg arrival delay by hour:
 hour
0      9.527950
1      9.531646
2     11.741936
3     -1.333333
4      0.666667
5     -2.851904
6     -2.173096
7     -1.046254
8     -1.279639
9     -0.602420
10     1.226007
11     0.709528
12     1.510026
13     5.112010
14     5.468344
15     6.451538
16     9.145147
17     8.319843
18    11.313490
19    11.137645
20     9.306370
21    10.226727
22    10.949877
23     5.970706
Name: ARR_DELAY, dtype: float32


In [66]:
# 26_dow_delays.py
if {'day_of_week','ARR_DELAY'}.issubset(df.columns):
    dow_delay = df.groupby('day_of_week')['ARR_DELAY'].mean()
    print("Avg arrival delay by weekday (0=Mon):\n", dow_delay)

Avg arrival delay by weekday (0=Mon):
 day_of_week
0    5.355211
1    1.716929
2    2.103893
3    5.383103
4    5.501360
5    4.168449
6    5.070957
Name: ARR_DELAY, dtype: float32


In [67]:
# 27_top_origins.py
if 'ORIGIN' in df.columns:
    print("Top origin airports:\n", df['ORIGIN'].value_counts().head(10))


Top origin airports:
 ORIGIN
ATL    5099
DFW    4444
ORD    4053
DEN    3886
CLT    3110
LAX    2866
PHX    2460
SEA    2451
LAS    2391
IAH    2180
Name: count, dtype: int64


In [68]:
# 28_top_destinations.py
if 'DEST' in df.columns:
    print("Top destination airports:\n", df['DEST'].value_counts().head(10))


Top destination airports:
 DEST
ATL    5095
DFW    4239
ORD    4157
DEN    3956
CLT    3269
LAX    2804
PHX    2628
LAS    2413
SEA    2323
MCO    2171
Name: count, dtype: int64


In [69]:
# 29_seasonal_counts.py
if 'month' in df.columns:
    month_counts = df['month'].value_counts().sort_index()
    print("Flights per month:\n", month_counts)

Flights per month:
 month
1     8903
2     8246
3     9733
4     8434
5     8393
6     8688
7     9553
8     9444
9     7044
10    7286
11    7054
12    7222
Name: count, dtype: Int64


In [70]:
# 30_dow_counts.py
if 'day_of_week' in df.columns:
    dow_counts = df['day_of_week'].value_counts().sort_index()
    print("Flights per weekday (0=Mon):\n", dow_counts)


Flights per weekday (0=Mon):
 day_of_week
0    14825
1    13781
2    14024
3    14855
4    15071
5    12900
6    14544
Name: count, dtype: Int64


In [71]:
# 31_flights_per_carrier.py
if 'OP_CARRIER' in df.columns:
    carrier_counts = df['OP_CARRIER'].value_counts()
    print("Flights per carrier:\n", carrier_counts.head(10))


In [72]:
# 32_avg_distance_by_carrier.py
if {'OP_CARRIER','DISTANCE'}.issubset(df.columns):
    dist_carrier = df.groupby('OP_CARRIER')['DISTANCE'].mean()
    print("Average distance by carrier:\n", dist_carrier.head(10))


In [73]:
# 33_avg_distance_by_route.py
if {'route','DISTANCE'}.issubset(df.columns):
    dist_route = df.groupby('route')['DISTANCE'].mean().sort_values(ascending=False)
    print("Longest average routes:\n", dist_route.head(10))


Longest average routes:
 route
GUM_SFO    5812.0
BOS_HNL    5095.0
HNL_BOS    5095.0
HNL_JFK    4983.0
JFK_HNL    4983.0
HNL_EWR    4962.0
EWR_HNL    4962.0
OGG_EWR    4904.0
EWR_OGG    4904.0
IAD_HNL    4817.0
Name: DISTANCE, dtype: float32


In [74]:
# 34_dep_arr_correlation.py
if {'DEP_DELAY','ARR_DELAY'}.issubset(df.columns):
    corr = df[['DEP_DELAY','ARR_DELAY']].corr()
    print("Correlation between departure and arrival delays:\n", corr)


Correlation between departure and arrival delays:
            DEP_DELAY  ARR_DELAY
DEP_DELAY   1.000000   0.958144
ARR_DELAY   0.958144   1.000000


In [75]:
# 35_median_carrier_delay.py
if {'OP_CARRIER','ARR_DELAY'}.issubset(df.columns):
    med_delays = df.groupby('OP_CARRIER')['ARR_DELAY'].median()
    print("Median delays per carrier:\n", med_delays.head(10))


In [76]:
# 36_pct_delay_carrier.py
if {'OP_CARRIER','is_arr_delay_15'}.issubset(df.columns):
    pct_delay = df.groupby('OP_CARRIER')['is_arr_delay_15'].mean() * 100
    print("Pct flights delayed >=15min per carrier:\n", pct_delay.head(10))


In [77]:
# 37_pct_delay_route.py
if {'route','is_arr_delay_15'}.issubset(df.columns):
    pct_delay_route = df.groupby('route')['is_arr_delay_15'].mean().sort_values(ascending=False) * 100
    print("Pct delayed flights by route:\n", pct_delay_route.head(10))


Pct delayed flights by route:
 route
EWR_ANC    100.0
VPS_DCA    100.0
TVC_DFW    100.0
TVC_PIE    100.0
VPS_DAY    100.0
TYS_LAS    100.0
UIN_ORD    100.0
VPS_AUS    100.0
VPS_BWI    100.0
EWR_ACK    100.0
Name: is_arr_delay_15, dtype: float64


In [78]:
# 38_tod_counts.py
if 'time_of_day' in df.columns:
    print("Flights by time of day:\n", df['time_of_day'].value_counts())


Flights by time of day:
 time_of_day
morning      41243
afternoon    30086
evening      22114
night         6557
Name: count, dtype: int64


In [79]:
# 39_tod_avg_delays.py
if {'time_of_day','ARR_DELAY'}.issubset(df.columns):
    tod_delay = df.groupby('time_of_day')['ARR_DELAY'].mean()
    print("Average arrival delay by time of day:\n", tod_delay)


Average arrival delay by time of day:
 time_of_day
afternoon    5.489563
evening      9.994122
morning     -0.691366
night        9.959433
Name: ARR_DELAY, dtype: float32


  tod_delay = df.groupby('time_of_day')['ARR_DELAY'].mean()


In [80]:
# 40_busiest_routes.py
if 'route' in df.columns:
    busiest = df['route'].value_counts().head(10)
    print("Busiest routes:\n", busiest)


Busiest routes:
 route
LAX_SFO    206
SFO_LAX    188
DCA_BOS    159
JFK_LAX    156
LAX_LAS    156
OGG_HNL    152
ORD_LGA    149
LAS_LAX    148
LAX_JFK    147
LGA_ORD    145
Name: count, dtype: int64


In [81]:
# 41_longest_shortest_flights.py
if 'DISTANCE' in df.columns:
    print("Longest flights:\n", df.nlargest(5,'DISTANCE')[['ORIGIN','DEST','DISTANCE']])
    print("Shortest flights:\n", df.nsmallest(5,'DISTANCE')[['ORIGIN','DEST','DISTANCE']])


Longest flights:
       ORIGIN DEST  DISTANCE
11857    GUM  SFO    5812.0
71369    BOS  HNL    5095.0
83079    BOS  HNL    5095.0
56484    HNL  BOS    5095.0
38169    HNL  BOS    5095.0
Shortest flights:
       ORIGIN DEST  DISTANCE
40928    EGE  ASE      29.0
48209    HHH  SAV      30.0
51211    PSG  WRG      31.0
64848    PSG  WRG      31.0
21308    PSG  WRG      31.0


In [82]:
# 42_delay_distribution.py
if 'ARR_DELAY' in df.columns:
    print(df['ARR_DELAY'].describe(percentiles=[0.25,0.5,0.75,0.9,0.95]))


count    100000.000000
mean          4.229590
std          50.626984
min         -88.000000
25%         -15.000000
50%          -6.000000
75%           6.000000
90%          35.000000
95%          69.000000
max        1520.000000
Name: ARR_DELAY, dtype: float64


In [83]:
# 43_extreme_delays.py
if 'ARR_DELAY' in df.columns:
    extremes = df.loc[df['ARR_DELAY'] > 300, ['AIRLINE_CODE','ORIGIN','DEST','ARR_DELAY']]
    print("Flights with >5hr arrival delay:\n", extremes.head(10))

Flights with >5hr arrival delay:
       AIRLINE_CODE ORIGIN DEST  ARR_DELAY
7461            G4    ABE  SFB      365.0
86175           G4    ABQ  LAS      396.0
549             OO    ABQ  PHX      312.0
36058           OO    ABR  MSP      663.0
32160           MQ    AEX  DFW      527.0
46991           DL    ALB  ATL     1032.0
13666           B6    ATL  BOS      338.0
40997           DL    ATL  DCA      813.0
15138           DL    ATL  DFW      388.0
61802           DL    ATL  EWR      471.0


In [84]:
# 44_early_arrivals.py
if 'ARR_DELAY' in df.columns:
    early = df.loc[df['ARR_DELAY'] < -30, ['AIRLINE_CODE','ORIGIN','DEST','ARR_DELAY']]
    print("Flights with >30 min early arrival:\n", early.head(10))

Flights with >30 min early arrival:
       AIRLINE_CODE ORIGIN DEST  ARR_DELAY
69777           OH    ABE  CLT      -35.0
46543           OH    ABE  CLT      -42.0
8749            OH    ABE  CLT      -59.0
95125           OH    ABE  CLT      -40.0
79576           OH    ABE  CLT      -39.0
15102           OH    ABE  CLT      -33.0
99222           OH    ABE  CLT      -35.0
80654           OH    ABE  CLT      -39.0
81337           OO    ABE  DTW      -40.0
52297           OO    ABI  IAH      -31.0


In [85]:
# 45_cancelled_by_carrier.py
if {'OP_CARRIER','is_cancelled'}.issubset(df.columns):
    cancelled = df.groupby('OP_CARRIER')['is_cancelled'].sum().sort_values(ascending=False)
    print("Cancelled flights per carrier:\n", cancelled)


In [86]:
# 46_cancelled_by_route.py
if {'route','is_cancelled'}.issubset(df.columns):
    cancelled_route = df.groupby('route')['is_cancelled'].sum().sort_values(ascending=False)
    print("Cancelled flights per route:\n", cancelled_route.head(10))


Cancelled flights per route:
 route
ORD_DCA    10
ATL_EWR     9
JFK_BOS     9
DCA_BOS     8
BOS_DCA     8
CLT_LGA     7
SFO_PHX     6
ATL_DFW     6
HNL_LIH     6
SEA_LAX     6
Name: is_cancelled, dtype: int8


In [87]:
# 47_yearly_counts.py
if 'YEAR' in df.columns:
    yearly = df['YEAR'].value_counts().sort_index()
    print("Flights per year:\n", yearly)


In [88]:
# 48_delay_variance_carrier.py
if {'OP_CARRIER','ARR_DELAY'}.issubset(df.columns):
    delay_var = df.groupby('OP_CARRIER')['ARR_DELAY'].var()
    print("Delay variance per carrier:\n", delay_var.head(10))


In [89]:
# 49_correlation_matrix.py
num_cols = df.select_dtypes(include=[np.number]).columns
corr_matrix = df[num_cols].corr()
print("Correlation matrix (numeric cols):\n", corr_matrix.head())


Correlation matrix (numeric cols):
               DOT_CODE  FL_NUMBER  CRS_DEP_TIME  DEP_TIME  DEP_DELAY  TAXI_OUT  WHEELS_OFF  WHEELS_ON   TAXI_IN  CRS_ARR_TIME  ARR_TIME  ARR_DELAY  CANCELLED  DIVERTED  CRS_ELAPSED_TIME  \
DOT_CODE      1.000000   0.434009      0.008372  0.000594  -0.001638  0.212435    0.005992   0.001904  0.131198      0.006898  0.004755   0.015615  -0.006599  0.008983         -0.072968   
FL_NUMBER     0.434009   1.000000     -0.000997  0.003558  -0.019413  0.092893    0.012758   0.008256 -0.009250      0.003626  0.012697  -0.014336   0.003427  0.005722         -0.326262   
CRS_DEP_TIME  0.008372  -0.000997      1.000000  0.952157   0.092001  0.001815    0.924222   0.639261 -0.031784      0.694837  0.605173   0.089028   0.016066  0.007223         -0.013009   
DEP_TIME      0.000594   0.003558      0.952157  1.000000   0.113765  0.009905    0.970251   0.667582 -0.027446      0.697959  0.632695   0.110942   0.015065  0.009409         -0.018072   
DEP_DELAY    -0.001

In [90]:
# 50_save_summaries.py
summary_dict = {
    'carrier_avg_delay': carrier_delay.head(10).to_dict() if 'carrier_delay' in locals() else {},
    'route_stats': route_stats.head(10).to_dict() if 'route_stats' in locals() else {}
}
print("Summary checkpoint:\n", summary_dict)


Summary checkpoint:
 {'carrier_avg_delay': {}, 'route_stats': {'mean': {'FCA_LGA': 1520.0, 'RFD_SFB': 595.0, 'BUF_MSP': 474.0, 'SMF_HOU': 408.0, 'AZA_GTF': 339.0, 'RIC_PHL': 324.0, 'EWR_VPS': 315.0, 'GSO_PHL': 307.3333435058594, 'GTF_AZA': 279.0, 'MLI_PIE': 278.3333435058594}, 'count': {'FCA_LGA': 1, 'RFD_SFB': 2, 'BUF_MSP': 2, 'SMF_HOU': 1, 'AZA_GTF': 1, 'RIC_PHL': 1, 'EWR_VPS': 1, 'GSO_PHL': 3, 'GTF_AZA': 1, 'MLI_PIE': 3}}}
