In [1]:
import pandas as pd
import os
import warnings
warnings.filterwarnings('ignore')

In [2]:
path_to_data = r"../data/raw/t100 segment"

In [3]:
# This function reduces the memory usage of a DataFrame by downcasting numeric types
def downcast(df, verbose = True):
    start_memory = df.memory_usage().sum() / 1024 ** 2
    for col in df.columns:
        dtype_name = df[col].dtype.name
        if dtype_name == "object":
            pass
        elif dtype_name == "bool":
            df[col] = df[col].astype("int8")
        elif dtype_name.startswith("int") or (df[col].round() == df[col]).all():
            df[col] = pd.to_numeric(df[col], downcast = "integer")
        else:
            df[col] = pd.to_numeric(df[col], downcast = "float")
    end_memory = df.memory_usage().sum() / 1024 ** 2

    if verbose:
        print("{:.1f}% compressed".format(100 * (start_memory - end_memory) / start_memory))

    return df

In [4]:
# Get the list of all csv files in path_to_data and all subfolders
csv_files = []
for root, dirs, files in os.walk(path_to_data):
    for f in files:
        if f.endswith('.asc'):
            csv_files.append(os.path.join(root, f))
csv_files

['../data/raw/t100 segment\\_201801\\dd.db28ds.wac.201702.201801.asc',
 '../data/raw/t100 segment\\_201802\\dd.db28ds.wac.201703.201802.asc',
 '../data/raw/t100 segment\\_201803\\dd.db28ds.wac.201704.201803.asc',
 '../data/raw/t100 segment\\_201804\\dd.db28ds.wac.201705.201804.asc',
 '../data/raw/t100 segment\\_201805\\dd.db28ds.wac.201706.201805.asc',
 '../data/raw/t100 segment\\_201806\\dd.db28ds.wac.201707.201806.asc',
 '../data/raw/t100 segment\\_201807\\dd.db28ds.wac.201708.201807.asc',
 '../data/raw/t100 segment\\_201808\\dd.db28ds.wac.201709.201808.asc',
 '../data/raw/t100 segment\\_201809\\dd.db28ds.wac.201710.201809.asc',
 '../data/raw/t100 segment\\_201810\\dd.db28ds.wac.201711.201810.asc',
 '../data/raw/t100 segment\\_201811\\dd.db28ds.wac.201712.201811.asc',
 '../data/raw/t100 segment\\_201812\\dd.db28ds.wac.201801.201812.asc',
 '../data/raw/t100 segment\\_201901\\dd.db28ds.wac.201802.201901.asc',
 '../data/raw/t100 segment\\_201902\\dd.db28ds.wac.201803.201902.asc',
 '../d

In [5]:
def load_data(file):
    df = pd.read_csv(file, low_memory=False, encoding='latin-1', delimiter='|', header=None)
    df['source_file'] = file
    return df

In [6]:
# Loop through the files, limiting columns and appending to a df
for file in csv_files:
    if file == csv_files[0]:
        df_main = load_data(file)
    else:
        print(f"Loading {file}")
        df_temp = load_data(file)
        df_main = pd.concat([df_main, df_temp], ignore_index=True)

df_main = downcast(df_main, verbose = True)
df_main

Loading ../data/raw/t100 segment\_201802\dd.db28ds.wac.201703.201802.asc
Loading ../data/raw/t100 segment\_201803\dd.db28ds.wac.201704.201803.asc
Loading ../data/raw/t100 segment\_201804\dd.db28ds.wac.201705.201804.asc
Loading ../data/raw/t100 segment\_201805\dd.db28ds.wac.201706.201805.asc
Loading ../data/raw/t100 segment\_201806\dd.db28ds.wac.201707.201806.asc
Loading ../data/raw/t100 segment\_201807\dd.db28ds.wac.201708.201807.asc
Loading ../data/raw/t100 segment\_201808\dd.db28ds.wac.201709.201808.asc
Loading ../data/raw/t100 segment\_201809\dd.db28ds.wac.201710.201809.asc
Loading ../data/raw/t100 segment\_201810\dd.db28ds.wac.201711.201810.asc
Loading ../data/raw/t100 segment\_201811\dd.db28ds.wac.201712.201811.asc
Loading ../data/raw/t100 segment\_201812\dd.db28ds.wac.201801.201812.asc
Loading ../data/raw/t100 segment\_201901\dd.db28ds.wac.201802.201901.asc
Loading ../data/raw/t100 segment\_201902\dd.db28ds.wac.201803.201902.asc
Loading ../data/raw/t100 segment\_201903\dd.db28ds.

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,20,21,22,23,24,25,26,27,source_file,28
0,2017,5,05A,30005,1,"LITTLE SQUAW,ALASKA,USA",ANC,30299,1,"ANCHORAGE,ALASKA,USA",...,47000,0,0,148,0,96,84,10.0,../data/raw/t100 segment\_201801\dd.db28ds.wac...,
1,2017,4,05A,30005,1,"LITTLE SQUAW,ALASKA,USA",FAI,31517,1,"FAIRBANKS,ALASKA,USA",...,2422,9,0,0,0,64,60,1.0,../data/raw/t100 segment\_201801\dd.db28ds.wac...,
2,2017,4,05A,30005,1,"LITTLE SQUAW,ALASKA,USA",FAI,31517,1,"FAIRBANKS,ALASKA,USA",...,7266,27,11,884,0,208,196,1.0,../data/raw/t100 segment\_201801\dd.db28ds.wac...,
3,2017,9,05A,30005,1,"LITTLE SQUAW,ALASKA,USA",FAI,31517,1,"FAIRBANKS,ALASKA,USA",...,18960,72,27,1203,0,448,416,1.0,../data/raw/t100 segment\_201801\dd.db28ds.wac...,
4,2017,9,05A,30005,1,"LITTLE SQUAW,ALASKA,USA",FAI,31517,1,"FAIRBANKS,ALASKA,USA",...,7266,27,10,1017,0,238,226,1.0,../data/raw/t100 segment\_201801\dd.db28ds.wac...,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32482339,2024,9,ZXU,36353,15,"North Kingstown, RI",TEB,35167,21,"Teterboro, NJ",...,3450,8,1,0,0,54,36,10.0,../data/raw/t100 segment\_202501\db28seg.dd.wa...,
32482340,2024,3,ZXU,36353,15,"North Kingstown, RI",TEB,35167,21,"Teterboro, NJ",...,6900,16,5,0,0,108,84,10.0,../data/raw/t100 segment\_202501\db28seg.dd.wa...,
32482341,2024,8,ZXU,36353,15,"North Kingstown, RI",TEB,35167,21,"Teterboro, NJ",...,3450,8,2,0,0,84,54,10.0,../data/raw/t100 segment\_202501\db28seg.dd.wa...,
32482342,2024,6,ZXU,36353,15,"North Kingstown, RI",TEB,35167,21,"Teterboro, NJ",...,6900,16,2,0,0,102,78,10.0,../data/raw/t100 segment\_202501\db28seg.dd.wa...,


In [7]:
# Add column names
df_main.columns = ['year', 'month', 
                   'orig_alpha_code', 'orig_num_code', 'orig_wac', 'orig_city', 
                   'dest_alpha_code', 'dest_num_code', 'dest_wac', 'dest_city', 
                   'carrier_alpha_code', 'carrier_entity_code', 'oai_group_code', 
                   'distance', 'service_class', 
                   'aircraft_group', 'aircraft_type', 'aircraft_config', 
                   'departures_performed', 'departures_scheduled',
                   'avail_capacity', 'avail_seats', 'num_passengers', 
                   'freight', 'mail', 'ramp_minutes', 'airborne_minutes', 'carrier_wac', 'source_file', 'end']

df_main.drop(columns=['end'], inplace=True)

In [8]:
df_main

Unnamed: 0,year,month,orig_alpha_code,orig_num_code,orig_wac,orig_city,dest_alpha_code,dest_num_code,dest_wac,dest_city,...,departures_scheduled,avail_capacity,avail_seats,num_passengers,freight,mail,ramp_minutes,airborne_minutes,carrier_wac,source_file
0,2017,5,05A,30005,1,"LITTLE SQUAW,ALASKA,USA",ANC,30299,1,"ANCHORAGE,ALASKA,USA",...,0,47000,0,0,148,0,96,84,10.0,../data/raw/t100 segment\_201801\dd.db28ds.wac...
1,2017,4,05A,30005,1,"LITTLE SQUAW,ALASKA,USA",FAI,31517,1,"FAIRBANKS,ALASKA,USA",...,0,2422,9,0,0,0,64,60,1.0,../data/raw/t100 segment\_201801\dd.db28ds.wac...
2,2017,4,05A,30005,1,"LITTLE SQUAW,ALASKA,USA",FAI,31517,1,"FAIRBANKS,ALASKA,USA",...,0,7266,27,11,884,0,208,196,1.0,../data/raw/t100 segment\_201801\dd.db28ds.wac...
3,2017,9,05A,30005,1,"LITTLE SQUAW,ALASKA,USA",FAI,31517,1,"FAIRBANKS,ALASKA,USA",...,0,18960,72,27,1203,0,448,416,1.0,../data/raw/t100 segment\_201801\dd.db28ds.wac...
4,2017,9,05A,30005,1,"LITTLE SQUAW,ALASKA,USA",FAI,31517,1,"FAIRBANKS,ALASKA,USA",...,0,7266,27,10,1017,0,238,226,1.0,../data/raw/t100 segment\_201801\dd.db28ds.wac...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32482339,2024,9,ZXU,36353,15,"North Kingstown, RI",TEB,35167,21,"Teterboro, NJ",...,0,3450,8,1,0,0,54,36,10.0,../data/raw/t100 segment\_202501\db28seg.dd.wa...
32482340,2024,3,ZXU,36353,15,"North Kingstown, RI",TEB,35167,21,"Teterboro, NJ",...,0,6900,16,5,0,0,108,84,10.0,../data/raw/t100 segment\_202501\db28seg.dd.wa...
32482341,2024,8,ZXU,36353,15,"North Kingstown, RI",TEB,35167,21,"Teterboro, NJ",...,0,3450,8,2,0,0,84,54,10.0,../data/raw/t100 segment\_202501\db28seg.dd.wa...
32482342,2024,6,ZXU,36353,15,"North Kingstown, RI",TEB,35167,21,"Teterboro, NJ",...,0,6900,16,2,0,0,102,78,10.0,../data/raw/t100 segment\_202501\db28seg.dd.wa...


In [10]:
df_sample = df_main.sample(1000, random_state=1234)
df_sample.to_csv(r"../data/processed/t100_segment_sample.csv", index=False)

In [17]:
df_main.sort_values(by=['num_passengers'], ascending=False).head(n=50)

Unnamed: 0,year,month,orig_alpha_code,orig_num_code,orig_wac,orig_city,dest_alpha_code,dest_num_code,dest_wac,dest_city,...,departures_scheduled,avail_capacity,avail_seats,num_passengers,freight,mail,ramp_minutes,airborne_minutes,carrier_wac,source_file
3088748,2018,7,HNL,32134,2,"HONOLULU,HAWAII,USA",OGG,33830,2,"KAHULUI,HAWAII,USA",...,849,26884800,108672,95777,492915,0,35812,18548,10.0,../data/raw/t100 segment\_201809\dd.db28ds.wac...
5725862,2018,7,HNL,32134,2,"HONOLULU,HAWAII,USA",OGG,33830,2,"KAHULUI,HAWAII,USA",...,849,26884800,108672,95777,492915,0,35812,18548,10.0,../data/raw/t100 segment\_201904\dd.db28ds.wac...
3834021,2018,7,HNL,32134,2,"HONOLULU,HAWAII,USA",OGG,33830,2,"KAHULUI,HAWAII,USA",...,849,26884800,108672,95777,492915,0,35812,18548,10.0,../data/raw/t100 segment\_201811\dd.db28ds.wac...
6108961,2018,7,HNL,32134,2,"HONOLULU,HAWAII,USA",OGG,33830,2,"KAHULUI,HAWAII,USA",...,849,26884800,108672,95777,492915,0,35812,18548,10.0,../data/raw/t100 segment\_201905\dd.db28ds.wac...
5344078,2018,7,HNL,32134,2,"HONOLULU,HAWAII,USA",OGG,33830,2,"KAHULUI,HAWAII,USA",...,849,26884800,108672,95777,492915,0,35812,18548,10.0,../data/raw/t100 segment\_201903\dd.db28ds.wac...
2717852,2018,7,HNL,32134,2,"HONOLULU,HAWAII,USA",OGG,33830,2,"KAHULUI,HAWAII,USA",...,849,26884800,108672,95777,492915,0,35812,18548,10.0,../data/raw/t100 segment\_201808\dd.db28ds.wac...
4585789,2018,7,HNL,32134,2,"HONOLULU,HAWAII,USA",OGG,33830,2,"KAHULUI,HAWAII,USA",...,849,26884800,108672,95777,492915,0,35812,18548,10.0,../data/raw/t100 segment\_201901\dd.db28ds.wac...
4208942,2018,7,HNL,32134,2,"HONOLULU,HAWAII,USA",OGG,33830,2,"KAHULUI,HAWAII,USA",...,849,26884800,108672,95777,492915,0,35812,18548,10.0,../data/raw/t100 segment\_201812\dd.db28ds.wac...
6493019,2018,7,HNL,32134,2,"HONOLULU,HAWAII,USA",OGG,33830,2,"KAHULUI,HAWAII,USA",...,849,26884800,108672,95777,492915,0,35812,18548,10.0,../data/raw/t100 segment\_201906\dd.db28ds.wac...
3460741,2018,7,HNL,32134,2,"HONOLULU,HAWAII,USA",OGG,33830,2,"KAHULUI,HAWAII,USA",...,849,26884800,108672,95777,492915,0,35812,18548,10.0,../data/raw/t100 segment\_201810\dd.db28ds.wac...


In [18]:
df_main['carrier_alpha_code'].value_counts()

carrier_alpha_code
WN     3431862
UA     2739176
DL     2591235
OO     2081390
AA     1991400
        ...   
EE         234
30Q         86
3BQ         24
29Q         15
8D           6
Name: count, Length: 154, dtype: int64

In [19]:
df_main.to_csv(f'{path_to_data}/t100_segment_combined.csv', index=False)