In [2]:
import pandas as pd
import numpy as np
import os
import gzip

In [3]:
def iff_read(path):
    """
    Read each line to determine the maximum number of columns in dataset and propagate numbered column names for dataset such that it can be read in by read_csv()
    Return numbered column names
    """
    if '.gz' in path:
        with gzip.open(path, 'rt') as temp_f:
            # Get number of columns in each line
            col_count = [ len(l.split(",")) for l in temp_f.readlines() ]
    else:
        with open(path, 'r') as temp_f:
            # Get number of columns in each line
            col_count = [ len(l.split(",")) for l in temp_f.readlines() ]

    # Generate column names  (names will be 0, 1, 2, ..., maximum columns - 1)
    column_names = [i for i in range(0, max(col_count))]
    return column_names

def split_clean(df):
    """
    Split dataframe by recordType and drop null values from dataframe
    Return three dataframes, header(2), track point(3), and flight plan(4) records
    """
    # Get all column names
    rec2_col = ['recType', 'recTime', 'fltKey', 'bcnCode', 'cId', 'source', 'msgType', 'AcId', 'recTypeCat', 'acType', 'Orig', 'Dest', 'OpsType', 'estOrig', 'estDest', 'modeSCode']
    rec3_col = ['recType', 'recTime', 'fltKey', 'bcnCode', 'cId', 'source', 'msgType', 'AcId', 'recTypeCat', 'coord1', 'coord2', 'alt', 'significance', 'coord1Accur', 'coord2Accur', 'altAccur', 'groundSpeed', 'course', 'rateOfClimb', 'altQualifier', 'altIndicator', 'trackPtStatus', 'leaderDir', 'scratchPad', 'msawInhibited', 'assignedAltString', 'controllingFac', 'controllingSec', 'receivingFac', 'receivingSec', 'activeContr', 'primaryContr', 'kybrdSubset', 'kybrdSymbol', 'adsCode', 'opsType', 'airportCode', 'trackNumber', 'tptReturnType', 'modeSCode', 'sensorTrack', 'spi', 'dvs', 'dupM3a', 'tid']
    rec4_col = ['recType', 'recTime', 'fltKey', 'bcnCode', 'cId', 'source', 'msgType', 'AcId', 'recTypeCat', 'acType', 'Orig', 'Dest','altCode', 'alt', 'maxAlt', 'assignedAltString', 'requestedAltString', 'route', 'esTime', 'fltCat', 'perfCat', 'opsType', 'equipList', 'coordinationTime', 'coordinationTimeType', 'leaderDir', 'scratchPad1', 'scratchPad2', 'fixPairScratchPad', 'prefDepArrRoute', 'prefDepRoute', 'prefArrRoute', 'coordinationPoint', 'coordinationPointType', 'trackNumber', 'modeSCode']
    rectype_cols = {2: len(rec2_col), 3: len(rec3_col), 4: len(rec4_col)}
    # Extract based on record type
    rec2 = df.loc[df[0] == 2, 0:rectype_cols[2]-1]
    rec3 = df.loc[df[0] == 3, 0:rectype_cols[3]-1]
    rec4 = df.loc[df[0] == 4, 0:rectype_cols[4]-1]

    # Rename header to correct column names
    rec2.set_axis(rec2_col, axis=1, inplace=True)
    rec3.set_axis(rec3_col, axis=1, inplace=True)
    rec4.set_axis(rec4_col, axis=1, inplace=True)

    # Replace any ? as NaN
    rec2.replace('?', np.nan, inplace=True)
    rec3.replace('?', np.nan, inplace=True)
    rec4.replace('?', np.nan, inplace=True)
    
    # print(f"Shape of each df: {rec2.shape[1]}, {rec3.shape[1]}, {rec4.shape[1]}")

    # Drop columns that only have null values
    rec2.dropna(axis=1, how='all', inplace=True)
    rec3.dropna(axis=1, how='all', inplace=True)
    rec4.dropna(axis=1, how='all', inplace=True)

    # print(f"After dropping nulls: {rec2.shape[1]}, {rec3.shape[1]}, {rec4.shape[1]}")

    return rec2, rec3, rec4

def read_gzip(path):
    """
    Unzip .gz files
    """
    with gzip.open(path, 'r') as temp_f:
        return pd.read_csv(temp_f)
    
def null_val(df):
    """
    Replace ? with NaN and drop any columns that only contain null values
    """
    # Replace any ? as NaN
    df.replace('?', np.nan, inplace=True)
    
    # print(f"Shape of each df: {df.shape[1]}")

    # Drop columns that only have null values
    df.dropna(axis=1, how='all', inplace=True)

    # print(f"After dropping nulls: {df.shape[1]}")
    return df

def sameval_cols(df):
    """
    Display columns if all values in the column are the same
    """
    nunique = df.nunique()
    cols_to_drop = nunique[nunique == 1].index
    print(cols_to_drop)
    return

def drop_sameval_cols(df):
    """
    Remove columns if all values in the column are the same
    """
    nunique = df.nunique()
    cols_to_drop = nunique[nunique == 1].index
    df.drop(cols_to_drop, axis=1, inplace=True)
    return df

In [4]:
# List out directory names
dir = ['IFF_2022_Final', 'EV_2022_Final', 'RD_2022_Final']

## IFF Data

### Read in data and split based on record type: Header records, track point rercords, and flight plan records.

In [5]:
# Obtain all files in the directory
iff_list = os.listdir(path=dir[0])

# Read in downloaded data
iff_master_df = {}
for i in range(len(iff_list)):
    path = dir[0] + '/' + iff_list[i]
    # Key will be the month and value will be dataframe
    iff_master_df[i+1] = pd.read_csv(path, header=None, skiprows=3, names=iff_read(path))
print(len(iff_master_df))

  iff_master_df[i+1] = pd.read_csv(path, header=None, skiprows=3, names=iff_read(path))
  iff_master_df[i+1] = pd.read_csv(path, header=None, skiprows=3, names=iff_read(path))
  iff_master_df[i+1] = pd.read_csv(path, header=None, skiprows=3, names=iff_read(path))
  iff_master_df[i+1] = pd.read_csv(path, header=None, skiprows=3, names=iff_read(path))
  iff_master_df[i+1] = pd.read_csv(path, header=None, skiprows=3, names=iff_read(path))
  iff_master_df[i+1] = pd.read_csv(path, header=None, skiprows=3, names=iff_read(path))
  iff_master_df[i+1] = pd.read_csv(path, header=None, skiprows=3, names=iff_read(path))


12


In [6]:
# Split the data by recordType
iff_splt_df = {}
for i in range(len(iff_master_df)):
    # Key will be the month and value will be a tuple of 3 dataframe
    iff_splt_df[i+1] = split_clean(iff_master_df[i+1])
print(iff_splt_df)

{1: (        recType       recTime  fltKey  bcnCode   source     AcId  recTypeCat  \
0             2  1.664612e+09   36037   7032.0      LAX  AAL2548           1   
3             2  1.664612e+09   36038   1145.0      LAX  JBU1223           1   
6             2  1.664613e+09   36049   4713.0      LAX   N883CE           1   
11            2  1.664613e+09   36053   3766.0      LAX  AAL2468           1   
14            2  1.664611e+09   36054    656.0  SMRADSB   ACA779           1   
...         ...           ...     ...      ...      ...      ...         ...   
796906        2  1.664696e+09   38659   1200.0      LAX     UNKN           1   
797444        2  1.664696e+09   38660   7250.0      LAX   EVA011           1   
798092        2  1.664697e+09   38661      NaN      LAX      O69           1   
798421        2  1.664697e+09   38662      NaN      LAX     UNKN           1   
798633        2  1.664697e+09   38664   1057.0     ADSB   AAL362           1   

       acType Orig Dest OpsType es

### Merge 12 months based on recordTypes

In [7]:
# For header records
header_ls = [iff_splt_df[i][0] for i in range(1, 13)]
iff_header_df = pd.concat(header_ls)
iff_header_df.shape

(28142, 14)

In [8]:
iff_header_df.head()

Unnamed: 0,recType,recTime,fltKey,bcnCode,source,AcId,recTypeCat,acType,Orig,Dest,OpsType,estOrig,estDest,modeSCode
0,2,1664612000.0,36037,7032.0,LAX,AAL2548,1,A319,,,,,,
3,2,1664612000.0,36038,1145.0,LAX,JBU1223,1,A321,,,,,,
6,2,1664613000.0,36049,4713.0,LAX,N883CE,1,F2TH,,,,,,ac2931
11,2,1664613000.0,36053,3766.0,LAX,AAL2468,1,A321,,,,,,
14,2,1664611000.0,36054,656.0,SMRADSB,ACA779,1,A333,,LAX,A,,LAX,c0582b


In [9]:
# For track point records
track_ls = [iff_splt_df[i][1] for i in range(1, 13)]
iff_track_df = pd.concat(track_ls)
iff_track_df.shape

(10169040, 22)

In [10]:
iff_track_df.head()

Unnamed: 0,recType,recTime,fltKey,bcnCode,source,msgType,AcId,recTypeCat,coord1,coord2,...,coord1Accur,coord2Accur,groundSpeed,course,rateOfClimb,trackPtStatus,scratchPad,airportCode,trackNumber,modeSCode
22,3,1664611000.0,36054,656.0,LAX+ASDEX,0xE02,ACA779,1,33.93542,-118.54539,...,0.5,0.5,164.0,82,-1500.0,0,unassigned,LAX,106.0,c0582b
23,3,1664611000.0,36054,656.0,LAX+ASDEX,0xE02,ACA779,1,33.93552,-118.54451,...,0.5,0.5,164.0,82,-1500.0,0,unassigned,LAX,106.0,c0582b
24,3,1664611000.0,36054,656.0,LAX+ASDEX,0xE02,ACA779,1,33.93558,-118.54363,...,0.5,0.5,164.0,85,2280.0,0,unassigned,LAX,106.0,c0582b
25,3,1664611000.0,36054,656.0,LAX+ASDEX,0xE02,ACA779,1,33.93567,-118.54275,...,0.5,0.5,163.0,83,-2640.0,0,unassigned,LAX,106.0,c0582b
26,3,1664611000.0,36054,656.0,LAX+ASDEX,0xE02,ACA779,1,33.9358,-118.54183,...,0.5,0.5,166.0,80,-2280.0,0,unassigned,LAX,106.0,c0582b


In [11]:
# For flight plan  records
flt_ls = [iff_splt_df[i][2] for i in range(1, 13)]
iff_flt_df = pd.concat(flt_ls)
iff_flt_df.shape

(188111, 18)

In [12]:
iff_flt_df.head()

Unnamed: 0,recType,recTime,fltKey,bcnCode,source,msgType,AcId,recTypeCat,acType,altCode,fltCat,perfCat,opsType,equipList,scratchPad1,scratchPad2,trackNumber,modeSCode
1,4,1664612000.0,36037,7032.0,ASR9,0xE02,AAL2548,1,A319,N,U,J,U,F/,,,272.0,
2,4,1664612000.0,36037,7032.0,LAX,0xE02,AAL2548,1,A319,N,U,J,U,F/,,,272.0,
4,4,1664612000.0,36038,1145.0,ASR9,0xE02,JBU1223,1,A321,N,U,J,U,F/,,,1234.0,
5,4,1664612000.0,36038,1145.0,LAX,0xE02,JBU1223,1,A321,N,U,J,U,F/,,,1234.0,
7,4,1664613000.0,36049,4713.0,ASR9,0xE02,N883CE,1,F2TH,N,U,J,U,H/,,,407.0,


In [13]:
# Identify columns that have same values in all datasets
sameval_cols(iff_header_df)
sameval_cols(iff_track_df)
sameval_cols(iff_flt_df)

Index(['recType', 'recTypeCat'], dtype='object')
Index(['recType', 'source', 'msgType', 'recTypeCat', 'trackPtStatus'], dtype='object')
Index(['recType', 'msgType', 'recTypeCat', 'altCode', 'fltCat', 'opsType'], dtype='object')


In [None]:
# Examine columns with same values
iff_header_df[['recType', 'recTypeCat']].head()

In [None]:
# Examine columns with same values
iff_track_df[['recType', 'source', 'msgType', 'recTypeCat', 'trackPtStatus']].head()

In [None]:
# Examine columns with same values
iff_flt_df[['recType', 'msgType', 'recTypeCat', 'altCode', 'fltCat', 'opsType']].head()

In [14]:
# Columns have uninteresting data, can remove these columns apply
iff_head = drop_sameval_cols(iff_header_df)
iff_trk = drop_sameval_cols(iff_track_df)
iff_flt = drop_sameval_cols(iff_flt_df)
print(iff_head.shape, iff_trk.shape, iff_flt.shape)

(28142, 12) (10169040, 17) (188111, 12)


### Save data to CSV

In [15]:
iff_head.to_csv('Sherlock_data/IFF_Header.csv', index=False)
iff_trk.to_csv('Sherlock_data/IFF_TrackPt.csv', index=False)
iff_flt.to_csv('Sherlock_data/IFF_Flight.csv', index=False)

## EV Data

In [16]:
# Obtain all files in designated directory
ev_list = os.listdir(dir[1])

In [17]:
# Read in EV datasets from the designated directory
ev_master_df = {}
for i in range(len(ev_list)):
    path = dir[1] + '/' + ev_list[i]
    # Key will be the month and value will be dataframe
    ev_master_df[i+1] = null_val(read_gzip(path))
    
print(len(ev_master_df))

12


In [18]:
ev_master_df[1].head()

Unnamed: 0,lKey,cKey,SysName,StartDate,StartTime,tMidnightSecs,tStartSecs,tStopSecs,tStart,tStop,...,cEv,vEv,rEv,DTD,FlD,DDT,FlT,EvNumInfo,EvCharInfo,EVVersion
0,32475,LAX+ASDEX_20220903_080418_32475,lax+asdex,09/03/2022,07:38:58,1662163000.0,1662191000.0,1662192000.0,27538.0,29058.0,...,83,161.0,-2280,0.0,0.0,0.0,0.0,0.0,,2
1,32475,LAX+ASDEX_20220903_080418_32475,lax+asdex,09/03/2022,07:38:58,1662163000.0,1662191000.0,1662192000.0,27538.0,29058.0,...,83,161.0,-2280,0.0,0.0,0.0,0.0,0.0,,2
2,32475,LAX+ASDEX_20220903_080418_32475,lax+asdex,09/03/2022,07:38:58,1662163000.0,1662191000.0,1662192000.0,27538.0,29058.0,...,83,161.0,-2280,0.0,0.0,0.0,0.0,0.0,,2
3,32475,LAX+ASDEX_20220903_080418_32475,lax+asdex,09/03/2022,07:38:58,1662163000.0,1662191000.0,1662192000.0,27538.0,29058.0,...,83,161.0,-2280,0.0,0.0,0.0,0.0,0.0,,2
4,32475,LAX+ASDEX_20220903_080418_32475,lax+asdex,09/03/2022,07:38:58,1662163000.0,1662191000.0,1662192000.0,27538.0,29058.0,...,82,172.1,-780,0.0,0.0,0.0,0.0,0.001,06R,2


In [19]:
# Combine 12 months data
ev = [ev_master_df[i] for i in range(1, 13)]
ev_df = pd.concat(ev)
ev_df.shape

(608047, 32)

In [20]:
# Check if there are any columns that are all 0
ev_df.columns[(ev_df == 0).all()]

Index([], dtype='object')

In [21]:
# Identify columns that have same values
sameval_cols(ev_df)

Index(['SysName', 'EVVersion'], dtype='object')


In [22]:
# Examine columns with same values
ev_df[['SysName', 'EVVersion']].head()

Unnamed: 0,SysName,EVVersion
0,lax+asdex,2
1,lax+asdex,2
2,lax+asdex,2
3,lax+asdex,2
4,lax+asdex,2


In [23]:
ev_df = drop_sameval_cols(ev_df)
print(ev_df.shape)

(608047, 30)


### Save to CSV

In [24]:
ev_df.to_csv('Sherlock_data/EV.csv', index=False)

## RD Data

In [25]:
rd_list = os.listdir(dir[2])

In [26]:
# Read in RD datasets from the designated directoyr
rd_master_df = {}
for i in range(len(rd_list)):
    path = dir[2] + '/' + rd_list[i]
    # Key will be the month and value will be dataframe
    rd_master_df[i+1] = null_val(read_gzip(path))
print(len(rd_master_df))

12


In [27]:
rd_master_df[1].head()

Unnamed: 0,lKey,cKey,SysName,StartDate,StartTime,tMidnightSecs,tStartSecs,tStopSecs,tStart,tStop,...,FIRs,Centers,TRACONs,Sectors,SUAs,SpAirsp,LclAirsp,ClssAirsp,equipList,RDVersion
0,4,LAX+ASDEX_20220806_080024_4,lax+asdex,08/06/2022,07:35:42,1659744000.0,1659771000.0,1659773000.0,27342.0,28824.0,...,NONE,OUTSIDE,NONE,NONE,OUTSIDE,NONE,NONE,NONE,F/,2
1,5,LAX+ASDEX_20220806_080141_5,lax+asdex,08/06/2022,07:38:14,1659744000.0,1659771000.0,1659773000.0,27494.0,28901.0,...,NONE,OUTSIDE,NONE,NONE,OUTSIDE,NONE,NONE,NONE,C/,2
2,6,LAX+ASDEX_20220806_080342_6,lax+asdex,08/06/2022,07:53:42,1659744000.0,1659772000.0,1659773000.0,28422.0,29022.0,...,NONE,OUTSIDE,NONE,NONE,OUTSIDE,NONE,NONE,NONE,F/,2
3,8,LAX+ASDEX_20220806_080523_8,lax+asdex,08/06/2022,07:55:37,1659744000.0,1659773000.0,1659773000.0,28537.0,29123.0,...,NONE,OUTSIDE,NONE,NONE,OUTSIDE,NONE,NONE,NONE,,2
4,9,LAX+ASDEX_20220806_081108_9,lax+asdex,08/06/2022,07:49:25,1659744000.0,1659772000.0,1659773000.0,28165.0,29468.0,...,NONE,OUTSIDE,NONE,NONE,OUTSIDE,NONE,NONE,NONE,B/,2


In [28]:
# Combine 12 months data
rd = [rd_master_df[i] for i in range(1, 13)]
rd_df = pd.concat(rd)
rd_df.shape

(24381, 60)

In [29]:
# Check if there are any columns that are all 'NONE'
rd_df.columns[(rd_df == "NONE").all()]

Index(['Systems', 'FIRs', 'TRACONs', 'Sectors', 'SpAirsp', 'LclAirsp',
       'ClssAirsp'],
      dtype='object')

In [30]:
# Drop columns with only 'NONE' in data
rd_df.replace('NONE', np.nan, inplace=True)

# print(f"Shape of each df: {rec2.shape[1]}, {rec3.shape[1]}, {rec4.shape[1]}")

# Drop columns that only have null values
rd_df.dropna(axis=1, how='all', inplace=True)
rd_df.shape

(24381, 53)

In [31]:
# Identify columns that have same values
sameval_cols(rd_df)

Index(['SysName', 'FltIdx', 'Cid', 'tTOC', 'tTOD', 'FFXName', 'FFXDist',
       'tFFX', 'ATACtrDep', 'ATAOutDep', 'ATACtrArr', 'ATATcnArr',
       'ETAThdArrO', 'ETAThdArrI', 'tOAGDep', 'tOAGArr', 'tOUT', 'tTAXI',
       'tIN', 'Centers', 'SUAs', 'RDVersion'],
      dtype='object')


In [32]:
# Examine columns with same values
pd.set_option('display.max_columns', None)
rd_df[['SysName', 'FltIdx', 'Cid', 'tTOC', 'tTOD', 'FFXName', 'FFXDist',
       'tFFX', 'ATACtrDep', 'ATAOutDep', 'ATACtrArr', 'ATATcnArr',
       'ETAThdArrO', 'ETAThdArrI', 'tOAGDep', 'tOAGArr', 'tOUT', 'tTAXI',
       'tIN', 'Centers', 'SUAs', 'RDVersion']].head()

Unnamed: 0,SysName,FltIdx,Cid,tTOC,tTOD,FFXName,FFXDist,tFFX,ATACtrDep,ATAOutDep,ATACtrArr,ATATcnArr,ETAThdArrO,ETAThdArrI,tOAGDep,tOAGArr,tOUT,tTAXI,tIN,Centers,SUAs,RDVersion
0,lax+asdex,0,-99,-9999999.0,-9999999.0,NO_FIX,-99999.898,-9999999.0,-9999999.0,-9999999.0,-9999999.0,-9999999.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,OUTSIDE,OUTSIDE,2
1,lax+asdex,0,-99,-9999999.0,-9999999.0,NO_FIX,-99999.898,-9999999.0,-9999999.0,-9999999.0,-9999999.0,-9999999.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,OUTSIDE,OUTSIDE,2
2,lax+asdex,0,-99,-9999999.0,-9999999.0,NO_FIX,-99999.898,-9999999.0,-9999999.0,-9999999.0,-9999999.0,-9999999.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,OUTSIDE,OUTSIDE,2
3,lax+asdex,0,-99,-9999999.0,-9999999.0,NO_FIX,-99999.898,-9999999.0,-9999999.0,-9999999.0,-9999999.0,-9999999.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,OUTSIDE,OUTSIDE,2
4,lax+asdex,0,-99,-9999999.0,-9999999.0,NO_FIX,-99999.898,-9999999.0,-9999999.0,-9999999.0,-9999999.0,-9999999.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,OUTSIDE,OUTSIDE,2


In [33]:
rd_df = drop_sameval_cols(rd_df)
print(rd_df.shape)

(24381, 31)


### Save to CSV

In [34]:
rd_df.to_csv('Sherlock_data/RD.csv', index=False)