In [14]:
import pandas as pd
import os
import warnings
warnings.filterwarnings('ignore')

In [15]:
path_to_data = r"../data/raw/ontime"

In [16]:
# This function reduces the memory usage of a DataFrame by downcasting numeric types
def downcast(df, verbose = True):
    start_memory = df.memory_usage().sum() / 1024 ** 2
    for col in df.columns:
        dtype_name = df[col].dtype.name
        if dtype_name == "object":
            pass
        elif dtype_name == "bool":
            df[col] = df[col].astype("int8")
        elif dtype_name.startswith("int") or (df[col].round() == df[col]).all():
            df[col] = pd.to_numeric(df[col], downcast = "integer")
        else:
            df[col] = pd.to_numeric(df[col], downcast = "float")
    end_memory = df.memory_usage().sum() / 1024 ** 2

    if verbose:
        print("{:.1f}% compressed".format(100 * (start_memory - end_memory) / start_memory))

    return df

In [17]:
df = pd.read_csv(f'{path_to_data}/_2018_1/_2018_1.csv', low_memory=False)


In [18]:
df

Unnamed: 0,Year,Quarter,Month,DayofMonth,DayOfWeek,FlightDate,Marketing_Airline_Network,Operated_or_Branded_Code_Share_Partners,DOT_ID_Marketing_Airline,IATA_Code_Marketing_Airline,...,Div5Airport,Div5AirportID,Div5AirportSeqID,Div5WheelsOn,Div5TotalGTime,Div5LongestGTime,Div5WheelsOff,Div5TailNum,Duplicate,Unnamed: 119
0,2018,1,1,14,7,2018-01-14,UA,UA_CODESHARE,19977,UA,...,,,,,,,,,N,
1,2018,1,1,15,1,2018-01-15,UA,UA_CODESHARE,19977,UA,...,,,,,,,,,N,
2,2018,1,1,16,2,2018-01-16,UA,UA_CODESHARE,19977,UA,...,,,,,,,,,N,
3,2018,1,1,17,3,2018-01-17,UA,UA_CODESHARE,19977,UA,...,,,,,,,,,N,
4,2018,1,1,18,4,2018-01-18,UA,UA_CODESHARE,19977,UA,...,,,,,,,,,N,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
621393,2018,1,1,23,2,2018-01-23,AA,AA,19805,AA,...,,,,,,,,,N,
621394,2018,1,1,24,3,2018-01-24,AA,AA,19805,AA,...,,,,,,,,,N,
621395,2018,1,1,25,4,2018-01-25,AA,AA,19805,AA,...,,,,,,,,,N,
621396,2018,1,1,26,5,2018-01-26,AA,AA,19805,AA,...,,,,,,,,,N,


In [19]:
list(df.columns)

['Year',
 'Quarter',
 'Month',
 'DayofMonth',
 'DayOfWeek',
 'FlightDate',
 'Marketing_Airline_Network',
 'Operated_or_Branded_Code_Share_Partners',
 'DOT_ID_Marketing_Airline',
 'IATA_Code_Marketing_Airline',
 'Flight_Number_Marketing_Airline',
 'Originally_Scheduled_Code_Share_Airline',
 'DOT_ID_Originally_Scheduled_Code_Share_Airline',
 'IATA_Code_Originally_Scheduled_Code_Share_Airline',
 'Flight_Num_Originally_Scheduled_Code_Share_Airline',
 'Operating_Airline ',
 'DOT_ID_Operating_Airline',
 'IATA_Code_Operating_Airline',
 'Tail_Number',
 'Flight_Number_Operating_Airline',
 'OriginAirportID',
 'OriginAirportSeqID',
 'OriginCityMarketID',
 'Origin',
 'OriginCityName',
 'OriginState',
 'OriginStateFips',
 'OriginStateName',
 'OriginWac',
 'DestAirportID',
 'DestAirportSeqID',
 'DestCityMarketID',
 'Dest',
 'DestCityName',
 'DestState',
 'DestStateFips',
 'DestStateName',
 'DestWac',
 'CRSDepTime',
 'DepTime',
 'DepDelay',
 'DepDelayMinutes',
 'DepDel15',
 'DepartureDelayGroups',
 '

In [20]:
df = df[[
 'FlightDate',
 'Marketing_Airline_Network',
 'DOT_ID_Marketing_Airline',
 'Flight_Number_Marketing_Airline',
 'DOT_ID_Originally_Scheduled_Code_Share_Airline',
 'Flight_Num_Originally_Scheduled_Code_Share_Airline',
 'DOT_ID_Operating_Airline',
 'Tail_Number',
 'Flight_Number_Operating_Airline',
 'OriginAirportID',
 'OriginCityMarketID',
 'OriginState',
 'DestAirportID',
 'DestCityMarketID',
 'DestState',
 'DepTime',
 'DepDelay',
 'WheelsOff',
 'WheelsOn',
 'ArrTime',
 'ArrDelay',
 'Cancelled',
 'CancellationCode',
 'Diverted',
 'ActualElapsedTime',
 'Flights',
 'Distance',
 'CarrierDelay',
 'WeatherDelay',
 'NASDelay',
 'SecurityDelay',
 'LateAircraftDelay',
 'Duplicate']]

In [21]:
df = downcast(df, verbose = True)

48.1% compressed


In [22]:
df

Unnamed: 0,FlightDate,Marketing_Airline_Network,DOT_ID_Marketing_Airline,Flight_Number_Marketing_Airline,DOT_ID_Originally_Scheduled_Code_Share_Airline,Flight_Num_Originally_Scheduled_Code_Share_Airline,DOT_ID_Operating_Airline,Tail_Number,Flight_Number_Operating_Airline,OriginAirportID,...,Diverted,ActualElapsedTime,Flights,Distance,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,Duplicate
0,2018-01-14,UA,19977,4443,,,20366,N14902,4443,11618,...,0,151.0,1,628,,,,,,N
1,2018-01-15,UA,19977,4443,,,20366,N12167,4443,11618,...,0,141.0,1,628,41.0,0.0,0.0,0.0,0.0,N
2,2018-01-16,UA,19977,4443,,,20366,N11109,4443,11618,...,0,131.0,1,628,67.0,0.0,0.0,0.0,2.0,N
3,2018-01-17,UA,19977,4443,,,20366,N16546,4443,11618,...,0,138.0,1,628,,,,,,N
4,2018-01-18,UA,19977,4443,,,20366,N11165,4443,11618,...,0,134.0,1,628,,,,,,N
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
621393,2018-01-23,AA,19805,407,,,19805,N170US,407,14100,...,0,191.0,1,1013,,,,,,N
621394,2018-01-24,AA,19805,407,,,19805,N979UY,407,14100,...,0,179.0,1,1013,,,,,,N
621395,2018-01-25,AA,19805,407,,,19805,N182UW,407,14100,...,0,173.0,1,1013,,,,,,N
621396,2018-01-26,AA,19805,407,,,19805,N580UW,407,14100,...,0,155.0,1,1013,,,,,,N


In [24]:
# Get the list of all csv files in path_to_data and all subfolders
csv_files = []
for root, dirs, files in os.walk(path_to_data):
    for f in files:
        if f.endswith('.csv'):
            csv_files.append(os.path.join(root, f))
csv_files

['../data/raw/ontime\\_2018_1\\_2018_1.csv',
 '../data/raw/ontime\\_2018_10\\_2018_10.csv',
 '../data/raw/ontime\\_2018_11\\_2018_11.csv',
 '../data/raw/ontime\\_2018_12\\_2018_12.csv',
 '../data/raw/ontime\\_2018_2\\_2018_2.csv',
 '../data/raw/ontime\\_2018_3\\_2018_3.csv',
 '../data/raw/ontime\\_2018_4\\_2018_4.csv',
 '../data/raw/ontime\\_2018_5\\_2018_5.csv',
 '../data/raw/ontime\\_2018_6\\_2018_6.csv',
 '../data/raw/ontime\\_2018_7\\_2018_7.csv',
 '../data/raw/ontime\\_2018_8\\_2018_8.csv',
 '../data/raw/ontime\\_2018_9\\_2018_9.csv',
 '../data/raw/ontime\\_2019_1\\_2019_1.csv',
 '../data/raw/ontime\\_2019_10\\_2019_10.csv',
 '../data/raw/ontime\\_2019_11\\_2019_11.csv',
 '../data/raw/ontime\\_2019_12\\_2019_12.csv',
 '../data/raw/ontime\\_2019_2\\_2019_2.csv',
 '../data/raw/ontime\\_2019_3\\_2019_3.csv',
 '../data/raw/ontime\\_2019_4\\_2019_4.csv',
 '../data/raw/ontime\\_2019_5\\_2019_5.csv',
 '../data/raw/ontime\\_2019_6\\_2019_6.csv',
 '../data/raw/ontime\\_2019_7\\_2019_7.csv'

In [None]:
def load_data(file):
    df = pd.read_csv(file, low_memory=False)
    df = df[[
     'FlightDate',
     'Marketing_Airline_Network',
     'DOT_ID_Marketing_Airline',
     'Flight_Number_Marketing_Airline',
     'DOT_ID_Originally_Scheduled_Code_Share_Airline',
     'Flight_Num_Originally_Scheduled_Code_Share_Airline',
     'DOT_ID_Operating_Airline',
     'Tail_Number',
     'Flight_Number_Operating_Airline',
     'OriginAirportID',
     'OriginCityMarketID',
     'OriginState',
     'DestAirportID',
     'DestCityMarketID',
     'DestState',
     'DepTime',
     'DepDelay',
     'WheelsOff',
     'WheelsOn',
     'ArrTime',
     'ArrDelay',
     'Cancelled',
     'CancellationCode',
     'Diverted',
     'ActualElapsedTime',
     'Flights',
     'Distance',
     'CarrierDelay',
     'WeatherDelay',
     'NASDelay',
     'SecurityDelay',
     'LateAircraftDelay']]
    return df

In [None]:
# Loop through the files, limiting columns and appending to a df
for file in csv_files:
    if file == csv_files[0]:
        df_main = load_data(file)
    else:
        df_temp = load_data(file)
        df_main = pd.concat([df_main, df_temp], ignore_index=True)

df_main = downcast(df_main, verbose = True)
df_main