Extracts 200k data points from training data 

In [1]:
import re
import pandas as pd
import psycopg2

In [2]:
my_vars = """
- Y **fl_date**: Flight Date (yyyy-mm-dd)
- Y **mkt_unique_carrier**: Unique Marketing Carrier Code. When the same code has been used by multiple carriers, a numeric suffix is used for earlier users, for example, PA, PA(1), PA(2). Use this field for analysis across a range of years.
- **branded_code_share**: Reporting Carrier Operated or Branded Code Share Partners
- Y **mkt_carrier**: Code assigned by IATA and commonly used to identify a carrier. As the same code may have been assigned to different carriers over time, the code is not always unique. For analysis, use the Unique Carrier Code.
- Y **mkt_carrier_fl_num**: Flight Number
- Y **op_unique_carrier**: Unique Scheduled Operating Carrier Code. When the same code has been used by multiple carriers, a numeric suffix is used for earlier users,for example, PA, PA(1), PA(2). Use this field for analysis across a range of years.
- Y **tail_num**: Tail Number
- Y **op_carrier_fl_num**: Flight Number
- Y **origin_airport_id**: Origin Airport, Airport ID. An identification number assigned by US DOT to identify a unique airport. Use this field for airport analysis across a range of years because an airport can change its airport code and airport codes can be reused.
- Y **origin**: Origin Airport
- Y **origin_city_name**: Origin Airport, City Name
- Y **dest_airport_id**: Destination Airport, Airport ID. An identification number assigned by US DOT to identify a unique airport. Use this field for airport analysis across a range of years because an airport can change its airport code and airport codes can be reused.
- Y **dest**: Destination Airport
- Y **dest_city_name**: Destination Airport, City Name
- Y **crs_dep_time**: CRS Departure Time (local time: hhmm)
- **dep_time**: Actual Departure Time (local time: hhmm)
- Y **dep_delay**: Difference in minutes between scheduled and actual departure time. Early departures show negative numbers.	
- **taxi_out**: Taxi Out Time, in Minutes
- **wheels_off**: Wheels Off Time (local time: hhmm)
- **wheels_on**: Wheels On Time (local time: hhmm)
- **taxi_in**: 	Taxi In Time, in Minutes
- Y **crs_arr_time**: CRS Arrival Time (local time: hhmm)
- Y **arr_time**: Actual Arrival Time (local time: hhmm)
- Y **arr_delay**: Difference in minutes between scheduled and actual arrival time. Early arrivals show negative numbers.
- **cancelled**: Cancelled Flight Indicator (1=Yes)
- **cancellation_code**: Specifies The Reason For Cancellation
- **diverted**: Diverted Flight Indicator (1=Yes)
- **dup**: Duplicate flag marked Y if the flight is swapped based on Form-3A data
- Y **crs_elapsed_time**: CRS Elapsed Time of Flight, in Minutes
- **actual_elapsed_time**: Elapsed Time of Flight, in Minutes
- Y **air_time**: Flight Time, in Minutes
- Y **flights**: Number of Flights
- Y **distance**: Distance between airports (miles)
- **carrier_delay**: Carrier Delay, in Minutes
- **weather_delay**: Weather Delay, in Minutes
- **nas_delay**: National Air System Delay, in Minutes
- **security_delay**: Security Delay, in Minutes
- **late_aircraft_delay**: Late Aircraft Delay, in Minutes
- Y **first_dep_time**: First Gate Departure Time at Origin Airport
- **total_add_gtime**: Total Ground Time Away from Gate for Gate Return or Cancelled Flight
- **longest_add_gtime**: Longest Time Away from Gate for Gate Return or Cancelled Flight
"""

In [3]:
import re
feat_name = '- Y.+\*'
feat_list = re.findall(feat_name, my_vars)

In [4]:
feat_list_clean=[]
for feat in feat_list:
    feat = re.sub(r"[- Y **]", "", feat)
    feat_list_clean.append(feat)

In [5]:
param_dic = {
    'host': '<enter host here>',
    'database': '<enter database>',
    'user': '<user>',
    'port': '<port>',
    'password': '<password>'
}

In [6]:
def connect(param_dic):
    """ Connect to the PostgreSQL database server """
    conn = None
    try:
        # connect to the PostgreSQL server
        print('Connecting to the PostgreSQL database...')
        conn = psycopg2.connect(**param_dic)
    except (Exception, psycopg2.DatabaseError) as error:
        print(error)
        sys.exit(1) 
    print("Connection successful")
    return conn

def postgres_to_df(conn, select_query, column_names):
    """
    Transforms a SELECT query into a pandas dataframe
    """
    cursor = con.cursor()
    try:
        cursor.execute(select_query)
    except (Exception, psycopg2.DatabaseError) as error:
        print(f"Error: {error}")
        cursor.close()
        return 1
    
    # list of tuples
    res_tuples = cursor.fetchall()
    cursor.close()

    # return to dataframe
    df = pd.DataFrame(res_tuples, columns=column_names)
    return df

In [7]:
con = connect(param_dic)

Connecting to the PostgreSQL database...
Connection successful


In [8]:
query1 = """SELECT fl_date, mkt_unique_carrier, mkt_carrier, 
mkt_carrier_fl_num, op_unique_carrier, tail_num, op_carrier_fl_num, 
origin_airport_id, origin, origin_city_name, dest_airport_id, 
dest, dest_city_name, crs_dep_time, dep_delay, crs_arr_time, 
arr_time, arr_delay, crs_elapsed_time, air_time, flights, distance, first_dep_time 
FROM flights
WHERE fl_date LIKE '2019-01%'
ORDER BY RANDOM()
LIMIT 100000;"""

query2 = """SELECT fl_date, mkt_unique_carrier, mkt_carrier, 
mkt_carrier_fl_num, op_unique_carrier, tail_num, op_carrier_fl_num, 
origin_airport_id, origin, origin_city_name, dest_airport_id, 
dest, dest_city_name, crs_dep_time, dep_delay, crs_arr_time, 
arr_time, arr_delay, crs_elapsed_time, air_time, flights, distance, first_dep_time 
FROM flights
WHERE fl_date LIKE '2018-01%'
ORDER BY RANDOM()
LIMIT 100000;"""

col_names = ['fl_date', 'mkt_unique_carrier', 'mkt_carrier', 
'mkt_carrier_fl_num', 'op_unique_carrier', 'tail_num', 'op_carrier_fl_num', 
'origin_airport_id', 'origin', 'origin_city_name', 'dest_airport_id', 
'dest', 'dest_city_name', 'crs_dep_time', 'dep_delay', 'crs_arr_time', 
'arr_time', 'arr_delay', 'crs_elapsed_time', 'air_time', 'flights', 'distance', 'first_dep_time']

In [9]:
df_201901 = postgres_to_df(con, query1, col_names)

In [10]:
df_201801 = postgres_to_df(con, query2, col_names)

In [11]:
df = pd.concat([df_201801,df_201901], axis=0)

Processing Data

In [12]:
# convert dates to datettime
df['fl_date'] = pd.to_datetime(df['fl_date'])
df['fl_day'] = df['fl_date'].dt.strftime("%A")

In [13]:
#split origin city and state
df[['origin_city','origin_state']]=df['origin_city_name'].str.split(',', expand=True)
df[['dest_city','dest_state']]=df['dest_city_name'].str.split(',', expand=True)

In [14]:
# convert crs_times to datettime 
# pad front with 0
df['crs_dep_hour'] = df['crs_dep_time'].astype(str).str.zfill(4)
df['crs_arr_hour'] = df['crs_arr_time'].astype(str).str.zfill(4)
# replace 2400 (not recognized by dataframe) to 0000 (midnight)
df['crs_dep_hour'].replace('2400', '0000', inplace=True)
df['crs_arr_hour'].replace('2400', '0000', inplace=True)
# convert to datetimes and extract hour oly
df['crs_dep_hour'] = pd.to_datetime(df['crs_dep_hour'], format="%H%M").dt.round('H').dt.hour
df['crs_arr_hour'] = pd.to_datetime(df['crs_arr_hour'], format="%H%M").dt.round('H').dt.hour

In [15]:
# convert hour to time of day categorical variables 
df['dep_tod'] = pd.cut(df['crs_dep_hour'],
    bins=[0,6,12,18,24],
    labels=['overnight','morning','afternoon','evening'],
    right=False,
    include_lowest=True)

df['arr_tod'] = pd.cut(df['crs_arr_hour'],
    bins=[0,6,12,18,24],
    labels=['overnight','morning','afternoon','evening'],
    right=False,
    include_lowest=True)

In [16]:
# getting week_num 
df['week_num'] = df['fl_date'].dt.isocalendar().week

In [17]:
# extract only relevant features
df = df[['mkt_unique_carrier','op_unique_carrier','origin',
'origin_city','dest','dest_city',
'crs_elapsed_time','crs_dep_hour','dep_tod',
'crs_arr_hour','arr_tod','distance',
'fl_date','fl_day','week_num','arr_delay']]

In [18]:
# note that path may be different
df_pass = pd.read_csv('data/processed/toJoin_passenger.csv', index_col=[0])

In [19]:
df['identifier']= df['op_unique_carrier'] + "-" + df['origin'] + "-" + df['dest']

In [20]:
df_fl_pass = pd.merge(df, df_pass, left_on='identifier', 
    right_on='identifier1', how='left').drop('identifier1', axis=1)

In [21]:
# drop any missing values
df_fl_pass = df_fl_pass.dropna()
# drop redundant columns
joined_cols_toDrop = ['distance_y', 'dest_y', 'origin_y', 'unique_carrier']
df_fl_pass = df_fl_pass.drop(joined_cols_toDrop, axis=1)
# rename original columns
df_fl_pass = df_fl_pass.rename(columns={'origin_x':'origin','dest_x':'dest','distance_x':'distance'})

In [22]:
# extract relevant columns 
df_fl_pass = df_fl_pass[['identifier','mkt_unique_carrier', 'op_unique_carrier',
'origin','origin_city','dest',
'dest_city','crs_elapsed_time', 'crs_dep_hour', 
'dep_tod','crs_arr_hour', 'arr_tod',
'distance','fl_date', 'fl_day', 'week_num', 
'departures_performed', 'payload', 'passengers','freight', 
'air_time', 'distance_group','arr_delay']]

Merge with Fuel

In [23]:
df_fuel = pd.read_csv('data/processed/toJoin_fuel.csv', index_col=[0])

In [24]:
df_fpf = pd.merge(df_fl_pass, df_fuel, left_on='op_unique_carrier',
right_on='unique_carrier', how='left').drop('unique_carrier', axis=1)

In [25]:
# drop missing fuel data
df_fpf.dropna(inplace=True)

Merge with Weather

In [26]:
df_weather = pd.read_csv('data/processed/weather20182019_cleaned.csv', index_col=[0])

In [27]:
df_fpf['weather_id']=df_fpf['fl_date'].astype(str) + "-" + df_fpf['origin']

In [40]:
df_flw = pd.merge(df_fpf, df_weather, left_on='weather_id', right_on='date_orig_id', how='left')

In [41]:
df_flw = df_flw.groupby(['origin_city','fl_date']).apply(lambda x: x.ffill().bfill())

In [42]:
df_flw.dropna(inplace=True)

In [43]:
df_flw=df_flw[['mkt_unique_carrier', 'origin','origin_city',
    'dest', 'dest_city', 'crs_elapsed_time','crs_dep_hour',
    'dep_tod', 'crs_arr_hour', 'arr_tod', 'distance', 'distance_group', 
    'fl_date', 'fl_day', 'week_num', 'departures_performed', 'payload', 
    'passengers', 'freight', 'sdomt_gallons', 'tdomt_gallons','sdomt_cost', 
    'tdomt_cost', 'AWND','PRCP', 'SNOW', 'SNWD', 'TAVG','arr_delay']]

In [44]:
df_flw['arr_delay_pos'] = df_flw['arr_delay']

In [45]:
df_flw.head()

Unnamed: 0,mkt_unique_carrier,origin,origin_city,dest,dest_city,crs_elapsed_time,crs_dep_hour,dep_tod,crs_arr_hour,arr_tod,...,tdomt_gallons,sdomt_cost,tdomt_cost,AWND,PRCP,SNOW,SNWD,TAVG,arr_delay,arr_delay_pos
0,UA,ORD,Chicago,MHT,Manchester,140.0,11,morning,14,afternoon,...,6214610.8,12885113.2,12885113.2,17.0,0.0,0.0,0.0,46.0,18.0,18.0
1,WN,AUS,Austin,DAL,Dallas,55.0,22,evening,23,evening,...,156532592.0,293397830.2,293803944.4,13.0,0.0,0.0,0.0,33.0,-11.0,-11.0
2,AA,DFW,Dallas/Fort Worth,IND,Indianapolis,120.0,15,afternoon,18,evening,...,163140817.6,264949334.8,265791415.6,15.0,0.0,0.0,0.0,38.0,7.0,7.0
4,AS,SEA,Seattle,PDX,Portland,45.0,15,afternoon,16,afternoon,...,0.0,0.0,0.0,38.0,15.0,0.0,0.0,83.0,-4.0,-4.0
5,DL,LGA,New York,STL,St. Louis,190.0,19,evening,21,evening,...,0.0,0.0,0.0,30.0,0.0,0.0,0.0,0.0,-25.0,-25.0


In [46]:
df_flw.loc[df_flw['arr_delay_pos']<0, 'arr_delay_pos']=0

In [49]:
df_flw['delay_binary']= df_flw['arr_delay_pos']

In [50]:
df_flw.loc[df_flw['delay_binary']>0, 'delay_binary']=1

In [52]:
df_flw.to_csv('train_flights_complete_raw200.csv')