In [1]:
my_vars = """
- Y **fl_date**: Flight Date (yyyy-mm-dd)
- Y **mkt_unique_carrier**: Unique Marketing Carrier Code. When the same code has been used by multiple carriers, a numeric suffix is used for earlier users, for example, PA, PA(1), PA(2). Use this field for analysis across a range of years.
- **branded_code_share**: Reporting Carrier Operated or Branded Code Share Partners
- Y **mkt_carrier**: Code assigned by IATA and commonly used to identify a carrier. As the same code may have been assigned to different carriers over time, the code is not always unique. For analysis, use the Unique Carrier Code.
- Y **mkt_carrier_fl_num**: Flight Number
- Y **op_unique_carrier**: Unique Scheduled Operating Carrier Code. When the same code has been used by multiple carriers, a numeric suffix is used for earlier users,for example, PA, PA(1), PA(2). Use this field for analysis across a range of years.
- Y **tail_num**: Tail Number
- Y **op_carrier_fl_num**: Flight Number
- Y **origin_airport_id**: Origin Airport, Airport ID. An identification number assigned by US DOT to identify a unique airport. Use this field for airport analysis across a range of years because an airport can change its airport code and airport codes can be reused.
- Y **origin**: Origin Airport
- Y **origin_city_name**: Origin Airport, City Name
- Y **dest_airport_id**: Destination Airport, Airport ID. An identification number assigned by US DOT to identify a unique airport. Use this field for airport analysis across a range of years because an airport can change its airport code and airport codes can be reused.
- Y **dest**: Destination Airport
- Y **dest_city_name**: Destination Airport, City Name
- Y **crs_dep_time**: CRS Departure Time (local time: hhmm)
- **dep_time**: Actual Departure Time (local time: hhmm)
- Y **dep_delay**: Difference in minutes between scheduled and actual departure time. Early departures show negative numbers.	
- **taxi_out**: Taxi Out Time, in Minutes
- **wheels_off**: Wheels Off Time (local time: hhmm)
- **wheels_on**: Wheels On Time (local time: hhmm)
- **taxi_in**: 	Taxi In Time, in Minutes
- Y **crs_arr_time**: CRS Arrival Time (local time: hhmm)
- Y **arr_time**: Actual Arrival Time (local time: hhmm)
- Y **arr_delay**: Difference in minutes between scheduled and actual arrival time. Early arrivals show negative numbers.
- **cancelled**: Cancelled Flight Indicator (1=Yes)
- **cancellation_code**: Specifies The Reason For Cancellation
- **diverted**: Diverted Flight Indicator (1=Yes)
- **dup**: Duplicate flag marked Y if the flight is swapped based on Form-3A data
- Y **crs_elapsed_time**: CRS Elapsed Time of Flight, in Minutes
- **actual_elapsed_time**: Elapsed Time of Flight, in Minutes
- Y **air_time**: Flight Time, in Minutes
- Y **flights**: Number of Flights
- Y **distance**: Distance between airports (miles)
- **carrier_delay**: Carrier Delay, in Minutes
- **weather_delay**: Weather Delay, in Minutes
- **nas_delay**: National Air System Delay, in Minutes
- **security_delay**: Security Delay, in Minutes
- **late_aircraft_delay**: Late Aircraft Delay, in Minutes
- Y **first_dep_time**: First Gate Departure Time at Origin Airport
- **total_add_gtime**: Total Ground Time Away from Gate for Gate Return or Cancelled Flight
- **longest_add_gtime**: Longest Time Away from Gate for Gate Return or Cancelled Flight
"""

In [2]:
import re
feat_name = '- Y.+\*'
feat_list = re.findall(feat_name, my_vars)

In [3]:
feat_list_clean=[]
for feat in feat_list:
    feat = re.sub(r"[- Y **]", "", feat)
    feat_list_clean.append(feat)

In [4]:
print(feat_list_clean)

['fl_date', 'mkt_unique_carrier', 'mkt_carrier', 'mkt_carrier_fl_num', 'op_unique_carrier', 'tail_num', 'op_carrier_fl_num', 'origin_airport_id', 'origin', 'origin_city_name', 'dest_airport_id', 'dest', 'dest_city_name', 'crs_dep_time', 'dep_delay', 'crs_arr_time', 'arr_time', 'arr_delay', 'crs_elapsed_time', 'air_time', 'flights', 'distance', 'first_dep_time']


#### Extracting From PostGres

In [5]:
import pandas as pd
import psycopg2

In [6]:
param_dic = {
    'host': 'lhl-data-bootcamp.crzjul5qln0e.ca-central-1.rds.amazonaws.com',
    'database': 'mid_term_project',
    'user': 'lhl_student',
    'port': '5432',
    'password': 'lhl_student'
}

In [7]:
def connect(param_dic):
    """ Connect to the PostgreSQL database server """
    conn = None
    try:
        # connect to the PostgreSQL server
        print('Connecting to the PostgreSQL database...')
        conn = psycopg2.connect(**param_dic)
    except (Exception, psycopg2.DatabaseError) as error:
        print(error)
        sys.exit(1) 
    print("Connection successful")
    return conn

In [8]:
con = connect(param_dic)

Connecting to the PostgreSQL database...
Connection successful


In [9]:
def postgres_to_df(conn, select_query, column_names):
    """
    Transforms a SELECT query into a pandas dataframe
    """
    cursor = con.cursor()
    try:
        cursor.execute(select_query)
    except (Exception, psycopg2.DatabaseError) as error:
        print(f"Error: {error}")
        cursor.close()
        return 1
    
    # list of tuples
    res_tuples = cursor.fetchall()
    cursor.close()

    # return to dataframe
    df = pd.DataFrame(res_tuples, columns=column_names)
    return df

In [10]:
query1 = """SELECT fl_date, mkt_unique_carrier, mkt_carrier, 
mkt_carrier_fl_num, op_unique_carrier, tail_num, op_carrier_fl_num, 
origin_airport_id, origin, origin_city_name, dest_airport_id, 
dest, dest_city_name, crs_dep_time, dep_delay, crs_arr_time, 
arr_time, arr_delay, crs_elapsed_time, air_time, flights, distance, first_dep_time 
FROM flights
WHERE fl_date LIKE '2019-01%'
ORDER BY RANDOM()
LIMIT 25000;"""

query2 = """SELECT fl_date, mkt_unique_carrier, mkt_carrier, 
mkt_carrier_fl_num, op_unique_carrier, tail_num, op_carrier_fl_num, 
origin_airport_id, origin, origin_city_name, dest_airport_id, 
dest, dest_city_name, crs_dep_time, dep_delay, crs_arr_time, 
arr_time, arr_delay, crs_elapsed_time, air_time, flights, distance, first_dep_time 
FROM flights
WHERE fl_date LIKE '2018-01%'
ORDER BY RANDOM()
LIMIT 25000;"""

col_names = ['fl_date', 'mkt_unique_carrier', 'mkt_carrier', 
'mkt_carrier_fl_num', 'op_unique_carrier', 'tail_num', 'op_carrier_fl_num', 
'origin_airport_id', 'origin', 'origin_city_name', 'dest_airport_id', 
'dest', 'dest_city_name', 'crs_dep_time', 'dep_delay', 'crs_arr_time', 
'arr_time', 'arr_delay', 'crs_elapsed_time', 'air_time', 'flights', 'distance', 'first_dep_time']

In [11]:
df_201901 = postgres_to_df(con, query1, col_names)

In [12]:
df_201801 = postgres_to_df(con, query2, col_names)

In [13]:
df_Jan_flights = pd.concat([df_201901, df_201801])

In [14]:
df_Jan_flights.shape

(50000, 23)

In [16]:
df_Jan_flights.to_csv('df_jan_flights.csv')