In [1]:
# Import the querying module
from flight_querying import query_flights
import pandas as pd

# Set up and retrieve the data from the database.
db_connect = query_flights()

In [2]:
# Make the data dictionary holder
data_dict = {
    "time_delta": [],
    "soc_delta": [],
    "soh": [],
    "average_altitude": [],
    "ground_speed": [],
    "activity": [],
    "average_power": [],
    "id": [],
    "unique_data_identifier": [],
    "temperature": [],
    "visibility": [],
    "wind_speed": []
}

# Create a increasing int variable to keep track of the Unique Data Identifier
unique_identifier = 0

In [3]:
def data_parser(parsing_dataframe):

    global unique_identifier

    # Get the current exercise
    current_exercise = parsing_dataframe.iloc[0, parsing_dataframe.columns.get_loc('activity')]
    id = parsing_dataframe.iloc[0, parsing_dataframe.columns.get_loc('flight_id')]
    max_soc, min_soc = 0, 101
    power_list = []
    ground_speed_list = []
    alt_list = []
    soh_list = []
    min_time, max_time = 100, 0

    # iterate over all the rows
    for index, row in parsing_dataframe.iterrows():

        # Get the data needed from the rows. Append the power
        new_exercise = row["activity"]
        soc = row["soc"]
        alt_list.append(row["pressure_alt"])
        ground_speed_list.append(row["ground_speed"])
        power_list.append(row["motor_power"])
        soh_list.append(round(row["soh"]))
        time = row["time_min"]
        outside_temp = row["temperature"]
        visibility = row["visibility"]
        wind_speed = row["wind_speed"]

        # If the exercise changes or if the rows end.
        if current_exercise != new_exercise or index == len(parsing_dataframe) - 1:

            # Set the values 
            data_dict["time_delta"].append(round(max_time - min_time, 2))
            data_dict["soc_delta"].append(round(max_soc - min_soc, 2))
            data_dict["soh"].append(min(soh_list))
            data_dict["average_altitude"].append(round(alt_list[-1] - alt_list[0], 2))
            data_dict["ground_speed"].append(round(sum(ground_speed_list)/len(ground_speed_list), 2))
            data_dict["activity"].append(current_exercise)
            data_dict["average_power"].append(round(sum(power_list)/len(power_list), 2))
            data_dict["temperature"].append(outside_temp)
            data_dict["visibility"].append(visibility)
            data_dict["wind_speed"].append(wind_speed)
            data_dict["id"].append(id)
            data_dict["unique_data_identifier"].append(unique_identifier)
            unique_identifier = unique_identifier + 1

            # Reset all the values
            max_soc, min_soc = soc, soc
            power_list.clear()
            soh_list.clear()
            ground_speed_list.clear()
            alt_list.clear()
            max_time, min_time = time, time
        
        # SOC
        if soc >= max_soc:
            max_soc = soc
        if soc <= min_soc:
            min_soc = soc

        # TIME
        if time >= max_time:
            max_time = time
        if time <= min_time:
            min_time = time
        
        # Change current exercise
        current_exercise = new_exercise


In [4]:
# Flight IDs to include
# query the list of flight ids excluding these six which are already labelled
flight_ids = db_connect.get_flight_ids()
flight_ids = flight_ids['id'].to_list()
ids_avoid = [5190, 5192, 5109]

# remove the manually_labelled_ids from the list 
flight_ids = [id for id in flight_ids if id not in ids_avoid]

# Fetch data for specified flight IDs
# Remove NA values from each dataframe in the list and put it through the data parser
for ids in flight_ids:
    print(f"On flight: {ids}")
    frame = db_connect.connect_flight_for_ml_data_prescription(ids)
    frame = frame[frame["activity"] != "NA"].reset_index().dropna()
    if len(frame) > 0:
        data_parser(frame)
    else:
        print(ids)

# Concatenate data frames and shuffle the data
# all_data = pd.concat(data_frames, axis=0).sample(frac=1, random_state=42)
all_data = pd.DataFrame(data_dict).dropna()

all_data.to_csv("ml_model_outputs/all_data.csv", index=False)

On flight: 5367
On flight: 5362
On flight: 5205
On flight: 5127
On flight: 5117
On flight: 5116
On flight: 5074
On flight: 5071
On flight: 5072
On flight: 5194
5194
On flight: 5096
On flight: 5034
On flight: 5021
On flight: 5019
On flight: 5025
5025
On flight: 5023
5023
On flight: 5013
On flight: 4999
On flight: 4994
On flight: 4987
On flight: 4983
On flight: 4931
On flight: 4917
On flight: 4910
On flight: 4908
On flight: 4903
On flight: 4868
On flight: 4842
On flight: 4845
On flight: 4795
On flight: 4792
On flight: 4790
On flight: 4785
On flight: 4780
On flight: 4862
On flight: 4871
On flight: 4620
On flight: 4622
On flight: 4633
On flight: 4636
On flight: 4669
On flight: 4766
On flight: 4783
On flight: 4802
On flight: 4850
On flight: 4853
On flight: 4857
On flight: 4860
On flight: 4901
On flight: 4904
On flight: 4906
On flight: 4909
On flight: 4915
On flight: 4919
On flight: 4921
On flight: 4923
On flight: 4925
On flight: 4927
On flight: 4929
On flight: 4936
On flight: 4938
On flight

In [5]:
print(all_data["activity"].unique())

['takeoff' 'climb' 'cruise' 'slow flight' 'power off stall' 'steep turns'
 'descent' 'landing' 'pre-flight' 'HASEL' 'steep turn' 'post-flight'
 'power on stall']


In [7]:
all_data.loc[all_data["activity"] == "pre-flight", :]

Unnamed: 0,time_delta,soc_delta,soh,average_altitude,ground_speed,activity,average_power,id,unique_data_identifier,temperature,visibility,wind_speed
33,17.06,4.0,84,178.61,4.38,pre-flight,1.44,5362,33,50.0,9.0,14
158,12.2,4.0,80,307.99,2.65,pre-flight,1.77,5116,158,48.2,9.0,3
416,11.1,2.0,96,305.28,2.84,pre-flight,2.3,4868,416,60.8,9.0,0
433,14.04,2.0,98,302.33,1.11,pre-flight,1.53,4842,433,51.8,9.0,2
876,13.78,1.0,97,307.43,0.0,pre-flight,1.61,4636,876,57.2,9.0,0
1407,9.8,3.0,90,309.08,1.24,pre-flight,3.18,4925,1407,51.8,9.0,7
1613,15.08,3.0,86,312.79,4.98,pre-flight,2.41,4978,1613,50.0,9.0,8
