This is a program which is in testing phase. We will use a linear regression ML model to give an SOC number for the flights.

In [None]:
# Import the querying module
from flight_querying import query_flights
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

# Set up and retrieve the data from the database.
db_connect = query_flights()

In [None]:
# Make the data dictionary holder
data_dict = {
    "Time Delta": [],
    "SOC Delta": [],
    "Activity": [],
    "Average Power": [],
    "Id": [],
    "Unique Data Identifier": [],
    "temperature": [],
    "Visibility": [],
    "Wind Speed": []
}

# Create a increasing int variable to keep track of the Unique Data Identifier
unique_identifier = 0

In [None]:
def data_parser(parsing_dataframe):

    global unique_identifier
    # Get the current exercise
    current_exercise = parsing_dataframe.iloc[0, parsing_dataframe.columns.get_loc('activity')]
    id = parsing_dataframe.iloc[0, parsing_dataframe.columns.get_loc('id')]
    max_soc, min_soc = 0, 101
    power_list = [0]
    min_time, max_time = 100, 0

    # iterate over all the rows
    for index, row in parsing_dataframe.iterrows():

        # Get the data needed from the rows. Append the power
        new_exercise = row["activity"]
        soc = row["soc"]
        power_list.append(row["power"])
        time = row["time"]
        outside_temp = row["temperature"]
        visibility = row["visibility"]
        wind_speed = row["wind_speed"]

        # If the exercise changes or if the rows end.
        if current_exercise != new_exercise or index == len(parsing_dataframe) - 1:

            # Set the values 
            data_dict["Time Delta"].append(round(max_time - min_time, 2))
            data_dict["SOC Delta"].append(max_soc - min_soc)
            data_dict["Activity"].append(current_exercise)
            data_dict["Average Power"].append(round(sum(power_list)/len(power_list), 2))
            data_dict["temperature"].append(outside_temp)
            data_dict["Visibility"].append(visibility)
            data_dict["Wind Speed"].append(wind_speed)
            data_dict["Id"].append(id)
            data_dict["Unique Data Identifier"].append(unique_identifier)
            unique_identifier = unique_identifier + 1

            # Reset all the values
            max_soc, min_soc = soc, soc
            power_list.clear()
            max_time, min_time = time, time
        
        # SOC
        if soc >= max_soc:
            max_soc = soc
        if soc <= min_soc:
            min_soc = soc

        # TIME
        if time >= max_time:
            max_time = time
        if time <= min_time:
            min_time = time
        
        # Change current exercise
        current_exercise = new_exercise


In [None]:
# Flight IDs to include
# query the list of flight ids excluding these six which are already labelled
flight_ids = db_connect.get_flight_ids()
flight_ids = flight_ids['id'].to_list()
ids_avoid = [5190, 5192, 5109]

# remove the manually_labelled_ids from the list 
flight_ids = [id for id in flight_ids if id not in ids_avoid]
print(flight_ids)
# Fetch data for specified flight IDs
# Remove NA values from each dataframe in the list and put it through the data parser
for ids in flight_ids:
    frame = db_connect.connect_flight_for_ml_data_prescription(ids)
    frame = frame[frame["activity"] != "NA"].reset_index()
    if len(frame) > 0:
      data_parser(frame)
    else:
      print(ids)

# Concatenate data frames and shuffle the data
# all_data = pd.concat(data_frames, axis=0).sample(frac=1, random_state=42)
all_data = pd.DataFrame(data_dict).dropna()

In [None]:
# Split the data into train and test sets
train_data, test_data = train_test_split(all_data, test_size=0.3, random_state=43)

In [None]:
all_data.head()

In [None]:
print(f"Length train_x = {len(train_data)} \n Length train_y = {len(test_data)}")

One-Hot-Encoding of the Operations columns

In [None]:
# ONE-HOT ENCODE
# https://stackabuse.com/one-hot-encoding-in-python-with-pandas-and-scikit-learn/
def one_hot(df, col, pre):
  encoded = pd.get_dummies(df[col], prefix=pre)
  for column in encoded:
    encoded = encoded.rename(columns={column: col + "_" + column})
  encoded['Unique Data Identifier'] = df['Unique Data Identifier']
  return encoded

In [None]:
# Encode Train data
train_encoded = one_hot(train_data, "Activity", 'is')
final_train_x = pd.merge(train_data, train_encoded, on=["Unique Data Identifier"])
final_train_y = final_train_x["SOC Delta"].to_numpy()
final_train_x = final_train_x.drop(columns=["SOC Delta", "Id", "Unique Data Identifier", "Activity"])

# Encode Test data
test_encoded = one_hot(test_data, "Activity", 'is')
final_test_x = pd.merge(test_data, test_encoded, on=["Unique Data Identifier"])
final_test_y = final_test_x["SOC Delta"].to_numpy()
final_test_x = final_test_x.drop(columns=["SOC Delta", "Id", "Unique Data Identifier", "Activity"])

In [None]:
final_train_x

In [None]:
print(f"Length test_encoded = {len(test_encoded)} \n Length train_encoded = {len(train_encoded)}")

In [None]:
print(f"Length train_x = {len(final_train_x)} \n Length train_y = {len(final_train_y)}")

In [None]:
print(f"Length test_x = {len(final_test_x)} \n Length test_y = {len(final_test_y)}")

Machine Learning Model Implementation

In [None]:
# import sklearn
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Set model
regression_model = LinearRegression()
# Fit model
regression_model.fit(final_train_x, final_train_y)

In [None]:
# if there is no power on stall activities in test set, add the column with all false values
if 'Activity_is_power on stall' not in final_test_x.columns:
  final_test_x.insert(loc=final_test_x.columns.get_loc('Activity_is_slow flight'), column='Activity_is_power on stall', value=False)

In [None]:
# Make predictions using the testing set
y_pred = regression_model.predict(final_test_x)

In [None]:
# print model score
print(regression_model.score(final_test_x, final_test_y))

In [None]:
from sklearn.metrics import mean_squared_error, r2_score

# The mean squared error
print("Mean squared error: %.5f" % mean_squared_error(final_test_y, y_pred))

# The coefficient of determination: 1 is perfect prediction
print("Coefficient of determination: %.2f" % r2_score(final_test_y, y_pred))

In [None]:
import matplotlib.pyplot as plt

# Plot outputs
plt.scatter(final_test_y, y_pred, color="black")

plt.xticks(())
plt.yticks(())

plt.xlabel("Actual Values")
plt.ylabel("Predicted Values")

plt.show()

In [None]:
coeff = pd.DataFrame(list(zip(regression_model.feature_names_in_, regression_model.coef_)), columns = ['Feature', 'Weight'])
coeff.sort_values('Weight')

In [None]:
import joblib

# Save the model to a file
model_filename = 'ML_model_outputs/prescription_linreg_model.joblib'
joblib.dump(regression_model, model_filename)

print(f"Model saved to {model_filename}")