This is a program which is in testing phase. We will use a linear regression ML model to give an SOC number for the flights.

In [1]:
# Import the querying module
from flight_querying import query_flights
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

# Set up and retrieve the data from the database.
db_connect = query_flights()

In [2]:
# Make the data dictionary holder
data_dict = {
    "Time Delta": [],
    "SOC Delta": [],
    "Activity": [],
    "Average Power": [],
    "Id": [],
    "Unique Data Identifier": []
}

# Create a increasing int variable to keep track of the Unique Data Identifier
unique_identifier = 0

In [3]:
def data_parser(parsing_dataframe):

    global unique_identifier

    # Get the current exercise
    current_exercise = parsing_dataframe.iloc[0, parsing_dataframe.columns.get_loc('activity')]
    id = parsing_dataframe.iloc[0, parsing_dataframe.columns.get_loc('id')]
    max_soc, min_soc = 0, 101
    power_list = [0]
    min_time, max_time = 100, 0

    # iterate over all the rows
    for index, row in parsing_dataframe.iterrows():

        # Get the data needed from the rows. Append the power
        new_exercise = row["activity"]
        soc = row["soc"]
        power_list.append(row["power"])
        time = row["time"]

        # If the exercise changes or if the rows end.
        if current_exercise != new_exercise or index == len(parsing_dataframe) - 1:

            # Set the values 
            data_dict["Time Delta"].append(round(max_time - min_time, 2))
            data_dict["SOC Delta"].append(max_soc - min_soc)
            data_dict["Activity"].append(current_exercise)
            data_dict["Average Power"].append(round(sum(power_list)/len(power_list), 2))
            data_dict["Id"].append(id)
            data_dict["Unique Data Identifier"].append(unique_identifier)
            unique_identifier = unique_identifier + 1

            # Reset all the values
            max_soc, min_soc = soc, soc
            power_list.clear()
            max_time, min_time = time, time
        
        # SOC
        if soc >= max_soc:
            max_soc = soc
        if soc <= min_soc:
            min_soc = soc

        # TIME
        if time >= max_time:
            max_time = time
        if time <= min_time:
            min_time = time
        
        # Change current exercise
        current_exercise = new_exercise

In [4]:
# Flight IDs to include
flight_ids = [4620, 4929, 4940, 5019, 5021, 5034]

# Fetch data for specified flight IDs
# Remove NA values from each dataframe in the list and put it through the data parser
for ids in flight_ids:
    frame = db_connect.connect_flight_for_ml_data_prescription(ids)
    frame = frame[frame["activity"] != "NA"].reset_index()
    data_parser(frame)

# Concatenate data frames and shuffle the data
# all_data = pd.concat(data_frames, axis=0).sample(frac=1, random_state=42)
all_data = pd.DataFrame(data_dict)

In [5]:
# Split the data into train and test sets
train_data, test_data = train_test_split(all_data, test_size=0.3, random_state=42)

In [6]:
all_data.head()

Unnamed: 0,Time Delta,SOC Delta,Activity,Average Power,Id,Unique Data Identifier
0,1.0,3,takeoff,53.39,4620,0
1,0.9,4,climb,51.92,4620,1
2,1.3,2,cruise,15.35,4620,2
3,1.2,0,landing,0.08,4620,3
4,0.7,3,takeoff,53.31,4620,4


In [7]:
print(f"Length train_x = {len(train_data)} \n Length train_y = {len(test_data)}")

Length train_x = 67 
 Length train_y = 29


One-Hot-Encoding of the Operations columns

In [8]:
# ONE-HOT ENCODE
# https://stackabuse.com/one-hot-encoding-in-python-with-pandas-and-scikit-learn/
def one_hot(df, col, pre):
  encoded = pd.get_dummies(df[col], prefix=pre)
  for column in encoded:
    encoded = encoded.rename(columns={column: col + "_" + column})
  encoded['Unique Data Identifier'] = df['Unique Data Identifier']
  return encoded

In [9]:
# Encode Train data
train_encoded = one_hot(train_data, "Activity", 'is')
final_train_x = pd.merge(train_data, train_encoded, on=["Unique Data Identifier"])
final_train_y = final_train_x["SOC Delta"].to_numpy()
final_train_x = final_train_x.drop(columns=["SOC Delta", "Id", "Unique Data Identifier", "Activity"])

# Encode Test data
test_encoded = one_hot(test_data, "Activity", 'is')
final_test_x = pd.merge(test_data, test_encoded, on=["Unique Data Identifier"])
final_test_y = final_test_x["SOC Delta"].to_numpy()
final_test_x = final_test_x.drop(columns=["SOC Delta", "Id", "Unique Data Identifier", "Activity"])

In [10]:
final_train_x

Unnamed: 0,Time Delta,Average Power,Activity_is_climb,Activity_is_cruise,Activity_is_descent,Activity_is_landing,Activity_is_power off stall,Activity_is_power on stall,Activity_is_slow flight,Activity_is_steep turns,Activity_is_takeoff
0,0.7,29.97,False,False,False,False,False,False,False,True,False
1,2.3,43.48,True,False,False,False,False,False,False,False,False
2,1.1,44.94,True,False,False,False,False,False,False,False,False
3,0.8,0.02,False,False,True,False,False,False,False,False,False
4,1.0,44.63,True,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...
62,0.7,44.04,True,False,False,False,False,False,False,False,False
63,3.4,17.77,False,False,False,False,False,False,True,False,False
64,1.5,20.41,False,True,False,False,False,False,False,False,False
65,0.7,53.09,False,False,False,False,False,False,False,False,True


In [11]:
print(f"Length test_encoded = {len(test_encoded)} \n Length train_encoded = {len(train_encoded)}")

Length test_encoded = 29 
 Length train_encoded = 67


In [12]:
print(f"Length train_x = {len(final_train_x)} \n Length train_y = {len(final_train_y)}")

Length train_x = 67 
 Length train_y = 67


In [13]:
print(f"Length test_x = {len(final_test_x)} \n Length test_y = {len(final_test_y)}")

Length test_x = 29 
 Length test_y = 29


Machine Learning Model Implementation

In [15]:
# import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from keras.models import Sequential
from keras.layers import Dense
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Standardize the features (optional but recommended for neural networks)
scaler = StandardScaler()
final_train_x = scaler.fit_transform(final_train_x)
final_test_x = scaler.transform(final_test_x)


# Create a Sequential model (a linear stack of layers)
model = Sequential()

# Add Dense layers to the model
# You can customize the number of units, activation function, and other parameters
model.add(Dense(units=64, activation='relu', input_dim=final_train_x.shape[1]))
model.add(Dense(units=32, activation='relu'))
model.add(Dense(units=1, activation='sigmoid'))  # Use 'softmax' for multi-class classification

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
# Use 'categorical_crossentropy' for multi-class classification

# Train the models
model.fit(final_train_x, final_train_y, epochs=10, batch_size=32, validation_data=(final_test_x, final_test_y))

# Evaluate the model
y_pred = model.predict_classes(final_test_x)
accuracy = accuracy_score(final_test_y, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Display additional evaluation metrics
print("Classification Report:\n", classification_report(final_test_y, y_pred))
print("Confusion Matrix:\n", confusion_matrix(final_test_y, y_pred))




AttributeError: partially initialized module 'keras.src' has no attribute 'utils' (most likely due to a circular import)

In [None]:
# print model score
print(model.score(final_test_x, final_test_y))

In [None]:
coeff = pd.DataFrame(list(zip(model.feature_names_in_, model.coef_)), columns = ['Feature', 'Weight'])
coeff.sort_values('Weight')