In [54]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from dotenv import load_dotenv
import os
from os import listdir, getenv
import sqlalchemy as sa
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [55]:
# Import the querying module
from flight_querying import query_flights
import pandas as pd

# Set up and retrieve the data from the database.
db_connect = query_flights()

In [56]:
# Flight IDs to include
flight_ids = [4620, 4929, 4940, 5019, 5021, 5034, 4842, 4868, 4925, 4978, 5362, 5116, 4636]

# Fetch data for specified flight IDs
data_frames = [db_connect.connect_flight_for_ml_data_label(flight_id) for flight_id in flight_ids]

# Concatenate data frames and shuffle the data
all_data = pd.concat(data_frames, axis=0).sample(frac=1, random_state=42)

In [57]:
# Count and output rows with NaN, NULL, or NAT values
rows_with_missing_values = all_data[all_data.isnull().any(axis=1)]

if len(rows_with_missing_values) < 50:
    # If the number of rows is less than 50, delete all the rows
    num_deleted_rows = len(all_data)
    all_data = all_data.dropna()
    num_deleted_rows -= len(all_data)
    print(f"All rows with missing values deleted. \nNumber of deleted rows: {num_deleted_rows}")
else:
    # Print information about rows with missing values
    print("Rows with missing values:")
    print(rows_with_missing_values)
    print("Number of rows with missing values:", len(rows_with_missing_values))

All rows with missing values deleted. 
Number of deleted rows: 9


In [58]:
#multi-nomial encoding 
le = LabelEncoder()
all_data['y'] = le.fit_transform(all_data['exercise'])
print(all_data.head())

        id   time   soc    motor_rpm     voltage  motor_power  \
498   5021   4.98  99.0   873.750000  394.091667     3.166667   
4446  4636  29.64  70.0  1013.083333  375.983333     0.000000   
3447  4842  34.46  56.5  1333.166667  364.016667     0.000000   
4046  4620  40.46  46.5  1093.083333  359.070833     0.000000   
4707  5034  47.06  37.5  1178.833333  352.129167     0.000000   

      pressure_altitude  ground_speed     pitch       roll exercise  \
498          304.041400     10.000000  2.999971  -0.905222       NA   
4446         348.601364     58.850000 -2.520616   1.510147  landing   
3447         623.244909     72.908333  0.104003  -0.929798  landing   
4046         355.233358     70.400000 -1.486922  11.933450  landing   
4707         502.636706     68.400000 -4.504937 -15.716479  landing   

            ias   soh  stall_warn_active      torque     heading          qng  \
498    0.000000  92.5                0.0  131.000000   67.050858  1019.150024   
4446  56.895023  97.

In [59]:
# Split the data into train and test sets
train_data, test_data = train_test_split(all_data, test_size=0.3, random_state=42)

# Set up train data
train_y = train_data["y"].to_numpy()
train_x = train_data.drop(columns=["exercise","id", "y"])

# Set up test data
test_y = test_data["y"].to_numpy()
test_x = test_data.drop(columns=["exercise", "id","y"])

In [60]:
print(f"Length train_x = {len(train_x)} \n Length train_y = {len(train_y)}")
print(f"Length test_x = {len(test_x)} \n Length test_y = {len(test_y)}")

Length train_x = 40133 
 Length train_y = 40133
Length test_x = 17200 
 Length test_y = 17200


In [61]:
train_x

Unnamed: 0,time,soc,motor_rpm,voltage,motor_power,pressure_altitude,ground_speed,pitch,roll,ias,soh,stall_warn_active,torque,heading,qng
1095,10.94,100.000000,777.666667,396.691667,2.000000,306.593015,7.841667,2.754949,-0.733044,0.000000,97.0,0.0,98.833333,121.265028,1017.099976
2578,25.78,75.333333,2272.416667,360.708333,46.333333,465.191798,73.533333,7.219226,15.309145,72.961285,97.5,0.0,741.916667,304.339076,1024.229980
2919,29.18,58.500000,1854.916667,361.870833,14.333333,725.304535,90.000000,-1.931712,-2.865744,85.319283,91.5,0.0,282.833333,352.354138,1021.169983
3169,31.68,60.500000,1942.916667,360.608333,22.666667,615.241837,52.800000,2.993648,10.376646,75.077793,97.0,0.0,430.000000,96.917704,1017.099976
1171,11.70,99.000000,687.250000,396.500000,2.000000,306.702573,10.366667,2.097166,0.092034,0.000000,97.0,0.0,81.000000,126.898046,1017.099976
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1742,17.42,73.500000,2005.818182,372.177273,19.000000,1091.359952,80.400000,0.687564,0.219287,88.477906,90.0,0.0,354.000000,350.829714,1025.250000
5322,53.22,35.500000,745.000000,350.679167,2.000000,311.336670,12.400000,2.350095,-0.243921,15.935500,92.5,0.0,98.000000,243.830684,1019.489990
2778,27.78,68.000000,1236.333333,373.066667,0.000000,575.854925,54.850000,1.118294,7.262523,70.397090,97.0,0.0,4.750000,123.608419,1017.099976
5079,50.78,35.500000,0.000000,351.950000,0.000000,312.436854,0.000000,2.803851,0.575909,0.000000,92.5,0.0,2.000000,358.101319,1019.489990


In [62]:
train_y

array([1, 2, 4, ..., 3, 1, 5])

In [63]:
test_x

Unnamed: 0,time,soc,motor_rpm,voltage,motor_power,pressure_altitude,ground_speed,pitch,roll,ias,soh,stall_warn_active,torque,heading,qng
3025,30.24,57.500000,1343.000000,367.587500,0.000000,595.235723,85.300000,-4.316911,-0.344227,77.517000,91.5,0.0,1.000000,328.695365,1021.169983
4120,41.20,45.708333,2371.833333,339.783333,54.916667,321.966888,75.166667,5.332282,1.988865,72.250975,97.0,0.0,835.083333,309.107452,1017.099976
743,7.42,99.000000,249.916667,395.125000,0.000000,265.162165,12.875000,6.336984,-0.644123,0.000000,91.5,0.0,30.583333,355.294452,1021.169983
1369,13.68,98.000000,125.750000,394.420833,0.000000,316.042686,0.000000,3.654774,-1.529262,0.000000,85.5,0.0,7.000000,0.041950,1009.969971
1130,22.60,65.500000,2382.250000,354.570833,46.833333,1009.420705,81.466667,5.890179,-5.889064,78.173767,95.5,0.0,714.250000,180.186327,1019.469971
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
436,4.36,99.000000,59.250000,397.300000,0.000000,270.639303,0.000000,1.957933,-1.677783,0.000000,91.5,0.0,2.000000,79.759343,1021.169983
30,0.60,100.000000,0.000000,396.600000,0.000000,339.653725,0.000000,2.415114,-0.769009,0.000000,90.5,0.0,0.000000,1.048787,1026.270020
768,7.68,97.000000,2435.500000,361.058333,68.166667,375.527051,71.000000,11.224504,-0.516619,65.888892,95.5,0.0,977.750000,65.417192,1018.809998
1821,36.42,43.500000,2173.333333,342.100000,30.666667,617.937770,76.800000,0.695694,8.019137,81.726079,95.5,0.0,519.000000,108.401785,1019.469971


In [64]:
test_y

array([ 4, 13,  1, ..., 13,  3,  5])

In [65]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
# Scale the data
scaler = StandardScaler()
train_x_scaled = scaler.fit_transform(train_x)
test_x_scaled = scaler.transform(test_x)

# Create and train the Logistic Regression model
clf = LogisticRegression(penalty='l2', multi_class='multinomial', solver='saga', max_iter=2000)
clf.fit(train_x_scaled, train_y)

In [66]:
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.metrics import accuracy_score, classification_report

# Print the results
print('Training accuracy:', clf.score(train_x_scaled, train_y))
print('Test accuracy:', clf.score(test_x_scaled, test_y))

# Cross-validation
cv_scores = cross_val_score(clf, train_x_scaled, train_y, cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42))
print('Cross-Validation Scores:', cv_scores)
print('Mean KFold Cross-Validation Accuracy:', cv_scores.mean())

# Classification Report
test_predictions = clf.predict(test_x_scaled)
print('\nClassification Report:')
print(classification_report(test_y, test_predictions))



Training accuracy: 0.7575561258814442
Test accuracy: 0.7530813953488372
Cross-Validation Scores: [0.75657157 0.75756821 0.75432914 0.75953152 0.75355096]
Mean KFold Cross-Validation Accuracy: 0.7563102798674954

Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.77      0.84       109
           1       0.64      0.53      0.58      3195
           2       0.81      0.83      0.82      1743
           3       0.80      0.90      0.84      3895
           4       0.70      0.54      0.61       789
           5       0.88      0.90      0.89      2638
           6       0.76      0.64      0.70       750
           7       0.68      0.64      0.66       100
           8       0.62      0.49      0.55        37
           9       0.64      0.77      0.70      2214
          10       0.58      0.51      0.54       365
          11       1.00      0.47      0.64        15
          12       0.61      0.38      0.47       344
        

In [67]:
# Assuming 'le' is your LabelEncoder instance
inverse_encoded_classes = le.inverse_transform([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13])

# Display the actual values of the encoded labels
print("Actual values of encoded labels:")
print(inverse_encoded_classes)

Actual values of encoded labels:
['HASEL' 'NA' 'climb' 'cruise' 'descent' 'landing' 'post-flight'
 'power off stall' 'power on stall' 'pre-flight' 'slow flight'
 'steep turn' 'steep turns' 'takeoff']


## Saving & Loading The Model

In [68]:
import joblib

# Assuming you've trained and named your Logistic Regression model 'logreg_model'

# Save the model to a file
model_filename = 'ML_model_outputs/multinomial_logreg_model.joblib'
joblib.dump(clf, model_filename)

print(f"Model saved to {model_filename}")


Model saved to ML_model_outputs/multinomial_logreg_model.joblib


In [69]:
loaded_model = joblib.load(model_filename)