In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from dotenv import load_dotenv
import os
from os import listdir, getenv
import sqlalchemy as sa
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [2]:
# Import the querying module
from flight_querying import query_flights
import pandas as pd

# Set up and retrieve the data from the database.
db_connect = query_flights()

In [3]:
# Flight IDs to include
flight_ids = [4620, 4929, 4940, 5019, 5021, 5034, 4636, 4842, 4868, 4925, 4978, 5362, 5116]

# Fetch data for specified flight IDs
data_frames = [db_connect.connect_flight_for_ml_data_label(flight_id) for flight_id in flight_ids]

# Concatenate data frames and shuffle the data
all_data = pd.concat(data_frames, axis=0).sample(frac=1, random_state=42)

In [4]:
# Count and output rows with NaN, NULL, or NAT values
rows_with_missing_values = all_data[all_data.isnull().any(axis=1)]

if len(rows_with_missing_values) < 50:
    # If the number of rows is less than 50, delete all the rows
    num_deleted_rows = len(all_data)
    all_data = all_data.dropna()
    num_deleted_rows -= len(all_data)
    print(f"All rows with missing values deleted. \nNumber of deleted rows: {num_deleted_rows}")
else:
    # Print information about rows with missing values
    print("Rows with missing values:")
    print(rows_with_missing_values)
    print("Number of rows with missing values:", len(rows_with_missing_values))

All rows with missing values deleted. 
Number of deleted rows: 9


In [5]:
#multi-nomial encoding 
le = LabelEncoder()
all_data['y'] = le.fit_transform(all_data['exercise'])
print(all_data.head())

        id   time    soc    motor_rpm     voltage  motor_power  \
498   5021   4.98   99.0   873.750000  394.091667     3.166667   
370   5362   7.40  100.0   949.083333  395.050000     4.083333   
3447  4636  22.98   79.5  1199.916667  384.133333     0.000000   
4046  4620  40.46   46.5  1093.083333  359.070833     0.000000   
4707  5034  47.06   37.5  1178.833333  352.129167     0.000000   

      pressure_altitude  ground_speed     pitch       roll    exercise  \
498          304.041400     10.000000  2.999971  -0.905222          NA   
370          168.743037      4.158333  4.043552  -0.620736  pre-flight   
3447         504.425913      0.000000 -2.209355  -8.617147     landing   
4046         355.233358     70.400000 -1.486922  11.933450     landing   
4707         502.636706     68.400000 -4.504937 -15.716479     landing   

            ias   soh  stall_warn_active      torque     heading          qng  \
498    0.000000  92.5                0.0  131.000000   67.050858  1019.150024

In [6]:
# Split the data into train and test sets
train_data, test_data = train_test_split(all_data, test_size=0.3, random_state=42)

# Set up train data
train_y = train_data["y"].to_numpy()
train_x = train_data.drop(columns=["exercise","id", "y"])

# Set up test data
test_y = test_data["y"].to_numpy()
test_x = test_data.drop(columns=["exercise", "id","y"])

In [7]:
print(f"Length train_x = {len(train_x)} \n Length train_y = {len(train_y)}")
print(f"Length test_x = {len(test_x)} \n Length test_y = {len(test_y)}")

Length train_x = 40133 
 Length train_y = 40133
Length test_x = 17200 
 Length test_y = 17200


In [8]:
train_x

Unnamed: 0,time,soc,motor_rpm,voltage,motor_power,pressure_altitude,ground_speed,pitch,roll,ias,soh,stall_warn_active,torque,heading,qng
1095,10.94,100.0,777.666667,396.691667,2.000000,306.593015,7.841667,2.754949,-0.733044,0.000000,97.0,0.0,98.833333,121.265028,1017.099976
724,14.48,83.0,2333.333333,365.991667,44.833333,876.171677,77.700000,6.828285,-3.524173,76.308932,95.5,0.0,708.000000,113.587958,1019.469971
5742,57.42,34.5,0.000000,351.150000,0.000000,308.010104,0.000000,3.127305,-0.330953,0.000000,92.5,0.0,2.000000,0.341260,1019.489990
4762,31.74,64.5,2373.083333,354.045833,49.333333,578.309224,81.725000,5.780713,-2.433567,76.085411,97.0,0.0,761.000000,9.962976,1008.260010
1171,11.70,99.0,687.250000,396.500000,2.000000,306.702573,10.366667,2.097166,0.092034,0.000000,97.0,0.0,81.000000,126.898046,1017.099976
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1742,17.42,73.5,2005.818182,372.177273,19.000000,1091.359952,80.400000,0.687564,0.219287,88.477906,90.0,0.0,354.000000,350.829714,1025.250000
5322,53.22,35.5,745.000000,350.679167,2.000000,311.336670,12.400000,2.350095,-0.243921,15.935500,92.5,0.0,98.000000,243.830684,1019.489990
2778,27.78,68.0,1236.333333,373.066667,0.000000,575.854925,54.850000,1.118294,7.262523,70.397090,97.0,0.0,4.750000,123.608419,1017.099976
3773,37.72,53.5,2204.909091,352.622727,27.454545,796.579612,94.018182,0.836357,1.408471,91.112127,92.5,0.0,456.000000,297.672531,1017.450012


In [9]:
train_y

array([1, 2, 1, ..., 3, 3, 4])

In [10]:
test_x

Unnamed: 0,time,soc,motor_rpm,voltage,motor_power,pressure_altitude,ground_speed,pitch,roll,ias,soh,stall_warn_active,torque,heading,qng
3025,30.24,57.5,1343.000000,367.587500,0.000000,595.235723,85.300000,-4.316911,-0.344227,77.517000,91.5,0.0,1.000000,328.695365,1021.169983
3420,22.80,79.5,1164.500000,383.841667,0.000000,538.314112,0.000000,-3.715154,-13.229540,63.960253,97.0,0.0,5.000000,274.279171,1008.260010
743,7.42,99.0,249.916667,395.125000,0.000000,265.162165,12.875000,6.336984,-0.644123,0.000000,91.5,0.0,30.583333,355.294452,1021.169983
2900,29.00,67.5,1070.416667,373.337500,0.000000,545.729645,53.900000,-2.917674,14.574056,59.357223,97.5,0.0,0.000000,114.131770,1024.229980
6494,43.28,42.5,1932.250000,347.162500,19.583333,609.588018,80.133333,0.921897,-3.105788,77.850981,97.0,0.0,372.000000,299.706055,1008.260010
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
436,4.36,99.0,59.250000,397.300000,0.000000,270.639303,0.000000,1.957933,-1.677783,0.000000,91.5,0.0,2.000000,79.759343,1021.169983
7616,50.76,29.5,1037.500000,345.241667,0.000000,474.125600,56.775000,-3.205537,-3.603134,58.105361,97.0,0.0,8.916667,239.148015,1008.260010
4679,46.78,24.5,370.000000,341.775000,0.000000,303.294825,11.725000,2.478494,-0.834605,0.000000,95.5,0.0,0.000000,153.296165,1018.809998
7185,47.90,34.0,2532.000000,325.387500,63.000000,632.627909,71.475000,6.709490,-6.089270,78.267352,97.0,0.0,916.500000,45.900908,1008.260010


In [11]:
test_y

array([4, 5, 1, ..., 1, 2, 9])

In [12]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler

# Scale the data
scaler = StandardScaler()
train_x_scaled = scaler.fit_transform(train_x)
test_x_scaled = scaler.transform(test_x)

# Create and train the Random Forest Classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(train_x_scaled, train_y)


In [13]:
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.metrics import accuracy_score, classification_report

# Print the results
print('Training accuracy:', clf.score(train_x_scaled, train_y))
print('Test accuracy:', clf.score(test_x_scaled, test_y))

# Cross-validation
cv_scores = cross_val_score(clf, train_x_scaled, train_y, cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42))
print('Cross-Validation Scores:', cv_scores)
print('Mean KFold Cross-Validation Accuracy:', cv_scores.mean())

# Classification Report
test_predictions = clf.predict(test_x_scaled)
print('\nClassification Report:')
print(classification_report(test_y, test_predictions))



Training accuracy: 0.9998255799466773
Test accuracy: 0.9948837209302326
Cross-Validation Scores: [0.99352186 0.9938956  0.99352186 0.99377025 0.99401944]
Mean KFold Cross-Validation Accuracy: 0.9937458026581171

Classification Report:
              precision    recall  f1-score   support

           0       0.99      1.00      1.00       112
           1       0.99      0.99      0.99      3164
           2       1.00      0.99      1.00      1758
           3       1.00      1.00      1.00      3901
           4       0.99      0.99      0.99       793
           5       1.00      1.00      1.00      2613
           6       1.00      1.00      1.00       742
           7       0.95      0.99      0.97       116
           8       0.93      0.93      0.93        29
           9       1.00      1.00      1.00      2216
          10       0.99      0.98      0.98       378
          11       0.93      1.00      0.97        14
          12       1.00      0.99      0.99       355
        

In [14]:
# Assuming 'le' is your LabelEncoder instance
inverse_encoded_classes = le.inverse_transform([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13])

# Display the actual values of the encoded labels
print("Actual values of encoded labels:")
print(inverse_encoded_classes)

Actual values of encoded labels:
['HASEL' 'NA' 'climb' 'cruise' 'descent' 'landing' 'post-flight'
 'power off stall' 'power on stall' 'pre-flight' 'slow flight'
 'steep turn' 'steep turns' 'takeoff']


## Saving & Loading The Model

In [15]:
import joblib

# Save the model to a file
model_filename = 'ML_model_outputs/label_randomforest_model.joblib'
joblib.dump(clf, model_filename)

print(f"Model saved to {model_filename}")


Model saved to ML_model_outputs/multinomial_logreg_model.joblib


In [16]:
loaded_model = joblib.load(model_filename)