In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from dotenv import load_dotenv
import os
from os import listdir, getenv
import sqlalchemy as sa
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [2]:
# Import the querying module
from flight_querying import query_flights
import pandas as pd

# Set up and retrieve the data from the database.
db_connect = query_flights()

In [3]:
# Flight IDs to include
flight_ids = [4620, 4929, 4940, 5019, 5021, 5034]

# Fetch data for specified flight IDs
data_frames = [db_connect.connect_flight_for_ml_data_label(flight_id) for flight_id in flight_ids]

# Concatenate data frames and shuffle the data
all_data = pd.concat(data_frames, axis=0).sample(frac=1, random_state=42).dropna()

In [4]:
#multi-nomial encoding 
le = LabelEncoder()
all_data['y'] = le.fit_transform(all_data['exercise'])
all_data.head()

Unnamed: 0,id,time,soc,cell_temperature,motor_rpm,motor_power,motor_temperature,indicated_air_speed,pressure_altitude,ground_speed,outside_air_temperature,inverter_temperature,pitch,roll,exercise,y
1214,4929,24.28,66.5,17.5,1961.583333,23.0,43.742027,75.025423,783.050156,79.4,14.0,33.38094,0.017464,0.466837,cruise,2
1920,4940,38.4,42.5,23.0,1103.666667,0.0,41.338358,74.469576,431.576482,53.45,22.5,33.106204,-3.551373,9.770128,,0
4214,4940,39.64,42.5,23.0,351.083333,0.0,39.889447,35.253217,317.79422,18.416667,23.0,33.762506,2.064351,-1.367186,landing,4
583,5021,11.66,92.5,18.0,2282.5,47.166667,44.556083,75.003683,549.130814,45.983333,11.0,35.388976,7.095505,-14.472788,climb,1
2790,5034,0.82,100.0,18.0,0.0,0.0,15.123765,0.0,323.046374,0.0,16.0,19.881319,2.607737,0.512978,,0


In [5]:
print("Encoded classes:", le.classes_)
labels=['NA', 'climb', 'cruise', 'descent', 'landing', 'power off stall',
 'power on stall', 'slow flight', 'steep turns', 'takeoff']
print("Encoded labels:", le.transform(labels))

Encoded classes: ['NA' 'climb' 'cruise' 'descent' 'landing' 'power off stall'
 'power on stall' 'slow flight' 'steep turns' 'takeoff']
Encoded labels: [0 1 2 3 4 5 6 7 8 9]


In [6]:
# Split the data into train and test sets
train_data, test_data = train_test_split(all_data, test_size=0.1, random_state=42)

# Set up train data
train_y = train_data["y"].to_numpy()
train_x = train_data.drop(columns=["exercise","id", "y"])

# Set up test data
test_y = test_data["y"].to_numpy()
test_x = test_data.drop(columns=["exercise", "id","y"])

In [7]:
print(f"Length train_x = {len(train_x)} \n Length train_y = {len(train_y)}")
print(f"Length test_x = {len(test_x)} \n Length test_y = {len(test_y)}")

Length train_x = 26532 
 Length train_y = 26532
Length test_x = 2948 
 Length test_y = 2948


In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

In [9]:
model = LogisticRegression(multi_class='multinomial', solver='lbfgs')
model.fit(train_x, train_y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [10]:
# Make predictions on the test set
predictions = model.predict(test_x)

# Evaluate the model
accuracy = accuracy_score(test_y, predictions)
print(f"Accuracy: {accuracy:.2f}")

# Display additional classification metrics
print("Classification Report:")
print(classification_report(test_y, predictions))

Accuracy: 0.75
Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.89      0.90      1071
           1       0.82      0.70      0.76       296
           2       0.65      0.91      0.76       723
           3       0.40      0.20      0.27       219
           4       0.70      0.72      0.71       297
           5       0.67      0.67      0.67        27
           6       0.00      0.00      0.00         7
           7       0.00      0.00      0.00        72
           8       0.59      0.18      0.27       112
           9       0.89      0.88      0.89       124

    accuracy                           0.75      2948
   macro avg       0.56      0.51      0.52      2948
weighted avg       0.74      0.75      0.73      2948



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Saving & Loading The Model

In [11]:
import joblib

# Assuming you've trained and named your Logistic Regression model 'logreg_model'

# Save the model to a file
model_filename = 'ML_model_outputs/multinomial_logreg_model.joblib'
joblib.dump(model, model_filename)

print(f"Model saved to {model_filename}")


Model saved to ML_model_outputs/multinomial_logreg_model.joblib


# Labelling all flights with the model created above

In [12]:
# Import the querying module
from flight_querying import query_flights
import pandas as pd

# Set up and retrieve the data from the database.
db_connect = query_flights()

In [13]:
# import model to label new data
import joblib
model_filename = 'ML_model_outputs/multinomial_logreg_model.joblib'
model = joblib.load(model_filename)

In [14]:
# query the list of flight ids excluding these six which are already labelled
flight_ids = db_connect.get_flight_ids()
flight_ids = flight_ids['id'].to_list()
manually_labelled_ids = [4620, 4929, 4940, 5019, 5021, 5034]

# remove the manually_labelled_ids from the list 
flight_ids = [id for id in flight_ids if id not in manually_labelled_ids]

In [15]:
# Fetch data for specified flight IDs # first id in list is 5367
data_frames = [db_connect.get_flightdata_for_ml_data_label(flight_id) for flight_id in flight_ids]

# Concatenate data frames
x = pd.concat(data_frames, axis=0).dropna()

In [16]:
# drop id column for model prediction
id_column = x[['id']]
x = x.drop(columns=["id"])

In [17]:
# Make predictions on the test set
predictions = model.predict(x)

In [18]:
# insert the predicted values back into the x dataframe
x['activity'] = predictions
x['flight_id'] = id_column

In [19]:
# replace values in activity column with their string activity mapping
labels = ['NA', 'climb', 'cruise', 'descent', 'landing', 'power off stall',
          'power on stall', 'slow flight', 'steep turns', 'takeoff']
x['activity'] = x['activity'].map(lambda x: labels[x])

In [20]:
# trim all columns except for the ones in flight_activities table
flight_activities_data = x[['flight_id', 'time', 'activity']]
flight_activities_data = flight_activities_data.rename(columns={"time": "time_min"})
flight_activities_data.head()

# push the updated data to the flight_activities table
engine = db_connect.connect()
flight_activities_data.to_sql('flight_activities', engine, if_exists='append', index=False)
engine.dispose()