In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from dotenv import load_dotenv
import os
from os import listdir, getenv
import sqlalchemy as sa
from sklearn.model_selection import train_test_split


In [2]:
# Import the querying module
from flight_querying import query_flights
import pandas as pd

# Set up and retrieve the data from the database.
db_connect = query_flights()

In [3]:
# ONE-HOT ENCODE
# https://stackabuse.com/one-hot-encoding-in-python-with-pandas-and-scikit-learn/
def one_hot(df, col, pre):
  encoded = pd.get_dummies(df[col], prefix=pre)
  for column in encoded:
    encoded = encoded.rename(columns={column: col + "_" + column})
  encoded['time'] = df['time']
  return encoded

In [4]:
from sklearn.preprocessing import LabelEncoder

# Flight IDs to include
flight_ids = [4620, 4929, 4940, 5019, 5021, 5034]

# Fetch data for specified flight IDs
data_frames = [db_connect.connect_flight_for_ml_data_label(flight_id) for flight_id in flight_ids]

# Concatenate data frames and shuffle the data
all_data = pd.concat(data_frames, axis=0).sample(frac=1, random_state=42)


le = LabelEncoder()
y = le.fit_transform(all_data['exercise'])


# # Encode Train data
# encoded_exercises = one_hot(all_data, "exercise", 'is')
# all_data = pd.merge(all_data, encoded_exercises, on=["time"])

# Split the data into train and test sets
train_data, test_data = train_test_split(all_data, test_size=0.5, random_state=42)

# Set up train data
train_y = train_data["exercise"].to_numpy()
train_x = train_data.drop(columns=["exercise","id"])

# Set up test data
test_y = test_data["exercise"].to_numpy()
test_x = test_data.drop(columns=["exercise", "id"])

In [5]:
print(f"Length train_x = {len(train_x)} \n Length train_y = {len(train_y)}")
print(f"Length test_x = {len(test_x)} \n Length test_y = {len(test_y)}")

Length train_x = 88034 
 Length train_y = 88034
Length test_x = 88035 
 Length test_y = 88035


In [6]:
train_x

Unnamed: 0,time,soc,cell_temperature,motor_rpm,motor_power,motor_temperature,indicated_air_speed,pressure_altitude,ground_speed,outside_air_temperature,inverter_temperature,pitch,roll
11328,18.941536,71,26,1170,0,55.990517,73.456477,1064.456299,71.1,20.0,35.395370,3.194920,-0.420180
29142,48.778612,35,12,0,0,28.022661,0.000000,312.330933,1.4,11.5,28.439835,2.829932,0.566098
394,0.702049,100,18,0,0,15.110507,0.000000,323.038788,0.0,16.0,19.757511,2.603294,0.496127
5719,9.546005,97,23,1173,7,31.950922,20.799833,314.835693,0.0,26.5,36.270409,4.369888,-0.269310
24572,41.143868,30,15,2212,47,45.553085,64.535853,465.452820,56.2,9.0,33.728527,10.099205,-2.519308
...,...,...,...,...,...,...,...,...,...,...,...,...,...
16770,28.050174,55,24,2002,30,51.174332,60.076112,1041.126953,56.7,19.0,37.341141,8.963028,5.702071
27359,45.801679,24,15,911,4,36.045177,28.383542,305.340088,29.6,8.5,26.639986,1.607027,-0.711601
15210,25.483659,65,15,1192,0,38.398270,67.290296,913.715210,69.0,8.0,26.348877,0.111682,1.196560
21544,36.081695,42,13,1212,4,30.368723,55.789655,334.411774,56.4,8.5,24.150352,-0.575589,0.760856


In [7]:
train_y

array(['power off stall', 'NA', 'NA', ..., 'descent', 'landing',
       'descent'], dtype=object)

In [8]:
test_x

Unnamed: 0,time,soc,cell_temperature,motor_rpm,motor_power,motor_temperature,indicated_air_speed,pressure_altitude,ground_speed,outside_air_temperature,inverter_temperature,pitch,roll
26643,44.594837,42,17,2269,47,50.245621,71.349741,558.984497,62.5,13.0,36.259029,9.137207,-11.866501
6693,11.252713,86,18,1944,15,48.322350,93.548783,797.952881,90.4,13.5,34.349075,-4.697657,-1.730736
14120,23.614891,63,25,1794,19,52.025509,69.271434,1003.969177,61.2,20.0,36.626617,4.640319,-9.478028
13019,21.828115,64,16,1968,14,43.627144,89.614788,974.033386,71.6,6.0,29.081947,-3.466079,0.295602
15663,26.264315,63,18,1935,19,43.925282,83.062707,770.048950,85.5,14.0,32.955479,-0.364615,0.711140
...,...,...,...,...,...,...,...,...,...,...,...,...,...
11477,19.249717,68,16,1273,0,46.733337,72.397562,1057.953369,96.1,6.0,29.068392,-1.226546,1.611414
7603,12.776225,83,18,1881,20,45.908138,72.049480,782.161255,74.7,13.5,33.726181,1.692268,-0.576937
1237,2.126370,99,15,0,0,12.843011,0.000000,281.591583,0.0,13.0,16.394247,1.857302,-0.208379
23866,39.963802,33,15,1230,5,37.421116,55.967126,343.133942,56.3,9.0,27.995983,-0.523597,-6.504545


In [9]:
test_y

array(['climb', 'cruise', 'cruise', ..., 'NA', 'landing', 'climb'],
      dtype=object)

In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

In [11]:
# Assuming you have already prepared your training data (numeric_train_x, train_y) and testing data (test_x, test_y)
#X_train, X_test, y_train, y_test = train_test_split(train_x, train_y, test_size=0.2, random_state=42)

In [12]:
model = LogisticRegression(multi_class='multinomial', solver='lbfgs')
model.fit(train_x, train_y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [13]:
# Make predictions on the test set
predictions = model.predict(test_x)

# Evaluate the model
accuracy = accuracy_score(test_y, predictions)
print(f"Accuracy: {accuracy:.2f}")

# Display additional classification metrics
print("Classification Report:")
print(classification_report(test_y, predictions))

Accuracy: 0.76
Classification Report:


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                 precision    recall  f1-score   support

             NA       0.90      0.87      0.89     31679
          climb       0.78      0.77      0.77      8797
         cruise       0.66      0.91      0.76     21272
        descent       0.55      0.32      0.40      6898
        landing       0.70      0.73      0.72      9104
power off stall       0.69      0.77      0.72       896
 power on stall       0.00      0.00      0.00       308
    slow flight       0.00      0.00      0.00      2206
    steep turns       0.65      0.23      0.34      3220
        takeoff       0.88      0.81      0.84      3655

       accuracy                           0.76     88035
      macro avg       0.58      0.54      0.55     88035
   weighted avg       0.75      0.76      0.74     88035



  _warn_prf(average, modifier, msg_start, len(result))


In [14]:
import joblib

# Assuming you've trained and named your Logistic Regression model 'logreg_model'

# Save the model to a file
model_filename = 'multinomial_logreg_model.joblib'
joblib.dump(model, model_filename)

print(f"Model saved to {model_filename}")


Model saved to multinomial_logreg_model.joblib


In [15]:
loaded_model = joblib.load(model_filename)

In [16]:
# Flight IDs to include
flight_ids = db_connect.get_ids()
# Extract 'flight_id' column and convert it to a Python list
flight_ids = flight_ids['flight_id'].tolist()
print(flight_ids)

[4785, 4790, 4927, 4994, 5194, 4919, 5039, 4783, 4853, 4975, 4857, 5034, 4766, 4792, 4860, 4978, 4850, 5025, 5096, 4992, 5367, 4669, 4780, 4901, 4999, 5127, 4636, 4923, 4795, 4909, 4842, 4906, 5205, 4915, 5117, 4620, 4938, 4871, 4622, 4910, 4931, 4940, 5074, 4862, 4633, 4908]


In [17]:
# Fetch data for all flight IDs
data_frames = [db_connect.connect_flight_for_ml_data_label_predictions(flight_id)for flight_id in flight_ids]

In [18]:
new_data = pd.concat(data_frames, ignore_index=True)
print(new_data.head())

   time    id exercise  soc  cell_temperature  motor_rpm  motor_power  \
0   NaN  None     None    0                 0          0            0   
1   NaN  None     None  100                 0          0            0   
2   NaN  None     None  100                 0          0            0   
3   NaN  None     None  100                 0          0            0   
4   NaN  None     None  100                17          0            0   

   motor_temperature  indicated_air_speed  pressure_altitude  ground_speed  \
0                0.0                  0.0                0.0           0.0   
1                0.0                  0.0                0.0           0.0   
2                0.0                  0.0                0.0           0.0   
3                0.0                  0.0                0.0           0.0   
4                0.0                  0.0                0.0           0.0   

   outside_air_temperature  inverter_temperature  pitch  roll  
0                      0.0  

In [19]:
unique_ids = new_data['id'].unique()
print(unique_ids)

[None 5034 4620 4940]


In [20]:
# Replace 'TBD' with NaN in the 'exercise' column
# new_data['exercise'].replace('TBD', np.nan, inplace=True)
# new_data.dropna(subset=['exercise'], inplace=True)
print(all_data.tail())

            time    id     exercise  soc  cell_temperature  motor_rpm  \
11024  18.484516  5021       cruise   76                17       1558   
23996  40.180919  5019      landing   33                15        938   
23077  38.635601  5021       cruise   41                14       1887   
3641    6.129829  5034           NA   97                18        473   
13103  21.961361  5021  slow flight   71                16       1746   

       motor_power  motor_temperature  indicated_air_speed  pressure_altitude  \
11024           12          42.908356            62.738670         926.222717   
23996            0          36.616276            55.210531         319.026276   
23077           19          38.492405            76.084989        1075.602783   
3641             0          25.043539             0.000000         306.998199   
13103           21          40.090439            54.212506         945.704651   

       ground_speed  outside_air_temperature  inverter_temperature      pi

In [21]:
# Print Features (X) used during training
print("Features (X):", train_x.columns)

Features (X): Index(['time', 'soc', 'cell_temperature', 'motor_rpm', 'motor_power',
       'motor_temperature', 'indicated_air_speed', 'pressure_altitude',
       'ground_speed', 'outside_air_temperature', 'inverter_temperature',
       'pitch', 'roll'],
      dtype='object')


In [22]:
# new_data.drop(columns=['exercise','id'], inplace=True)
print("Encoded Features:", new_data.columns)
print(new_data.head())

Encoded Features: Index(['time', 'id', 'exercise', 'soc', 'cell_temperature', 'motor_rpm',
       'motor_power', 'motor_temperature', 'indicated_air_speed',
       'pressure_altitude', 'ground_speed', 'outside_air_temperature',
       'inverter_temperature', 'pitch', 'roll'],
      dtype='object')
   time    id exercise  soc  cell_temperature  motor_rpm  motor_power  \
0   NaN  None     None    0                 0          0            0   
1   NaN  None     None  100                 0          0            0   
2   NaN  None     None  100                 0          0            0   
3   NaN  None     None  100                 0          0            0   
4   NaN  None     None  100                17          0            0   

   motor_temperature  indicated_air_speed  pressure_altitude  ground_speed  \
0                0.0                  0.0                0.0           0.0   
1                0.0                  0.0                0.0           0.0   
2                0.0        

In [23]:
# Encode target variable
le = LabelEncoder()
y_new_data = le.fit_transform(new_data['exercise'])

print(y_new_data)

[9 9 9 ... 9 9 9]


In [24]:
# Predict with multinomial logistic regression model
predictions = model.predict(new_data.drop(columns=['exercise','id']))
print(predictions)

ValueError: Input X contains NaN.
LogisticRegression does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [None]:
# Replace the values in the 'exercise' column with the predicted values
new_data['exercise'] = predictions

# Print the updated 'new_data' table
print(new_data.tail())

            time    id exercise soc cell_temperature motor_rpm motor_power  \
90787  44.629874  4940       NA  41               23         0           0   
90788  44.631800  4940       NA  41               23         0           0   
90789  44.633208  4940       NA  41               23         0           0   
90790  44.635448  4940       NA  41               23         0           0   
90791  44.636539  4940       NA  41               23         0           0   

       motor_temperature  indicated_air_speed  pressure_altitude  \
90787          40.495037            34.067734         321.677246   
90788          40.495037            34.067734         321.677246   
90789          40.495037            34.067734         321.677246   
90790          40.495037            34.067734         321.677246   
90791          40.495037            34.067734         321.677246   

       ground_speed  outside_air_temperature  inverter_temperature    pitch  \
90787           0.0                     24.

In [None]:
# Create a temporary DataFrame with IDs and predicted values
temp_predictions = pd.DataFrame({'flight_id': new_data['id'], 'activity': predictions, 'time_min': new_data['time']})

In [None]:
engine = db_connect.connect()

In [None]:
temp_predictions.to_sql('temp_predictions', con=engine, index=False, if_exists='replace')


792

In [None]:
print(temp_predictions.head())

  flight_id activity  time_min
0      5034       NA  0.000000
1      5034       NA  0.031966
2      5034       NA  0.033909
3      5034       NA  0.035297
4      5034       NA  0.036966


In [None]:
from sqlalchemy import create_engine, text
update_query = """
    UPDATE flight_activities
    SET activity = temp_predictions.activity
    FROM temp_predictions
    WHERE flight_activities.flight_id = temp_predictions.flight_id
    AND flight_activities.time_min = temp_predictions.time_min;
"""

# Establish a connection and execute the query
with engine.connect() as connection:
    connection.execute(text(update_query))

# Drop the temporary table
drop_temp_table_query = "DROP TABLE IF EXISTS temp_predictions;"
with engine.connect() as connection:
    connection.execute(text(drop_temp_table_query))