This is a program which is in testing phase. We will use a linear regression ML model to give an SOC number for the flights.

In [1]:
# Import the querying module
from flight_querying import query_flights
from sklearn.model_selection import train_test_split
import pandas as pd

# Set up and retrieve the data from the database.
db_connect = query_flights()

In [2]:
# Flight IDs to include
flight_ids = [4620, 4929, 4940, 5019, 5021, 5034]

# Fetch data for specified flight IDs
data_frames = [db_connect.connect_flight_for_ml_data_label(flight_id) for flight_id in flight_ids]

# Concatenate data frames and shuffle the data
all_data = pd.concat(data_frames, axis=0).sample(frac=1, random_state=42)

# Split the data into train and test sets
train_data, test_data = train_test_split(all_data, test_size=0.5, random_state=42)

# Set up train data
train_y = train_data["soc"].to_numpy()
train_x = train_data.drop(columns=["soc"])

# Set up test data
test_y = test_data["soc"].to_numpy()
test_x = test_data.drop(columns=["soc"])

In [3]:
print(f"Length train_x = {len(train_x)} \n Length train_y = {len(train_y)}")

Length train_x = 176069 
 Length train_y = 176069


In [4]:
print(f"Length test_x = {len(test_x)} \n Length test_y = {len(test_y)}")

Length test_x = 176069 
 Length test_y = 176069


One-Hot-Encoding of the Operations columns

In [5]:
# ONE-HOT ENCODE
# https://stackabuse.com/one-hot-encoding-in-python-with-pandas-and-scikit-learn/
def one_hot(df, col, pre):
  encoded = pd.get_dummies(df[col], prefix=pre)
  for column in encoded:
    encoded = encoded.rename(columns={column: col + "_" + column})
  encoded['time'] = df['time']
  encoded["id"] = df["id"]
  return encoded

In [8]:
# Encode Train data
train_encoded = one_hot(train_x, "exercise", 'is')
final_train_x = pd.merge(train_x, train_encoded, on=["time", "id"])
final_train_x = final_train_x.drop(columns=["time", "id", "exercise"])

# Encode Test data
test_encoded = one_hot(test_x, "exercise", 'is')
final_test_x = pd.merge(test_x, test_encoded, on=["time","id"])
final_test_x = final_test_x.drop(columns=["time", "id", "exercise"])

In [9]:
final_train_x

Unnamed: 0,environment_temperature,dewpoint,humidity,wind_speed,visibility,cell_temperature,motor_rpm,motor_power,motor_temperature,indicated_air_speed,...,exercise_is_NA,exercise_is_climb,exercise_is_cruise,exercise_is_descent,exercise_is_landing,exercise_is_power off stall,exercise_is_power on stall,exercise_is_slow flight,exercise_is_steep turns,exercise_is_takeoff
0,44.6,41.0,87.09,2,9.0,17,709,2,17.992535,0.000000,...,True,False,False,False,False,False,False,False,False,False
1,75.2,57.2,53.55,3,9.0,21,0,0,25.748734,21.597849,...,True,False,False,False,False,False,False,False,False,False
2,75.2,57.2,53.55,3,9.0,21,0,0,25.748734,21.597849,...,True,False,False,False,False,False,False,False,False,False
3,75.2,57.2,53.55,4,9.0,21,0,0,25.748734,21.597849,...,True,False,False,False,False,False,False,False,False,False
4,75.2,57.2,53.55,4,9.0,21,0,0,25.748734,21.597849,...,True,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
264176,66.2,59.0,77.60,4,9.0,25,1924,21,58.877289,75.679705,...,False,False,True,False,False,False,False,False,False,False
264177,44.6,41.0,87.09,3,9.0,14,1901,21,41.061993,74.738262,...,False,False,True,False,False,False,False,False,False,False
264178,55.4,44.6,66.91,9,9.0,17,2079,35,40.179089,65.504970,...,True,False,False,False,False,False,False,False,False,False
264179,44.6,41.0,87.09,3,9.0,17,2229,44,41.405380,72.401552,...,False,True,False,False,False,False,False,False,False,False


In [10]:
print(f"Length test_encoded = {len(test_encoded)} \n Length train_encoded = {len(train_encoded)}")

Length test_encoded = 176069 
 Length train_encoded = 176069


In [11]:
print(f"Length train_x = {len(final_train_x)} \n Length train_y = {len(train_y)}")

Length train_x = 264181 
 Length train_y = 176069


In [12]:
print(f"Length test_x = {len(final_test_x)} \n Length test_y = {len(test_y)}")

Length test_x = 264181 
 Length test_y = 176069


In [None]:
train_encoded

In [None]:
test_encoded

Machine Learning Model Implementation

In [None]:
# import sklearn
from sklearn import preprocessing, svm
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score, classification_report

# Set model
regression_model = LinearRegression()

# Fit model
regression_model.fit(final_train_x, train_y)

In [None]:
# print model score
print(regression_model.score(final_test_x, test_y))

In [None]:
coeff = pd.DataFrame(list(zip(regression_model.feature_names_in_, regression_model.coef_)), columns = ['Feature', 'Weight'])
coeff.sort_values('Weight')

In [None]:
# Make predictions on the test set
predictions = regression_model.predict(test_x)

# Evaluate the model
accuracy = accuracy_score(test_y, predictions)
print(f"Accuracy: {accuracy:.2f}")

# Display additional classification metrics
print("Classification Report:")
print(classification_report(test_y, predictions))