In [2]:
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np
import pandas as pd

In [3]:
df = pd.read_csv("Aviation_KPIs_Dataset.csv")

In [4]:
df.head()

Unnamed: 0,Flight Number,Scheduled Departure Time,Actual Departure Time,Delay (Minutes),Aircraft Utilization (Hours/Day),Turnaround Time (Minutes),Load Factor (%),Fleet Availability (%),Maintenance Downtime (Hours),Fuel Efficiency (ASK),Revenue (USD),Operating Cost (USD),Net Profit Margin (%),Ancillary Revenue (USD),Debt-to-Equity Ratio,Revenue per ASK,Cost per ASK,Profit (USD)
0,FL885,2024-05-20 11:51:21,2024-07-10 02:38:54,50,12.36,115,79.18,96.24,9.21,4.15,10953.75,27847.59,17.9,1058.68,2.46,2639.46,6710.26,-16893.84
1,FL930,2024-01-23 06:56:23,2024-07-07 02:53:44,27,14.53,83,98.59,80.49,1.55,3.93,31597.25,1564.41,7.7,2941.32,2.89,8040.01,398.07,30032.84
2,FL478,2024-05-30 09:18:39,2024-12-05 01:00:54,108,10.73,87,67.44,97.9,2.06,4.26,13700.27,30494.88,5.84,1920.45,1.34,3216.03,7158.42,-16794.61
3,FL637,2024-08-15 05:21:47,2024-09-21 13:23:42,64,15.1,99,69.01,80.17,6.85,4.13,39913.01,13444.09,24.09,3167.79,0.6,9664.17,3255.23,26468.92
4,FL318,2024-07-25 15:29:58,2024-03-21 15:05:54,30,13.46,114,50.36,82.23,7.3,4.97,11531.48,34668.98,7.18,619.8,0.84,2320.22,6975.65,-23137.5


In [5]:
df.columns

Index(['Flight Number', 'Scheduled Departure Time', 'Actual Departure Time',
       'Delay (Minutes)', 'Aircraft Utilization (Hours/Day)',
       'Turnaround Time (Minutes)', 'Load Factor (%)',
       'Fleet Availability (%)', 'Maintenance Downtime (Hours)',
       'Fuel Efficiency (ASK)', 'Revenue (USD)', 'Operating Cost (USD)',
       'Net Profit Margin (%)', 'Ancillary Revenue (USD)',
       'Debt-to-Equity Ratio', 'Revenue per ASK', 'Cost per ASK',
       'Profit (USD)'],
      dtype='object')

In [6]:
# Drop irrelevant columns
df.drop(columns=['Flight Number', 'Scheduled Departure Time', 'Actual Departure Time'], inplace=True)


In [7]:
# Handle missing values
df.fillna(df.median(), inplace=True)

In [10]:
# Feature and target split
x = df.drop(columns=['Profit (USD)'])  # Features
y = df['Profit (USD)']  # Target

In [11]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.3,random_state=43)

In [13]:
rf = RandomForestRegressor(
    n_estimators=100,
    max_depth=10,
    min_samples_split=5,
    min_samples_leaf=3,
    random_state=42
)

In [14]:
rf.fit(x_train,y_train)

In [16]:
xgb = XGBRegressor(
    n_estimators = 100,
    max_depth = 6,
    learning_rate = 0.1,
    subsample = 0.8,
    colsample_bytree =0.8,
    reg_alpha = 0.1,
    reg_lambada = 1.0,
    random_state = 42
)

In [24]:
xgb = XGBRegressor(
    n_estimators=100,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=0.1,
    reg_lambda=1.0,
    eval_metric='rmse',  # Move eval_metric here
    random_state=42
)




In [26]:
xgb.fit(
    x_train, y_train,
    eval_set=[(x_test, y_test)]
)


[0]	validation_0-rmse:17032.48318
[1]	validation_0-rmse:15414.52016
[2]	validation_0-rmse:13904.40443
[3]	validation_0-rmse:12543.85086
[4]	validation_0-rmse:11370.61128
[5]	validation_0-rmse:10260.05275
[6]	validation_0-rmse:9258.95729
[7]	validation_0-rmse:8357.20990
[8]	validation_0-rmse:7541.29486
[9]	validation_0-rmse:7214.42756
[10]	validation_0-rmse:6512.09251
[11]	validation_0-rmse:5901.49971
[12]	validation_0-rmse:5327.69708
[13]	validation_0-rmse:4828.40678
[14]	validation_0-rmse:4362.07781
[15]	validation_0-rmse:3941.21972
[16]	validation_0-rmse:3561.90730
[17]	validation_0-rmse:3426.05475
[18]	validation_0-rmse:3117.26856
[19]	validation_0-rmse:2831.85499
[20]	validation_0-rmse:2576.30927
[21]	validation_0-rmse:2333.30285
[22]	validation_0-rmse:2114.76027
[23]	validation_0-rmse:1928.80358
[24]	validation_0-rmse:1751.52938
[25]	validation_0-rmse:1592.86076
[26]	validation_0-rmse:1454.79072
[27]	validation_0-rmse:1326.55483
[28]	validation_0-rmse:1213.37299
[29]	validation_0-

In [27]:
# Predictions
rf_preds = rf.predict(x_test)
xgb_preds = xgb.predict(x_test)

In [28]:
# Metrics
for name, preds in zip(["Random Forest", "XGBoost"], [rf_preds, xgb_preds]):
    print(f"{name} Performance:")
    print(f"MAE: {mean_absolute_error(y_test, preds):.2f}")
    print(f"RMSE: {np.sqrt(mean_squared_error(y_test, preds)):.2f}")
    print(f"R2 Score: {r2_score(y_test, preds):.4f}\n")


Random Forest Performance:
MAE: 196.48
RMSE: 252.35
R2 Score: 0.9998

XGBoost Performance:
MAE: 226.88
RMSE: 291.02
R2 Score: 0.9997



In [30]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Models
models = {"Random Forest": rf, "XGBoost": xgb}

# Evaluation
for name, model in models.items():
    # Predictions
    train_preds = model.predict(x_train)
    test_preds = model.predict(x_test)

    # Training Metrics
    train_mae = mean_absolute_error(y_train, train_preds)
    train_rmse = np.sqrt(mean_squared_error(y_train, train_preds))
    train_r2 = r2_score(y_train, train_preds)

    # Testing Metrics
    test_mae = mean_absolute_error(y_test, test_preds)
    test_rmse = np.sqrt(mean_squared_error(y_test, test_preds))
    test_r2 = r2_score(y_test, test_preds)

    print(f"\n{name} Performance:")
    print(f"Training -> MAE: {train_mae:.2f}, RMSE: {train_rmse:.2f}, R2: {train_r2:.4f}")
    print(f"Testing  -> MAE: {test_mae:.2f}, RMSE: {test_rmse:.2f}, R2: {test_r2:.4f}")

    # Overfitting Check
    if train_r2 > 0.90 and test_r2 < 0.75:
        print(f" {name} might be **overfitting**! Consider regularization or reducing complexity.")
    elif train_r2 < 0.60 and test_r2 < 0.60:
        print(f" {name} might be **underfitting**! Consider increasing model complexity.")
    else:
        print(f" {name} seems well-balanced.")



Random Forest Performance:
Training -> MAE: 182.88, RMSE: 234.67, R2: 0.9998
Testing  -> MAE: 196.48, RMSE: 252.35, R2: 0.9998
 Random Forest seems well-balanced.

XGBoost Performance:
Training -> MAE: 217.24, RMSE: 278.68, R2: 0.9998
Testing  -> MAE: 226.88, RMSE: 291.02, R2: 0.9997
 XGBoost seems well-balanced.
