In [66]:
import pandas as pd
import numpy as np

In [67]:
df = pd.read_csv('../data/clean/opensky_asia_features.csv')
df.head()

Unnamed: 0,timestamp,icao24,callsign,latitude,longitude,baro_altitude,velocity,vertical_rate,true_track,on_ground,track_sin,track_cos,delta_time,prev_lat,prev_lon,distance_delta,prev_velocity,acceleration,prev_track,turn_rate,climb_phase,speed_bucket
0,2026-02-13 16:55:00,0100a2,MSR5060,30.8654,30.4872,9197.34,212.92,7.15,321.97,False,-0.616074,0.787688,17.0,30.8237,30.5251,5881.473473,213.33,-0.024118,322.05,-0.004706,1,cruise
1,2026-02-13 16:55:05,0100a2,MSR5060,30.8687,30.4843,9212.58,212.92,7.15,321.97,False,-0.616074,0.787688,5.0,30.8654,30.4872,459.631153,212.92,0.0,321.97,0.0,1,cruise
2,2026-02-13 16:55:34,0100a2,MSR5060,30.9144,30.4426,9273.54,219.83,0.0,321.84,False,-0.61786,0.786288,29.0,30.8687,30.4843,6454.113841,212.92,0.238276,321.97,-0.004483,-1,cruise
3,2026-02-13 16:55:47,0100a2,MSR5060,30.9368,30.4223,9349.74,220.64,7.8,322.01,False,-0.615524,0.788118,13.0,30.9144,30.4426,3154.898925,219.83,0.062308,321.84,0.013077,1,cruise
4,2026-02-13 16:55:57,0100a2,MSR5060,30.9527,30.4078,9433.56,219.51,9.75,321.95,False,-0.616349,0.787473,10.0,30.9368,30.4223,2244.560185,220.64,-0.113,322.01,-0.006,1,cruise


In [68]:
df.columns

Index(['timestamp', 'icao24', 'callsign', 'latitude', 'longitude',
       'baro_altitude', 'velocity', 'vertical_rate', 'true_track', 'on_ground',
       'track_sin', 'track_cos', 'delta_time', 'prev_lat', 'prev_lon',
       'distance_delta', 'prev_velocity', 'acceleration', 'prev_track',
       'turn_rate', 'climb_phase', 'speed_bucket'],
      dtype='object')

In [69]:
df['speed_bucket'].unique()

array(['cruise', 'fast', 'slow'], dtype=object)

First, we have to sort the dataframe by icao24 number of the plane and their timestamps.

In [70]:
df = df.sort_values(["icao24", "timestamp"])

Now, we derive lag features which will be useful in our climb phase detection.

In [71]:
for lag in [1, 2, 3]:
    df[f"accel_lag_{lag}"] = df.groupby("icao24")["acceleration"].shift(lag)
    df[f"dist_lag_{lag}"] = df.groupby("icao24")["distance_delta"].shift(lag)

remove the columns that tells our model directly about climb phase.

In [72]:
feature_cols = [
    "baro_altitude",
    "velocity",
    "distance_delta",
    "acceleration",
    "turn_rate",
    "delta_time",
    "track_sin",
    "track_cos",

    "accel_lag_1",
    "accel_lag_2",
    "accel_lag_3",
    "dist_lag_1",
    "dist_lag_2",
    "dist_lag_3"
]

target_col = "climb_phase"
df_model = df[feature_cols + [target_col, "icao24"]].dropna()

In [73]:
df_model.head()

Unnamed: 0,baro_altitude,velocity,distance_delta,acceleration,turn_rate,delta_time,track_sin,track_cos,accel_lag_1,accel_lag_2,accel_lag_3,dist_lag_1,dist_lag_2,dist_lag_3,climb_phase,icao24
3,9349.74,220.64,3154.898925,0.062308,0.013077,13.0,-0.615524,0.788118,0.238276,0.0,-0.024118,6454.113841,459.631153,5881.473473,1,0100a2
4,9433.56,219.51,2244.560185,-0.113,-0.006,10.0,-0.616349,0.787473,0.062308,0.238276,0.0,3154.898925,6454.113841,459.631153,1,0100a2
5,9464.04,219.51,649.638854,0.0,0.0,3.0,-0.616349,0.787473,-0.113,0.062308,0.238276,2244.560185,3154.898925,6454.113841,1,0100a2
6,9593.58,216.53,2621.831223,-0.1192,-0.004,25.0,-0.617722,0.786396,0.0,-0.113,0.062308,649.638854,2244.560185,3154.898925,1,0100a2
7,9776.46,215.41,4473.650931,-0.086154,-0.004615,13.0,-0.618546,0.785749,-0.1192,0.0,-0.113,2621.831223,649.638854,2244.560185,1,0100a2


Since, one plane given by its icao number can have multiple instances, we will do GroupShuffleSplit instead of train_test_split.

In [74]:
from sklearn.model_selection import GroupShuffleSplit
groups = df_model["icao24"]

gss = GroupShuffleSplit(
    test_size = 0.2,
    n_splits = 1,
    random_state = 42
)

train_idx, val_idx = next(gss.split(df_model, groups=groups))

X_train = df_model.iloc[train_idx][feature_cols]
y_train = df_model.iloc[train_idx][target_col]
X_val = df_model.iloc[val_idx][feature_cols]
y_val = df_model.iloc[val_idx][target_col]

Here, we will just scale our data.

In [75]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

### Logistic Regression:

In [76]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(
    max_iter = 1000,
    class_weight="balanced"
)
model.fit(X_train_scaled, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,'balanced'
,random_state,
,solver,'lbfgs'
,max_iter,1000


In [77]:
from sklearn.metrics import classification_report, confusion_matrix
y_pred = model.predict(X_val_scaled)
print(classification_report(y_val, y_pred))
print(confusion_matrix(y_val, y_pred))

              precision    recall  f1-score   support

          -1       0.94      0.80      0.87    162977
           0       0.32      0.42      0.36       751
           1       0.41      0.72      0.52     30843

    accuracy                           0.79    194571
   macro avg       0.56      0.65      0.58    194571
weighted avg       0.85      0.79      0.81    194571

[[130766    532  31679]
 [     6    313    432]
 [  8371    131  22341]]


Logistic Regression has the 79% accuracy. Lets see each coefficient's weight.

In [78]:
coef_df = pd.DataFrame({
    "feature": feature_cols,
    "weight": model.coef_[1]
}).sort_values("weight", ascending=False)
coef_df

Unnamed: 0,feature,weight
6,track_sin,0.768927
11,dist_lag_1,0.048907
2,distance_delta,0.027664
13,dist_lag_3,-0.008164
12,dist_lag_2,-0.012985
7,track_cos,-0.017797
5,delta_time,-0.03319
4,turn_rate,-0.039399
8,accel_lag_1,-0.118917
10,accel_lag_3,-0.195899


It seems like track_sin accounts for the largest weight.

### Random Forest Classifier

In [51]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(
    n_estimators = 300,
    max_depth = 15,
    min_samples_leaf = 50,
    class_weight = "balanced", 
    n_jobs = -1,
    random_state = 42
)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_val)
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

          -1       0.96      0.91      0.93    162977
           0       0.79      0.38      0.52       751
           1       0.63      0.79      0.70     30843

    accuracy                           0.89    194571
   macro avg       0.79      0.70      0.72    194571
weighted avg       0.90      0.89      0.90    194571



### XGBoost

In [84]:
unique_labels = np.unique(y_train)  
y_train.unique()

array([2, 0, 1])

In [86]:
y_train = y_train.map({-1: 0, 0: 1, 1: 2})
y_val = y_val.map({-1: 0, 0: 1, 1: 2})

In [85]:
import xgboost as xgb

model = xgb.XGBClassifier(
    n_estimators = 100,
    max_depth = 3,
    learning_rate = 0.1,
    objective = 'multi:softmax',
    num_class = len(unique_labels),
    use_label_encoder = False,
    eval_metric = 'mlogloss'
)
model.fit(X_train, y_train)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


0,1,2
,objective,'multi:softmax'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [87]:
from sklearn.metrics import accuracy_score
y_pred = model.predict(X_val)
print("Accuracy:", accuracy_score(y_val, y_pred))
print("Classification Report: \n", classification_report(y_val, y_pred))
print("Confusion Matrix: \n", confusion_matrix(y_val, y_pred))

Accuracy: 0.9117237409480344
Classification Report: 
               precision    recall  f1-score   support

           0       0.92      0.98      0.95    162977
           1       0.99      0.38      0.55       751
           2       0.83      0.57      0.68     30843

    accuracy                           0.91    194571
   macro avg       0.91      0.64      0.73    194571
weighted avg       0.91      0.91      0.90    194571

Confusion Matrix: 
 [[159442      4   3531]
 [   463    287      1]
 [ 13177      0  17666]]
