In [4]:
import os, joblib, numpy as np, pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score
from sklearn.preprocessing import StandardScaler, LabelEncoder

ROOT = Path("..")   
DATA_DIR = ROOT / "data"
DATASET_DIR = ROOT / "dataset"
MODELS_DIR = ROOT.parent / "models"  # project_root/models
REPORTS_DIR = ROOT.parent / "reports"
FIG_DIR = REPORTS_DIR / "figures"
METRICS_DIR = REPORTS_DIR / "metrics"

for p in (MODELS_DIR, REPORTS_DIR, FIG_DIR, METRICS_DIR):
    os.makedirs(p, exist_ok=True)


In [5]:
train_csv = DATA_DIR / "train.csv"
val_csv   = DATA_DIR / "val.csv"
test_csv  = DATA_DIR / "test.csv"
processed_csv = DATASET_DIR / "processed_logs.csv" 

if train_csv.exists() and val_csv.exists() and test_csv.exists():
    print("Loading pre-saved splits from", DATA_DIR)
    train_df = pd.read_csv(train_csv)
    val_df   = pd.read_csv(val_csv)
    test_df  = pd.read_csv(test_csv)
else:
    print("train/val/test not found — loading processed_logs.csv and creating splits")
    df = pd.read_csv(processed_csv)
    
    if "flight_id" in df.columns:
        flights = df["flight_id"].unique()
        train_flights, rest = train_test_split(flights, test_size=0.3, random_state=42)
        val_flights, test_flights = train_test_split(rest, test_size=0.5, random_state=42)
        train_df = df[df["flight_id"].isin(train_flights)].reset_index(drop=True)
        val_df   = df[df["flight_id"].isin(val_flights)].reset_index(drop=True)
        test_df  = df[df["flight_id"].isin(test_flights)].reset_index(drop=True)
    else:
        train_df, temp = train_test_split(df, test_size=0.3, random_state=42, stratify=df.get("action"))
        val_df, test_df = train_test_split(temp, test_size=0.5, random_state=42, stratify=temp.get("action"))

print("Train/Val/Test sizes:", len(train_df), len(val_df), len(test_df))


Loading pre-saved splits from ../data
Train/Val/Test sizes: 64 30 30


In [6]:
def get_feature_cols(df):
    
    s_cols = [c for c in df.columns if c.endswith("_s")]
    if len(s_cols) >= 6:
        print("Using scaled feature columns (found _s):", len(s_cols))
        return s_cols, True
    
    fallback = ["confidence","x","y","w","h","bbox_area","aspect_ratio",
                "height","temp","speed_x","speed_y","speed_z","flight_time","battery"]
    present = [c for c in fallback if c in df.columns]
    print("Using fallback feature columns:", present)
    return present, False

feature_cols, already_scaled = get_feature_cols(train_df)
feature_cols


Using scaled feature columns (found _s): 13


['detected_s',
 'x_s',
 'y_s',
 'w_s',
 'h_s',
 'bbox_area_s',
 'aspect_ratio_s',
 'height_s',
 'speed_x_s',
 'speed_y_s',
 'speed_z_s',
 'dx_s',
 'dy_s']

In [7]:

def prepare_Xy(df, feature_cols):
    X = df[feature_cols].copy()
    y = df["action"].copy() if "action" in df.columns else df["action_encoded"].copy()
    return X, y

X_train_raw, y_train = prepare_Xy(train_df, feature_cols)
X_val_raw, y_val     = prepare_Xy(val_df, feature_cols)
X_test_raw, y_test   = prepare_Xy(test_df, feature_cols)

# If features are already scaled (_s), skip scaler; else try to load scaler from models or fit one
scaler_path = MODELS_DIR / "scaler.joblib"
if already_scaled:
    print("Columns already scaled; skipping scaler.")
    X_train = X_train_raw.values
    X_val   = X_val_raw.values
    X_test  = X_test_raw.values
else:
    # ensure numeric (fillna)
    X_train_raw = X_train_raw.fillna(0.0)
    X_val_raw = X_val_raw.fillna(0.0)
    X_test_raw = X_test_raw.fillna(0.0)
    if scaler_path.exists():
        scaler = joblib.load(scaler_path)
        print("Loaded existing scaler:", scaler_path)
    else:
        from sklearn.preprocessing import StandardScaler
        scaler = StandardScaler()
        scaler.fit(X_train_raw)
        joblib.dump(scaler, scaler_path)
        print("Fitted new scaler and saved to", scaler_path)
    X_train = scaler.transform(X_train_raw)
    X_val   = scaler.transform(X_val_raw)
    X_test  = scaler.transform(X_test_raw)

# Label encoder: ensure we have integer labels; save encoder
le_path = MODELS_DIR / "label_encoder.joblib"
if le_path.exists():
    le = joblib.load(le_path)
    # If y already encoded numeric, convert; else transform
    if y_train.dtype.kind in "iu":  # already ints
        y_train_enc = y_train.astype(int).values
        y_val_enc   = y_val.astype(int).values
        y_test_enc  = y_test.astype(int).values
    else:
        y_train_enc = le.transform(y_train)
        y_val_enc   = le.transform(y_val)
        y_test_enc  = le.transform(y_test)
    print("Loaded label encoder")
else:
    le = LabelEncoder()
    # if y are numeric coded already we still fit on strings to be safe
    y_train_enc = le.fit_transform(y_train)
    y_val_enc   = le.transform(y_val)
    y_test_enc  = le.transform(y_test)
    joblib.dump(le, le_path)
    print("Fitted and saved label encoder to", le_path)

print("Final shapes (X_train, y_train):", X_train.shape, y_train_enc.shape)


Columns already scaled; skipping scaler.
Fitted and saved label encoder to models/label_encoder.joblib
Final shapes (X_train, y_train): (64, 13) (64,)


In [8]:
# Train three baseline models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
try:
    from xgboost import XGBClassifier
    xgb_available = True
except Exception as e:
    print("XGBoost not available (fallback will be used):", e)
    xgb_available = False
    from sklearn.ensemble import HistGradientBoostingClassifier

models = {}

# Logistic Regression
lr = LogisticRegression(max_iter=1000, random_state=42)
lr.fit(X_train, y_train_enc)
models["LogisticRegression"] = lr

# Random Forest
rf = RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1)
rf.fit(X_train, y_train_enc)
models["RandomForest"] = rf

# XGBoost or fallback
if xgb_available:
    xgb = XGBClassifier(n_estimators=200, learning_rate=0.05, max_depth=6,
                        use_label_encoder=False, eval_metric="mlogloss", random_state=42)
    xgb.fit(X_train, y_train_enc)
    models["XGBoost"] = xgb
else:
    hgb = HistGradientBoostingClassifier(random_state=42)
    hgb.fit(X_train, y_train_enc)
    models["HistGB"] = hgb

print("Models trained:", list(models.keys()))


XGBoost not available (fallback will be used): No module named 'xgboost'
Models trained: ['LogisticRegression', 'RandomForest', 'HistGB']


In [9]:
# Helper to evaluate & save results
def evaluate_and_report(model, X, y_true, le, name, out_prefix="val"):
    y_pred = model.predict(X)
    acc = accuracy_score(y_true, y_pred)
    f1_w = f1_score(y_true, y_pred, average="weighted")
    f1_m = f1_score(y_true, y_pred, average="macro")
    print(f"{name} — acc: {acc:.4f}, f1(weighted): {f1_w:.4f}, f1(macro): {f1_m:.4f}")
    # classification report (strings)
    report = classification_report(y_true, y_pred, target_names=le.classes_, output_dict=False)
    print(report)
    # confusion matrix plot
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(6,5))
    plt.imshow(cm, interpolation='nearest')
    plt.colorbar()
    ticks = np.arange(len(le.classes_))
    plt.xticks(ticks, le.classes_, rotation=45)
    plt.yticks(ticks, le.classes_)
    plt.xlabel("Predicted")
    plt.ylabel("True")
    plt.title(f"{name} Confusion ({out_prefix})")
    plt.tight_layout()
    fig_path = FIG_DIR / f"{out_prefix}_{name}_confusion.png"
    plt.savefig(fig_path)
    plt.close()
    # save metrics to csv
    rep = classification_report(y_true, y_pred, target_names=le.classes_, output_dict=True)
    pd.DataFrame(rep).transpose().to_csv(METRICS_DIR / f"{out_prefix}_{name}_report.csv")
    return {"acc": acc, "f1_weighted": f1_w, "f1_macro": f1_m, "report": rep, "cm_path": str(fig_path)}

# Evaluate on validation
val_results = {}
for name, model in models.items():
    val_results[name] = evaluate_and_report(model, X_val, y_val_enc, le, name, out_prefix="val")


LogisticRegression — acc: 0.8333, f1(weighted): 0.8182, f1(macro): 0.3030
              precision    recall  f1-score   support

move_forward       0.89      0.93      0.91        27
   move_left       0.00      0.00      0.00         2
  move_right       0.00      0.00      0.00         1

    accuracy                           0.83        30
   macro avg       0.30      0.31      0.30        30
weighted avg       0.80      0.83      0.82        30

RandomForest — acc: 1.0000, f1(weighted): 1.0000, f1(macro): 1.0000
              precision    recall  f1-score   support

move_forward       1.00      1.00      1.00        27
   move_left       1.00      1.00      1.00         2
  move_right       1.00      1.00      1.00         1

    accuracy                           1.00        30
   macro avg       1.00      1.00      1.00        30
weighted avg       1.00      1.00      1.00        30



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


HistGB — acc: 0.7000, f1(weighted): 0.7578, f1(macro): 0.4759
              precision    recall  f1-score   support

move_forward       0.95      0.70      0.81        27
   move_left       0.20      0.50      0.29         2
  move_right       0.20      1.00      0.33         1

    accuracy                           0.70        30
   macro avg       0.45      0.73      0.48        30
weighted avg       0.87      0.70      0.76        30



In [10]:
# select best by weighted F1
best_name = max(val_results.keys(), key=lambda k: val_results[k]["f1_weighted"])
best_score = val_results[best_name]["f1_weighted"]
print("Best on validation:", best_name, "f1_weighted=", best_score)

# Save each model to disk and also save the best as best_model.joblib
for name, model in models.items():
    path = MODELS_DIR / f"{name.lower()}.joblib"
    joblib.dump(model, path)
    print("Saved", name, "->", path)

best_model = models[best_name]
best_path = MODELS_DIR / "best_model.joblib"
joblib.dump(best_model, best_path)
print("Saved best model to", best_path)


Best on validation: RandomForest f1_weighted= 1.0
Saved LogisticRegression -> models/logisticregression.joblib
Saved RandomForest -> models/randomforest.joblib
Saved HistGB -> models/histgb.joblib
Saved best model to models/best_model.joblib


In [11]:
# Evaluate best model on test set
test_metrics = evaluate_and_report(best_model, X_test, y_test_enc, le, best_name, out_prefix="test")

# Feature importance (if available)
if hasattr(best_model, "feature_importances_"):
    fi = best_model.feature_importances_
    feat_names = feature_cols
    # if we used scaled columns with suffix, strip _s for nicer labels
    feat_labels = [c[:-2] if c.endswith("_s") else c for c in feat_names]
    idx = np.argsort(fi)[::-1]
    plt.figure(figsize=(8,6))
    plt.barh(np.array(feat_labels)[idx], fi[idx])
    plt.title(f"Feature importances ({best_name})")
    plt.tight_layout()
    plt.savefig(FIG_DIR / f"feature_importance_{best_name}.png")
    plt.close()
    print("Saved feature importance plot.")
else:
    print("Best model has no feature_importances_ attribute (e.g., LogisticRegression).")
    
# Save test metrics summary
import json
with open(METRICS_DIR / f"best_model_test_metrics.json", "w") as fh:
    json.dump(test_metrics, fh, indent=2)
print("Saved test metrics.")


RandomForest — acc: 0.9333, f1(weighted): 0.9282, f1(macro): 0.9182
              precision    recall  f1-score   support

move_forward       0.91      1.00      0.95        21
   move_left       1.00      0.67      0.80         6
  move_right       1.00      1.00      1.00         3

    accuracy                           0.93        30
   macro avg       0.97      0.89      0.92        30
weighted avg       0.94      0.93      0.93        30

Saved feature importance plot.
Saved test metrics.


In [12]:
# Save pipeline objects that you'll need at inference time
joblib.dump(le, MODELS_DIR / "label_encoder.joblib")  # probably already saved
if not already_scaled:
    joblib.dump(joblib.load(scaler_path), MODELS_DIR / "scaler.joblib")
# best_model already saved; we also store a simple pipeline dict
pipeline = {
    "scaler_path": str(MODELS_DIR / "scaler.joblib"),
    "label_encoder_path": str(MODELS_DIR / "label_encoder.joblib"),
    "model_path": str(MODELS_DIR / "best_model.joblib"),
    "feature_columns": feature_cols
}
joblib.dump(pipeline, MODELS_DIR / "inference_pipeline.joblib")
print("Saved inference pipeline metadata to models/inference_pipeline.joblib")


Saved inference pipeline metadata to models/inference_pipeline.joblib


In [13]:
# Optional: quick example of RandomizedSearchCV for RF (uncomment and run if you want tuning)
"""
from sklearn.model_selection import RandomizedSearchCV
param_dist = {
    "n_estimators": [100,200,300,500],
    "max_depth": [None, 10, 20, 30],
    "min_samples_leaf": [1,2,4],
    "max_features": ["sqrt","log2", 0.5]
}
rs = RandomizedSearchCV(RandomForestClassifier(random_state=42), param_dist,
                        n_iter=20, scoring="f1_weighted", cv=3, n_jobs=-1, random_state=42, verbose=2)
rs.fit(X_train, y_train_enc)
print("RandomizedSearch best params:", rs.best_params_)
joblib.dump(rs.best_estimator_, MODELS_DIR / "rf_tuned.joblib")
"""
print("Tuning snippet provided (commented out). Run if you want to tune.")


Tuning snippet provided (commented out). Run if you want to tune.


In [22]:

import joblib, numpy as np, pandas as pd
pipeline = joblib.load(ROOT/"notebooks/models/inference_pipeline.joblib")

model = joblib.load(pipeline["model_path"])
le = joblib.load(pipeline["label_encoder_path"])
scaler = joblib.load(pipeline["scaler_path"])
feat_cols = pipeline["feature_columns"]

def predict_from_row(row_dict):
    # row_dict must include feature keys in feat_cols (raw numbers if scaler exists)
    df = pd.DataFrame([row_dict])[feat_cols].fillna(0.0)
    X = scaler.transform(df.values)
    pred = model.predict(X)
    return le.inverse_transform(pred)[0]

print("Example usage: call predict_from_row with a dict containing features in", feat_cols)


Example usage: call predict_from_row with a dict containing features in ['detected_s', 'x_s', 'y_s', 'w_s', 'h_s', 'bbox_area_s', 'aspect_ratio_s', 'height_s', 'speed_x_s', 'speed_y_s', 'speed_z_s', 'dx_s', 'dy_s']
