In [36]:
# Import packages
import pandas as pd
import numpy as np
import joblib
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.preprocessing import FunctionTransformer

In [37]:
# Load dataset
# Import Dataset 
X_train = pd.read_csv('../data/processed/X_train.csv')
X_val = pd.read_csv('../data/processed/X_val.csv')
X_test = pd.read_csv('../data/processed/X_test.csv')
y_train = pd.read_csv('../data/processed/y_train.csv')
y_val = pd.read_csv('../data/processed/y_val.csv')
y_test = pd.read_csv('../data/processed/y_test.csv')


In [38]:
from joblib import load

In [39]:
X_train

Unnamed: 0,startingAirport,destinationAirport,Cabin_Leg1,Cabin_Leg2,Cabin_Leg3,Cabin_Leg4,n_stops,month,day,hour,minute,day_of_week,week_of_year,date_diff,medianTravelDistance
0,BOS,DTW,coach,coach,no_stop,no_stop,1,6,16,23,42,3,24,49,670.0
1,DEN,MIA,coach,coach,no_stop,no_stop,1,6,1,5,55,2,22,31,1834.0
2,DEN,LAX,coach,no_stop,no_stop,no_stop,0,5,12,13,0,3,19,3,939.0
3,CLT,SFO,coach,coach,no_stop,no_stop,1,6,11,21,15,5,23,53,2536.0
4,BOS,DFW,coach,coach,no_stop,no_stop,1,6,13,15,5,0,24,32,1565.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6278770,DEN,ORD,coach,coach,no_stop,no_stop,1,4,19,17,0,1,16,1,1034.0
6278771,CLT,LGA,coach,coach,no_stop,no_stop,1,6,15,11,25,2,24,47,548.0
6278772,DEN,LGA,coach,coach,no_stop,no_stop,1,5,26,5,55,3,21,37,1627.0
6278773,CLT,DTW,coach,coach,no_stop,no_stop,1,6,9,15,42,3,23,49,832.0


In [40]:
preprocessor

In [41]:
from xgboost import XGBRegressor

# Create a pipeline with the preprocessor and the  model
xgb_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),  # Step 1: preprocess features
    ('model', XGBRegressor(n_estimators=100, 
                           max_depth = 9,
                           min_child_weight = 9,
                           subsample = 0.7,
                           colsample_bytree  = 0.5,
                           random_state=42))  
])


In [42]:
# Fit the pipeline on the training data
xgb_pipeline.fit(X_train, y_train)

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [43]:
from joblib import dump
dump(xgb_pipeline, '../models/xgb_pipeline.joblib')

['../models/xgb_pipeline.joblib']

## Evaluating Model

In [44]:
preds_train = xgb_pipeline.predict(X_train)
preds_val = xgb_pipeline.predict(X_val)


In [45]:
preds_test = xgb_pipeline.predict(X_test)

In [46]:
from my_krml_ratana.models.performance import print_regressor_scores

In [47]:
print_regressor_scores(y_preds=preds_train, y_actuals=y_train, set_name='Training')
print_regressor_scores(y_preds=preds_val, y_actuals=y_val, set_name='Validating')

RMSE Training: 85.38433129097044
MAE Training: 57.37425445618766
RMSE Validating: 85.91520760783447
MAE Validating: 57.6430764663865


In [48]:
print_regressor_scores(y_preds=preds_test, y_actuals=y_test, set_name='Testing')

RMSE Testing: 86.21781490111088
MAE Testing: 57.6862820508008


## Test app input

In [49]:
median_traveldistance = pd.read_csv('../data/external/median_travel_distance.csv')

In [50]:
#Match median travel distance
def get_distance(median_traveldistance, start, destination):
    df = median_traveldistance[(median_traveldistance['startingAirport']==start) &
                               (median_traveldistance['destinationAirport']==destination)]

    if df is not None:
        distance = df['medianTravelDistance'].values[0]
        return distance
    return None

In [51]:
### RATANA PROCESS FLIGHT DATA ###
def ratana_process_flight_data(
    date: str,
    time: str,
    startingAirport: str,
    destinationAirport: str,
    n_stops: int,
    Cabin_Leg1: str,
    Cabin_Leg2: str = "no_stop", # Set default parameter for no stop
    Cabin_Leg3: str = "no_stop", # Set default parameter for no stop
    Cabin_Leg4: str = "no_stop" # Set default parameter for no stop
):
    """
    Function to process flight data to inputs for modelling
    """
    
    # Check and adjust cabin types based on the number of stops
    if n_stops == 0:
        Cabin_Leg2 = Cabin_Leg3 = Cabin_Leg4 = "no_stop"
    elif n_stops == 1:
        Cabin_Leg3 = Cabin_Leg4 = "no_stop"
    elif n_stops == 2:
        Cabin_Leg4 = "no_stop"
    
    # Extract date and time features (simplified here)
    combined_datetime = datetime.strptime(f"{date} {time}", "%Y-%m-%d %H-%M-%S")

    # Use the current date as the reference flight date
    reference_flight_date = datetime.now()

    # Calculate the date_diff
    date_diff = (combined_datetime - reference_flight_date).days

    # Get the travel distance based on airport combinations
    travel_distance = get_distance(median_traveldistance, startingAirport, destinationAirport)

    # Create the feature dictionary
    features = {
        "month": combined_datetime.month,
        "day": combined_datetime.day,
        "hour": combined_datetime.hour,
        "minute": combined_datetime.minute,
        "day_of_week": combined_datetime.weekday(),  # e.g., Monday
        "week_of_year": combined_datetime.isocalendar()[1],  # ISO week number
        "date_diff": date_diff,
        "startingAirport": startingAirport,
        "destinationAirport": destinationAirport,
        "n_stops": n_stops,
        "medianTravelDistance": travel_distance,
        "Cabin_Leg1": Cabin_Leg1,
        "Cabin_Leg2": Cabin_Leg2,
        "Cabin_Leg3": Cabin_Leg3,
        "Cabin_Leg4": Cabin_Leg4
    }
    # Convert the features dictionary into a Dataframe
    features_df = pd.DataFrame([features])

    # Specify the desired data types
    features_df = features_df.astype({
        "month": "int64",
        "day": "int64",
        "hour": "int64",
        "minute": "int64",
        "day_of_week": "int64",
        "week_of_year": "int64",
        "date_diff": "int64",
        "startingAirport": "string",
        "destinationAirport": "string",
        "n_stops": "int64",
        "medianTravelDistance": travel_distance,
        "Cabin_Leg1": "string",
        "Cabin_Leg2": "string",
        "Cabin_Leg3": "string",
        "Cabin_Leg4": "string"
        
    })
    return features_df



In [52]:
# Example usage
flight_data = ratana_process_flight_data(
    date="2024-11-25",
    time="15-30-00",
    startingAirport="ATL",
    destinationAirport="LAX",
    n_stops=0,  # 2 stops, so 3 legs
    Cabin_Leg1="coach"
)


In [53]:
flight_data

Unnamed: 0,month,day,hour,minute,day_of_week,week_of_year,date_diff,startingAirport,destinationAirport,n_stops,medianTravelDistance,Cabin_Leg1,Cabin_Leg2,Cabin_Leg3,Cabin_Leg4
0,11,25,15,30,0,48,17,ATL,LAX,0,2034.0,coach,no_stop,no_stop,no_stop


In [54]:
xgb_pipeline.predict(flight_data)

array([529.8064], dtype=float32)

In [55]:
from datetime import datetime