In [1]:
pip install -i https://test.pypi.org/simple/ my_krml_ratana

Looking in indexes: https://test.pypi.org/simple/Note: you may need to restart the kernel to use updated packages.



In [2]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.preprocessing import FunctionTransformer
from sklearn.model_selection import train_test_split


## Import Dataset

In [3]:
df_1 = pd.read_csv('../data/interim/df_set_1.csv')

In [4]:
df_1

Unnamed: 0,startingAirport,destinationAirport,flightDate,searchDate,segmentsDepartureTimeRaw,segmentsCabinCode,totalFare
0,ATL,BOS,2022-04-17,2022-04-16,2022-04-17T12:57:00.000-04:00,coach,248.6
1,ATL,BOS,2022-04-17,2022-04-16,2022-04-17T06:30:00.000-04:00,coach,248.6
2,ATL,BOS,2022-04-17,2022-04-16,2022-04-17T11:35:00.000-04:00,coach,248.6
3,ATL,BOS,2022-04-17,2022-04-16,2022-04-17T13:59:00.000-04:00,coach,248.6
4,ATL,BOS,2022-04-17,2022-04-16,2022-04-17T09:59:00.000-04:00,coach,248.6
...,...,...,...,...,...,...,...
3488204,DEN,SFO,2022-06-04,2022-05-19,2022-06-04T09:45:00.000-06:00||2022-06-04T12:4...,coach||coach,506.6
3488205,DEN,SFO,2022-06-04,2022-05-19,2022-06-04T12:25:00.000-06:00||2022-06-04T20:1...,coach||coach,562.2
3488206,DEN,SFO,2022-06-04,2022-05-19,2022-06-04T12:25:00.000-06:00||2022-06-04T21:3...,coach||coach,562.2
3488207,DEN,SFO,2022-06-04,2022-05-19,2022-06-04T12:25:00.000-06:00||2022-06-04T15:4...,coach||coach,586.6


## Explore Dataset

In [5]:
df_1['segmentsCabinCode'].unique()

array(['coach', 'coach||coach', 'coach||coach||coach||coach',
       'coach||coach||coach', 'first', 'first||coach', 'coach||first',
       'premium coach||coach', 'premium coach',
       'premium coach||premium coach', 'coach||premium coach',
       'coach||coach||premium coach', 'coach||business',
       'premium coach||coach||coach', 'first||coach||coach',
       'coach||coach||coach||premium coach', 'first||first',
       'premium coach||premium coach||coach',
       'coach||coach||coach||first', 'coach||coach||business',
       'business||coach', 'coach||first||first', 'coach||coach||first',
       'first||coach||first', 'coach||first||coach',
       'business||coach||coach', 'coach||business||business',
       'coach||premium coach||coach', 'coach||business||coach',
       'business||business||coach', 'business', 'business||first',
       'first||first||first', 'premium coach||business||coach',
       'business||business', 'first||first||coach', 'first||business',
       'first||

## Split segments

In [6]:
df_copy = df_1.copy()

In [7]:
# Split segmentsDepartureTimeRaw to get the first leg depature time

segments = df_copy['segmentsDepartureTimeRaw'].str.split(r'\|\|', n=1, expand=True)


In [8]:
segments

Unnamed: 0,0,1
0,2022-04-17T12:57:00.000-04:00,
1,2022-04-17T06:30:00.000-04:00,
2,2022-04-17T11:35:00.000-04:00,
3,2022-04-17T13:59:00.000-04:00,
4,2022-04-17T09:59:00.000-04:00,
...,...,...
3488204,2022-06-04T09:45:00.000-06:00,2022-06-04T12:45:00.000-07:00
3488205,2022-06-04T12:25:00.000-06:00,2022-06-04T20:10:00.000-07:00
3488206,2022-06-04T12:25:00.000-06:00,2022-06-04T21:35:00.000-07:00
3488207,2022-06-04T12:25:00.000-06:00,2022-06-04T15:45:00.000-07:00


In [9]:
df_copy['DepatureTime'] = segments[0]

In [10]:
df_copy

Unnamed: 0,startingAirport,destinationAirport,flightDate,searchDate,segmentsDepartureTimeRaw,segmentsCabinCode,totalFare,DepatureTime
0,ATL,BOS,2022-04-17,2022-04-16,2022-04-17T12:57:00.000-04:00,coach,248.6,2022-04-17T12:57:00.000-04:00
1,ATL,BOS,2022-04-17,2022-04-16,2022-04-17T06:30:00.000-04:00,coach,248.6,2022-04-17T06:30:00.000-04:00
2,ATL,BOS,2022-04-17,2022-04-16,2022-04-17T11:35:00.000-04:00,coach,248.6,2022-04-17T11:35:00.000-04:00
3,ATL,BOS,2022-04-17,2022-04-16,2022-04-17T13:59:00.000-04:00,coach,248.6,2022-04-17T13:59:00.000-04:00
4,ATL,BOS,2022-04-17,2022-04-16,2022-04-17T09:59:00.000-04:00,coach,248.6,2022-04-17T09:59:00.000-04:00
...,...,...,...,...,...,...,...,...
3488204,DEN,SFO,2022-06-04,2022-05-19,2022-06-04T09:45:00.000-06:00||2022-06-04T12:4...,coach||coach,506.6,2022-06-04T09:45:00.000-06:00
3488205,DEN,SFO,2022-06-04,2022-05-19,2022-06-04T12:25:00.000-06:00||2022-06-04T20:1...,coach||coach,562.2,2022-06-04T12:25:00.000-06:00
3488206,DEN,SFO,2022-06-04,2022-05-19,2022-06-04T12:25:00.000-06:00||2022-06-04T21:3...,coach||coach,562.2,2022-06-04T12:25:00.000-06:00
3488207,DEN,SFO,2022-06-04,2022-05-19,2022-06-04T12:25:00.000-06:00||2022-06-04T15:4...,coach||coach,586.6,2022-06-04T12:25:00.000-06:00


In [11]:
# Split different segmentsCabinCode
segments = df_copy['segmentsCabinCode'].str.split(r'\|\|', n=4, expand=True)


In [12]:
segments

Unnamed: 0,0,1,2,3
0,coach,,,
1,coach,,,
2,coach,,,
3,coach,,,
4,coach,,,
...,...,...,...,...
3488204,coach,coach,,
3488205,coach,coach,,
3488206,coach,coach,,
3488207,coach,coach,,


In [13]:
df_copy[['Cabin_Leg1', 'Cabin_Leg2','Cabin_Leg3', 'Cabin_Leg4']] = segments

In [14]:
df_copy

Unnamed: 0,startingAirport,destinationAirport,flightDate,searchDate,segmentsDepartureTimeRaw,segmentsCabinCode,totalFare,DepatureTime,Cabin_Leg1,Cabin_Leg2,Cabin_Leg3,Cabin_Leg4
0,ATL,BOS,2022-04-17,2022-04-16,2022-04-17T12:57:00.000-04:00,coach,248.6,2022-04-17T12:57:00.000-04:00,coach,,,
1,ATL,BOS,2022-04-17,2022-04-16,2022-04-17T06:30:00.000-04:00,coach,248.6,2022-04-17T06:30:00.000-04:00,coach,,,
2,ATL,BOS,2022-04-17,2022-04-16,2022-04-17T11:35:00.000-04:00,coach,248.6,2022-04-17T11:35:00.000-04:00,coach,,,
3,ATL,BOS,2022-04-17,2022-04-16,2022-04-17T13:59:00.000-04:00,coach,248.6,2022-04-17T13:59:00.000-04:00,coach,,,
4,ATL,BOS,2022-04-17,2022-04-16,2022-04-17T09:59:00.000-04:00,coach,248.6,2022-04-17T09:59:00.000-04:00,coach,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
3488204,DEN,SFO,2022-06-04,2022-05-19,2022-06-04T09:45:00.000-06:00||2022-06-04T12:4...,coach||coach,506.6,2022-06-04T09:45:00.000-06:00,coach,coach,,
3488205,DEN,SFO,2022-06-04,2022-05-19,2022-06-04T12:25:00.000-06:00||2022-06-04T20:1...,coach||coach,562.2,2022-06-04T12:25:00.000-06:00,coach,coach,,
3488206,DEN,SFO,2022-06-04,2022-05-19,2022-06-04T12:25:00.000-06:00||2022-06-04T21:3...,coach||coach,562.2,2022-06-04T12:25:00.000-06:00,coach,coach,,
3488207,DEN,SFO,2022-06-04,2022-05-19,2022-06-04T12:25:00.000-06:00||2022-06-04T15:4...,coach||coach,586.6,2022-06-04T12:25:00.000-06:00,coach,coach,,


In [15]:
# Create Number of Stops feature
stop_counts = df_copy['segmentsCabinCode'].str.count(r'\|\|')

In [16]:
df_copy['n_stops'] = stop_counts

In [17]:
df_copy

Unnamed: 0,startingAirport,destinationAirport,flightDate,searchDate,segmentsDepartureTimeRaw,segmentsCabinCode,totalFare,DepatureTime,Cabin_Leg1,Cabin_Leg2,Cabin_Leg3,Cabin_Leg4,n_stops
0,ATL,BOS,2022-04-17,2022-04-16,2022-04-17T12:57:00.000-04:00,coach,248.6,2022-04-17T12:57:00.000-04:00,coach,,,,0
1,ATL,BOS,2022-04-17,2022-04-16,2022-04-17T06:30:00.000-04:00,coach,248.6,2022-04-17T06:30:00.000-04:00,coach,,,,0
2,ATL,BOS,2022-04-17,2022-04-16,2022-04-17T11:35:00.000-04:00,coach,248.6,2022-04-17T11:35:00.000-04:00,coach,,,,0
3,ATL,BOS,2022-04-17,2022-04-16,2022-04-17T13:59:00.000-04:00,coach,248.6,2022-04-17T13:59:00.000-04:00,coach,,,,0
4,ATL,BOS,2022-04-17,2022-04-16,2022-04-17T09:59:00.000-04:00,coach,248.6,2022-04-17T09:59:00.000-04:00,coach,,,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3488204,DEN,SFO,2022-06-04,2022-05-19,2022-06-04T09:45:00.000-06:00||2022-06-04T12:4...,coach||coach,506.6,2022-06-04T09:45:00.000-06:00,coach,coach,,,1
3488205,DEN,SFO,2022-06-04,2022-05-19,2022-06-04T12:25:00.000-06:00||2022-06-04T20:1...,coach||coach,562.2,2022-06-04T12:25:00.000-06:00,coach,coach,,,1
3488206,DEN,SFO,2022-06-04,2022-05-19,2022-06-04T12:25:00.000-06:00||2022-06-04T21:3...,coach||coach,562.2,2022-06-04T12:25:00.000-06:00,coach,coach,,,1
3488207,DEN,SFO,2022-06-04,2022-05-19,2022-06-04T12:25:00.000-06:00||2022-06-04T15:4...,coach||coach,586.6,2022-06-04T12:25:00.000-06:00,coach,coach,,,1


In [18]:
# Replace None with 'no_stop'
df_copy['Cabin_Leg1'] = df_copy['Cabin_Leg1'].fillna('no_stop')
df_copy['Cabin_Leg2'] = df_copy['Cabin_Leg2'].fillna('no_stop')
df_copy['Cabin_Leg3'] = df_copy['Cabin_Leg3'].fillna('no_stop')
df_copy['Cabin_Leg4'] = df_copy['Cabin_Leg4'].fillna('no_stop')

In [19]:
# Drop the segmentsDepartureTimeRaw and segmentsCabinCode
df_copy = df_copy.drop(['segmentsDepartureTimeRaw', 'segmentsCabinCode'], axis = 1)

In [20]:
df_copy

Unnamed: 0,startingAirport,destinationAirport,flightDate,searchDate,totalFare,DepatureTime,Cabin_Leg1,Cabin_Leg2,Cabin_Leg3,Cabin_Leg4,n_stops
0,ATL,BOS,2022-04-17,2022-04-16,248.6,2022-04-17T12:57:00.000-04:00,coach,no_stop,no_stop,no_stop,0
1,ATL,BOS,2022-04-17,2022-04-16,248.6,2022-04-17T06:30:00.000-04:00,coach,no_stop,no_stop,no_stop,0
2,ATL,BOS,2022-04-17,2022-04-16,248.6,2022-04-17T11:35:00.000-04:00,coach,no_stop,no_stop,no_stop,0
3,ATL,BOS,2022-04-17,2022-04-16,248.6,2022-04-17T13:59:00.000-04:00,coach,no_stop,no_stop,no_stop,0
4,ATL,BOS,2022-04-17,2022-04-16,248.6,2022-04-17T09:59:00.000-04:00,coach,no_stop,no_stop,no_stop,0
...,...,...,...,...,...,...,...,...,...,...,...
3488204,DEN,SFO,2022-06-04,2022-05-19,506.6,2022-06-04T09:45:00.000-06:00,coach,coach,no_stop,no_stop,1
3488205,DEN,SFO,2022-06-04,2022-05-19,562.2,2022-06-04T12:25:00.000-06:00,coach,coach,no_stop,no_stop,1
3488206,DEN,SFO,2022-06-04,2022-05-19,562.2,2022-06-04T12:25:00.000-06:00,coach,coach,no_stop,no_stop,1
3488207,DEN,SFO,2022-06-04,2022-05-19,586.6,2022-06-04T12:25:00.000-06:00,coach,coach,no_stop,no_stop,1


## Feature Engineering

In [21]:
# Convert DepatureTime to usable features
df_copy['DepatureTime'] = pd.to_datetime(df_copy['DepatureTime'], utc = True)

In [22]:
df_copy['DepatureTime']

0         2022-04-17 16:57:00+00:00
1         2022-04-17 10:30:00+00:00
2         2022-04-17 15:35:00+00:00
3         2022-04-17 17:59:00+00:00
4         2022-04-17 13:59:00+00:00
                     ...           
3488204   2022-06-04 15:45:00+00:00
3488205   2022-06-04 18:25:00+00:00
3488206   2022-06-04 18:25:00+00:00
3488207   2022-06-04 18:25:00+00:00
3488208   2022-06-04 21:42:00+00:00
Name: DepatureTime, Length: 3488209, dtype: datetime64[ns, UTC]

In [23]:
# Extract features from the 'DepatureTime' column
df_copy['month'] = df_copy['DepatureTime'].dt.month
df_copy['day'] = df_copy['DepatureTime'].dt.day
df_copy['hour'] = df_copy['DepatureTime'].dt.hour
df_copy['minute'] = df_copy['DepatureTime'].dt.minute
df_copy['day_of_week'] = df_copy['DepatureTime'].dt.dayofweek  # Monday=0, Sunday=6
df_copy['week_of_year'] = df_copy['DepatureTime'].dt.isocalendar().week  # Week of the year

In [24]:
df_copy

Unnamed: 0,startingAirport,destinationAirport,flightDate,searchDate,totalFare,DepatureTime,Cabin_Leg1,Cabin_Leg2,Cabin_Leg3,Cabin_Leg4,n_stops,month,day,hour,minute,day_of_week,week_of_year
0,ATL,BOS,2022-04-17,2022-04-16,248.6,2022-04-17 16:57:00+00:00,coach,no_stop,no_stop,no_stop,0,4,17,16,57,6,15
1,ATL,BOS,2022-04-17,2022-04-16,248.6,2022-04-17 10:30:00+00:00,coach,no_stop,no_stop,no_stop,0,4,17,10,30,6,15
2,ATL,BOS,2022-04-17,2022-04-16,248.6,2022-04-17 15:35:00+00:00,coach,no_stop,no_stop,no_stop,0,4,17,15,35,6,15
3,ATL,BOS,2022-04-17,2022-04-16,248.6,2022-04-17 17:59:00+00:00,coach,no_stop,no_stop,no_stop,0,4,17,17,59,6,15
4,ATL,BOS,2022-04-17,2022-04-16,248.6,2022-04-17 13:59:00+00:00,coach,no_stop,no_stop,no_stop,0,4,17,13,59,6,15
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3488204,DEN,SFO,2022-06-04,2022-05-19,506.6,2022-06-04 15:45:00+00:00,coach,coach,no_stop,no_stop,1,6,4,15,45,5,22
3488205,DEN,SFO,2022-06-04,2022-05-19,562.2,2022-06-04 18:25:00+00:00,coach,coach,no_stop,no_stop,1,6,4,18,25,5,22
3488206,DEN,SFO,2022-06-04,2022-05-19,562.2,2022-06-04 18:25:00+00:00,coach,coach,no_stop,no_stop,1,6,4,18,25,5,22
3488207,DEN,SFO,2022-06-04,2022-05-19,586.6,2022-06-04 18:25:00+00:00,coach,coach,no_stop,no_stop,1,6,4,18,25,5,22


### Create days between search and flight feature

In [25]:
df_copy['flightDate'] = pd.to_datetime(df_copy['flightDate'])
df_copy['searchDate'] = pd.to_datetime(df_copy['searchDate'])

In [26]:
df_copy['date_diff'] = (df_copy['flightDate'] - df_copy['searchDate']).dt.days

In [27]:
X = df_copy.drop(['totalFare', 'flightDate', 'searchDate', 'DepatureTime'], axis = 1)

In [28]:
y = df_copy['totalFare']

In [29]:
X

Unnamed: 0,startingAirport,destinationAirport,Cabin_Leg1,Cabin_Leg2,Cabin_Leg3,Cabin_Leg4,n_stops,month,day,hour,minute,day_of_week,week_of_year,date_diff
0,ATL,BOS,coach,no_stop,no_stop,no_stop,0,4,17,16,57,6,15,1
1,ATL,BOS,coach,no_stop,no_stop,no_stop,0,4,17,10,30,6,15,1
2,ATL,BOS,coach,no_stop,no_stop,no_stop,0,4,17,15,35,6,15,1
3,ATL,BOS,coach,no_stop,no_stop,no_stop,0,4,17,17,59,6,15,1
4,ATL,BOS,coach,no_stop,no_stop,no_stop,0,4,17,13,59,6,15,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3488204,DEN,SFO,coach,coach,no_stop,no_stop,1,6,4,15,45,5,22,16
3488205,DEN,SFO,coach,coach,no_stop,no_stop,1,6,4,18,25,5,22,16
3488206,DEN,SFO,coach,coach,no_stop,no_stop,1,6,4,18,25,5,22,16
3488207,DEN,SFO,coach,coach,no_stop,no_stop,1,6,4,18,25,5,22,16


In [30]:
y

0          248.6
1          248.6
2          248.6
3          248.6
4          248.6
           ...  
3488204    506.6
3488205    562.2
3488206    562.2
3488207    586.6
3488208    636.6
Name: totalFare, Length: 3488209, dtype: float64

## Scaling and Pipeline

In [31]:
'''
from sklearn.base import BaseEstimator, TransformerMixin

# Custom Transformer for DateTime Features
class DateTimeEncoder(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X = X.copy()  # Avoid modifying the original dataframe

        # Cyclical encoding for month, day, hour, day_of_week, and week_of_year
        X['month_sin'] = np.sin(2 * np.pi * X['month'] / 12)
        X['month_cos'] = np.cos(2 * np.pi * X['month'] / 12)

        X['day_sin'] = np.sin(2 * np.pi * X['day'] / 31)
        X['day_cos'] = np.cos(2 * np.pi * X['day'] / 31)

        X['hour_sin'] = np.sin(2 * np.pi * X['hour'] / 24)
        X['hour_cos'] = np.cos(2 * np.pi * X['hour'] / 24)

        X['minute_scaled'] = X['minute'] / 60

        X['day_of_week_sin'] = np.sin(2 * np.pi * X['day_of_week'] / 7)
        X['day_of_week_cos'] = np.cos(2 * np.pi * X['day_of_week'] / 7)

        X['week_of_year_sin'] = np.sin(2 * np.pi * X['week_of_year'] / 52)
        X['week_of_year_cos'] = np.cos(2 * np.pi * X['week_of_year'] / 52)

        # Drop original columns
        X = X.drop(['month', 'day', 'hour', 'minute', 'day_of_week', 'week_of_year'], axis=1)
        
        return X
'''

"\nfrom sklearn.base import BaseEstimator, TransformerMixin\n\n# Custom Transformer for DateTime Features\nclass DateTimeEncoder(BaseEstimator, TransformerMixin):\n    def __init__(self):\n        pass\n    \n    def fit(self, X, y=None):\n        return self\n    \n    def transform(self, X):\n        X = X.copy()  # Avoid modifying the original dataframe\n\n        # Cyclical encoding for month, day, hour, day_of_week, and week_of_year\n        X['month_sin'] = np.sin(2 * np.pi * X['month'] / 12)\n        X['month_cos'] = np.cos(2 * np.pi * X['month'] / 12)\n\n        X['day_sin'] = np.sin(2 * np.pi * X['day'] / 31)\n        X['day_cos'] = np.cos(2 * np.pi * X['day'] / 31)\n\n        X['hour_sin'] = np.sin(2 * np.pi * X['hour'] / 24)\n        X['hour_cos'] = np.cos(2 * np.pi * X['hour'] / 24)\n\n        X['minute_scaled'] = X['minute'] / 60\n\n        X['day_of_week_sin'] = np.sin(2 * np.pi * X['day_of_week'] / 7)\n        X['day_of_week_cos'] = np.cos(2 * np.pi * X['day_of_week'] / 

In [32]:
from sklearn.compose import ColumnTransformer

In [33]:
from sklearn.linear_model import LinearRegression

In [34]:
# Define the order of categories
# Define the order of categories for each cabin type
cabin_order = [
    ['no_stop', 'coach', 'premium coach', 'business', 'first'],  # for Cabin_Leg1
    ['no_stop', 'coach', 'premium coach', 'business', 'first'],  # for Cabin_Leg2
    ['no_stop', 'coach', 'premium coach', 'business', 'first'],  # for Cabin_Leg3
    ['no_stop', 'coach', 'premium coach', 'business', 'first'],  # for Cabin_Leg4
]


In [35]:
from sklearn.preprocessing import StandardScaler, FunctionTransformer

In [36]:
# Define preprocessing step
preprocessor = ColumnTransformer(
    transformers=[
        ('cabins', OrdinalEncoder(categories=cabin_order), ['Cabin_Leg1', 'Cabin_Leg2', 'Cabin_Leg3', 'Cabin_Leg4']), # Ordinal Encode the 'Cabin' columns
        ('ohe', OneHotEncoder(), ['startingAirport', 'destinationAirport']), # OneHotEncode starting and destination
        ('standard', StandardScaler(), ['date_diff', 'month', 'day','hour','minute', 'day_of_week','week_of_year'])
    ],
    remainder='passthrough'  # drop any other columns not specified
)

In [37]:
# Create a pipline with the preprcessor and model
linear_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor), # Step 1: preprocess features
    ('model', LinearRegression()), # Step 2: train model
])


## Data Splitting

In [38]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


## Train Model

In [39]:
# Fit the pipeline on the training data
linear_pipeline.fit(X_train, y_train)

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [40]:
# Make predictions on the test data
predictions = linear_pipeline.predict(X_test)


In [41]:
predictions

array([211.14104748, 231.42535305, 319.45656776, ..., 270.98466778,
       312.46115685, 396.53018475])

## Model Assessment 

### Base Model 

In [42]:
from my_krml_ratana.models.null import NullRegressor

In [43]:
base_model = NullRegressor()

In [44]:
y_base = base_model.fit_predict(y_train)

In [45]:
from my_krml_ratana.models.performance import print_regressor_scores

print_regressor_scores(y_preds=y_base, y_actuals=y_train, set_name='Base')

RMSE Base: 190.0412716469115
MAE Base: 140.90661793500814


### Assess Linear model

In [46]:
preds_train = linear_pipeline.predict(X_train)
preds_test = linear_pipeline.predict(X_test)

In [47]:
print_regressor_scores(y_preds=preds_train, y_actuals=y_train, set_name='Training')

RMSE Training: 148.8058316095679
MAE Training: 106.69453555223518


In [48]:
print_regressor_scores(y_preds=preds_test, y_actuals=y_test, set_name='Test')

RMSE Test: 148.36740808566228
MAE Test: 106.64128194071712


## Save Model

In [49]:
from joblib import dump

In [50]:
# Save basic model for deployment
dump(linear_pipeline,  '../models/linear_reg_pipeline1.joblib')

['../models/linear_reg_pipeline1.joblib']

## Test Sample obs for APP

In [51]:
from datetime import datetime
from typing import Dict

In [52]:
# Test potential input

def process_flight_data(
    date: str,
    time: str,
    startingAirport: str,
    destinationAirport: str,
    n_stops: int,
    Cabin_Leg1: str,
    Cabin_Leg2: str = "no_stop", # Set default parameter for no stop
    Cabin_Leg3: str = "no_stop", # Set default parameter for no stop
    Cabin_Leg4: str = "no_stop" # Set default parameter for no stop
):
    """
    Function to process flight data to inputs for modelling
    """
    
    # Check and adjust cabin types based on the number of stops
    if n_stops == 0:
        Cabin_Leg2 = Cabin_Leg3 = Cabin_Leg4 = "no_stop"
    elif n_stops == 1:
        Cabin_Leg3 = Cabin_Leg4 = "no_stop"
    elif n_stops == 2:
        Cabin_Leg4 = "no_stop"
    
    # Extract date and time features (simplified here)
    combined_datetime = datetime.strptime(f"{date} {time}", "%Y-%m-%d %H:%M:%S")

    # Use the current date as the reference flight date
    reference_flight_date = datetime.now()

    # Calculate the date_diff
    date_diff = (combined_datetime - reference_flight_date).days

    # Create the feature dictionary
    features = {
        "month": combined_datetime.month,
        "day": combined_datetime.day,
        "hour": combined_datetime.hour,
        "minute": combined_datetime.minute,
        "day_of_week": combined_datetime.weekday(),  # e.g., Monday
        "week_of_year": combined_datetime.isocalendar()[1],  # ISO week number
        "date_diff": date_diff,
        "startingAirport": startingAirport,
        "destinationAirport": destinationAirport,
        "n_stops": n_stops,
        "Cabin_Leg1": Cabin_Leg1,
        "Cabin_Leg2": Cabin_Leg2,
        "Cabin_Leg3": Cabin_Leg3,
        "Cabin_Leg4": Cabin_Leg4
    }
    # Convert the features dictionary into a Dataframe
    features_df = pd.DataFrame([features])

    # Specify the desired data types
    features_df = features_df.astype({
        "month": "int64",
        "day": "int64",
        "hour": "int64",
        "minute": "int64",
        "day_of_week": "int64",
        "week_of_year": "int64",
        "date_diff": "int64",
        "startingAirport": "string",
        "destinationAirport": "string",
        "n_stops": "int64",
        "Cabin_Leg1": "string",
        "Cabin_Leg2": "string",
        "Cabin_Leg3": "string",
        "Cabin_Leg4": "string"
    })
    return features_df




In [53]:
# Example usage
flight_data = process_flight_data(
    date="2024-11-25",
    time="15:30:00",
    startingAirport="ATL",
    destinationAirport="LAX",
    n_stops=0,  # 2 stops, so 3 legs
    Cabin_Leg1="coach"
)



In [54]:
flight_data

Unnamed: 0,month,day,hour,minute,day_of_week,week_of_year,date_diff,startingAirport,destinationAirport,n_stops,Cabin_Leg1,Cabin_Leg2,Cabin_Leg3,Cabin_Leg4
0,11,25,15,30,0,48,24,ATL,LAX,0,coach,no_stop,no_stop,no_stop


In [55]:
result = linear_pipeline.predict(flight_data)

In [56]:
result

array([753.47974968])

In [57]:
X

Unnamed: 0,startingAirport,destinationAirport,Cabin_Leg1,Cabin_Leg2,Cabin_Leg3,Cabin_Leg4,n_stops,month,day,hour,minute,day_of_week,week_of_year,date_diff
0,ATL,BOS,coach,no_stop,no_stop,no_stop,0,4,17,16,57,6,15,1
1,ATL,BOS,coach,no_stop,no_stop,no_stop,0,4,17,10,30,6,15,1
2,ATL,BOS,coach,no_stop,no_stop,no_stop,0,4,17,15,35,6,15,1
3,ATL,BOS,coach,no_stop,no_stop,no_stop,0,4,17,17,59,6,15,1
4,ATL,BOS,coach,no_stop,no_stop,no_stop,0,4,17,13,59,6,15,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3488204,DEN,SFO,coach,coach,no_stop,no_stop,1,6,4,15,45,5,22,16
3488205,DEN,SFO,coach,coach,no_stop,no_stop,1,6,4,18,25,5,22,16
3488206,DEN,SFO,coach,coach,no_stop,no_stop,1,6,4,18,25,5,22,16
3488207,DEN,SFO,coach,coach,no_stop,no_stop,1,6,4,18,25,5,22,16


In [58]:
X_test

Unnamed: 0,startingAirport,destinationAirport,Cabin_Leg1,Cabin_Leg2,Cabin_Leg3,Cabin_Leg4,n_stops,month,day,hour,minute,day_of_week,week_of_year,date_diff
1299189,BOS,EWR,coach,no_stop,no_stop,no_stop,0,5,31,22,15,1,22,32
1719162,BOS,MIA,coach,no_stop,no_stop,no_stop,0,5,21,20,22,5,20,10
3423406,DEN,LGA,coach,coach,no_stop,no_stop,1,6,14,19,25,1,24,32
663927,ATL,MIA,coach,coach,no_stop,no_stop,1,7,2,17,40,5,26,56
1108789,BOS,EWR,coach,coach,no_stop,no_stop,1,5,21,18,28,5,20,28
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1880843,CLT,ATL,coach,no_stop,no_stop,no_stop,0,5,6,11,36,4,18,19
710873,ATL,BOS,coach,coach,no_stop,no_stop,1,5,28,17,40,5,21,19
1197849,BOS,ATL,coach,coach,no_stop,no_stop,1,5,16,18,28,0,20,20
1951057,CLT,PHL,coach,coach,no_stop,no_stop,1,5,20,11,20,4,20,31


In [59]:
y_test

1299189     43.60
1719162     88.60
3423406    579.01
663927     701.60
1108789    360.10
            ...  
1880843    197.10
710873     442.19
1197849    126.10
1951057    405.10
303806     384.60
Name: totalFare, Length: 697642, dtype: float64

In [60]:
X

Unnamed: 0,startingAirport,destinationAirport,Cabin_Leg1,Cabin_Leg2,Cabin_Leg3,Cabin_Leg4,n_stops,month,day,hour,minute,day_of_week,week_of_year,date_diff
0,ATL,BOS,coach,no_stop,no_stop,no_stop,0,4,17,16,57,6,15,1
1,ATL,BOS,coach,no_stop,no_stop,no_stop,0,4,17,10,30,6,15,1
2,ATL,BOS,coach,no_stop,no_stop,no_stop,0,4,17,15,35,6,15,1
3,ATL,BOS,coach,no_stop,no_stop,no_stop,0,4,17,17,59,6,15,1
4,ATL,BOS,coach,no_stop,no_stop,no_stop,0,4,17,13,59,6,15,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3488204,DEN,SFO,coach,coach,no_stop,no_stop,1,6,4,15,45,5,22,16
3488205,DEN,SFO,coach,coach,no_stop,no_stop,1,6,4,18,25,5,22,16
3488206,DEN,SFO,coach,coach,no_stop,no_stop,1,6,4,18,25,5,22,16
3488207,DEN,SFO,coach,coach,no_stop,no_stop,1,6,4,18,25,5,22,16


In [61]:
df_copy

Unnamed: 0,startingAirport,destinationAirport,flightDate,searchDate,totalFare,DepatureTime,Cabin_Leg1,Cabin_Leg2,Cabin_Leg3,Cabin_Leg4,n_stops,month,day,hour,minute,day_of_week,week_of_year,date_diff
0,ATL,BOS,2022-04-17,2022-04-16,248.6,2022-04-17 16:57:00+00:00,coach,no_stop,no_stop,no_stop,0,4,17,16,57,6,15,1
1,ATL,BOS,2022-04-17,2022-04-16,248.6,2022-04-17 10:30:00+00:00,coach,no_stop,no_stop,no_stop,0,4,17,10,30,6,15,1
2,ATL,BOS,2022-04-17,2022-04-16,248.6,2022-04-17 15:35:00+00:00,coach,no_stop,no_stop,no_stop,0,4,17,15,35,6,15,1
3,ATL,BOS,2022-04-17,2022-04-16,248.6,2022-04-17 17:59:00+00:00,coach,no_stop,no_stop,no_stop,0,4,17,17,59,6,15,1
4,ATL,BOS,2022-04-17,2022-04-16,248.6,2022-04-17 13:59:00+00:00,coach,no_stop,no_stop,no_stop,0,4,17,13,59,6,15,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3488204,DEN,SFO,2022-06-04,2022-05-19,506.6,2022-06-04 15:45:00+00:00,coach,coach,no_stop,no_stop,1,6,4,15,45,5,22,16
3488205,DEN,SFO,2022-06-04,2022-05-19,562.2,2022-06-04 18:25:00+00:00,coach,coach,no_stop,no_stop,1,6,4,18,25,5,22,16
3488206,DEN,SFO,2022-06-04,2022-05-19,562.2,2022-06-04 18:25:00+00:00,coach,coach,no_stop,no_stop,1,6,4,18,25,5,22,16
3488207,DEN,SFO,2022-06-04,2022-05-19,586.6,2022-06-04 18:25:00+00:00,coach,coach,no_stop,no_stop,1,6,4,18,25,5,22,16


In [3]:
from datetime import datetime
# Import packages
import pandas as pd
import numpy as np
import joblib
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.preprocessing import FunctionTransformer

In [28]:
median_traveldistance = pd.read_csv('../data/external/median_travel_distance.csv')

In [5]:
#Match median travel distance
def get_distance(median_traveldistance, start, destination):
    df = median_traveldistance[(median_traveldistance['startingAirport']==start) &
                               (median_traveldistance['destinationAirport']==destination)]

    if df is not None:
        distance = df['medianTravelDistance'].values[0]
        return distance
    return None

In [29]:
get_distance(median_traveldistance, 'ATL','BOS')

np.float64(947.0)

In [6]:
### RATANA PROCESS FLIGHT DATA ###
def ratana_process_flight_data(
    date: str,
    time: str,
    startingAirport: str,
    destinationAirport: str,
    n_stops: int,
    Cabin_Leg1: str,
    Cabin_Leg2: str = "no_stop", # Set default parameter for no stop
    Cabin_Leg3: str = "no_stop", # Set default parameter for no stop
    Cabin_Leg4: str = "no_stop" # Set default parameter for no stop
):
    """
    Function to process flight data to inputs for modelling
    """
    
    # Check and adjust cabin types based on the number of stops
    if n_stops == 0:
        Cabin_Leg2 = Cabin_Leg3 = Cabin_Leg4 = "no_stop"
    elif n_stops == 1:
        Cabin_Leg3 = Cabin_Leg4 = "no_stop"
    elif n_stops == 2:
        Cabin_Leg4 = "no_stop"
    
    # Extract date and time features (simplified here)
    combined_datetime = datetime.strptime(f"{date} {time}", "%Y-%m-%d %H-%M-%S")

    # Use the current date as the reference flight date
    reference_flight_date = datetime.now()

    # Calculate the date_diff
    date_diff = (combined_datetime - reference_flight_date).days

    # Get the travel distance based on airport combinations
    travel_distance = get_distance(median_traveldistance, startingAirport, destinationAirport)

    # Create the feature dictionary
    features = {
        "month": combined_datetime.month,
        "day": combined_datetime.day,
        "hour": combined_datetime.hour,
        "minute": combined_datetime.minute,
        "day_of_week": combined_datetime.weekday(),  # e.g., Monday
        "week_of_year": combined_datetime.isocalendar()[1],  # ISO week number
        "date_diff": date_diff,
        "startingAirport": startingAirport,
        "destinationAirport": destinationAirport,
        "n_stops": n_stops,
        "medianTravelDistance": travel_distance,
        "Cabin_Leg1": Cabin_Leg1,
        "Cabin_Leg2": Cabin_Leg2,
        "Cabin_Leg3": Cabin_Leg3,
        "Cabin_Leg4": Cabin_Leg4
    }
    # Convert the features dictionary into a Dataframe
    features_df = pd.DataFrame([features])

    # Specify the desired data types
    features_df = features_df.astype({
        "month": "int64",
        "day": "int64",
        "hour": "int64",
        "minute": "int64",
        "day_of_week": "int64",
        "week_of_year": "int64",
        "date_diff": "int64",
        "startingAirport": "string",
        "destinationAirport": "string",
        "n_stops": "int64",
        "medianTravelDistance": travel_distance,
        "Cabin_Leg1": "string",
        "Cabin_Leg2": "string",
        "Cabin_Leg3": "string",
        "Cabin_Leg4": "string"
        
    })
    return features_df



In [30]:
# Example function call with sample data
date = "2024-11-10"  # Example future date
time = "14-30-00"  # Example time
startingAirport = "ATL"
destinationAirport = "BOS"
n_stops = 1
cabin_Leg1 = "Economy"
cabin_Leg2 = "Business"
cabin_Leg3 = "no_stop"
cabin_Leg4 = "no_stop"

In [26]:
flight_data

Unnamed: 0,month,day,hour,minute,day_of_week,week_of_year,date_diff,startingAirport,destinationAirport,n_stops,medianTravelDistance,Cabin_Leg1,Cabin_Leg2,Cabin_Leg3,Cabin_Leg4
0,11,25,15,30,0,48,17,ATL,LAX,0,2034.0,coach,no_stop,no_stop,no_stop


In [31]:
obs = ratana_process_flight_data(date, time, startingAirport, destinationAirport, n_stops,
                                 cabin_Leg1, cabin_Leg2, cabin_Leg3, cabin_Leg4)

        

In [32]:
obs

Unnamed: 0,month,day,hour,minute,day_of_week,week_of_year,date_diff,startingAirport,destinationAirport,n_stops,medianTravelDistance,Cabin_Leg1,Cabin_Leg2,Cabin_Leg3,Cabin_Leg4
0,11,10,14,30,6,45,2,ATL,BOS,1,947.0,Economy,Business,no_stop,no_stop


In [33]:
obs.dtypes

month                            int64
day                              int64
hour                             int64
minute                           int64
day_of_week                      int64
week_of_year                     int64
date_diff                        int64
startingAirport         string[python]
destinationAirport      string[python]
n_stops                          int64
medianTravelDistance           float64
Cabin_Leg1              string[python]
Cabin_Leg2              string[python]
Cabin_Leg3              string[python]
Cabin_Leg4              string[python]
dtype: object

In [10]:
from joblib import load

In [11]:
xgb_pipeline = load('../models/xgb_pipeline.joblib')

In [17]:
answer = xgb_pipeline.predict(flight_data).tolist()

In [18]:
answer

[529.806396484375]