In [2]:
# Import packages
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.preprocessing import FunctionTransformer
from sklearn.model_selection import train_test_split

In [3]:
# Import Dataset 
X_train = pd.read_csv('../data/processed/X_train.csv')
X_val = pd.read_csv('../data/processed/X_val.csv')
X_test = pd.read_csv('../data/processed/X_test.csv')
y_train = pd.read_csv('../data/processed/y_train.csv')
y_val = pd.read_csv('../data/processed/y_val.csv')
y_test = pd.read_csv('../data/processed/y_test.csv')


In [4]:
cabin_order = [
    ['no_stop', 'coach', 'premium coach', 'business', 'first'],  # for Cabin_Leg1
    ['no_stop', 'coach', 'premium coach', 'business', 'first'],  # for Cabin_Leg2
    ['no_stop', 'coach', 'premium coach', 'business', 'first'],  # for Cabin_Leg3
    ['no_stop', 'coach', 'premium coach', 'business', 'first'],  # for Cabin_Leg4
]


In [8]:
from sklearn.compose import ColumnTransformer

In [18]:
# Define preprocessing step
preprocessor = ColumnTransformer(
    transformers=[
        ('cabins', OrdinalEncoder(categories=cabin_order), ['Cabin_Leg1', 'Cabin_Leg2', 'Cabin_Leg3', 'Cabin_Leg4']), # Ordinal Encode the 'Cabin' columns
        ('ohe', OneHotEncoder(), ['startingAirport', 'destinationAirport']), # OneHotEncode starting and destination
        ('standard', StandardScaler(), ['date_diff', 'month', 'day','hour','minute', 'day_of_week','week_of_year', 'medianTravelDistance'])
    ],
    remainder='passthrough'  # drop any other columns not specified
)

In [27]:
from joblib import dump

In [29]:
# Save (dump) the preprocessor to a file
dump(preprocessor, '../models/preprocessor/preprocessor.pkl')

['../models/preprocessor/preprocessor.pkl']

In [19]:
from sklearn.linear_model import LinearRegression

In [20]:
# Create a pipline with the preprcessor and model
linear_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor), # Step 1: preprocess features
    ('model', LinearRegression()), # Step 2: train model
])


In [21]:
# Fit the pipeline on the training data
linear_pipeline.fit(X_train, y_train)

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [22]:
preds_train = linear_pipeline.predict(X_train)
preds_val = linear_pipeline.predict(X_val)
preds_test = linear_pipeline.predict(X_test)

In [23]:
from my_krml_ratana.models.performance import print_regressor_scores

In [24]:
print_regressor_scores(y_preds=preds_train, y_actuals=y_train, set_name='Training')

RMSE Training: 137.99953365888217
MAE Training: 97.677377575661


In [26]:
print_regressor_scores(y_preds=preds_val, y_actuals=y_val, set_name='Validating')

RMSE Validating: 138.10956163415597
MAE Validating: 97.71144375128803


In [25]:
print_regressor_scores(y_preds=preds_test, y_actuals=y_test, set_name='Testing')

RMSE Testing: 138.3562452097703
MAE Testing: 97.73302373681766
