## 1. Import Libraries

In [5]:
!pip install xgboost

Collecting xgboost
  Downloading xgboost-2.1.1-py3-none-manylinux2014_x86_64.whl.metadata (2.0 kB)
Collecting numpy (from xgboost)
  Downloading numpy-2.1.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.9/60.9 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading xgboost-2.1.1-py3-none-manylinux2014_x86_64.whl (4.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.5/4.5 MB[0m [31m71.5 MB/s[0m eta [36m0:00:00[0mta [36m0:00:01[0m
[?25hDownloading numpy-2.1.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (16.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.3/16.3 MB[0m [31m24.5 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: numpy, xgboost
  Attempting uninstall: numpy
    Found existing installation: numpy 1.22.4
    Uninstalling numpy-1.22.4:
      Successfully uninstalled numpy-1.2

In [6]:
!pip install feature-engine

Collecting feature-engine
  Downloading feature_engine-1.8.1-py2.py3-none-any.whl.metadata (9.8 kB)
Downloading feature_engine-1.8.1-py2.py3-none-any.whl (364 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m364.1/364.1 kB[0m [31m22.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: feature-engine
Successfully installed feature-engine-1.8.1


In [8]:
import os

import boto3

import pickle

import xgboost as xgb

import warnings

import numpy as np

import pandas as pd

import sklearn
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics.pairwise import rbf_kernel
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import (
    OneHotEncoder,
    MinMaxScaler,
    PowerTransformer,
    FunctionTransformer,
    OrdinalEncoder,
    StandardScaler
)

from feature_engine.outliers import Winsorizer
from feature_engine.datetime import DatetimeFeatures
from feature_engine.selection import SelectBySingleFeaturePerformance
from feature_engine.encoding import (
    RareLabelEncoder,
    MeanEncoder,
    CountFrequencyEncoder
    )
import matplotlib.pyplot as plt

import sagemaker
from sagemaker.estimator import Estimator
from sagemaker.inputs import TrainingInput
from sagemaker.tuner import (
    IntegerParameter,
    ContinuousParameter, 
    HyperparameterTuner
)


Matplotlib is building the font cache; this may take a moment.


sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


## 2. Display Settings

In [9]:
pd.set_option('display.max_columns', None) # display all the columns

In [10]:
 # skicit transformer by default return numpy array, so if want output as dataframe below code will be done
sklearn.set_config(transform_output='pandas')

In [11]:
warnings.filterwarnings('ignore')

## 3. Read Datasets

In [12]:
train = pd.read_csv('train.csv')
train

Unnamed: 0,airline,date_of_journey,source,destination,dep_time,arrival_time,duration,total_stops,additional_info,price
0,Air India,2019-06-18,Banglore,Delhi,17:00:00,19:50:00,170,0.0,No Info,6961
1,Jet Airways,2019-04-24,Banglore,Delhi,08:20:00,11:20:00,180,0.0,In-flight meal not included,4544
2,Jet Airways,2019-05-03,Banglore,Delhi,19:50:00,22:50:00,180,0.0,No Info,7229
3,Air India,2019-06-15,Banglore,Delhi,17:00:00,19:45:00,165,0.0,No Info,7171
4,Air India,2019-04-09,Delhi,Cochin,07:00:00,19:15:00,735,1.0,No Info,6674
...,...,...,...,...,...,...,...,...,...,...
635,Indigo,2019-05-03,Chennai,Kolkata,14:45:00,17:05:00,140,0.0,No Info,3858
636,Multiple Carriers,2019-03-21,Delhi,Cochin,11:40:00,21:00:00,560,1.0,No Info,7215
637,Indigo,2019-06-03,Mumbai,Hyderabad,12:00:00,13:30:00,90,0.0,No Info,2754
638,Jet Airways,2019-03-21,Delhi,Cochin,10:45:00,18:50:00,1925,2.0,No Info,11093


In [13]:
val = pd.read_csv('val.csv')
val

Unnamed: 0,airline,date_of_journey,source,destination,dep_time,arrival_time,duration,total_stops,additional_info,price
0,Indigo,2019-05-01,Banglore,Delhi,08:30:00,11:20:00,170,0.0,No Info,4823
1,Jet Airways,2019-06-03,Delhi,Cochin,19:45:00,19:00:00,1395,1.0,In-flight meal not included,10262
2,Indigo,2019-04-15,Banglore,Delhi,10:10:00,13:00:00,170,0.0,No Info,4423
3,Indigo,2019-05-06,Kolkata,Banglore,17:35:00,22:15:00,280,1.0,No Info,5699
4,Spicejet,2019-06-06,Chennai,Kolkata,08:20:00,10:35:00,135,0.0,No check-in baggage included,3543
...,...,...,...,...,...,...,...,...,...,...
155,Air Asia,2019-03-21,Delhi,Cochin,20:10:00,07:10:00,660,1.0,No Info,6151
156,Air India,2019-05-27,Delhi,Cochin,07:00:00,19:15:00,735,1.0,No Info,9929
157,Multiple Carriers,2019-06-09,Delhi,Cochin,11:25:00,19:00:00,455,1.0,No Info,11200
158,Jet Airways,2019-03-21,Banglore,New Delhi,11:40:00,21:20:00,580,1.0,In-flight meal not included,7832


In [14]:
test = pd.read_csv('test.csv')
test

Unnamed: 0,airline,date_of_journey,source,destination,dep_time,arrival_time,duration,total_stops,additional_info,price
0,Jet Airways,2019-06-09,Kolkata,Banglore,18:55:00,04:40:00,585,1.0,In-flight meal not included,9663
1,Air India,2019-06-01,Banglore,Delhi,10:00:00,12:45:00,165,0.0,No Info,6961
2,Goair,2019-04-09,Banglore,Delhi,20:55:00,23:40:00,165,0.0,No Info,4239
3,Indigo,2019-06-21,Delhi,Cochin,05:35:00,08:50:00,195,0.0,No Info,5000
4,Vistara,2019-06-03,Banglore,Delhi,17:00:00,19:35:00,155,0.0,No Info,4878
...,...,...,...,...,...,...,...,...,...,...
195,Air India,2019-03-01,Delhi,Cochin,04:00:00,07:40:00,1660,1.0,No Info,28322
196,Spicejet,2019-05-21,Banglore,Delhi,21:10:00,00:05:00,175,0.0,No check-in baggage included,3573
197,Jet Airways,2019-06-12,Kolkata,Banglore,16:30:00,04:40:00,730,1.0,In-flight meal not included,6224
198,Jet Airways,2019-05-09,Delhi,Cochin,19:15:00,19:00:00,1425,1.0,In-flight meal not included,12373


## 4. Preprocessing Operations

In [15]:
# airline
air_transformer = Pipeline(steps=[
    ("", SimpleImputer(strategy='most_frequent')),
    ('grouper', RareLabelEncoder(tol=0.1, replace_with="Other", n_categories=2)),
    ('encoder', OneHotEncoder(sparse_output=False, handle_unknown='ignore'))
])

#doj
features_to_extract = ['month', 'week', 'day_of_week','day_of_year']
doj_transformer = Pipeline(steps=[
    ("dt", DatetimeFeatures(features_to_extract=features_to_extract, yearfirst=True, format='mixed')),
    ('scaler', MinMaxScaler())
])

#source & destination
location_pipe1 = Pipeline(steps=[
    ('grouper', RareLabelEncoder(tol=0.1, replace_with='Other', n_categories=2)),
    ('encoder', MeanEncoder()), # to perform mean encoding you have to give target column also
    ('scaler', PowerTransformer()) # helps to transform variable as symmetric as possible
])


def is_north(X):
    columns = X.columns.to_list()
    north_cities = ['Delhi', 'Kolkata', 'Mumbai', 'New_Delhi']
    return (
        X
        .assign(**{
            f'{col}_is_north': X.loc[:, col].isin(north_cities).astype(int)
            for col in columns
        })
        .drop(columns=columns)
    )

# FeatureUnion is able to concatenates results of multiple transformer objects. 
location_transformer = FeatureUnion(transformer_list=[
    ('part1', location_pipe1),
    ('part2', FunctionTransformer(func=is_north))
])

# dep_time & arrival_time
time_pipe1 = Pipeline(steps=[
    ('dt', DatetimeFeatures(features_to_extract = ['hour','minute'])),
    ('scaler', MinMaxScaler())
])

def part_of_day(X, morning=4, noon=12, eve=16, night=20):
    columns = X.columns.to_list() # store col to list
    X_temp = X.assign(**{
        col: pd.to_datetime(X.loc[:, col]).dt.hour # to custom col change into datetime col & attract hour information
        for col in columns
    })

    return(
        X_temp
        .assign(**{
            f'{col}_part_of_day': np.select(    # naming the col name & performing the operation 
                [X_temp.loc[:, col].between(morning, noon, inclusive='left'),
                 X_temp.loc[:, col].between(noon, eve, inclusive='left'),
                 X_temp.loc[:, col].between(eve, night, inclusive='left')],
                ['morning', 'afternoon', 'evening'], 
                default='night'
            )  
            for col in columns
        })
    )

time_pipe2 = Pipeline(steps=[
    ('part', FunctionTransformer(func=part_of_day)),
    ('encoder', CountFrequencyEncoder()),
    ('scaler', MinMaxScaler())
])

time_transformer = FeatureUnion(transformer_list=[
    ('part1', time_pipe1),
    ('part2', time_pipe2)
])

# duration
from sklearn.base import BaseEstimator, TransformerMixin
class RBFPercentileSimilarity(BaseEstimator, TransformerMixin):
	def __init__(self, variables=None, percentiles=[0.25, 0.5, 0.75], gamma=0.1):
		self.variables = variables
		self.percentiles = percentiles
		self.gamma = gamma


	def fit(self, X, y=None):
		if not self.variables:
			self.variables = X.select_dtypes(include="number").columns.to_list()

		self.reference_values_ = {
			col: (
				X
				.loc[:, col]
				.quantile(self.percentiles)
				.values
				.reshape(-1, 1)
			)
			for col in self.variables
		}

		return self


	def transform(self, X):
		objects = []
		for col in self.variables:
			columns = [f"{col}_rbf_{int(percentile * 100)}" for percentile in self.percentiles]
			obj = pd.DataFrame(
				data=rbf_kernel(X.loc[:, [col]], Y=self.reference_values_[col], gamma=self.gamma),
				columns=columns
			)
			objects.append(obj)
		return pd.concat(objects, axis=1)

    
def duration_category(X, short=180, med=400):
    return (
        X.assign(
            duration_cat=np.select(
                [X.duration.lt(short), 
                 X.duration.between(short, med, closed='left')
                ],
                ['short', 'medium'],
                default='long'
            )
        ).drop(columns='duration')
    )

def is_over(X, value=1000):
    return (
        X.assign(**{
            f'duration_over_{value}': X.duration.ge(value).astype(int) # ge mean greater than or equal to
        })
        .drop(columns='duration')
    )

duration_pipe1 = Pipeline(steps=[
	("rbf", RBFPercentileSimilarity()),
	("scaler", PowerTransformer())
])

duration_pipe2 = Pipeline(steps=[
	("cat", FunctionTransformer(func=duration_category)),
	("encoder", OrdinalEncoder(categories=[["short", "medium", "long"]]))
])

duration_union = FeatureUnion(transformer_list=[
	("part1", duration_pipe1),
	("part2", duration_pipe2),
	("part3", FunctionTransformer(func=is_over)),
	("part4", StandardScaler())
])

duration_transformer = Pipeline(steps=[
	("outliers", Winsorizer(capping_method="iqr", fold=1.5)),
	("imputer", SimpleImputer(strategy="median")),
	("union", duration_union)
])


def is_direct(X): # if total stops value is non then make it zero otherwise make it 1
    return X.assign(is_direct_flight=X.total_stops.eq(0).astype(int))

total_stops_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('', FunctionTransformer(func=is_direct))
])

# total stops
def is_direct(X): # if total stops value is non then make it zero otherwise make it 1
    return X.assign(is_direct_flight=X.total_stops.eq(0).astype(int))

total_stops_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('', FunctionTransformer(func=is_direct))
])

# additional_info
info_pipe1 = Pipeline(steps=[
    ('group', RareLabelEncoder(tol=0.1, n_categories=2, replace_with='Other')),
    ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

def have_info(X):
    return X.assign(additional_info=X.additional_info.ne('No Info').astype(int))

info_union = FeatureUnion(transformer_list=[
    ('part1', info_pipe1),
    ('part2', FunctionTransformer(func=have_info))
])

info_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='unknown')),
    ('union', info_union)
])

# ColumnTransformer: tool for applying different transformations to different columns of a dataset
column_transformer = ColumnTransformer(transformers=[
    ('air', air_transformer, ['airline']),
    ('doj', doj_transformer, ['date_of_journey']),
    ('location', location_transformer, ['source', 'destination']),
    ('time', time_transformer, ['dep_time', 'arrival_time']),
   # ('dur', duration_transformer, ['duration']),
    ('stops', total_stops_transformer, ['total_stops']),
    ('info', info_transformer,['additional_info'])
], remainder = 'passthrough') # Keep all other columns as they are

# Feature Selector
## Define the estimator
estimator = RandomForestRegressor(n_estimators=10, max_depth=3, random_state=42)

## Define the feature selector
selector = SelectBySingleFeaturePerformance(
    estimator=estimator,
    scoring='r2',
    threshold=0.1 # Features with r2 below 0.1 will be dropped
)

# preprocessor
preprocessor = Pipeline(steps=[
    ('ct', column_transformer),
    ('selector', selector)
])


In [16]:
preprocessor.fit(
    train.drop(columns='price'),
    train.price.copy()
)

In [17]:
preprocessor.transform(train.drop(columns='price'))

Unnamed: 0,air__airline_Indigo,air__airline_Jet Airways,air__airline_Other,doj__date_of_journey_week,doj__date_of_journey_day_of_year,location__source,location__destination,location__destination_is_north,time__arrival_time_hour,time__arrival_time,stops__total_stops,stops__is_direct_flight,remainder__duration
0,0.0,0.0,0.0,0.941176,0.923729,-0.757855,-1.747101,1,0.826087,0.826087,0.0,1,170
1,0.0,1.0,0.0,0.470588,0.457627,-0.757855,-1.747101,1,0.478261,0.478261,0.0,1,180
2,0.0,1.0,0.0,0.529412,0.533898,-0.757855,-1.747101,1,0.956522,0.956522,0.0,1,180
3,0.0,0.0,0.0,0.882353,0.898305,-0.757855,-1.747101,1,0.826087,0.826087,0.0,1,165
4,0.0,0.0,0.0,0.352941,0.330508,1.039377,1.045929,0,0.826087,0.826087,1.0,0,735
...,...,...,...,...,...,...,...,...,...,...,...,...,...
635,1.0,0.0,0.0,0.529412,0.533898,-1.928295,-0.980060,1,0.739130,0.739130,0.0,1,140
636,0.0,0.0,0.0,0.176471,0.169492,1.039377,1.045929,0,0.913043,0.913043,1.0,0,560
637,1.0,0.0,0.0,0.823529,0.796610,-1.928295,-0.980060,0,0.565217,0.565217,0.0,1,90
638,0.0,1.0,0.0,0.176471,0.169492,1.039377,1.045929,0,0.782609,0.782609,2.0,0,1925


## 4. Preprocess Data and Upload to Bucket

In [18]:
# seeting two global variable
BUCKET_NAME = "sagemaker-flights-predict-bucket"

DATA_PREFIX = 'data'

In [19]:
def get_file_name(name):
    return f'{name}.pre.csv'

In [20]:
def export_data(data, name, pre):
    # split data into x and y subsets
    X = data.drop(columns='price')
    y = data.price.copy()
    
    # transformation
    X_pre = pre.transform(X)
    
    #exporting --> In sagemaker, database should have target column at first.
    #Below condition should be match befor using it.
    file_name = get_file_name(name)
    (
        y.to_frame()
        .join(X_pre)
        .to_csv(file_name, index=False, header= False)
    )

In [21]:
def upload_to_bucket(name):
    file_name = get_file_name(name)
    
    (boto3                  # calling the library   
     .Session()       #calling the envirnoment
     .resource('s3')  # which resource you wANT
     .Bucket(BUCKET_NAME)  # bucket name ?
     .Object(os.path.join(DATA_PREFIX, f'{name}/{name}.csv')) 
     .upload_file(file_name)
    )


In [22]:
def export_and_upload_bucket(data, name, pre):
    export_data(data, name, pre)
    upload_to_bucket(name)

In [23]:
export_and_upload_bucket(train, 'train', preprocessor)

In [24]:
export_and_upload_bucket(val, 'val', preprocessor)

In [25]:
export_and_upload_bucket(test, 'test', preprocessor)

## 5. Model and Hyperparameter Tuning

In [26]:
session = sagemaker.Session()   # to inform in which env we are working on
region_name = session.boto_region_name  # these are condition when we are using sagemaker

In [27]:
# after training where to save 
output_path = f's3://{BUCKET_NAME}/model/output'

In [28]:
model = Estimator(
    image_uri=sagemaker.image_uris.retrieve('xgboost', region_name, '1.2-1'), # we r informing that we want to use xgboost model
    role=sagemaker.get_execution_role(),  #IAM role
    instance_count=1,   # how many EC2 we need?
    instance_type='ml.m4.xlarge',
    volume_size=5,  # GB
    output_path=output_path, # where to save the model?
    use_spot_instances=True, # this and below 2 lines were written to save the cost
    max_run=300,
    max_wait=600,
    sagemaker_session=session
)

In [29]:
model.set_hyperparameters(
    objective='reg:linear',
    num_round=10,
    eta=0.1,     #learning_rate
    max_depth=5,
    subsample=0.8,        # Sample 80% of data
    colsample_bytree=0.8, # Take 80% of columns from that sample. Its done to avoid overfitting
    alpha=0.1             #l2 regularization
)

In [30]:
hyperparameter_ranges = {
    'eta': ContinuousParameter(0.05, 0.2),
    'alpha': ContinuousParameter(0, 1),
    'max_depth': IntegerParameter(3, 5)
}

In [31]:
tuner = HyperparameterTuner(
    estimator=model,
    objective_metric_name='validation:rmse',
    hyperparameter_ranges=hyperparameter_ranges,
    strategy='Bayesian',
    objective_type='Minimize'
)

## 6. Data Channels

In [32]:
def get_data_channel(name):
    bucket_path =  f's3://{BUCKET_NAME}/{DATA_PREFIX}/{name}'
    return TrainingInput(bucket_path, content_type ='csv')

In [33]:
train_data_channel = get_data_channel('train')
train_data_channel

<sagemaker.inputs.TrainingInput at 0x7fa80a22d330>

In [34]:
val_data_channel = get_data_channel('val')

In [35]:
data_channels = {
    'train': train_data_channel,
    'validation': val_data_channel
}

## 7. Train and Tune the Model

In [36]:
tuner.fit(data_channels)

No finished training job found associated with this estimator. Please make sure this estimator is only used for building workflow config
No finished training job found associated with this estimator. Please make sure this estimator is only used for building workflow config


.............................................!


In [37]:
# if you want to deploy your model on Sagemaker

# -- tuner.best_estimator().deploy()


## 8. Model Evaluation

In [38]:
# 1st download the model s3 bucket and upload on jupiter_notebook cloud
with open('xgboost-model', 'rb') as f:
    best_model = pickle.load(f)
    
best_model

<xgboost.core.Booster at 0x7fa80a22ffa0>

In [39]:
# best_model.predict(x)--> it won't work here, xgboost take data in slightly different format
pd.read_csv('train.pre.csv').iloc[:,1:]

Unnamed: 0,0.0,0.0.1,0.0.2,0.9411764705882352,0.923728813559322,-0.7578545700979066,-1.747100705192746,1,0.8260869565217391,0.8260869565217391.1,0.0.3,1.1,170
0,0.0,1.0,0.0,0.470588,0.457627,-0.757855,-1.747101,1,0.478261,0.478261,0.0,1,180
1,0.0,1.0,0.0,0.529412,0.533898,-0.757855,-1.747101,1,0.956522,0.956522,0.0,1,180
2,0.0,0.0,0.0,0.882353,0.898305,-0.757855,-1.747101,1,0.826087,0.826087,0.0,1,165
3,0.0,0.0,0.0,0.352941,0.330508,1.039377,1.045929,0,0.826087,0.826087,1.0,0,735
4,0.0,0.0,1.0,0.647059,0.635593,1.039377,1.045929,0,0.521739,0.521739,1.0,0,355
...,...,...,...,...,...,...,...,...,...,...,...,...,...
634,1.0,0.0,0.0,0.529412,0.533898,-1.928295,-0.980060,1,0.739130,0.739130,0.0,1,140
635,0.0,0.0,0.0,0.176471,0.169492,1.039377,1.045929,0,0.913043,0.913043,1.0,0,560
636,1.0,0.0,0.0,0.823529,0.796610,-1.928295,-0.980060,0,0.565217,0.565217,0.0,1,90
637,0.0,1.0,0.0,0.176471,0.169492,1.039377,1.045929,0,0.782609,0.782609,2.0,0,1925


In [40]:
def evaluate_model(name):
    file_name = get_file_name(name)
    data = pd.read_csv(file_name)
    
    x = xgb.DMatrix(data.iloc[:,1:])
    y = data.iloc[:, 0].copy()
    
    pred = best_model.predict(x)
    
    return r2_score(y, pred)

In [41]:
evaluate_model('train')

0.09163272380828857

In [42]:
evaluate_model('val')

-0.09521186351776123

In [43]:
evaluate_model('test')

-0.09037244319915771