##  1. Import Libraries

In [170]:
!pip install xgboost

Collecting xgboost
  Downloading xgboost-2.1.0-py3-none-manylinux2014_x86_64.whl.metadata (2.0 kB)
Downloading xgboost-2.1.0-py3-none-manylinux2014_x86_64.whl (4.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.5/4.5 MB[0m [31m57.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: xgboost
Successfully installed xgboost-2.1.0


In [127]:
!pip install feature-engine



In [171]:
import os

import pickle

import boto3

import xgboost as xgb

import numpy as np
import pandas as pd

import sklearn

from sklearn.metrics import r2_score

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics.pairwise import rbf_kernel
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import (
	OneHotEncoder,
	OrdinalEncoder,
	StandardScaler,
	MinMaxScaler,
	PowerTransformer,
	FunctionTransformer
)

from feature_engine.outliers import Winsorizer
from feature_engine.datetime import DatetimeFeatures
from feature_engine.selection import SelectBySingleFeaturePerformance
from feature_engine.encoding import (
	RareLabelEncoder,
	MeanEncoder,
	CountFrequencyEncoder
)

import sagemaker
from sagemaker.estimator import Estimator
from sagemaker.inputs import TrainingInput
from sagemaker.tuner import (
    IntegerParameter,
    ContinuousParameter,
    HyperparameterTuner
)

import warnings

# needs after step 5

import sagemaker
from sagemaker.estimator import Estimator
from sagemaker.inputs import TrainingInput
from sagemaker.tuner import (
    IntegerParameter,
    ContinuousParameter,
    HyperparameterTuner
)

## 2. Display Settings

In [129]:
pd.set_option('display.max_columns', None)

In [130]:
sklearn.set_config(transform_output='pandas')

In [131]:
warnings.filterwarnings('ignore')

# 3. Read Datasets

In [132]:
train = pd.read_csv('train.csv')
train.head()

Unnamed: 0,airline,date_of_journey,source,destination,dep_time,arrival_time,duration,total_stops,additional_info,price
0,Jet Airways,2019-05-24,Kolkata,Banglore,20:00:00,20:45:00,1485,1.0,In-flight meal not included,10844
1,Goair,2019-04-03,Banglore,Delhi,07:45:00,10:40:00,175,0.0,No Info,4239
2,Indigo,2019-03-27,Kolkata,Banglore,05:30:00,08:20:00,170,0.0,No Info,4148
3,Jet Airways,2019-03-18,Mumbai,Hyderabad,10:20:00,11:50:00,90,0.0,In-flight meal not included,4160
4,Jet Airways,2019-05-09,Delhi,Cochin,15:00:00,12:35:00,1295,1.0,In-flight meal not included,12373


In [133]:
val = pd.read_csv('valid.csv')
val

Unnamed: 0,airline,date_of_journey,source,destination,dep_time,arrival_time,duration,total_stops,additional_info,price
0,Indigo,2019-03-06,Delhi,Cochin,11:55:00,22:20:00,625,1.0,No Info,6442
1,Jet Airways,2019-05-09,Delhi,Cochin,23:05:00,19:00:00,1195,2.0,No Info,15129
2,Jet Airways,2019-05-21,Delhi,Cochin,08:00:00,19:00:00,660,1.0,No Info,16289
3,Air Asia,2019-05-01,Kolkata,Banglore,19:55:00,22:25:00,150,0.0,No Info,5989
4,Air India,2019-05-03,Banglore,Delhi,06:10:00,08:55:00,165,0.0,No Info,5228
...,...,...,...,...,...,...,...,...,...,...
155,Jet Airways,2019-05-18,Delhi,Cochin,23:05:00,12:35:00,810,2.0,No Info,15129
156,Vistara,2019-06-06,Kolkata,Banglore,20:20:00,22:40:00,1580,1.0,No Info,8662
157,Jet Airways,2019-05-01,Kolkata,Banglore,08:25:00,04:40:00,1215,1.0,No Info,13941
158,Indigo,2019-04-15,Banglore,Delhi,16:55:00,19:55:00,180,0.0,No Info,4823


In [134]:
test = pd.read_csv('test.csv')
test

Unnamed: 0,airline,date_of_journey,source,destination,dep_time,arrival_time,duration,total_stops,additional_info,price
0,Indigo,2019-05-24,Banglore,Delhi,07:10:00,10:05:00,175,0.0,No Info,4823
1,Indigo,2019-03-03,Banglore,New Delhi,18:25:00,21:20:00,175,0.0,No Info,7648
2,Air India,2019-05-09,Delhi,Cochin,09:45:00,23:00:00,795,1.0,No Info,7174
3,Indigo,2019-05-21,Mumbai,Hyderabad,09:10:00,10:40:00,90,0.0,No Info,4049
4,Vistara,2019-06-27,Delhi,Cochin,14:40:00,17:50:00,190,0.0,No Info,6216
...,...,...,...,...,...,...,...,...,...,...
195,Multiple Carriers,2019-06-06,Delhi,Cochin,11:25:00,19:15:00,470,1.0,No Info,7354
196,Indigo,2019-03-15,Banglore,New Delhi,07:30:00,12:20:00,290,1.0,No Info,6114
197,Jet Airways,2019-05-15,Banglore,Delhi,07:10:00,10:10:00,180,0.0,In-flight meal not included,5198
198,Multiple Carriers,2019-06-24,Delhi,Cochin,10:00:00,19:00:00,540,1.0,No Info,12717


# 4. Preprocessing Operations

In [135]:
# airline
air_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('grouper', RareLabelEncoder(tol=0.1, n_categories=2, replace_with='Other')),
    ('encoder', OneHotEncoder(sparse_output=False, handle_unknown='ignore'))
     
])

# date of journey
feature_to_extract = ['month', 'week', 'day_of_week', 'day_of_month', 'day_of_year']

doj_transformer = Pipeline(steps=[
    ('dt', DatetimeFeatures(features_to_extract=feature_to_extract, yearfirst=True, format='mixed')),
    ('scaler', MinMaxScaler())
])

# source and destination
location_pipe1 = Pipeline(steps=[
    ('grouper', RareLabelEncoder(tol=0.1,replace_with='Other', n_categories=2)),
    ('encoer' , MeanEncoder()),
    ('scaler' , PowerTransformer())
])

def is_north(X):
    columns = X.columns.to_list()
    north_cities = ['Delhi', 'New Delhi', 'Kolkata', 'Mumbai']
    return (
        X.assign(**{
            f"{col}_is_north": X.loc[:, col].isin(north_cities).astype(int)
            for col in columns
        })
        .drop(columns=columns)
    )

location_transformer=FeatureUnion(transformer_list=[
    ('part1', location_pipe1),
    ('part2', FunctionTransformer(func=is_north))
])

# dep_time & arrival_time

time_pipe1 = Pipeline(steps=[
    ('dt', DatetimeFeatures(features_to_extract=['hour','minute'])),
    ('scaler', MinMaxScaler())
])

def part_of_day(X, morning=4, noon=12, evening=16, night=20):
    columns = X.columns.to_list()
    X_temp  = X.assign(**{
        col: pd.to_datetime(X.loc[:, col]).dt.hour for col in columns
    })
    return (
        X_temp
        .assign(**{
            f"{col}_part_of_day": np.select(
                [X_temp.loc[:, col].between(morning, noon, inclusive='left'),
                 X_temp.loc[:, col].between(noon, evening, inclusive='left'),
                 X_temp.loc[:, col].between(evening, night, inclusive='left')
                ],
                ['morning', 'afternoon', 'evening'],
                default='night'
            ) 
            for col in columns
        })
        .drop(columns=columns)
    )

time_pipe2 = Pipeline(steps=[
    ('part', FunctionTransformer(func=part_of_day)),
    ('encoder', CountFrequencyEncoder()),
    ('scaler' , MinMaxScaler())
])


time_transformer = FeatureUnion(transformer_list=[
    ('part1', time_pipe1),
    ('part2', time_pipe2)
])

# duration

def duration_category(X, short=180, med=400):
	return (
		X
		.assign(duration_cat=np.select([X.duration.lt(short),
									    X.duration.between(short, med, inclusive="left")],
									   ["short", "medium"],
									   default="long"))
		.drop(columns="duration")
	)

class RBFPercentileSimilarity(BaseEstimator, TransformerMixin):
	def __init__(self, variables=None, percentiles=[0.25, 0.5, 0.75], gamma=0.1):
		self.variables = variables
		self.percentiles = percentiles
		self.gamma = gamma


	def fit(self, X, y=None):
		if not self.variables:
			self.variables = X.select_dtypes(include="number").columns.to_list()

		self.reference_values_ = {
			col: (
				X
				.loc[:, col]
				.quantile(self.percentiles)
				.values
				.reshape(-1, 1)
			)
			for col in self.variables
		}

		return self


	def transform(self, X):
		objects = []
		for col in self.variables:
			columns = [f"{col}_rbf_{int(percentile * 100)}" for percentile in self.percentiles]
			obj = pd.DataFrame(
				data=rbf_kernel(X.loc[:, [col]], Y=self.reference_values_[col], gamma=self.gamma),
				columns=columns
			)
			objects.append(obj)
		return pd.concat(objects, axis=1)
   

def is_over(X, value=1000):
    return (
        X
        .assign(**{
            f"duration_over_{value}": X.duration.ge(value).astype(int)
        })
        .drop(columns='duration')
    )

duration_pipe1 = Pipeline(steps=[
	("rbf", RBFPercentileSimilarity()),
	("scaler", PowerTransformer())
])

duration_pipe2 = Pipeline(steps=[
	("cat", FunctionTransformer(func=duration_category)),
	("encoder", OrdinalEncoder(categories=[["short", "medium", "long"]]))
])

duration_union = FeatureUnion(transformer_list=[
	("part1", duration_pipe1),
	("part2", duration_pipe2),
	("part3", FunctionTransformer(func=is_over)),
	("part4", StandardScaler())
])

duration_transformer = Pipeline(steps=[
	("outliers", Winsorizer(capping_method="iqr", fold=1.5)),
	("imputer", SimpleImputer(strategy="median")),
	("union", duration_union)
])


# total stops

def is_direct(X):
    return X.assign(is_direct_flight = X.total_stops.eq(0).astype(int))

total_stops_transformer = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='most_frequent')),
    ("", FunctionTransformer(func=is_direct))
])

# additional information

info_pipe1 = Pipeline(steps=[
    ('grouper', RareLabelEncoder(tol=0.1, n_categories=2, replace_with='Other')),
    ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

def have_info(X):
    return X.assign(additional_info=X.additional_info.ne('No Info').astype(int))

info_union = FeatureUnion(transformer_list=[
    ('part1', info_pipe1),
    ('part2', FunctionTransformer(func=have_info))
])

info_transformer = Pipeline(steps=[
	("imputer", SimpleImputer(strategy="constant", fill_value="unknown")),
	("union", info_union)
])

# column transformer


column_transformer = ColumnTransformer(transformers=[
	("air", air_transformer, ["airline"]),
	("doj", doj_transformer, ["date_of_journey"]),
	("location", location_transformer, ["source", 'destination']),
	("time", time_transformer, ["dep_time", "arrival_time"]),
	("dur", duration_transformer, ["duration"]),
	("stops", total_stops_transformer, ["total_stops"]),
	("info", info_transformer, ["additional_info"])
], remainder="passthrough")

# feature selector

estimator = RandomForestRegressor(n_estimators=10, max_depth=3, random_state=42)

selector = SelectBySingleFeaturePerformance(
    estimator = estimator,
    scoring = 'r2',
    threshold=0.06
)

# preprocessor

preprocessor = Pipeline(steps=[
    ('ct', column_transformer),
    ('selector', selector)
])

In [136]:
preprocessor.fit(train.drop(columns='price'), train.price.copy())

In [137]:
preprocessor.transform(train.drop(columns='price'))

Unnamed: 0,air__airline_Indigo,air__airline_Jet Airways,doj__date_of_journey_month,doj__date_of_journey_week,doj__date_of_journey_day_of_year,location__source,location__destination,time__arrival_time_hour,dur__duration_rbf_25,dur__duration_cat,dur__duration,stops__total_stops,stops__is_direct_flight
0,0.0,1.0,0.666667,0.705882,0.711864,-0.241814,-0.273227,0.869565,-0.335996,2.0,1.664009,1.0,0
1,0.0,0.0,0.333333,0.294118,0.279661,-0.926482,-1.906881,0.434783,3.194193,0.0,-0.909843,0.0,1
2,1.0,0.0,0.000000,0.235294,0.220339,-0.241814,-0.273227,0.347826,2.863948,0.0,-0.919667,0.0,1
3,0.0,1.0,0.000000,0.176471,0.144068,-1.896087,-0.795248,0.478261,-0.335996,0.0,-1.076849,0.0,1
4,0.0,1.0,0.666667,0.588235,0.584746,1.009536,1.005984,0.521739,-0.335996,2.0,1.290702,1.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
635,0.0,1.0,0.000000,0.176471,0.144068,-0.926482,-0.795248,0.434783,-0.335996,2.0,0.809333,1.0,0
636,0.0,0.0,0.666667,0.705882,0.711864,1.009536,1.005984,0.043478,-0.335996,1.0,-0.487417,1.0,0
637,0.0,1.0,0.000000,0.176471,0.194915,-1.896087,-0.795248,0.739130,-0.335996,0.0,-1.076849,0.0,1
638,1.0,0.0,1.000000,0.823529,0.847458,-0.241814,-0.273227,0.086957,-0.335996,1.0,-0.634775,1.0,0


#  5. Preprocess Data and Upload to Bucket

In [138]:
BUCKET_NAME = 'fligh-prices-bucket'
DATA_PREFIX = 'data'

In [139]:
def get_file_name(name):
    return f'{name}-pre.csv'       # returning the name of preprocess data

In [140]:
# first we export data then we upload on s3 bucket
# sagemaker wants its target variable first then after any input

def export_data(data, name, pre):
    # split data into X and y subsets
    
    X = data.drop(columns='price')
    y = data.price.copy()
    
    # transformation
    X_pre = pre.transform(X)
    
    # exporting
    file_name = get_file_name(name)
    
    # here we save file into csv format. index=False, header=False this is the expectation of sagemaker
    (
        y
        .to_frame()
        .join(X_pre)
        .to_csv(file_name, index=False, header=False)
    )

In [141]:
# interaction of python with aws services

def upload_to_bucket(name):
    
    file_name = get_file_name(name)
    
    (
        boto3
        .Session()  # representing the curr working env
        .resource('s3')
        .Bucket(BUCKET_NAME)
        .Object(os.path.join(DATA_PREFIX, f'{name}/{name}.csv')) # which file you wanna save upto this folder
        .upload_file(file_name)
    )

In [142]:
def export_and_upload_to_bucket(data, name, pre):
    export_data(data, name, pre)
    upload_to_bucket(name)

In [143]:
export_and_upload_to_bucket(train, 'train', preprocessor)

In [144]:
export_and_upload_to_bucket(test, 'test', preprocessor)

In [145]:
export_and_upload_to_bucket(val , 'val' , preprocessor)

#  6. Model and Hyperparameter Tuning Set-up

In [146]:
session = sagemaker.Session()
region_name = session.boto_region_name

In [147]:
# where to save our model => inside s3 bucket
output_path = f's3://{BUCKET_NAME}/model/output'

In [157]:
xgboost_container = sagemaker.image_uris.retrieve('xgboost', region_name, "1.2-1")
model = Estimator(
    image_uri = xgboost_container,
    role = sagemaker.get_execution_role(),
    instance_count=1,
    instance_type='ml.m4.xlarge',
    volume_size=5,
    output_path = output_path,
    sagemaker_session=session,
    max_run = 300,
    max_wait= 600,
    use_spot_instances=True
)

In [158]:
model.set_hyperparameters(
    objective='reg:linear',
    num_round=10,
    eta=0.1,
    max_depth=5, 
    subsample=0.7,
    colsample_bytree = 0.7,
    alpha=0.1
)

In [159]:
hyperparameter_ranges = {
    "eta": ContinuousParameter(0.05, 0.2),
    "alpha": ContinuousParameter(0, 1),
    "max_depth": IntegerParameter(3, 5)
}

In [160]:
tuner = HyperparameterTuner(
    estimator=model,
    objective_metric_name="validation:rmse",
    hyperparameter_ranges=hyperparameter_ranges,
    strategy="Bayesian",
    objective_type="Minimize"
)

# 7. Data Channels

In [161]:
# model ko path dikhana => channel

In [162]:
def get_data_channel(name):
    bucket_path = f's3://{BUCKET_NAME}/{DATA_PREFIX}/{name}'
    return TrainingInput(bucket_path, content_type='csv')

In [163]:
train_data_channel = get_data_channel('train')

In [164]:
val_data_channel = get_data_channel('val')

In [165]:
data_channel = {
    'train':train_data_channel,
    'validation':val_data_channel
}

# 8. Train and Tune the Model

In [166]:
tuner.fit(data_channel)

No finished training job found associated with this estimator. Please make sure this estimator is only used for building workflow config
No finished training job found associated with this estimator. Please make sure this estimator is only used for building workflow config


............................................!


In [167]:
tuner.best_estimator()


2024-07-05 14:51:02 Starting - Preparing the instances for training
2024-07-05 14:51:02 Downloading - Downloading the training image
2024-07-05 14:51:02 Training - Training image download completed. Training in progress.
2024-07-05 14:51:02 Uploading - Uploading generated training model
2024-07-05 14:51:02 Completed - Training job completed


<sagemaker.estimator.Estimator at 0x7fd7203385b0>

In [169]:
# for deploy the model on sagemaker 
# tuner.best_estimator().deploy()

# 9. Model Evaluation

In [173]:
with open('xgboost-model', 'rb') as f:
    best_model = pickle.load(f)
    
best_model

<xgboost.core.Booster at 0x7fd71bf6a7a0>

In [174]:
pd.read_csv('train-pre.csv')

Unnamed: 0,10844,0.0,1.0,0.6666666666666665,0.7058823529411765,0.7118644067796611,-0.24181387591374567,-0.27322738173660915,0.8695652173913043,-0.3359958148605272,2.0,1.6640092843973011,1.0.1,0
0,4239,0.0,0.0,0.333333,0.294118,0.279661,-0.926482,-1.906881,0.434783,3.194193,0.0,-0.909843,0.0,1
1,4148,1.0,0.0,0.000000,0.235294,0.220339,-0.241814,-0.273227,0.347826,2.863948,0.0,-0.919667,0.0,1
2,4160,0.0,1.0,0.000000,0.176471,0.144068,-1.896087,-0.795248,0.478261,-0.335996,0.0,-1.076849,0.0,1
3,12373,0.0,1.0,0.666667,0.588235,0.584746,1.009536,1.005984,0.521739,-0.335996,2.0,1.290702,1.0,0
4,5000,1.0,0.0,1.000000,0.941176,0.923729,1.009536,1.005984,0.347826,-0.335996,1.0,-0.870548,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
634,11087,0.0,1.0,0.000000,0.176471,0.144068,-0.926482,-0.795248,0.434783,-0.335996,2.0,0.809333,1.0,0
635,8283,0.0,0.0,0.666667,0.705882,0.711864,1.009536,1.005984,0.043478,-0.335996,1.0,-0.487417,1.0,0
636,2228,0.0,1.0,0.000000,0.176471,0.194915,-1.896087,-0.795248,0.739130,-0.335996,0.0,-1.076849,0.0,1
637,6911,1.0,0.0,1.000000,0.823529,0.847458,-0.241814,-0.273227,0.086957,-0.335996,1.0,-0.634775,1.0,0


In [175]:
def evaluate_model(name):
    file_name = get_file_name(name)
    data = pd.read_csv(file_name)
    
    X = xgb.DMatrix(data.iloc[:, 1:])   # this is the formate for xgboost model
    y = data.iloc[:,0].copy()
    
    pred = best_model.predict(X)
    
    return r2_score(y, pred)

In [176]:
evaluate_model('train')

0.57494559618126

In [178]:
evaluate_model('val')

0.5760362073516443

In [179]:
evaluate_model('test')

0.395635897575785

- in our case we need sagemaker 