#1. Import Libraries

In [2]:
!pip install feature_engine

Collecting feature_engine
  Downloading feature_engine-1.9.3-py3-none-any.whl.metadata (10 kB)
Downloading feature_engine-1.9.3-py3-none-any.whl (229 kB)
Installing collected packages: feature_engine
Successfully installed feature_engine-1.9.3


In [6]:
!pip install xgboost

Collecting xgboost
  Downloading xgboost-3.0.4-py3-none-manylinux2014_x86_64.whl.metadata (2.0 kB)
Downloading xgboost-3.0.4-py3-none-manylinux2014_x86_64.whl (4.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.9/4.9 MB[0m [31m80.0 MB/s[0m  [33m0:00:00[0m
[?25hInstalling collected packages: xgboost
Successfully installed xgboost-3.0.4


In [7]:
import os

import boto3

import pickle

import warnings

import numpy as np

import pandas as pd

import xgboost as xgb

import sklearn
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics.pairwise import rbf_kernel
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import (
	OneHotEncoder,
	OrdinalEncoder,
	StandardScaler,
	MinMaxScaler,
	PowerTransformer,
	FunctionTransformer
)

from feature_engine.outliers import Winsorizer
from feature_engine.datetime import DatetimeFeatures
from feature_engine.selection import SelectBySingleFeaturePerformance
from feature_engine.encoding import (
	RareLabelEncoder,
	MeanEncoder,
	CountFrequencyEncoder
)

import sagemaker
from sagemaker.estimator import Estimator
from sagemaker.inputs import TrainingInput
from sagemaker.tuner import (
    IntegerParameter,
    ContinuousParameter,
    HyperparameterTuner
)

Note: You have installed the 'manylinux2014' variant of XGBoost. Certain features such as GPU algorithms or federated learning are not available. To use these features, please upgrade to a recent Linux distro with glibc 2.28+, and install the 'manylinux_2_28' variant.


sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


## 2. Display Settings

In [4]:
pd.set_option("display.max_columns", None)

In [8]:
sklearn.set_config(transform_output="pandas")

In [9]:
warnings.filterwarnings("ignore")

## 3. Read Datasets

In [10]:
train = pd.read_csv("train.csv")
train

Unnamed: 0,airline,date_of_journey,source,destination,dep_time,arrival_time,duration,total_stops,additional_info,price
0,Indigo,2019-04-27,Delhi,Cochin,02:00:00,07:45:00,345,1.0,No Info,7173
1,Spicejet,2019-05-09,Chennai,Kolkata,17:45:00,20:05:00,140,0.0,No Info,3597
2,Spicejet,2019-06-06,Kolkata,Banglore,09:00:00,11:25:00,145,0.0,No check-in baggage included,3841
3,Air India,2019-05-12,Kolkata,Banglore,12:00:00,13:15:00,1515,1.0,No Info,6612
4,Multiple Carriers,2019-05-18,Delhi,Cochin,07:10:00,16:10:00,540,1.0,No Info,7038
...,...,...,...,...,...,...,...,...,...,...
635,Indigo,2019-03-03,Delhi,Cochin,06:40:00,15:30:00,530,1.0,No Info,14871
636,Air India,2019-04-01,Delhi,Cochin,17:20:00,09:25:00,965,1.0,No Info,5117
637,Spicejet,2019-04-01,Delhi,Cochin,06:05:00,11:20:00,315,1.0,No Info,5351
638,Vistara,2019-05-24,Kolkata,Banglore,20:20:00,10:45:00,865,1.0,No Info,9187


In [11]:
val = pd.read_csv("val.csv")
val

Unnamed: 0,airline,date_of_journey,source,destination,dep_time,arrival_time,duration,total_stops,additional_info,price
0,Multiple Carriers,2019-05-09,Delhi,Cochin,10:00:00,21:00:00,660,1.0,No Info,13727
1,Jet Airways,2019-05-18,Kolkata,Banglore,09:35:00,18:00:00,505,1.0,No Info,14151
2,Jet Airways,2019-03-24,Kolkata,Banglore,19:45:00,20:40:00,1495,1.0,No Info,13759
3,Multiple Carriers,2019-03-27,Delhi,Cochin,11:40:00,21:00:00,560,1.0,No Info,6637
4,Multiple Carriers,2019-03-21,Delhi,Cochin,04:45:00,12:35:00,470,1.0,No Info,9838
...,...,...,...,...,...,...,...,...,...,...
155,Spicejet,2019-03-15,Banglore,New Delhi,09:35:00,12:30:00,175,0.0,No check-in baggage included,6205
156,Spicejet,2019-04-24,Kolkata,Banglore,22:20:00,00:40:00,140,0.0,No check-in baggage included,3841
157,Air India,2019-03-06,Mumbai,Hyderabad,19:00:00,09:10:00,850,1.0,No Info,12308
158,Indigo,2019-06-06,Kolkata,Banglore,15:15:00,20:30:00,315,1.0,No Info,5170


In [12]:
test = pd.read_csv("test.csv")
test

Unnamed: 0,airline,date_of_journey,source,destination,dep_time,arrival_time,duration,total_stops,additional_info,price
0,Air India,2019-06-01,Delhi,Cochin,10:55:00,19:15:00,500,2.0,No Info,13013
1,Goair,2019-04-27,Banglore,Delhi,20:55:00,23:40:00,165,0.0,No Info,3898
2,Jet Airways,2019-05-12,Banglore,Delhi,15:15:00,18:10:00,175,0.0,No Info,7229
3,Spicejet,2019-05-06,Chennai,Kolkata,09:45:00,12:00:00,135,0.0,No check-in baggage included,3543
4,Indigo,2019-03-12,Kolkata,Banglore,05:30:00,08:20:00,170,0.0,No Info,4148
...,...,...,...,...,...,...,...,...,...,...
195,Indigo,2019-06-06,Delhi,Cochin,07:30:00,21:00:00,810,1.0,No Info,6093
196,Vistara,2019-04-27,Banglore,Delhi,19:30:00,22:15:00,165,0.0,No Info,5613
197,Jet Airways,2019-06-15,Delhi,Cochin,11:45:00,12:35:00,1490,2.0,No Info,13882
198,Indigo,2019-06-06,Chennai,Kolkata,07:55:00,10:15:00,140,0.0,No Info,3850


## 4. Preprocessing Operations

In [13]:
# airline
air_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("grouper", RareLabelEncoder(tol=0.1, replace_with="Other", n_categories=2)),
    ("encoder", OneHotEncoder(sparse_output=False, handle_unknown="ignore"))
])

#doj
feature_to_extract = ["month", "week", "day_of_week", "day_of_year"]

doj_transformer = Pipeline(steps=[
    ("dt", DatetimeFeatures(features_to_extract=feature_to_extract, yearfirst=True, format="mixed")),
    ("scaler", MinMaxScaler())
])

# source & destination
location_pipe1 = Pipeline(steps=[
    ("grouper", RareLabelEncoder(tol=0.1, replace_with="Other", n_categories=2)),
    ("encoder", MeanEncoder()),
    ("scaler", PowerTransformer())
])

def is_north(X):
    columns = X.columns.to_list()
    north_cities = ["Delhi", "Kolkata", "Mumbai", "New Delhi"]
    return (
        X
        .assign(**{
            f"{col}_is_north": X.loc[:, col].isin(north_cities).astype(int)
            for col in columns
        })
        .drop(columns=columns)
    )

location_transformer = FeatureUnion(transformer_list=[
    ("part1", location_pipe1),
    ("part2", FunctionTransformer(func=is_north))
])

# dep_time & arrival_time
time_pipe1 = Pipeline(steps=[
    ("dt", DatetimeFeatures(features_to_extract=["hour", "minute"])),
    ("scaler", MinMaxScaler())
])

def part_of_day(X, morning=4, noon=12, eve=16, night=20):
    columns = X.columns.to_list()
    X_temp = X.assign(**{
        col: pd.to_datetime(X.loc[:, col]).dt.hour
        for col in columns
    })

    return (
        X_temp
        .assign(**{
            f"{col}_part_of_day": np.select(
                [X_temp.loc[:, col].between(morning, noon, inclusive="left"),
                 X_temp.loc[:, col].between(noon, eve, inclusive="left"),
                 X_temp.loc[:, col].between(eve, night, inclusive="left")],
                ["morning", "afternoon", "evening"],
                default="night"
            )
            for col in columns
        })
        .drop(columns=columns)
    )

time_pipe2 = Pipeline(steps=[
    ("part", FunctionTransformer(func=part_of_day)),
    ("encoder", CountFrequencyEncoder()),
    ("scaler", MinMaxScaler())
])

time_transformer = FeatureUnion(transformer_list=[
    ("part1", time_pipe1),
    ("part2", time_pipe2)
])

# duration
class RBFPercentileSimilarity(BaseEstimator, TransformerMixin):
    def __init__(self, variables=None, percentiles=[0.25, 0.5, 0.75], gamma=0.1):
        self.variables = variables
        self.percentiles = percentiles
        self.gamma = gamma


    def fit(self, X, y=None):
        if not self.variables:
            self.variables = X.select_dtypes(include="number").columns.to_list()

        self.reference_values_ = {
            col: (
                X
                .loc[:, col]
                .quantile(self.percentiles)
                .values
                .reshape(-1, 1)
            )
            for col in self.variables
        }

        return self


    def transform(self, X):
        objects = []
        for col in self.variables:
            columns = [f"{col}_rbf_{int(percentile * 100)}" for percentile in self.percentiles]
            obj = pd.DataFrame(
                data=rbf_kernel(X.loc[:, [col]], Y=self.reference_values_[col], gamma=self.gamma),
                columns=columns
            )
            objects.append(obj)
        return pd.concat(objects, axis=1)
    

def duration_category(X, short=180, med=400):
    return (
        X
        .assign(duration_cat=np.select([X.duration.lt(short),
                                    X.duration.between(short, med, inclusive="left")],
                                    ["short", "medium"],
                                    default="long"))
        .drop(columns="duration")
    )

def is_over(X, value=1000):
    return (
        X
        .assign(**{
            f"duration_over_{value}": X.duration.ge(value).astype(int)
        })
        .drop(columns="duration")
    )

duration_pipe1 = Pipeline(steps=[
    ("rbf", RBFPercentileSimilarity()),
    ("scaler", PowerTransformer())
])

duration_pipe2 = Pipeline(steps=[
    ("cat", FunctionTransformer(func=duration_category)),
    ("encoder", OrdinalEncoder(categories=[["short", "medium", "long"]]))
])

duration_union = FeatureUnion(transformer_list=[
    ("part1", duration_pipe1),
    ("part2", duration_pipe2),
    ("part3", FunctionTransformer(func=is_over)),
    ("part4", StandardScaler())
])

duration_transformer = Pipeline(steps=[
    ("outliers", Winsorizer(capping_method="iqr", fold=1.5)),
    ("imputer", SimpleImputer(strategy="median")),
    ("union", duration_union)
])

# total_stops
def is_direct(X):
    return X.assign(is_direct_flight=X.total_stops.eq(0).astype(int))


total_stops_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("", FunctionTransformer(func=is_direct))
])

# additional_info
info_pipe1 = Pipeline(steps=[
    ("group", RareLabelEncoder(tol=0.1, n_categories=2, replace_with="Other")),
    ("encoder", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

def have_info(X):
    return X.assign(additional_info=X.additional_info.ne("No Info").astype(int))

info_union = FeatureUnion(transformer_list=[
("part1", info_pipe1),
("part2", FunctionTransformer(func=have_info))
])

info_transformer = Pipeline(steps=[
("imputer", SimpleImputer(strategy="constant", fill_value="unknown")),
("union", info_union)
])

# column transformer
column_transformer = ColumnTransformer(transformers=[
("air", air_transformer, ["airline"]),
("doj", doj_transformer, ["date_of_journey"]),
("location", location_transformer, ["source", 'destination']),
("time", time_transformer, ["dep_time", "arrival_time"]),
("dur", duration_transformer, ["duration"]),
("stops", total_stops_transformer, ["total_stops"]),
("info", info_transformer, ["additional_info"])
], remainder="passthrough")

# feature selector
estimator = RandomForestRegressor(n_estimators=10, max_depth=3, random_state=42)

selector = SelectBySingleFeaturePerformance(
estimator=estimator,
scoring="r2",
threshold=0.1
) 

# preprocessor
preprocessor = Pipeline(steps=[
("ct", column_transformer),
("selector", selector)
])


In [14]:
preprocessor.fit(
    train.drop(columns="price"),
    train.price.copy()
)

0,1,2
,steps,"[('ct', ...), ('selector', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('air', ...), ('doj', ...), ...]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,tol,0.1
,n_categories,2
,max_n_categories,
,replace_with,'Other'
,variables,
,missing_values,'raise'
,ignore_format,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,variables,
,features_to_extract,"['month', 'week', ...]"
,drop_original,True
,missing_values,'raise'
,dayfirst,False
,yearfirst,True
,utc,
,format,'mixed'

0,1,2
,feature_range,"(0, ...)"
,copy,True
,clip,False

0,1,2
,tol,0.1
,n_categories,2
,max_n_categories,
,replace_with,'Other'
,variables,
,missing_values,'raise'
,ignore_format,False

0,1,2
,variables,
,missing_values,'raise'
,ignore_format,False
,unseen,'ignore'
,smoothing,0.0

0,1,2
,method,'yeo-johnson'
,standardize,True
,copy,True

0,1,2
,func,<function is_...x7f499726f1c0>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,
,kw_args,
,inv_kw_args,

0,1,2
,variables,
,features_to_extract,"['hour', 'minute']"
,drop_original,True
,missing_values,'raise'
,dayfirst,False
,yearfirst,False
,utc,
,format,

0,1,2
,feature_range,"(0, ...)"
,copy,True
,clip,False

0,1,2
,func,<function par...x7f498b9be950>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,
,kw_args,
,inv_kw_args,

0,1,2
,encoding_method,'count'
,variables,
,missing_values,'raise'
,ignore_format,False
,unseen,'ignore'

0,1,2
,feature_range,"(0, ...)"
,copy,True
,clip,False

0,1,2
,capping_method,'iqr'
,tail,'right'
,fold,1.5
,add_indicators,False
,variables,
,missing_values,'raise'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,transformer_list,"[('part1', ...), ('part2', ...), ...]"
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True

0,1,2
,variables,
,percentiles,"[0.25, 0.5, ...]"
,gamma,0.1

0,1,2
,method,'yeo-johnson'
,standardize,True
,copy,True

0,1,2
,func,<function dur...x7f498b9bd090>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,
,kw_args,
,inv_kw_args,

0,1,2
,categories,"[['short', 'medium', ...]]"
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,unknown_value,
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,func,<function is_...x7f498b9bec20>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,
,kw_args,
,inv_kw_args,

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,func,<function is_...x7f498b9bf130>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,
,kw_args,
,inv_kw_args,

0,1,2
,missing_values,
,strategy,'constant'
,fill_value,'unknown'
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,transformer_list,"[('part1', ...), ('part2', ...)]"
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True

0,1,2
,tol,0.1
,n_categories,2
,max_n_categories,
,replace_with,'Other'
,variables,
,missing_values,'raise'
,ignore_format,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,func,<function hav...x7f498b9bf1c0>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,
,kw_args,
,inv_kw_args,

0,1,2
,estimator,RandomForestR...ndom_state=42)
,scoring,'r2'
,cv,3
,groups,
,threshold,0.1
,variables,
,confirm_variables,False

0,1,2
,n_estimators,10
,criterion,'squared_error'
,max_depth,3
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [15]:
preprocessor.transform(train.drop(columns="price"))

Unnamed: 0,air__airline_Indigo,air__airline_Jet Airways,air__airline_Other,doj__date_of_journey_week,doj__date_of_journey_day_of_year,location__source,location__destination,dur__duration_rbf_25,dur__duration_cat,dur__duration_over_1000,dur__duration,stops__total_stops,stops__is_direct_flight
0,1.0,0.0,0.0,0.470588,0.483051,1.025247,1.021079,-0.35562,1.0,0,-0.592393,1.0,0
1,0.0,0.0,1.0,0.588235,0.584746,-1.878015,-0.758515,-0.35562,0.0,0,-0.994546,0.0,1
2,0.0,0.0,1.0,0.823529,0.822034,-0.154463,-0.188294,-0.35562,0.0,0,-0.984737,0.0,1
3,0.0,0.0,0.0,0.588235,0.610169,-0.154463,-0.188294,-0.35562,2.0,1,1.702819,1.0,0
4,0.0,0.0,0.0,0.647059,0.661017,1.025247,1.021079,-0.35562,2.0,0,-0.209858,1.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
635,1.0,0.0,0.0,0.000000,0.016949,1.025247,1.021079,-0.35562,2.0,0,-0.229475,1.0,0
636,0.0,0.0,0.0,0.294118,0.262712,1.025247,1.021079,-0.35562,2.0,0,0.623873,1.0,0
637,0.0,0.0,1.0,0.294118,0.262712,1.025247,1.021079,-0.35562,1.0,0,-0.651245,1.0,0
638,0.0,0.0,1.0,0.705882,0.711864,-0.154463,-0.188294,-0.35562,2.0,0,0.427701,1.0,0


## 4. Preprocess Data and Upload to Bucket

In [16]:
BUCKET_NAME = "aero-ticker-bucket"

DATA_PREFIX = "data"

In [17]:
def get_file_name(name):
    return f"{name}-pre.csv"

In [18]:
def export_data(data, name, pre):
    # split data into X and y subsets
    X = data.drop(columns="price")
    y = data.price.copy()
    
    # transformation
    X_pre = pre.transform(X)
    
    # exporting
    file_name = get_file_name(name)
    (
        y
        .to_frame()
        .join(X_pre)
        .to_csv(file_name, index=False, header=False)
    )

In [21]:
def upload_to_bucket(name):
    file_name = get_file_name(name)
    
    (
        boto3
        .Session()
        .resource("s3")
        .Bucket(BUCKET_NAME)
        .Object(os.path.join(DATA_PREFIX, f"{name}/{name}.csv"))
        .upload_file(file_name)
    )

In [22]:
def export_and_upload_bucket(data, name, pre):
    export_data(data, name, pre)
    upload_to_bucket(name)

In [23]:
export_and_upload_bucket(train, "train", preprocessor)

In [24]:
pd.read_csv("train-pre.csv")

Unnamed: 0,7173,1.0,0.0,0.0.1,0.47058823529411764,0.48305084745762716,1.0252469906606678,1.0210789388278794,-0.3556202039945009,1.0.1,0,-0.592393312150945,1.0.2,0.1
0,3597,0.0,0.0,1.0,0.588235,0.584746,-1.878015,-0.758515,-0.35562,0.0,0,-0.994546,0.0,1
1,3841,0.0,0.0,1.0,0.823529,0.822034,-0.154463,-0.188294,-0.35562,0.0,0,-0.984737,0.0,1
2,6612,0.0,0.0,0.0,0.588235,0.610169,-0.154463,-0.188294,-0.35562,2.0,1,1.702819,1.0,0
3,7038,0.0,0.0,0.0,0.647059,0.661017,1.025247,1.021079,-0.35562,2.0,0,-0.209858,1.0,0
4,3100,0.0,0.0,0.0,0.882353,0.872881,-1.878015,-0.758515,-0.35562,0.0,0,-1.112249,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
634,14871,1.0,0.0,0.0,0.000000,0.016949,1.025247,1.021079,-0.35562,2.0,0,-0.229475,1.0,0
635,5117,0.0,0.0,0.0,0.294118,0.262712,1.025247,1.021079,-0.35562,2.0,0,0.623873,1.0,0
636,5351,0.0,0.0,1.0,0.294118,0.262712,1.025247,1.021079,-0.35562,1.0,0,-0.651245,1.0,0
637,9187,0.0,0.0,1.0,0.705882,0.711864,-0.154463,-0.188294,-0.35562,2.0,0,0.427701,1.0,0


In [25]:
export_and_upload_bucket(val, "val", preprocessor)

In [26]:
export_and_upload_bucket(test, "test", preprocessor)

## 5. Model and Hyperparatmeter Tuning Setup

In [27]:
session = sagemaker.Session()
region_name = session.boto_region_name

In [28]:
output_path = f"s3://{BUCKET_NAME}/model/output"

In [29]:
model = Estimator(
    image_uri=sagemaker.image_uris.retrieve("xgboost", region_name, "1.2-1"),
    role=sagemaker.get_execution_role(),
    instance_count=1,
    instance_type="ml.m4.xlarge",
    volume_size=5,
    output_path=output_path,
    use_spot_instances=True,
    max_run=300,
    max_wait=600,
    sagemaker_session=session
)

In [30]:
model.set_hyperparameters(
    objective="reg:linear",
    num_round=10,
    eta=0.1,
    max_depth=5,
    subsample=0.8,
    colsample_bytree=0.8,
    alpha=0.1
)

In [31]:
hyperparameter_ranges = {
    "eta": ContinuousParameter(0.05, 0.2),
    "alpha": ContinuousParameter(0, 1),
    "max_depth": IntegerParameter(3, 5)
}

In [33]:
tuner = HyperparameterTuner(
    estimator=model,
    objective_metric_name="validation:rmse",
    hyperparameter_ranges=hyperparameter_ranges,
    strategy="Bayesian",
    objective_type="Minimize"
)

## 6. Data Channels

In [34]:
def get_data_channel(name):
    bucket_path = f"s3://{BUCKET_NAME}/{DATA_PREFIX}/{name}"
    return TrainingInput(bucket_path, content_type="csv")

In [35]:
train_data_channel = get_data_channel("train")
train_data_channel

<sagemaker.inputs.TrainingInput at 0x7f4979626110>

In [36]:
val_data_channel = get_data_channel("val")

In [37]:
data_channels = {
    "train": train_data_channel,
    "validation": val_data_channel
}

## 7. Train and Tune the Model

In [42]:
tuner.fit(data_channels)

No finished training job found associated with this estimator. Please make sure this estimator is only used for building workflow config
No finished training job found associated with this estimator. Please make sure this estimator is only used for building workflow config


...................................!


In [38]:
# tuner.best_estimator().deploy()

## 8. Model Evaluation

In [43]:
with open("xgboost-model", "rb") as f:
    best_model = pickle.load(f)
    
best_model

<xgboost.core.Booster at 0x7f498ba35180>

In [41]:
pd.read_csv("train-pre.csv").iloc[:,0]

0       3597
1       3841
2       6612
3       7038
4       3100
       ...  
634    14871
635     5117
636     5351
637     9187
638    14714
Name: 7173, Length: 639, dtype: int64

In [44]:
def evaluate_model(name):
    file_name = get_file_name(name)
    data = pd.read_csv(file_name)
    
    X = xgb.DMatrix(data.iloc[:, 1:])
    y = data.iloc[:, 0].copy()
    
    pred = best_model.predict(X)
    
    return r2_score(y, pred)

In [45]:
evaluate_model("train")

0.6985762119293213

In [46]:

evaluate_model("val")

0.5995759963989258

In [47]:
evaluate_model("test")

0.5301194190979004