In [None]:
!pip install -Uqq sagemaker 

In [1]:
import requests
from io import BytesIO
import pandas as pd
import boto3
import s3fs
from datetime import datetime
import time
import sagemaker
from sagemaker.estimator import Estimator
from sagemaker import image_uris
from sagemaker.inputs import TrainingInput
from sagemaker.debugger import Rule, rule_configs

from IPython.core.display import display, HTML

In [2]:
# setup sagemaker variables
role = sagemaker.get_execution_role()
sess = sagemaker.session.Session()
bucket = sess.default_bucket()
key_prefix = "higgs-boson"
region = sess._region_name
s3 = s3fs.S3FileSystem(anon=False)

xgboost_container = image_uris.retrieve("xgboost", region, "1.2-1")

In [3]:
xgboost_container

'683313688378.dkr.ecr.us-east-1.amazonaws.com/sagemaker-xgboost:1.2-1'

In [4]:
# get data from CERN
data_url = "http://opendata.cern.ch/record/328/files/atlas-higgs-challenge-2014-v2.csv.gz"
gz_file = BytesIO(requests.get(data_url).content)
gz_file.flush()
df = pd.read_csv(gz_file, compression="gzip")

In [5]:
# identify feature, label, and unused columns
non_feature_cols = ["EventId", "Weight", "KaggleSet", "KaggleWeight", "Label"]
feature_cols = [col for col in df.columns if col not in non_feature_cols]
label_col = "Label"
df["Label"] = df["Label"].apply(lambda x: 1 if x=="s" else 0)

# take subsets of data per the original Kaggle competition
train_data = df.loc[df["KaggleSet"] == "t", [label_col, *feature_cols]]
test_data = df.loc[df["KaggleSet"] == "b", [label_col, *feature_cols]]

In [6]:
# upload data to S3
for name, dataset in zip(["train", "test"], [train_data, test_data]):
    sess.upload_string_as_file_body(body=dataset.to_csv(index=False, header=False),
                                   bucket=bucket,
                                   key=f"{key_prefix}/input/{name}.csv"
                                   )

In [7]:
# configure data inputs for SageMaker training
train_input = TrainingInput(f"s3://{bucket}/{key_prefix}/input/train.csv", content_type="text/csv")
validation_input = TrainingInput(f"s3://{bucket}/{key_prefix}/input/test.csv", content_type="text/csv")

In [8]:
# add a rule to generate the XGBoost Report
rules=[
    Rule.sagemaker(rule_configs.create_xgboost_report())
]

In [63]:
# baseline hyperparams
# hyperparameters={
#     "objective": "binary:logistic",
#     "num_round": "100",
#     "eval_metric": "error"
# }

hyperparameters={
    "eta": "0.1",
    "max_depth": "6",
    "objective": "binary:logistic",
    "scale_pos_weight": "595",
    "num_round": "3000"
}

In [64]:
estimator=Estimator(
    role=role,
    image_uri=xgboost_container,
    base_job_name="higgs-boson-model",
    instance_count=1,
    instance_type="ml.m5.2xlarge",
    hyperparameters=hyperparameters,
    rules=rules, 
)

training_job_time = datetime.now()
estimator.fit({'train': train_input, 'validation': validation_input}, 
              wait=True)

2021-01-11 16:34:57 Starting - Starting the training job...
2021-01-11 16:35:23 Starting - Launching requested ML instancesCreateXgboostReport: InProgress
ProfilerReport-1610382897: InProgress
......
2021-01-11 16:36:24 Starting - Preparing the instances for training......
2021-01-11 16:37:25 Downloading - Downloading input data
2021-01-11 16:37:25 Training - Downloading the training image...
2021-01-11 16:37:54 Training - Training image download completed. Training in progress.[34mINFO:sagemaker-containers:Imported framework sagemaker_xgboost_container.training[0m
[34mINFO:sagemaker-containers:Failed to parse hyperparameter objective value binary:logistic to Json.[0m
[34mReturning the value itself[0m
[34mINFO:sagemaker-containers:No GPUs detected (normal if no gpus installed)[0m
[34mINFO:sagemaker_xgboost_container.training:Running XGBoost Sagemaker in algorithm mode[0m
[34mINFO:root:Determined delimiter of CSV input is ','[0m
[34mINFO:root:Determined delimiter of CSV inp

In [65]:
import os
#get name of profiler report
profiler_report_name = [rule["RuleConfigurationName"] 
                        for rule in estimator.latest_training_job.rule_job_summary() 
                        if "Profiler" in rule["RuleConfigurationName"]][0]

xgb_profile_job_name = [rule["RuleEvaluationJobArn"].split("/")[-1] 
                        for rule in estimator.latest_training_job.rule_job_summary() 
                        if "CreateXgboostReport" in rule["RuleConfigurationName"]][0]

base_output_path = os.path.dirname(estimator.latest_job_debugger_artifacts_path())
rule_output_path = os.path.join(base_output_path, "rule-output/")
xgb_report_path = os.path.join(rule_output_path, "CreateXgboostReport")
profile_report_path = os.path.join(rule_output_path, profiler_report_name)

In [66]:
while True:
    
    xgb_job_info = sess.sagemaker_client.describe_processing_job(ProcessingJobName=xgb_profile_job_name)

    if xgb_job_info["ProcessingJobStatus"] == "Completed":
        break
    else:
        print(f"Job Status: {xgb_job_info['ProcessingJobStatus']}")
        time.sleep(30)

s3.download(xgb_report_path, "reports/xgb/", recursive=True)
s3.download(profile_report_path, "reports/profiler/", recursive=True)
display(HTML("""<h3>Training Profiling Jobs Finished!</h3>
            <h4>
            <a href='/view/XGBoost%20Viz%20Blog/reports/profiler/profiler-output/profiler-report.html'> 
            Click here to view the Profiler Report
            </a>
            </h4>
            <h4>
            <a href='/view/XGBoost%20Viz%20Blog/reports/xgb/xgboost_report.html'> 
            Click here to view the XGBoost Training Report
            </a>
            </h4>"""))

Job Status: InProgress
Job Status: InProgress
Job Status: InProgress
Job Status: InProgress
Job Status: InProgress
Job Status: InProgress
Job Status: InProgress


In [None]:
updated_hyperparameters={
    "eta": "0.1",
    "max_depth": "6",
    "objective": "binary:logistic",
    "num_round": "50"
}

In [None]:
improved_estimator=Estimator(
    role=role,
    image_uri=xgboost_container,
    base_job_name="higgs-boson-model",
    instance_count=1,
    instance_type="ml.m5.xlarge",
    hyperparameters=updated_hyperparameters,
    rules=rules
)

improved_estimator.fit({'train': train_input, 'validation': validation_input}, wait=True)

In [None]:
improved_estimator.latest_training_job.job_name