### DETECTING ZERO DAY ATTACKS USING XGBOOST FROM AWS SAGEMAKER

#### IMPORTING REQUIRED LIBRARIES

In [1]:
import sagemaker
import boto3
import pandas as pd
from boto3.session import Session

  from pandas.core.computation.check import NUMEXPR_INSTALLED


sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


In [8]:
from sagemaker.debugger import Rule, ProfilerRule, rule_configs
from sagemaker.session import TrainingInput

#### GET AWS REGION AND ROLE FOR FUTURE USE

In [2]:
region = sagemaker.Session().boto_region_name
print("AWS Region: {}".format(region))

role = sagemaker.get_execution_role()
print("RoleArn: {}".format(role))

AWS Region: ap-south-1
RoleArn: arn:aws:iam::634842168668:role/service-role/SageMaker-MLOpsRole


#### CREATE A SESSION FOR ACCESSING S3

In [3]:
ACCESS_KEY_ID = """ENTER YOUR AWS ACCESS KEY"""
SECRET_KEY = """ENTER YOUR SECRET KEY"""

In [4]:
session = Session(aws_access_key_id = ACCESS_KEY_ID,
                 aws_secret_access_key = SECRET_KEY)

In [5]:
bucket = 'zero-days-attack-ml-training' #enter your bucket name
prefix = 'Vinaya' #enter the prefix

#### FETCHING TEST DATA INTO A DATAFRAME FOR USE LATER

s3 = session.client('s3')
obj = s3.get_object(Bucket=bucket, Key='Vinaya/test.csv') #key is the path to the object inside the bucket
test = pd.read_csv(obj['Body'])

In [7]:
test

Unnamed: 0,Label,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,act_data_pkt_fwd,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min
0,0,135,1,1,6,6,6,6,6.000000,0.000000,...,0,20,0.00,0.0000,0,0,0.0,0.00,0,0
1,0,116159721,23,19,703,6328,318,0,30.565217,80.821254,...,4,32,53793.75,60145.8095,244782,36275,9626179.0,1330548.96,10000000,5401139
2,0,96068,2,2,70,194,35,35,35.000000,0.000000,...,1,20,0.00,0.0000,0,0,0.0,0.00,0,0
3,0,28210124,2,2,87,131,44,43,43.500000,0.707107,...,1,32,30811.00,0.0000,30811,30811,28100000.0,0.00,28100000,28100000
4,0,32739,4,4,136,520,34,34,34.000000,0.000000,...,3,32,0.00,0.0000,0,0,0.0,0.00,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
142205,1,11512204,8,5,326,11632,326,0,40.750000,115.258405,...,1,32,892.00,0.0000,892,892,6507197.0,0.00,6507197,6507197
142206,1,11513325,5,5,471,3525,471,0,94.200000,210.637604,...,1,32,918.00,0.0000,918,918,6508582.0,0.00,6508582,6508582
142207,1,11509201,7,6,314,11632,314,0,44.857143,118.680845,...,1,32,899.00,0.0000,899,899,6503248.0,0.00,6503248,6503248
142208,1,11509095,8,5,369,11632,369,0,46.125000,130.461201,...,1,32,914.00,0.0000,914,914,6504954.0,0.00,6504954,6504954


#### FETCHING TRAINING AND VALIDATION DATA FROM S3 BUCKET

In [9]:
train_input = TrainingInput(
    "s3://{}/{}/{}".format(bucket, prefix, "train.csv"), content_type="csv"
)
validation_input = TrainingInput(
    "s3://{}/{}/{}".format(bucket, prefix, "validate.csv"), content_type="csv"
)

#### CREATING AN XGBOOST ESTIMATOR

In [11]:
s3_output_location='s3://{}/{}/{}'.format(bucket, prefix, 'xgboost_model')

container=sagemaker.image_uris.retrieve("xgboost", region, "1.7-1")
print(container)

xgb_model=sagemaker.estimator.Estimator(
    image_uri=container,
    role=role,
    instance_count=1,
    instance_type='ml.m5.xlarge',
    use_spot_instances = True,
    max_run = 3600,
    max_wait = 7200,
    volume_size=5,
    output_path=s3_output_location,
    sagemaker_session=sagemaker.Session(),
    rules=[
        Rule.sagemaker(rule_configs.create_xgboost_report()),
        ProfilerRule.sagemaker(rule_configs.ProfilerReport())
    ]
)

720646828776.dkr.ecr.ap-south-1.amazonaws.com/sagemaker-xgboost:1.7-1


#### SPECIFYING HYPERPARAMETERS FOR XGBOOST ALGORITHM

In [12]:
xgb_model.set_hyperparameters(
    max_depth = 5,
    eta = 0.2,
    gamma = 4,
    min_child_weight = 6,
    subsample = 0.7,
    objective = "binary:logistic",
    num_round = 1000
)

#### TRAINING THE MODEL

In [13]:
xgb_model.fit({"train": train_input, "validation": validation_input}, wait=True)

INFO:sagemaker:Creating training-job with name: sagemaker-xgboost-2024-01-12-12-56-15-272


2024-01-12 12:56:15 Starting - Starting the training job...
2024-01-12 12:56:44 Starting - Preparing the instances for trainingCreateXgboostReport: InProgress
ProfilerReport: InProgress
......
2024-01-12 12:57:44 Downloading - Downloading input data...
2024-01-12 12:58:04 Downloading - Downloading the training image...
2024-01-12 12:58:44 Training - Training image download completed. Training in progress..[34m[2024-01-12 12:58:47.714 ip-10-0-82-56.ap-south-1.compute.internal:7 INFO utils.py:28] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34m[2024-01-12 12:58:47.737 ip-10-0-82-56.ap-south-1.compute.internal:7 INFO profiler_config_parser.py:111] User has disabled profiler.[0m
[34m[2024-01-12:12:58:48:INFO] Imported framework sagemaker_xgboost_container.training[0m
[34m[2024-01-12:12:58:48:INFO] Failed to parse hyperparameter objective value binary:logistic to Json.[0m
[34mReturning the value itself[0m
[34m[2024-01-12:12:58:48:INFO] No GPUs detected (normal if no gpus installed)[0

#### TRAINING AND PROFILING REPORTS

In [14]:
rule_output_path = xgb_model.output_path + "/" + xgb_model.latest_training_job.job_name + "/rule-output"
! aws s3 ls {rule_output_path} --recursive

2024-01-12 13:10:09     411694 Vinaya/xgboost_model/sagemaker-xgboost-2024-01-12-12-56-15-272/rule-output/CreateXgboostReport/xgboost_report.html
2024-01-12 13:10:07     215856 Vinaya/xgboost_model/sagemaker-xgboost-2024-01-12-12-56-15-272/rule-output/CreateXgboostReport/xgboost_report.ipynb
2024-01-12 13:10:51     322359 Vinaya/xgboost_model/sagemaker-xgboost-2024-01-12-12-56-15-272/rule-output/ProfilerReport/profiler-output/profiler-report.html
2024-01-12 13:10:50     168698 Vinaya/xgboost_model/sagemaker-xgboost-2024-01-12-12-56-15-272/rule-output/ProfilerReport/profiler-output/profiler-report.ipynb
2024-01-12 13:10:46        192 Vinaya/xgboost_model/sagemaker-xgboost-2024-01-12-12-56-15-272/rule-output/ProfilerReport/profiler-output/profiler-reports/BatchSize.json
2024-01-12 13:10:46        200 Vinaya/xgboost_model/sagemaker-xgboost-2024-01-12-12-56-15-272/rule-output/ProfilerReport/profiler-output/profiler-reports/CPUBottleneck.json
2024-01-12 13:10:46        126 Vinaya/xgboost_mo

In [15]:
! aws s3 cp {rule_output_path} ./ --recursive

download: s3://zero-days-attack-ml-training/Vinaya/xgboost_model/sagemaker-xgboost-2024-01-12-12-56-15-272/rule-output/CreateXgboostReport/xgboost_report.html to CreateXgboostReport/xgboost_report.html
download: s3://zero-days-attack-ml-training/Vinaya/xgboost_model/sagemaker-xgboost-2024-01-12-12-56-15-272/rule-output/ProfilerReport/profiler-output/profiler-reports/GPUMemoryIncrease.json to ProfilerReport/profiler-output/profiler-reports/GPUMemoryIncrease.json
download: s3://zero-days-attack-ml-training/Vinaya/xgboost_model/sagemaker-xgboost-2024-01-12-12-56-15-272/rule-output/ProfilerReport/profiler-output/profiler-reports/MaxInitializationTime.json to ProfilerReport/profiler-output/profiler-reports/MaxInitializationTime.json
download: s3://zero-days-attack-ml-training/Vinaya/xgboost_model/sagemaker-xgboost-2024-01-12-12-56-15-272/rule-output/ProfilerReport/profiler-output/profiler-reports/LowGPUUtilization.json to ProfilerReport/profiler-output/profiler-reports/LowGPUUtilization.jso

In [16]:
from IPython.display import FileLink, FileLinks
display("Click link below to view the XGBoost Training report", FileLink("CreateXgboostReport/xgboost_report.html"))

'Click link below to view the XGBoost Training report'

In [17]:
profiler_report_name = [rule["RuleConfigurationName"] 
                        for rule in xgb_model.latest_training_job.rule_job_summary() 
                        if "Profiler" in rule["RuleConfigurationName"]][0]
profiler_report_name
display("Click link below to view the profiler report", FileLink(profiler_report_name+"/profiler-output/profiler-report.html"))

'Click link below to view the profiler report'

In [18]:
xgb_model.model_data

's3://zero-days-attack-ml-training/Vinaya/xgboost_model/sagemaker-xgboost-2024-01-12-12-56-15-272/output/model.tar.gz'

#### DEPLOYING MODEL TO AN ENDPOINT

In [None]:
from sagemaker.serializers import CSVSerializer
xgb_predictor=xgb_model.deploy(
    initial_instance_count=1,
    instance_type='ml.t2.medium',
    serializer=CSVSerializer()
)

INFO:sagemaker:Creating model with name: sagemaker-xgboost-2024-01-12-13-15-21-334
INFO:sagemaker:Creating endpoint-config with name sagemaker-xgboost-2024-01-12-13-15-21-334
INFO:sagemaker:Creating endpoint with name sagemaker-xgboost-2024-01-12-13-15-21-334


------------!

In [20]:
xgb_predictor.endpoint_name

'sagemaker-xgboost-2024-01-12-13-15-21-334'

#### FUNCTION TO GET PREDICTIONS FROM MODEL ON TESTING DATA

In [26]:
import numpy as np
def predict(data, rows=1000):
    split_array = np.array_split(data, int(data.shape[0] / float(rows) + 1))
    predictions = ''
    for array in split_array:
        predictions = ','.join([predictions, xgb_predictor.predict(array).decode('utf-8')])
    return np.fromstring(predictions[1:], sep=',')

#### CHECKING ACCURACY OF MODEL

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

cutoff=0.5
print(confusion_matrix(test.iloc[:, 0], np.where(predictions > cutoff, 1, 0)))
print(classification_report(test.iloc[:, 0], np.where(predictions > cutoff, 1, 0)))

In [None]:
import matplotlib.pyplot as plt
import sklearn

cutoffs = np.arange(0.01, 1, 0.01)
log_loss = []
for c in cutoffs:
    log_loss.append(
        sklearn.metrics.log_loss(test.iloc[:, 0], np.where(predictions > c, 1, 0))
    )

plt.figure(figsize=(15,10))
plt.plot(cutoffs, log_loss)
plt.xlabel("Cutoff")
plt.ylabel("Log loss")
plt.show()

In [None]:
print(
    'Log loss is minimized at a cutoff of ', cutoffs[np.argmin(log_loss)], 
    ', and the log loss value at the minimum is ', np.min(log_loss)
)