In [57]:
import boto3, re, sys, math, json, os, sagemaker, urllib.request
from sagemaker import get_execution_role
import numpy as np                                
import pandas as pd                               
import matplotlib.pyplot as plt                   
from IPython.display import Image                 
from IPython.display import display               
from time import gmtime, strftime                 
from sagemaker.predictor import csv_serializer   

# Define IAM role
role = get_execution_role()
prefix = 'sagemaker/DEMO-xgboost-dm'
containers = {'us-west-2': '433757028032.dkr.ecr.us-west-2.amazonaws.com/xgboost:latest',
              'us-east-1': '811284229777.dkr.ecr.us-east-1.amazonaws.com/xgboost:latest',
              'us-east-2': '825641698319.dkr.ecr.us-east-2.amazonaws.com/xgboost:latest',
              'eu-west-1': '685385470294.dkr.ecr.eu-west-1.amazonaws.com/xgboost:latest'} # each region has its XGBoost container
my_region = boto3.session.Session().region_name # set the region of the instance
print("Success - the MySageMakerInstance is in the " + my_region + " region. You will use the " + containers[my_region] + " container for your SageMaker endpoint.")

Success - the MySageMakerInstance is in the us-west-2 region. You will use the 433757028032.dkr.ecr.us-west-2.amazonaws.com/xgboost:latest container for your SageMaker endpoint.


In [58]:
bucket_name = 'de166-sagemaker-shihao' # <--- CHANGE THIS VARIABLE TO A UNIQUE NAME FOR YOUR BUCKET
s3 = boto3.resource('s3')
try:
    if  my_region == 'us-east-1': 
        s3.create_bucket(Bucket=bucket_name)
    else: 
        s3.create_bucket(Bucket=bucket_name, CreateBucketConfiguration={ 'LocationConstraint': my_region })
    print('S3 bucket created successfully')
except Exception as e:
    print('S3 error: ',e)
    



S3 error:  An error occurred (BucketAlreadyOwnedByYou) when calling the CreateBucket operation: Your previous request to create the named bucket succeeded and you already own it.


In [82]:
import pyarrow.parquet as pq
import s3fs

s3 = s3fs.S3FileSystem()
model_data = pq.read_table('s3://de166-sagemaker-shihao/merged.parquet', filesystem=s3).to_pandas()

In [83]:
model_data= model_data.drop(columns = ['Local Authority Distract Name','Region/Country','Second Tier Authority','Code','Year','Operator','Site','Postcode','Reference','Substance Name','Local Authority'])


In [87]:
model_data

Unnamed: 0,Industry Electricity,Industry Gas,Industry 'Other Fuels',Large Industrial Installations,Agriculture,Industry Total,2019 year,label
0,17.786253,64.760412,23.930058,0.050426,5.952103,112.479252,49.343000,1
1,110.906985,86.680235,130.746196,10.899522,45.994055,385.226992,0.043075,1
2,110.906985,86.680235,130.746196,10.899522,45.994055,385.226992,4.700560,1
3,45.752494,76.066494,35.248639,4.795098,1.542726,163.405451,22.400000,1
4,45.752494,76.066494,35.248639,4.795098,1.542726,163.405451,22.200000,1
...,...,...,...,...,...,...,...,...
619,38.559634,56.107100,217.406073,11.847051,45.652023,369.571881,1010.033350,-1
620,38.559634,56.107100,217.406073,11.847051,45.652023,369.571881,1141.294000,-1
621,75.808282,24.178805,349.697965,357.242559,92.253520,899.181131,18.977000,1
622,75.808282,24.178805,349.697965,357.242559,92.253520,899.181131,313.326000,-1


In [66]:
model_data['label'] = model_data['label'].replace(-1,0)

In [81]:
model_data=model_data.astype('float')

In [68]:
train_data, test_data = np.split(model_data.sample(frac=1, random_state=50), [int(0.7 * len(model_data))])
print(train_data.shape, test_data.shape)

(436, 8) (188, 8)


In [69]:
#step 5 
pd.concat([train_data['label'], train_data.drop(['label'], axis=1)], axis=1).to_csv('train.csv', index=False, header=False)
boto3.Session().resource('s3').Bucket(bucket_name).Object(os.path.join(prefix, 'train/train.csv')).upload_file('train.csv')
s3_input_train = sagemaker.inputs.TrainingInput(s3_data='s3://{}/{}/train'.format(bucket_name, prefix), content_type='csv')

In [70]:
#step 6 
sess = sagemaker.Session()
xgb = sagemaker.estimator.Estimator(containers[my_region],role, instance_count=1, instance_type='ml.m4.xlarge',output_path='s3://{}/{}/output'.format(bucket_name, prefix),sagemaker_session=sess)
xgb.set_hyperparameters(max_depth=5,eta=0.2,gamma=4,min_child_weight=6,subsample=0.8,silent=0,objective='binary:logistic',num_round=100)


In [71]:

#step 7 
xgb.fit({'train': s3_input_train})

2022-04-23 14:05:14 Starting - Starting the training job...
2022-04-23 14:05:41 Starting - Preparing the instances for trainingProfilerReport-1650722714: InProgress
.........
2022-04-23 14:07:04 Downloading - Downloading input data......
2022-04-23 14:07:59 Training - Downloading the training image...
2022-04-23 14:08:45 Training - Training image download completed. Training in progress..[34mArguments: train[0m
[34m[2022-04-23:14:08:49:INFO] Running standalone xgboost training.[0m
[34m[2022-04-23:14:08:49:INFO] Path /opt/ml/input/data/validation does not exist![0m
[34m[2022-04-23:14:08:49:INFO] File size need to be processed in the node: 0.05mb. Available memory size in the node: 8492.56mb[0m
[34m[2022-04-23:14:08:49:INFO] Determined delimiter of CSV input is ','[0m
[34m[14:08:49] S3DistributionType set as FullyReplicated[0m
[34m[14:08:49] 436x7 matrix with 3052 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[34m[14:08:49] src/tree/

In [72]:
#step 8 
xgb_predictor = xgb.deploy(initial_instance_count=1,instance_type='ml.m4.xlarge')


-------!

In [73]:
#step 9 
from sagemaker.serializers import CSVSerializer

test_data_array = test_data.drop(['label'], axis=1).values #load the data into an array
xgb_predictor.serializer = CSVSerializer() # set the serializer type
predictions = xgb_predictor.predict(test_data_array).decode('utf-8') # predict!
predictions_array = np.fromstring(predictions[1:], sep=',') # and turn the prediction into an array
print(predictions_array.shape)


(188,)


In [74]:
#step 10 
cm = pd.crosstab(index=test_data['label'], columns=np.round(predictions_array), rownames=['Observed'], colnames=['Predicted'])
tn = cm.iloc[0,0]; fn = cm.iloc[1,0]; tp = cm.iloc[1,1]; fp = cm.iloc[0,1]; p = (tp+tn)/(tp+tn+fp+fn)*100
print("\n{0:<20}{1:<4.1f}%\n".format("Overall Classification Rate: ", p))
print("{0:<15}{1:<15}{2:>8}".format("Predicted", "Less than 50%", "More than 50%"))
print("Observed")
print("{0:<15}{1:<2.0f}% ({2:<}){3:>6.0f}% ({4:<})".format("Less than 50%", tn/(tn+fn)*100,tn, fp/(tp+fp)*100, fp))
print("{0:<16}{1:<1.0f}% ({2:<}){3:>7.0f}% ({4:<}) \n".format("More than 50%", fn/(tn+fn)*100,fn, tp/(tp+fp)*100, tp))



Overall Classification Rate: 93.1%

Predicted      Less than 50%  More than 50%
Observed
Less than 50%  93% (52)     7% (9)
More than 50%   7% (4)     93% (123) 



In [89]:
test_data

Unnamed: 0,Industry Electricity,Industry Gas,Industry 'Other Fuels',Large Industrial Installations,Agriculture,Industry Total,2019 year,label
429,21.471378,28.171075,48.760413,13.089610,14.229823,125.722299,7.49466,1
237,43.524392,18.127447,39.148399,0.016560,0.596807,101.413605,152.75500,0
384,30.576951,18.814030,58.443575,107.993055,14.428038,230.255650,3196.71800,0
366,50.480688,100.082530,13.109398,1.134588,0.071980,164.879184,51.29100,1
378,12.167436,4.994726,18.269672,1.660167,9.457461,46.549461,200.39100,0
...,...,...,...,...,...,...,...,...
70,47.666017,54.049591,37.599949,49.002407,1.204505,189.522469,17.74600,1
132,105.011511,115.378414,145.563009,524.963206,70.290386,961.206526,58.42880,1
289,27.238769,9.761711,31.429824,0.073522,6.951697,75.455523,11.13000,1
109,67.413388,207.137841,50.903814,178.246660,1.388202,505.089905,74.00000,1


In [76]:
js = test_data.iloc[0].to_json(orient = 'columns')
js

'{"Industry Electricity":21.4713780854,"Industry Gas ":28.1710746292,"Industry \'Other Fuels\'":48.760412829,"Large Industrial Installations":13.0896102274,"Agriculture":14.2298228162,"Industry Total":125.7222985872,"2019 year":7.49466,"label":1.0}'

In [77]:
xgb_predictor.endpoint

The endpoint attribute has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


'xgboost-2022-04-23-14-09-27-392'

In [97]:
xgb_predictor.predict([21,28,48,13,14,125,7])

b'0.9814813137054443'

In [93]:
import io
def np2csv(arr):
    csv = io.BytesIO()

    np.savetxt(csv, arr, delimiter=",", fmt="%g")
    

    return csv.getvalue().decode().rstrip()

In [111]:
runtime = boto3.client("runtime.sagemaker")

# get the payload

payload = np2csv([[21,28,48,13,14,125,7]])
# invoke the endpoint to get the response

response = runtime.invoke_endpoint(

EndpointName='xgboost-2022-04-23-14-09-27-392', Body=payload, ContentType="text/csv")

In [103]:
result = json.loads(response["Body"].read().decode())

In [104]:
result

0.9814813137054443

In [105]:
test= { "data": [[21,28,48,13,14,125,7]]}

In [110]:
data = json.loads(json.dumps(test))
payload = data["data"]
print(payload)
payload

[[21, 28, 48, 13, 14, 125, 7]]


[[21, 28, 48, 13, 14, 125, 7]]

In [112]:
print(payload)

21,28,48,13,14,125,7
