# Step 2 - Deploy del modelo

## Seteos iniciales

Logueamos sólo los mensajes de warning y error

In [2]:
import logging

logging.getLogger("sagemaker.config").setLevel(logging.WARNING)
logging.getLogger("sagemaker.experiments.run").setLevel(logging.WARNING)

Mostramos las versiones de las librerías de Python importantes para el proyecto

In [3]:
import awscli
import boto3
import numpy
import pandas
import sagemaker

print("sagemaker\t", sagemaker.__version__)
print("pandas\t\t", pandas.__version__)
print("numpy\t\t", numpy.__version__)
print("boto3\t\t", boto3.__version__)
print("awscli\t\t", awscli.__version__)

sagemaker	 2.215.0
pandas		 2.2.2
numpy		 1.26.4
boto3		 1.34.84
awscli		 1.32.84


In [4]:
role = sagemaker.get_execution_role()
region = boto3.Session().region_name

sage_session = sagemaker.Session()
bucket_name = sage_session.default_bucket()
prefix = "australia-rain"
project_fd = f"s3://{bucket_name}/{prefix}"

print("Region:", region)
print("Rol:", role)
print("Info S3:")
print(f"- Bucket: {bucket_name}")
print(f"- Prefix: {prefix}")

Region: us-east-1
Rol: arn:aws:iam::335415446667:role/service-role/AmazonSageMakerExecutionRole-sagemaker-stack
Info S3:
- Bucket: sagemaker-us-east-1-335415446667
- Prefix: australia-rain


## Modelo

In [51]:
import boto3

def get_latest_training_job_name(base_job_name: str) -> str:
    client = boto3.client("sagemaker")
    response = client.list_training_jobs(
        NameContains=base_job_name,
        SortBy="CreationTime", 
        SortOrder="Descending",
        StatusEquals="Completed",
    )
    if response["TrainingJobSummaries"]:
        return response["TrainingJobSummaries"][0]["TrainingJobName"]
    else:
        raise Exception("Training job not found.")

def get_training_job_s3_model_artifacts(job_name: str):
    client = boto3.client("sagemaker")
    response = client.describe_training_job(TrainingJobName=job_name)
    s3_model_artifacts = response["ModelArtifacts"]["S3ModelArtifacts"]
    return s3_model_artifacts

In [52]:
train_base_job_name   = "sagemaker-xgboost-241118-2043-016-64d25362"  # best model of the HPO

latest_train_job_name = get_latest_training_job_name(train_base_job_name)
model_path            = get_training_job_s3_model_artifacts(latest_train_job_name)

print(f"Model path: {model_path}")

Model path: s3://sagemaker-us-east-1-335415446667/australia-rain-processed/output/sagemaker-xgboost-241118-2043-016-64d25362/output/model.tar.gz


In [53]:
import time
from sagemaker.xgboost import XGBoostModel

code_location = f"{project_fd}/code"
xgboost_model = XGBoostModel(
    name=f"{train_base_job_name}-model-{int(time.time())}",
    model_data=model_path,
    entry_point="inference.py",
    source_dir="scripts/inference/",
    code_location=code_location,
    framework_version="0.90-2",
    py_version="py3",
    role=role, 
    sagemaker_session=sage_session,
)

## Deploy del modelo

In [54]:
import time

from sagemaker.model_monitor import DataCaptureConfig

s3_capture_upload_path = f"{project_fd}/monitoring/datacapture"
print(f"The endpoint will upload captured data to {s3_capture_upload_path}")

endpoint_name = f"{prefix}-sm-endpoint-{int(time.time())}"

print(f"\n*** Endpoint Name ***\n\n{endpoint_name}")

tried_deploying = False

The endpoint will upload captured data to s3://sagemaker-us-east-1-335415446667/australia-rain/monitoring/datacapture

*** Endpoint Name ***

australia-rain-sm-endpoint-1731966974


In [55]:
assert not tried_deploying, "Se debe volver a crear el endpoint_name antes de intentar deployar de nuevo"
tried_deploying = True

xgboost_model.deploy(
    initial_instance_count=1, 
    instance_type="ml.m5.xlarge", 
    endpoint_name=endpoint_name,
    data_capture_config=DataCaptureConfig(
        enable_capture=True,
        sampling_percentage=100,
        destination_s3_uri=s3_capture_upload_path,
    ),
)

------!

<sagemaker.xgboost.model.XGBoostPredictor at 0x7f13b8540460>

In [56]:
# # ! USAR ESTE CODIGO SOLO SI HUBO UN PROBLEMA CON EL DEPLOY (descomentarlo primero)
# import boto3
# sagemaker = boto3.client("sagemaker")
# sagemaker.delete_endpoint(EndpointName=endpoint_name)

## Inferencia

In [57]:
from sagemaker.deserializers import CSVDeserializer
from sagemaker.predictor import Predictor
from sagemaker.serializers import CSVSerializer

predictor = Predictor(
    endpoint_name=endpoint_name,
    sagemaker_session=sage_session,
    serializer=CSVSerializer(),
    deserializer=CSVDeserializer(),
)

In [64]:
df_val = pandas.read_csv("australia-rain-val-dbjob_18Nov2024_1731954770752_part00000.csv")
mask_rain = df_val.RainTomorrow > 0.5
df_val

Unnamed: 0,RainTomorrow,MinTemp,MaxTemp,Rainfall,WindGustDir_north,WindGustDir_east,WindGustSpeed,WindDir9am_north,WindDir9am_east,WindDir3pm_north,WindDir3pm_east,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Temp9am,Temp3pm,RainToday
0,0.0,-1.575823,-0.912322,-0.447464,0.622214,-1.306855,-1.089222,0.543832,-1.405455,0.083000,-1.376365,0.136098,-0.415678,0.705367,-0.151323,-5.492681e-13,-4.509145e-13,-1.348866,-0.789395,-0.537545
1,0.0,1.275882,1.985564,-0.447464,-0.489264,1.367870,0.122041,-0.549593,1.345424,-1.240775,0.575429,0.136098,-0.660744,-1.125007,-1.296875,-4.515951e-01,-5.822026e-01,1.128619,2.129049,-0.537545
2,1.0,-0.672004,-1.750436,3.147074,-1.275198,0.584461,1.414056,-1.322761,0.539710,-1.240775,0.575429,1.464127,1.544853,0.436194,1.089691,1.017893e+00,1.008099e+00,-1.178539,-1.737889,1.860173
3,0.0,-0.407091,-1.025965,-0.447464,-0.960397,-0.993066,-0.200962,-1.322761,-0.599741,-1.349844,0.035234,0.619018,-0.170611,-0.748165,-0.055861,1.221585e+00,1.387437e+00,-0.652073,-1.125016,-0.537545
4,0.0,-0.485007,-0.869706,-0.447464,-0.489264,-1.306855,1.333305,1.007312,-1.082731,0.083000,-1.376365,0.377558,1.544853,-0.371323,-1.153681,-2.479037e-01,-2.904041e-01,-0.899822,-0.745618,-0.537545
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13546,0.0,-0.313593,0.920164,-0.447464,0.622214,1.367870,-0.523966,-0.549593,-1.405455,1.096173,-0.962917,-1.191930,-0.660744,-0.317489,-1.201412,3.486213e-01,1.035239e-01,-0.388840,0.742788,-0.537545
13547,0.0,-0.921333,-1.466330,3.329847,-1.275198,-0.523446,1.575558,-1.431508,-0.030015,-1.349844,0.035234,0.377558,-0.415678,0.328525,0.946497,5.377633e-01,1.124819e+00,-1.039180,-1.621151,1.860173
13548,0.0,0.917471,0.962780,0.100855,1.093347,1.054081,-0.927720,1.425747,-0.030015,0.631325,1.339381,-0.588281,-0.660744,0.759201,-0.199055,5.377633e-01,2.348333e-01,0.462795,1.034633,1.860173
13549,0.0,-0.407091,1.374735,-0.447464,1.093347,-0.993066,0.525796,-0.002880,1.458749,1.515844,0.035234,-1.433390,0.196988,0.813036,-1.296875,-6.989347e-01,-1.194979e+00,-0.203029,1.384846,-0.537545


In [70]:
print("RAIN TOMORROW")
for rid, row in df_val[mask_rain].iloc[:10].iterrows():
    payload = ",".join([str(v) for v in row])
    print(f"[rid={rid}]\tPrediction:", predictor.predict(payload))
print()
print()

print("NO RAIN TOMORROW")
for rid, row in df_val[~mask_rain].iloc[:10].iterrows():
    payload = ",".join([str(v) for v in row])
    print(f"[rid={rid}]\tPrediction:", predictor.predict(payload))

RAIN TOMORROW
[rid=2]	Prediction: [['no']]
[rid=12]	Prediction: [['yes']]
[rid=13]	Prediction: [['no']]
[rid=14]	Prediction: [['yes']]
[rid=16]	Prediction: [['no']]
[rid=17]	Prediction: [['yes']]
[rid=22]	Prediction: [['yes']]
[rid=26]	Prediction: [['no']]
[rid=31]	Prediction: [['yes']]
[rid=34]	Prediction: [['no']]


NO RAIN TOMORROW
[rid=0]	Prediction: [['no']]
[rid=1]	Prediction: [['no']]
[rid=3]	Prediction: [['no']]
[rid=4]	Prediction: [['no']]
[rid=5]	Prediction: [['no']]
[rid=6]	Prediction: [['no']]
[rid=7]	Prediction: [['no']]
[rid=8]	Prediction: [['no']]
[rid=9]	Prediction: [['no']]
[rid=10]	Prediction: [['no']]


### Data capturada

Dejamos un delay de 2 minutos para que S3 pueda recibir la data

In [71]:
import time
for i in range(12):
    print(f"{i * 10} segundos...")
    time.sleep(10) 
print("LISTO")

0 segundos...
10 segundos...
20 segundos...
30 segundos...
40 segundos...
50 segundos...
60 segundos...
70 segundos...
80 segundos...
90 segundos...
100 segundos...
110 segundos...
LISTO


In [78]:
s3_client = boto3.Session().client("s3")
current_endpoint_capture_prefix = f"{prefix}/monitoring/datacapture/{endpoint_name}"

result = s3_client.list_objects(Bucket=bucket_name, Prefix=current_endpoint_capture_prefix)
capture_files = [
    f"s3://{bucket_name}/{capture_file.get('Key')}"
    for capture_file in result.get('Contents')
]

print("Capture Files: ")
print("\n".join(capture_files))

Capture Files: 
s3://sagemaker-us-east-1-335415446667/australia-rain/monitoring/datacapture/australia-rain-sm-endpoint-1731966974/AllTraffic/2024/11/18/22/00-45-707-5df1f258-11a1-42fd-b020-52abc68682e3.jsonl
s3://sagemaker-us-east-1-335415446667/australia-rain/monitoring/datacapture/australia-rain-sm-endpoint-1731966974/AllTraffic/2024/11/18/22/03-53-940-1bf54dc0-f23b-4ed0-b6f2-c1df76e1dede.jsonl
s3://sagemaker-us-east-1-335415446667/australia-rain/monitoring/datacapture/australia-rain-sm-endpoint-1731966974/AllTraffic/2024/11/18/22/05-07-314-539934b6-f1e9-436c-982e-ea0aabf7d748.jsonl


In [76]:
!aws s3 cp {capture_files[0]} datacapture/captured_data_example_0.jsonl
!aws s3 cp {capture_files[1]} datacapture/captured_data_example_1.jsonl
!aws s3 cp {capture_files[2]} datacapture/captured_data_example_2.jsonl
# !head datacapture/captured_data_example.jsonl

download: s3://sagemaker-us-east-1-335415446667/australia-rain/monitoring/datacapture/australia-rain-sm-endpoint-1731966974/AllTraffic/2024/11/18/22/00-45-707-5df1f258-11a1-42fd-b020-52abc68682e3.jsonl to datacapture/captured_data_example_0.jsonl
download: s3://sagemaker-us-east-1-335415446667/australia-rain/monitoring/datacapture/australia-rain-sm-endpoint-1731966974/AllTraffic/2024/11/18/22/03-53-940-1bf54dc0-f23b-4ed0-b6f2-c1df76e1dede.jsonl to datacapture/captured_data_example_1.jsonl
download: s3://sagemaker-us-east-1-335415446667/australia-rain/monitoring/datacapture/australia-rain-sm-endpoint-1731966974/AllTraffic/2024/11/18/22/05-07-314-539934b6-f1e9-436c-982e-ea0aabf7d748.jsonl to datacapture/captured_data_example_2.jsonl


Analizamos las primeras 4 predicciones, que dieron no, yes, no, yes respectivamente

In [80]:
import json
with open("datacapture/captured_data_example_2.jsonl", "r") as f:
    data = f.read()

print(json.dumps(json.loads(data.split("\n")[0]), indent=2))
print(json.dumps(json.loads(data.split("\n")[1]), indent=2))
print(json.dumps(json.loads(data.split("\n")[2]), indent=2))
print(json.dumps(json.loads(data.split("\n")[3]), indent=2))

{
  "captureData": {
    "endpointInput": {
      "observedContentType": "text/csv",
      "mode": "INPUT",
      "data": "1.0,-0.6720035672084514,-1.750436292789939,3.147074156972007,-1.2751979794448909,0.5844612877482018,1.4140559026347617,-1.3227605369525006,0.5397100975998633,-1.2407753183151389,0.5754290956249299,1.464126930349772,1.544852522085946,0.4361942319513341,1.0896905171715925,1.0178931757286425,1.008099343620971,-1.1785387167368813,-1.7378889535508568,1.860173368407406",
      "encoding": "CSV"
    },
    "endpointOutput": {
      "observedContentType": "text/csv; charset=utf-8",
      "mode": "OUTPUT",
      "data": "no",
      "encoding": "CSV"
    }
  },
  "eventMetadata": {
    "eventId": "d009f9ac-84d3-468c-9b18-9854686e2fb8",
    "inferenceTime": "2024-11-18T22:05:07Z"
  },
  "eventVersion": "0"
}
{
  "captureData": {
    "endpointInput": {
      "observedContentType": "text/csv",
      "mode": "INPUT",
      "data": "1.0,-1.4979072347007536,-0.983348843817234,-0.3

### Testing

In [101]:
df_test = pandas.read_csv("australia-rain-test-dbjob_18Nov2024_1731955058171_part00000.csv")
mask_rain_test = df_test.RainTomorrow > 0.5
df_test

Unnamed: 0,RainTomorrow,MinTemp,MaxTemp,Rainfall,WindGustDir_north,WindGustDir_east,WindGustSpeed,WindDir9am_north,WindDir9am_east,WindDir3pm_north,WindDir3pm_east,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Temp9am,Temp3pm,RainToday
0,1.0,-0.179724,-0.254568,-0.443274,-1.288629,-0.533888,0.662474,-0.001443,-1.523267,0.625809,1.321980,-0.109655,-1.509713,0.221667,-0.114998,3.179906e-01,3.640538e-01,-0.656743,-0.301944,-0.538776
1,0.0,1.614333,1.093263,-0.443274,0.625570,1.338691,-0.773296,1.320512,-0.606283,0.625809,1.321980,-1.428911,0.655646,-0.046651,0.411837,-9.891015e-01,-1.010811e+00,1.755201,1.154760,-0.538776
2,0.0,1.394010,0.369691,0.604382,-0.495742,1.338691,0.662474,-0.549015,1.334532,-0.470380,1.321980,2.049128,1.497730,1.133949,0.651308,4.204890e-02,2.616703e-01,0.665936,0.411841,1.855922
3,0.0,1.913342,1.135826,0.429772,-1.288629,0.563044,-1.092356,-1.323398,0.530620,1.090461,1.015929,0.609940,-0.427033,1.187613,0.938673,-1.235997e+00,-1.303336e+00,1.226129,1.300430,1.855922
4,1.0,-2.304265,-1.261895,-0.443274,1.100872,-0.998855,0.104119,0.546128,1.334532,0.625809,-1.286777,-1.189046,0.174455,0.007013,-0.210786,-3.355555e-01,-6.597818e-01,-1.668203,-1.350771,-0.538776
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13578,0.0,-0.148249,-0.240380,-0.443274,-1.400151,0.014578,-0.055411,0.546128,-1.410195,-0.470380,1.321980,-0.349520,-0.427033,-1.388242,-0.546045,7.246414e-01,7.443356e-01,-0.112110,-0.185408,-0.538776
13579,1.0,-1.155439,-1.247707,-0.443274,-0.495742,-1.309536,-1.570946,-1.323398,-0.606283,0.077714,0.017601,-0.589384,-2.231499,0.597313,0.747096,1.436280e+00,1.314758e+00,-1.185814,-1.175966,-0.538776
13580,0.0,0.276659,0.412254,-0.443274,-1.288629,-0.533888,0.263649,-1.013224,1.012530,-1.354526,0.017601,0.370075,-0.186438,-0.797942,-0.929199,1.001419e-01,1.007818e-01,0.199108,0.572078,-0.538776
13581,1.0,0.481244,0.440629,0.429772,1.100872,-0.998855,-0.214941,1.320512,-0.606283,1.509954,0.017601,-1.428911,0.054157,0.328995,0.507626,-5.639770e-13,5.773921e-13,0.479205,0.309872,1.855922


Usamos el endpoint para predecir en test

In [104]:
df_test.loc[:, "RainTomorrow_pred"] = None
print("Total requests run:")
for rid, row in df_test.iterrows():
    if (rid % 1000) == 0:
        print()
    if (rid % 100) == 0:
        print(rid, end="...")
    payload = ",".join([str(v) for v in row])
    df_test.at[rid, "RainTomorrow_pred"] = predictor.predict(payload)[0][0]

print()
print()
print("LISTO")

Total requests run:

0...100...200...300...400...500...600...700...800...900...
1000...1100...1200...1300...1400...1500...1600...1700...1800...1900...
2000...2100...2200...2300...2400...2500...2600...2700...2800...2900...
3000...3100...3200...3300...3400...3500...3600...3700...3800...3900...
4000...4100...4200...4300...4400...4500...4600...4700...4800...4900...
5000...5100...5200...5300...5400...5500...5600...5700...5800...5900...
6000...6100...6200...6300...6400...6500...6600...6700...6800...6900...
7000...7100...7200...7300...7400...7500...7600...7700...7800...7900...
8000...8100...8200...8300...8400...8500...8600...8700...8800...8900...
9000...9100...9200...9300...9400...9500...9600...9700...9800...9900...
10000...10100...10200...10300...10400...10500...10600...10700...10800...10900...
11000...11100...11200...11300...11400...11500...11600...11700...11800...11900...
12000...12100...12200...12300...12400...12500...12600...12700...12800...12900...
13000...13100...13200...13300...13400.

In [105]:
df_test["RainTomorrow"] = df_test["RainTomorrow"].map(lambda v: bool(int(v)))

In [106]:
df_test["RainTomorrow_pred"] = df_test["RainTomorrow_pred"].map(lambda v: True if v == "yes" else v)
df_test["RainTomorrow_pred"] = df_test["RainTomorrow_pred"].map(lambda v: False if v == "no" else v)

In [107]:
df_test[["RainTomorrow", "RainTomorrow_pred"]].value_counts().sort_index()

RainTomorrow  RainTomorrow_pred
False         False                8997
              True                 1608
True          False                1839
              True                 1139
Name: count, dtype: int64

In [108]:
df_test.index.name = "indice"

In [109]:
df_test

Unnamed: 0_level_0,RainTomorrow,MinTemp,MaxTemp,Rainfall,WindGustDir_north,WindGustDir_east,WindGustSpeed,WindDir9am_north,WindDir9am_east,WindDir3pm_north,...,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Temp9am,Temp3pm,RainToday,RainTomorrow_pred
indice,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,True,-0.179724,-0.254568,-0.443274,-1.288629,-0.533888,0.662474,-0.001443,-1.523267,0.625809,...,-0.109655,-1.509713,0.221667,-0.114998,3.179906e-01,3.640538e-01,-0.656743,-0.301944,-0.538776,False
1,False,1.614333,1.093263,-0.443274,0.625570,1.338691,-0.773296,1.320512,-0.606283,0.625809,...,-1.428911,0.655646,-0.046651,0.411837,-9.891015e-01,-1.010811e+00,1.755201,1.154760,-0.538776,False
2,False,1.394010,0.369691,0.604382,-0.495742,1.338691,0.662474,-0.549015,1.334532,-0.470380,...,2.049128,1.497730,1.133949,0.651308,4.204890e-02,2.616703e-01,0.665936,0.411841,1.855922,False
3,False,1.913342,1.135826,0.429772,-1.288629,0.563044,-1.092356,-1.323398,0.530620,1.090461,...,0.609940,-0.427033,1.187613,0.938673,-1.235997e+00,-1.303336e+00,1.226129,1.300430,1.855922,True
4,True,-2.304265,-1.261895,-0.443274,1.100872,-0.998855,0.104119,0.546128,1.334532,0.625809,...,-1.189046,0.174455,0.007013,-0.210786,-3.355555e-01,-6.597818e-01,-1.668203,-1.350771,-0.538776,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13578,False,-0.148249,-0.240380,-0.443274,-1.400151,0.014578,-0.055411,0.546128,-1.410195,-0.470380,...,-0.349520,-0.427033,-1.388242,-0.546045,7.246414e-01,7.443356e-01,-0.112110,-0.185408,-0.538776,False
13579,True,-1.155439,-1.247707,-0.443274,-0.495742,-1.309536,-1.570946,-1.323398,-0.606283,0.077714,...,-0.589384,-2.231499,0.597313,0.747096,1.436280e+00,1.314758e+00,-1.185814,-1.175966,-0.538776,False
13580,False,0.276659,0.412254,-0.443274,-1.288629,-0.533888,0.263649,-1.013224,1.012530,-1.354526,...,0.370075,-0.186438,-0.797942,-0.929199,1.001419e-01,1.007818e-01,0.199108,0.572078,-0.538776,False
13581,True,0.481244,0.440629,0.429772,1.100872,-0.998855,-0.214941,1.320512,-0.606283,1.509954,...,-1.428911,0.054157,0.328995,0.507626,-5.639770e-13,5.773921e-13,0.479205,0.309872,1.855922,False


In [110]:
df_test.isnull().sum(axis=0)

RainTomorrow         0
MinTemp              0
MaxTemp              0
Rainfall             0
WindGustDir_north    0
WindGustDir_east     0
WindGustSpeed        0
WindDir9am_north     0
WindDir9am_east      0
WindDir3pm_north     0
WindDir3pm_east      0
WindSpeed9am         0
WindSpeed3pm         0
Humidity9am          0
Humidity3pm          0
Pressure9am          0
Pressure3pm          0
Temp9am              0
Temp3pm              0
RainToday            0
RainTomorrow_pred    0
dtype: int64

In [111]:
df_test.to_csv("australia-rain-test-predicted.csv")