In [None]:
# === Lab 8: Steps 3–6 in one cell (clean & self-contained) ===
import os, time, uuid, boto3
from pathlib import Path
from sagemaker import Session, image_uris
from sagemaker.sklearn.model import SKLearnModel

# --- Env/clients ---
REGION       = os.environ.get("AWS_REGION", "ap-northeast-2")
BUCKET       = os.environ["BUCKET"]
LABP         = os.environ.get("LAB_PREFIX", "student")
SM_ROLE_ARN  = os.environ["SM_ROLE_ARN"]
S3_ARTIFACTS = os.environ["S3_ARTIFACTS"]

boto_sess = boto3.Session(region_name=REGION)
sm = boto_sess.client("sagemaker")
s3 = boto_sess.client("s3")
sagemaker_sess = Session(boto_session=boto_sess)

# --- Step 2: resolve your entrypoint (relative to this notebook) ---
entry_point_path = Path("../sagemaker/code/inference.py").resolve()
assert entry_point_path.exists(), f"Missing: {entry_point_path}"

# --- Step 3: latest Approved package ARN ---
MPG = f"{LABP}-telco-churn"
r = sm.list_model_packages(
    ModelPackageGroupName=MPG,
    ModelApprovalStatus="Approved",
    SortBy="CreationTime", SortOrder="Descending", MaxResults=1
)
assert r.get("ModelPackageSummaryList"), "No Approved package; approve one in Lab 7."
pkg_arn = r["ModelPackageSummaryList"][0]["ModelPackageArn"]

# --- Step 4: describe package -> image, model_data, eval json ---
info = sm.describe_model_package(ModelPackageName=pkg_arn)
image_uri      = info["InferenceSpecification"]["Containers"][0]["Image"]
model_data_url = info["InferenceSpecification"]["Containers"][0]["ModelDataUrl"]
eval_json_s3   = info["ModelMetrics"]["ModelQuality"]["Statistics"]["S3Uri"]

print("ENTRY:", entry_point_path)
print("IMAGE:", image_uri)
print("MODEL:", model_data_url)
print("EVAL :", eval_json_s3)

# --- Step 5: choose input/output S3 (find newest processed test.csv if not set) ---
def find_test_input():
    cands = []
    p = s3.get_paginator("list_objects_v2")
    for prefix in ["data/processed/", "artifacts/preprocess/"]:
        for page in p.paginate(Bucket=BUCKET, Prefix=prefix):
            for o in page.get("Contents", []):
                if o["Key"].endswith("test.csv"):
                    cands.append((o["LastModified"], o["Key"]))
    assert cands, "No processed test.csv found."
    _, key = max(cands, key=lambda x: x[0])
    return f"s3://{BUCKET}/{key}"

input_s3 = globals().get("input_s3") or find_test_input()
from datetime import datetime
output_s3 = globals().get("output_s3") or f"{S3_ARTIFACTS.rstrip('/')}/batch/{datetime.utcnow().strftime('%Y%m%d-%H%M%S')}/"
print("INPUT:", input_s3)
print("OUT  :", output_s3)

# --- Step 6: build Model with your code, then run Batch Transform ---
feature_list_s3 = f"{S3_ARTIFACTS.rstrip('/')}/preprocess/columns.json"

sk_model = SKLearnModel(
    model_data=model_data_url,
    image_uri=image_uri,
    role=SM_ROLE_ARN,
    entry_point=entry_point_path.name,
    source_dir=str(entry_point_path.parent),
    sagemaker_session=sagemaker_sess,
    code_location=f"{S3_ARTIFACTS.rstrip('/')}/code",   # 👈 force upload here
    env={
        "EVAL_JSON_S3": eval_json_s3,
        "FEATURE_LIST_JSON": feature_list_s3
    },
)


transformer = sk_model.transformer(
    instance_type="ml.m5.large",
    instance_count=1,
    strategy="MultiRecord",
    assemble_with="Line",
    output_path=output_s3,
    max_payload=6,
    max_concurrent_transforms=2,
)

job_name = f"{LABP}-bt-{uuid.uuid4().hex[:8]}"
transformer.transform(data=input_s3, content_type="text/csv", split_type="Line", job_name=job_name)
print("Started:", job_name)

# # prove your code is attached
# m = sm.describe_model(ModelName=transformer.latest_transform_job.describe()["ModelName"])
# env = m["PrimaryContainer"].get("Environment", {})
# print("PROGRAM:", env.get("SAGEMAKER_PROGRAM"))
# print("SUBMIT :", env.get("SAGEMAKER_SUBMIT_DIRECTORY"))

# --- prove your code is attached (robust) ---
d = sm.describe_transform_job(TransformJobName=job_name)
model_name = d.get("ModelName")

env = {}
if model_name:
    m = sm.describe_model(ModelName=model_name)
    env = m.get("PrimaryContainer", {}).get("Environment", {})

print("PROGRAM:", env.get("SAGEMAKER_PROGRAM"))
print("SUBMIT :", env.get("SAGEMAKER_SUBMIT_DIRECTORY"))


# poll status
while True:
    d = sm.describe_transform_job(TransformJobName=job_name)
    status = d["TransformJobStatus"]
    print("Status:", status)
    if status in ("Completed", "Failed", "Stopped"):
        if status != "Completed":
            print("FailureReason:", d.get("FailureReason"))
        break
    time.sleep(20)


In [None]:
# --- Helper: parse an s3://bucket/prefix/ into (bucket, prefix)
from urllib.parse import urlparse

def parse_s3_uri(s3_uri: str):
    u = urlparse(s3_uri)
    assert u.scheme == "s3" and u.netloc, f"Bad S3 URI: {s3_uri}"
    # Ensure trailing slash prefix behavior for list_objects
    prefix = u.path.lstrip("/")
    return u.netloc, prefix

# We set this earlier in Step 6
print("Output (we set this in Step 6):", output_s3)

# --- Find .out objects produced by Batch Transform
out_bucket, out_prefix = parse_s3_uri(output_s3)
paginator = s3.get_paginator("list_objects_v2")
out_keys = []
for page in paginator.paginate(Bucket=out_bucket, Prefix=out_prefix):
    for obj in page.get("Contents", []):
        if obj["Key"].endswith(".out"):
            out_keys.append(obj["Key"])

assert out_keys, f"No .out files found under {output_s3}"
print(f"Found {len(out_keys)} output file(s). Showing the first one:")
print("  s3://%s/%s" % (out_bucket, out_keys[0]))

# --- Read & preview the first ~10 lines
resp = s3.get_object(Bucket=out_bucket, Key=out_keys[0])
lines = resp["Body"].read().decode("utf-8").splitlines()

# Sanity checks on the "contract"
assert lines, "Output file is empty."
header = lines[0].strip()
print("Header:", header)
if header.lower() != "proba,pred":
    print("WARNING: Unexpected header. Did your inference script change?")
    print("First line raw:", header)

print("\nPreview (first 10 lines):")
for row in lines[:10]:
    print(row)