In [None]:
! wget https://healthcaredatasetbucket.s3.amazonaws.com/HealthCare.csv

In [173]:
!pip install "sagemaker>=2.48.0" --upgrade

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m23.1.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [174]:
import sagemaker
import os 
import boto3
import time
from sagemaker.workflow.parameters import ParameterBoolean , ParameterInteger , ParameterFloat , ParameterString
from sagemaker.pytorch import PyTorch , PyTorchModel
from sagemaker.processing import ProcessingInput , ProcessingOutput
from sagemaker.sklearn.processing import SKLearnProcessor
from sagemaker.workflow.steps import ProcessingStep , TrainingStep , TrainingInput
from sagemaker.huggingface import HuggingFaceModel , HuggingFace
from sagemaker.workflow.lambda_step import LambdaStep
from sagemaker.lambda_helper import Lambda
from sagemaker.workflow.fail_step import FailStep
from sagemaker.workflow.conditions import ConditionGreaterThanOrEqualTo , ConditionLessThanOrEqualTo
from sagemaker.workflow.condition_step import ConditionStep
from sagemaker.workflow.pipeline import Pipeline , PipelineExperimentConfig
from sagemaker.workflow.functions import JsonGet
from sagemaker.workflow.execution_variables import ExecutionVariables
from sagemaker.workflow.step_collections import RegisterModel , CreateModelStep 
from sagemaker.workflow.properties import PropertyFile
from sagemaker.workflow.functions import JsonGet
from sagemaker.workflow.pipeline import Pipeline, PipelineExperimentConfig
from sagemaker.workflow.execution_variables import ExecutionVariables

In [175]:
sess=sagemaker.Session()

region=sess.boto_region_name

sagemaker_bucket = None 

if sagemaker_bucket is None and sess is not None:
    sagemaker_bucket=sess.default_bucket()
    
role = sagemaker.get_execution_role()

sess = sagemaker.Session(default_bucket=sagemaker_bucket)

print(f'The sagemaker region is : {region}')

print(f'The sagemaker role is : {role}')

print(f'The bucket name : {sagemaker_bucket}')

print(f'The sess : {sess}')


The sagemaker region is : us-east-1
The sagemaker role is : arn:aws:iam::274743203955:role/service-role/AmazonSageMaker-ExecutionRole-20230619T115265
The bucket name : sagemaker-us-east-1-274743203955
The sess : <sagemaker.session.Session object at 0x7f99ce6c6590>


In [176]:
 
pytorch_version = "2.0"
transformers_version = "4.28"
python_version="py310"

s3_perfix="T5demo"

base_job_prefix="T5demohealthcare"

model_id_ ="t5-base"

model_id =ParameterString(name="ModelName",default_value=model_id_)


In [177]:
processing_scripts=ParameterString(name="processingscript" , default_value="./scripts/preprocessing.py")
preprocessing_instance = ParameterString(name="preprocess_instance",default_value="ml.c5.2xlarge")
processing_count = ParameterInteger(name="preprocess_instance_count",default_value=1)


In [178]:
s3_bucket_store = f"{sagemaker_bucket}/{s3_perfix}"

In [179]:
s3_input = sess.upload_data(path="Preprocessed_HealthCare.csv",key_prefix=s3_perfix)

In [180]:
%%writefile scripts/preprocessing.py

import argparse
import logging
import os 
import sys 
import subprocess
import pandas as pd 
import numpy as np




def install(package):
    subprocess.check_call([sys.executable, "-m", "pip", "install", package])



if __name__=="__main__":
    
    parse=argparse.ArgumentParser()
    
    parse.add_argument("--model_id", type=str)
    parse.add_argument("--dataset_name", type=str)
    parse.add_argument("--pytorch_version", type=str)
    
    install("fsspec==2023.1.0")
    install("s3fs==0.4.2")
    install("scikit-learn==0.22.1")
    install("datasets[s3]")
    install("datasets==2.12.0")
    install("transformers==4.30.1")
    install("torch==1.13.1")

    args, _ = parse.parse_known_args()
    
    from sklearn.model_selection import train_test_split
    from datasets import load_from_disk , Dataset , load_metric
    from transformers import AdamW,T5ForConditionalGeneration, T5TokenizerFast as T5Tokenizer , AutoTokenizer
    
    logger = logging.getLogger(__name__)

    logging.basicConfig(
        level=logging.getLevelName("INFO"),
        handlers=[logging.StreamHandler(sys.stdout)],
        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
    )
    
  
    # lets make pandas to read data from s3 
    df=pd.read_csv(args.dataset_name)
    
    df_train,df_test=train_test_split(df,test_size=0.2,random_state=42)
    
    new_df_train=pd.concat([df_train,df_test],ignore_index=True) 
    
    new_df_train.to_csv("train.csv",index=False)
    df_test.to_csv("test.csv",index=False)
    
    train_data=pd.read_csv("train.csv")
    test_data=pd.read_csv("test.csv")
    
    
    train_dataset = Dataset.from_pandas(train_data)
    test_dataset = Dataset.from_pandas(test_data)
    
    logger.info(f"The lenght of the train dataset : {len(train_dataset)}")
    
    logger.info(f"The lenght of test dataset : {len(test_dataset)}")
    
    tokenizer=AutoTokenizer.from_pretrained(args.model_id)
    
    # encoder and decoder
    encoder_length =  128
    decoder_length =  512
    
    def preprocess_data(data):
        
           
        
        inputs = ["question: " + item for item in data["Question"]]
        
        outputs = ["answer: " + str(ans) for ans in data["Answer"]]
        

        model_inputs = tokenizer(inputs,max_length=encoder_length,padding='max_length', truncation=True,add_special_tokens=True)
        
    
        labels= tokenizer(outputs, max_length=decoder_length,padding='max_length', truncation=True,add_special_tokens=True)
        
        
        labels["input_ids"] = [
            [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
        ]
        
        model_inputs["labels"]=labels["input_ids"]
        
        return model_inputs
    
    train_dataset=train_dataset.map(preprocess_data,batched=True,remove_columns=["Question","Answer"])
    
    train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
    
    test_dataset=test_dataset.map(preprocess_data,batched=True,remove_columns=["Question","Answer"])
    
    test_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
    
    train_dataset.save_to_disk("/opt/ml/processing/train")
    
    test_dataset.save_to_disk("/opt/ml/processing/test")
    

Overwriting scripts/preprocessing.py


In [181]:
## preprocessing step 

s3_bucket_store = f"s3://{sagemaker_bucket}/{s3_perfix}/data"

processor = SKLearnProcessor(
    framework_version="0.23-1",
    instance_count=processing_count,
    instance_type=preprocessing_instance.default_value,
    base_job_name=base_job_prefix + "/preprocessing",
    sagemaker_session=sess,
    role=role
)

process_step = ProcessingStep(
    name="T5DemoPreProcessing",
    processor=processor,
    job_arguments=["--pytorch_version",pytorch_version,
                   "--dataset_name",s3_input,
                   "--model_id",model_id
                  ],
    outputs=[
         ProcessingOutput(
             output_name="train",
             destination=f"{s3_bucket_store}/train",
             source="/opt/ml/processing/train"    
                ),
            
         ProcessingOutput(
             output_name="test",
             destination=f"{s3_bucket_store}/test",
             source="/opt/ml/processing/test"    
                )

             ]
    
    ,
    code=processing_scripts.default_value
)





In [182]:
# training entry point

training_entry_point = ParameterString(name='trainingscript',default_value="train.py")
training_src_dir = ParameterString(name="trainsourcrdir",default_value="./scripts")
training_instance_type = ParameterString(name="traininstancetype",default_value="ml.g5.2xlarge")
training_instance_count = ParameterInteger(name="traininginstancecount",default_value=1)

## hyperparameters 
epochs = ParameterInteger(name="epochs",default_value=1)

train_batch_size = ParameterInteger(name="trainbatchsize",default_value=4)

test_batch_size = ParameterInteger(name="testbatchsize",default_value=4)

learning_rate = ParameterFloat(name="learningrate",default_value=3e-3)



In [183]:
%%writefile scripts/requirements.txt
rouge==1.0.1
rouge-score==0.1.2
transformers==4.30.2
torch==1.13.1
datasets==2.13.0
accelerate==0.20.3
tensorboard==2.11.2

Overwriting scripts/requirements.txt


In [184]:
%%writefile scripts/train.py 
import torch 
import gc 
import pandas as pd 
import numpy as np
from transformers import AdamW, AutoModelForSeq2SeqLM, Seq2SeqTrainer , Seq2SeqTrainingArguments , AutoTokenizer , DataCollatorForSeq2Seq
import os 
import warnings
import sys 
import argparse
import logging
import json
from datasets import load_from_disk , Dataset , load_metric
from transformers.trainer_utils import get_last_checkpoint

os.environ["TOKENIZERS_PARALLELISM"] = "false"
warnings.filterwarnings('ignore')

if __name__=='__main__':
    
    parse = argparse.ArgumentParser()
    
    ## hyperparameters 
    parse.add_argument("--epochs",default=1)
    parse.add_argument("--train_batch_size",default=4)
    parse.add_argument("--test_batch_size",default=2)
    parse.add_argument("--train_grad_accumulation",default=1)
    parse.add_argument("--test_grad_accumulation",default=1)
    parse.add_argument("--learning_rate",default=3e-3)
    parse.add_argument("--warmup_steps",default=1000)
    parse.add_argument("--model_id",default="t5-base")
    ## sagemaker values 
    parse.add_argument("--output_data_dir",default=os.environ["SM_OUTPUT_DATA_DIR"])
    parse.add_argument("--model_dir",default=os.environ["SM_MODEL_DIR"])
    parse.add_argument("--n_gpus",default=os.environ["SM_NUM_GPUS"])
    parse.add_argument("--training_dir",default=os.environ["SM_CHANNEL_TRAIN"])
    parse.add_argument("--test_dir",default=os.environ["SM_CHANNEL_TEST"])
    
    args,_ =parse.parse_known_args()
    
        # Set up logging
    logger = logging.getLogger(__name__)

    logging.basicConfig(
        level=logging.getLevelName("INFO"),
        handlers=[logging.StreamHandler(sys.stdout)],
        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
    )
    

    
    train_dataset=load_from_disk(args.training_dir)
    test_dataset=load_from_disk(args.test_dir)
    
    tokenizer=AutoTokenizer.from_pretrained(args.model_id)
    
    
    metric = load_metric('rouge')
    

    def compute_metrics(eval_preds):
        
        preds, labels = eval_preds
        
        if isinstance(preds, tuple):
            preds = preds[0]
    
    # Replace -100 in the labels as we can't decode them.
        preds = np.where(preds != -100, preds, tokenizer.pad_token_id)
        
        decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
        
        labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
        
        decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True ,clean_up_tokenization_spaces=True)  
    
        result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
        
        return {'rouge1':result['rouge1'].mid.fmeasure,
                'rouge2':result['rouge2'].mid.fmeasure,
                'rougeL':result['rougeL'].mid.fmeasure,
                'rougeLsum':result['rougeLsum'].mid.fmeasure }
        

    model = AutoModelForSeq2SeqLM.from_pretrained(args.model_id)
    
    label_pad_token_id = -100

    data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    label_pad_token_id=label_pad_token_id,
    pad_to_multiple_of=8
     )
    
    training_args = Seq2SeqTrainingArguments(
        output_dir=f"{args.output_data_dir}",
        overwrite_output_dir=True if get_last_checkpoint(args.output_data_dir) is not None else False,
        num_train_epochs=int(args.epochs),
        do_eval=True,
        generation_max_length=512, # decoder lenght 
        logging_strategy="steps",
        logging_steps=500,
        gradient_accumulation_steps=3,
        predict_with_generate=True,
        per_device_train_batch_size=int(args.train_batch_size),
        per_device_eval_batch_size=int(args.test_batch_size),
        warmup_steps=args.warmup_steps,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        logging_dir=f"{args.output_data_dir}/logs",
        learning_rate=float(args.learning_rate),
        load_best_model_at_end=True,
        report_to="tensorboard"
    )
    
    
        # create Trainer instance
    trainer = Seq2SeqTrainer(
        model=model,
        args=training_args,
        compute_metrics=compute_metrics,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        tokenizer=tokenizer,
        data_collator=data_collator
    )

    # train model
    if get_last_checkpoint(args.output_data_dir) is not None:
        logger.info("***** continue training *****")
        last_checkpoint = get_last_checkpoint(args.output_data_dir)
        trainer.train(resume_from_checkpoint=last_checkpoint)
    else:
        trainer.train()
    
    trainer.train()

    # evaluate model
    eval_result = trainer.evaluate(eval_dataset=test_dataset)
    

     # writes eval result to file which can be accessed later in s3 ouput
    with open(os.path.join(os.environ["SM_MODEL_DIR"], "evaluation.json"), "w") as writer:
        print(f"***** Eval results *****")
        writer.write(json.dumps(eval_result))

    # Saves the model to s3 uses os.environ["SM_MODEL_DIR"] to make sure checkpointing works
    trainer.save_model(os.environ["SM_MODEL_DIR"]) 
    tokenizer.save_pretrained(os.environ["SM_MODEL_DIR"])

Overwriting scripts/train.py


In [185]:

estimator = HuggingFace(
    entry_point="train.py",
    source_dir="./scripts",
    base_job_name="t5demojob",
    instance_type=training_instance_type.default_value,
    instance_count=1,
    role=role,
    framework_version=pytorch_version,
    transformers_version=transformers_version,
    pytorch_version=pytorch_version,
    py_version=python_version,
    hyperparameters={
         'epochs':epochs,
         'test_batch_size': test_batch_size,
         'train_batch_size': train_batch_size,
         'learning_rate': learning_rate,
         'model_id': model_id
    },
    sagemaker_session=sess
)

step_train = TrainingStep(
    name="TrainingT5Model",
    estimator=estimator,
    inputs={
        "train": TrainingInput(
            s3_data=process_step.properties.ProcessingOutputConfig.Outputs[
                "train"
            ].S3Output.S3Uri
        ),
        "test": TrainingInput(
            s3_data=process_step.properties.ProcessingOutputConfig.Outputs[
                "test"
            ].S3Output.S3Uri
        ),
    }
)


In [186]:
evaluation_script = ParameterString(name="EvaluationScript", default_value="./scripts/evaluate.py")


In [187]:
%%writefile scripts/evaluate.py


import subprocess
import sys
import json
import logging
import pathlib
import tarfile
import os

import numpy as np
import pandas as pd


logger = logging.getLogger()
logger.setLevel(logging.INFO)
logger.addHandler(logging.StreamHandler())


if __name__ == "__main__":
    logger.debug("Starting evaluation.")
    model_path = "/opt/ml/processing/model/model.tar.gz"
    with tarfile.open(model_path) as tar:
        tar.extractall(path="./modeldir")

    with open("./modeldir/evaluation.json") as f:
        eval_result = json.load(f)

    logger.info(eval_result)
    output_dir = "/opt/ml/processing/evaluation"
    pathlib.Path(output_dir).mkdir(parents=True, exist_ok=True)

    evaluation_path = f"{output_dir}/evaluation.json"
    with open(evaluation_path, "w") as f:
        f.write(json.dumps(eval_result))

Overwriting scripts/evaluate.py


In [188]:
script_eval = SKLearnProcessor(
    framework_version="0.23-1",
    instance_type=preprocessing_instance.default_value,
    instance_count=processing_count,
    base_job_name=base_job_prefix + "/evaluation",
    sagemaker_session=sess,
    role=role,
)

evaluation_report = PropertyFile(
    name="EvaluationReport",
    output_name="evaluation",
    path="evaluation.json",
)

step_eval = ProcessingStep(
    name="EvalStep",
    processor=script_eval,
    inputs=[
        ProcessingInput(
            source=step_train.properties.ModelArtifacts.S3ModelArtifacts,
            destination="/opt/ml/processing/model",
        )
    ],
    outputs=[
        ProcessingOutput(
            output_name="evaluation",
            source="/opt/ml/processing/evaluation",
            destination=f"s3://{sagemaker_bucket}/{s3_perfix}/evaluation_report",
        ),
    ],
    code=evaluation_script.default_value,
    property_files=[evaluation_report]
)

In [189]:
threshold_accuracy = ParameterFloat(name="ThresholdRouge", default_value=0.25)

In [190]:
failed_step=FailStep(
    name="FailedStep",
    error_message="The Rouge score is less than threshold",
    display_name="Failed due to less Rouge score"
)

In [191]:
model = HuggingFaceModel(
    model_data=step_train.properties.ModelArtifacts.S3ModelArtifacts,
    role=role,
    py_version=python_version,
    sagemaker_session=sess,
    transformers_version=transformers_version,
    pytorch_version=pytorch_version
)

# step_create_model=CreateModelStep(
#     name="T5-ModelCreation-Step",
#     model=model,
#     inputs=sagemaker.inputs.CreateModelInput(instance_type="ml.g4dn.xlarge")
    
# )
model_package_group_name = "T5Group"
step_register = RegisterModel(
    name="T5RegisterModel",
    model=model,
    content_types=["application/json"],
    response_types=["application/json"],
    inference_instances=["ml.g5.xlarge"],
    model_package_group_name=model_package_group_name,
    approval_status="Approved"
)


In [192]:
%%writefile scripts/iam_helper.py

import boto3
import json

iam = boto3.client("iam")


def create_lambda_role(role_name):
    try:
        response = iam.create_role(
            RoleName=role_name,
            AssumeRolePolicyDocument=json.dumps(
                {
                    "Version": "2012-10-17",
                    "Statement": [
                        {
                            "Effect": "Allow",
                            "Principal": {"Service": "lambda.amazonaws.com"},
                            "Action": "sts:AssumeRole",
                        }
                    ],
                }
            ),
            Description="Role for Lambda to call SageMaker functions",
        )

        role_arn = response["Role"]["Arn"]

        response = iam.attach_role_policy(
            RoleName=role_name,
            PolicyArn="arn:aws:iam::aws:policy/service-role/AWSLambdaBasicExecutionRole",
        )

        response = iam.attach_role_policy(
            PolicyArn="arn:aws:iam::aws:policy/AmazonSageMakerFullAccess", RoleName=role_name
        )

        return role_arn

    except iam.exceptions.EntityAlreadyExistsException:
        print(f"Using ARN from existing role: {role_name}")
        response = iam.get_role(RoleName=role_name)
        return response["Role"]["Arn"]


Overwriting scripts/iam_helper.py


In [193]:
from scripts.iam_helper import create_lambda_role

lambda_role = create_lambda_role("deploy-model-lambda-role")

Using ARN from existing role: deploy-model-lambda-role


In [194]:
%%writefile scripts/deploy_model_lambda.py


"""
This Lambda function deploys the model to SageMaker Endpoint. 
If Endpoint exists, then Endpoint will be updated with new Endpoint Config.
"""

import json
import boto3
import time


sm_client = boto3.client("sagemaker")


def lambda_handler(event, context):
    print(f"Received Event: {event}")

    current_time = time.strftime("%m-%d-%H-%M-%S", time.localtime())
    endpoint_instance_type = event["endpoint_instance_type"]
    model_name = event["model_name"]
    model_package_arn = event["model_package_arn"]
    endpoint_config_name = "{}-{}".format(event["endpoint_config_name"], current_time)
    endpoint_name = event["endpoint_name"]
    role=event["role"]
    
    container = {"ModelPackageName": model_package_arn}

    create_model_respose = sm_client.create_model(ModelName=model_name, ExecutionRoleArn=role,Containers=[container])
    
    # Create Endpoint Configuration
    create_endpoint_config_response = sm_client.create_endpoint_config(
        EndpointConfigName=endpoint_config_name,
        ProductionVariants=[
            {
                "InstanceType": endpoint_instance_type,
                "InitialVariantWeight": 1,
                "InitialInstanceCount": 1,
                "ModelName": model_name,
                "VariantName": "AllTraffic",
            }
        ],
    )
    print(f"create_endpoint_config_response: {create_endpoint_config_response}")

    # Check if an endpoint exists. If no - Create new endpoint, if yes - Update existing endpoint
    list_endpoints_response = sm_client.list_endpoints(
        SortBy="CreationTime",
        SortOrder="Descending",
        NameContains=endpoint_name,
    )
    print(f"list_endpoints_response: {list_endpoints_response}")

    if len(list_endpoints_response["Endpoints"]) > 0:
        print("Updating Endpoint with new Endpoint Configuration")
        update_endpoint_response = sm_client.update_endpoint(
            EndpointName=endpoint_name, EndpointConfigName=endpoint_config_name
        )
        print(f"update_endpoint_response: {update_endpoint_response}")
    else:
        print("Creating Endpoint")
        create_endpoint_response = sm_client.create_endpoint(
            EndpointName=endpoint_name, EndpointConfigName=endpoint_config_name
        )
        print(f"create_endpoint_response: {create_endpoint_response}")

    return {"statusCode": 200, "body": json.dumps("Endpoint Created Successfully")}

Overwriting scripts/deploy_model_lambda.py


In [195]:

current_time = time.strftime("%m-%d-%H-%M-%S", time.localtime())

endpoint_config_name = "t5-healthcaredataset-config"
endpoint_name = "t5-healthcare-endpoint-" + current_time



deploy_model_lambda_function_name = "sagemaker-deploy-t5model" + current_time

deploy_model_lambda_function = Lambda(
    function_name=deploy_model_lambda_function_name,
    execution_role_arn=lambda_role,
    script="./scripts/deploy_model_lambda.py",
    handler="deploy_model_lambda.lambda_handler",
)

step_lower_rogue_score = LambdaStep(
    name="Deploy-T5-HealthcareModel",
    lambda_func=deploy_model_lambda_function,
    inputs={
        "model_name": f"{model_id_}-healthcaredataset",
        "endpoint_config_name": endpoint_config_name,
        "model_package_arn": step_register.properties.ModelPackageArn,
        "endpoint_name": endpoint_name,
        "role":role,
        "endpoint_instance_type": "ml.g5.xlarge",
    },
)

In [196]:
cond_gte = ConditionGreaterThanOrEqualTo(
    left=JsonGet(
        step_name=step_eval.name,
        property_file=evaluation_report,
        json_path="eval_rouge1",
    ),
    right=threshold_accuracy,
)

step_cond = ConditionStep(
    name="CheckEvalRogue",
    conditions=[cond_gte],
    if_steps=[step_register,step_lower_rogue_score],
    else_steps=[failed_step],
)

In [197]:
pipeline = Pipeline(
    name=f"T5DemoPipleline",
    parameters=[
        processing_scripts,
        preprocessing_instance,
        processing_count,
        model_id,
        training_entry_point,
        training_src_dir,
        training_instance_type,
        training_instance_count,
        epochs,
        train_batch_size,
        test_batch_size,
        learning_rate,
        threshold_accuracy,
        evaluation_script
        
        
    ],
    steps=[process_step,step_train,step_eval,step_cond],
    sagemaker_session=sess,
)

In [198]:
import json

json.loads(pipeline.definition())

Popping out 'CertifyForMarketplace' from the pipeline definition since it will be overridden in pipeline execution time.


Using provided s3_resource


{'Version': '2020-12-01',
 'Metadata': {},
 'Parameters': [{'Name': 'processingscript',
   'Type': 'String',
   'DefaultValue': './scripts/preprocessing.py'},
  {'Name': 'preprocess_instance',
   'Type': 'String',
   'DefaultValue': 'ml.c5.2xlarge'},
  {'Name': 'preprocess_instance_count', 'Type': 'Integer', 'DefaultValue': 1},
  {'Name': 'ModelName', 'Type': 'String', 'DefaultValue': 't5-base'},
  {'Name': 'trainingscript', 'Type': 'String', 'DefaultValue': 'train.py'},
  {'Name': 'trainsourcrdir', 'Type': 'String', 'DefaultValue': './scripts'},
  {'Name': 'traininstancetype',
   'Type': 'String',
   'DefaultValue': 'ml.g5.2xlarge'},
  {'Name': 'traininginstancecount', 'Type': 'Integer', 'DefaultValue': 1},
  {'Name': 'epochs', 'Type': 'Integer', 'DefaultValue': 1},
  {'Name': 'trainbatchsize', 'Type': 'Integer', 'DefaultValue': 4},
  {'Name': 'testbatchsize', 'Type': 'Integer', 'DefaultValue': 4},
  {'Name': 'learningrate', 'Type': 'Float', 'DefaultValue': 0.003},
  {'Name': 'Thresho

In [199]:
pipeline.upsert(role_arn=role)

Popping out 'CertifyForMarketplace' from the pipeline definition since it will be overridden in pipeline execution time.


Using provided s3_resource
Using provided s3_resource


Popping out 'CertifyForMarketplace' from the pipeline definition since it will be overridden in pipeline execution time.


{'PipelineArn': 'arn:aws:sagemaker:us-east-1:274743203955:pipeline/T5DemoPipleline',
 'ResponseMetadata': {'RequestId': 'bb19a489-3d0a-4920-a047-2420e5cf0b26',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'bb19a489-3d0a-4920-a047-2420e5cf0b26',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '83',
   'date': 'Fri, 23 Jun 2023 14:44:35 GMT'},
  'RetryAttempts': 0}}

In [200]:
executor=pipeline.start()

In [None]:
executor.wait()

In [205]:
from sagemaker.huggingface import HuggingFacePredictor

In [206]:
predictor=HuggingFacePredictor(endpoint_name="t5-healthcare-endpoint-06-23-14-44-28" , sagemaker_session=sess)

In [221]:
%%time
parameters = {
  "early_stopping": True,
  "length_penalty": 1.0,
  "max_new_tokens": 512,
  "temperature": 0.3,
  "min_length": 10,
  "no_repeat_ngram_size": 3,
}


predictor.predict({"inputs":"Any info about Serum progesterone","parameters" :parameters})

CPU times: user 5.59 ms, sys: 190 µs, total: 5.78 ms
Wall time: 5.5 s


[{'generated_text': 'answer: Serum progesterone is a hormone produced by the body. It is injected into the bloodstream by a doctor or nurse in a medical facility. The test is done to check the levels of serum progestin in the blood. Serum is inserted into the uterus and is pumped through the urethra. The ureters are a tube that carries urine from the lungs to the intestines. The tube is placed in the rectum and is placed directly into the stomach. The tubes are placed directly through the stomach and are inserted through the tube. The catheter is inflated and inflated to help the esophagus. The needle is pushed through the catheter and inserted. The machine is positioned in the abdomen and the stoma. This is called a ureter. The procedure is done in sterone a procedure that uses a needle to draw blood from the stomach through a catheter. Then the needle is placed into the vagina and the needle inserted to draw the blood from a vein. The vagina is dilated and dilsated. The device is spr

In [219]:
pd.read_csv("HealthCare.csv")

Unnamed: 0,Question,Answer
0,What is (are) Non-Small Cell Lung Cancer ?,Key Points\n - Non-small ce...
1,Who is at risk for Non-Small Cell Lung Cancer? ?,Smoking is the major risk factor for non-small...
2,What are the symptoms of Non-Small Cell Lung C...,Signs of non-small cell lung cancer include a ...
3,How to diagnose Non-Small Cell Lung Cancer ?,Tests that examine the lungs are used to detec...
4,What is the outlook for Non-Small Cell Lung Ca...,Certain factors affect prognosis (chance of re...
...,...,...
19020,What is (are) Acrodermatitis ?,Gianotti-Crosti syndrome is a childhood skin c...
19021,Do you have information about Serum progesterone,Serum progesterone is a test to measure the am...
19022,What is (are) Osteomalacia ?,Osteomalacia is softening of the bones. It mos...
19023,What is (are) Elbow replacement ?,Elbow replacement is surgery to replace the el...
