In [177]:
pip install transformers

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Note: you may need to restart the kernel to use updated packages.


In [5]:
import torch
import boto3
import json

# Connect to S3 Bucket

In [179]:
# Create an S3 client
s3 = boto3.client('s3')

# List all S3 buckets
response = s3.list_buckets()

# Print bucket names
print("Available S3 buckets:")
for bucket in response['Buckets']:
    print(bucket['Name'])
bucket_name = bucket['Name']

# List the objects in the bucket
response = s3.list_objects_v2(Bucket=bucket_name)
# Print the full path of the zip file
for obj in response.get('Contents', []):
    if obj['Key'].endswith('.zip'):
        print(f"s3://{bucket_name}/{obj['Key']}")

Available S3 buckets:
sagemaker-studio-443370720429-v31y0qov1w
sagemaker-us-east-2-443370720429
usd-projects
s3://usd-projects/final_model-20241009T224405Z-001.zip


In [180]:
# List objects in the bucket
response = s3.list_objects_v2(Bucket=bucket_name)

# Check if the bucket is not empty
if 'Contents' in response:
    print(f"Contents of the bucket '{bucket_name}':")
    for obj in response['Contents']:
        print(obj['Key'])
else:
    print(f"The bucket '{bucket_name}' is empty or does not exist.")

Contents of the bucket 'usd-projects':
final_model-20241009T224405Z-001.tar.gz
final_model-20241009T224405Z-001.zip
inference.py
requirement.txt


In [181]:
# Specify the bucket and file names
bucket_name = 'usd-projects'
file_name = 'final_model-20241009T224405Z-001.zip'
download_path = '/home/ec2-user/SageMaker/final_model.zip'  # Path to download the zip file

# Download the zip file
s3.download_file(bucket_name, file_name, download_path)

In [182]:
ls /home/ec2-user/SageMaker/source_dir

inference.py  requirements.txt


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [183]:
import zipfile
import os

# Specify the extraction path
extraction_path = '/home/ec2-user/SageMaker/final_model'

# Unzip the file
with zipfile.ZipFile(download_path, 'r') as zip_ref:
    zip_ref.extractall(extraction_path)

# Load Model 

In [184]:
from transformers import AutoTokenizer, AutoModelForCausalLM

# Set the paths to the model and tokenizer directories
model_dir = os.path.join(extraction_path, '/home/ec2-user/SageMaker/final_model/final_model')  
tokenizer_dir = os.path.join(extraction_path, '/home/ec2-user/SageMaker/final_model/final_model')  

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir)
model = AutoModelForCausalLM.from_pretrained(model_dir)

In [185]:
# Set the model to evaluation mode
model.eval()

# Define a test input (make sure it's compatible with your model, e.g., distilgpt2)
test_input = "<|context|> The Great Wall of China stretches over 13,000 miles and is a series of fortifications. <|question|> How long is the Great Wall of China?"
input_ids = tokenizer.encode(test_input, return_tensors='pt').to(model.device)

# Generate output
with torch.no_grad():
    output = model.generate(input_ids, max_length=50, num_return_sequences=1)

# Decode the output
decoded_output = tokenizer.decode(output[0], skip_special_tokens=True)
print("Generated Output:")
print(decoded_output)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Generated Output:
 The Great Wall of China stretches over 13,000 miles and is a series of fortifications.  How long is the Great Wall of China?  13,000 miles


In [None]:
def generate_response(context, question):
    # Combine context and question into the input text
    input_text = f"<|context|> {context} <|question|> {question}"

    # Encode the input text
    input_ids = tokenizer.encode(input_text, return_tensors='pt').to(model.device)

    # Create attention mask
    attention_mask = (input_ids != tokenizer.pad_token_id).type(torch.int).to(model.device)

    with torch.no_grad():
        # Generate output from the model
        output = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_length=150,
            num_return_sequences=1,
            pad_token_id=tokenizer.eos_token_id,
            do_sample=True,
            top_k=50,
            top_p=0.95,
            temperature=0.7
        )

    # Decode the output
    decoded_output = tokenizer.decode(output[0], skip_special_tokens=False)

    # Extract the answer from the decoded output
    answer_start = decoded_output.find("<|answer|>") + len("<|answer|>")
    if answer_start != -1:
        answer = decoded_output[answer_start:].strip()  # Extract answer
        # Remove any padding tokens from the end of the answer
        answer = answer.split('[PAD]')[0].strip()  # Trim padding tokens
    else:
        answer = "No answer token found."

    print(f"Decoded Output: {decoded_output}")  # Keep this for debugging
    return answer


In [None]:
# Define lists of contexts and questions
contexts = [
    "The Eiffel Tower is located in Paris, France. It is one of the most recognizable structures in the world.",
    "The Great Wall of China stretches over 13,000 miles and is a series of fortifications.",
    "Mount Everest is the highest mountain in the world, with a peak that reaches 29,029 feet.",
    "The Amazon Rainforest is home to an estimated 390 billion trees and millions of species of plants and animals.",
    "The Mona Lisa, painted by Leonardo da Vinci, is one of the most famous works of art in history."
]

questions = [
    "Where is the Eiffel Tower located?",
    "How long is the Great Wall of China?",
    "What is the height of Mount Everest?",
    "What is significant about the Amazon Rainforest?",
    "Who painted the Mona Lisa?"
]

# Loop through each context and question
for context, question in zip(contexts, questions):
    answer = generate_response(context, question)
    print(f"Question: {question}")
    print(f"Answer: {answer}")
    print()  # Print a newline for better readability

# Deploy Model to Sagemaker

In [186]:
model

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50261, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-5): 6 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50261, bias=False)
)

In [191]:
import sagemaker
from sagemaker.pytorch import PyTorchModel

sagemaker_session = sagemaker.Session()
role = 'arn:aws:iam::443370720429:role/service-role/AmazonSageMaker-ExecutionRole-20241010T105606'  # Replace with your IAM role with SageMaker permissions

# Define the S3 path for your model
model_path = "s3://usd-projects/final_model-20241009T224405Z-001.tar.gz"

# Create the PyTorch model object
pytorch_model = PyTorchModel(
    model_data=model_path,
    role=role,
    entry_point="inference.py",  # Your inference script
    source_dir="source_dir",  # Directory containing the inference script
    framework_version='1.12',  # Match the PyTorch version you used
    py_version='py38',  # Match the Python version
    dependencies=["source_dir/requirements.txt"],
)

In [192]:
predictor = pytorch_model.deploy(
    instance_type="ml.m5.xlarge",
    initial_instance_count=1,
    container_startup_health_check_timeout=600  # Increase timeout (in seconds)
)

------!

In [None]:
# Prepare input data
test_input = "<|context|> The Great Wall of China stretches over 13,000 miles and is a series of fortifications. <|question|> How long is the Great Wall of China?"

# Log the input to verify
print(f"Sending input: {test_input}")

response = predictor.predict(
    test_input,
    initial_args={
        "ContentType": "text/plain",
        "Accept": "application/json"
    }
)

# Print the response
print(response)


In [6]:
question_context1 = ["Where is the Eiffel Tower located?","The Eiffel Tower is located in Paris, France. It is one of the most recognizable structures in the world."]
question_context2 = ["How long is the wall of China?", "The Great Wall of China stretches over 13,000 miles and is a series of fortifications."]

In [9]:
newline = '\n'
bold = '\033[1m'
unbold = '\033[0m'
def query_endpoint(encoded_text):
    endpoint_name = 'usd-chatbot-20241012-055920'
    client = boto3.client('runtime.sagemaker')
    response = client.invoke_endpoint(EndpointName=endpoint_name, ContentType='application/list-text', Body=encoded_text)
    return response

def parse_response(query_response):
    model_predictions = json.loads(query_response['Body'].read())
    answer = model_predictions['answer']
    return answer

for question_context in [question_context1, question_context2]:
    query_response = query_endpoint(json.dumps(question_context).encode('utf-8'))
    answer = parse_response(query_response)
    print (f"Inference:{newline}"
            f"Question: {bold}{question_context[0]}{unbold}{newline}"
            f"Context: {question_context[1]}{newline}"
            f"model answer: {bold}{answer}{unbold}{newline}")
          

Inference:
Question: [1mWhere is the Eiffel Tower located?[0m
Context: The Eiffel Tower is located in Paris, France. It is one of the most recognizable structures in the world.
model answer: [1mParis, France[0m

Inference:
Question: [1mHow long is the wall of China?[0m
Context: The Great Wall of China stretches over 13,000 miles and is a series of fortifications.
model answer: [1m13,000 miles[0m

