In [1]:
!pip install sagemaker

Collecting sagemaker
  Downloading sagemaker-2.242.0-py3-none-any.whl.metadata (16 kB)
Collecting attrs<24,>=23.1.0 (from sagemaker)
  Downloading attrs-23.2.0-py3-none-any.whl.metadata (9.5 kB)
Collecting boto3<2.0,>=1.35.75 (from sagemaker)
  Downloading boto3-1.37.19-py3-none-any.whl.metadata (6.7 kB)
Collecting docker (from sagemaker)
  Downloading docker-7.1.0-py3-none-any.whl.metadata (3.8 kB)
Collecting fastapi (from sagemaker)
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting importlib-metadata<7.0,>=1.4.0 (from sagemaker)
  Downloading importlib_metadata-6.11.0-py3-none-any.whl.metadata (4.9 kB)
Collecting numpy<2.0,>=1.9.0 (from sagemaker)
  Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting omegaconf<=2.3,>=2.2 (from sagemaker)
  Downloading omegaconf-2.3.0-py3-none

In [1]:
import sagemaker

import boto3

import pandas as pd



# Initialize SageMaker session

sagemaker_session = sagemaker.Session()



# Get the SageMaker execution role

role = sagemaker.get_execution_role()



# S3 bucket for storing data

bucket = sagemaker_session.default_bucket()

prefix = "nlp-model-demo"








sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml


In [2]:
def train():

    # Argument parser for SageMaker input

    parser = argparse.ArgumentParser()

    parser.add_argument("--train_data", type=str, default=os.environ["SM_CHANNEL_TRAIN"])

    args = parser.parse_args()



    # Load dataset from the provided path

    train_data_path = os.path.join(args.train_data, "processed_reviews.csv")

    df = pd.read_csv(train_data_path)



    # Split data

    X = df["Text"]

    y = df["Sentiment"]
# Create a text-processing pipeline

    pipeline = Pipeline([

        ("tfidf", TfidfVectorizer(stop_words="english")),

        ("clf", LogisticRegression())

    ])



    # Train model

    pipeline.fit(X, y)



    # Save trained model

    model_path = os.path.join("/opt/ml/model", "model.joblib")

    joblib.dump(pipeline, model_path)

    print("Model saved at", model_path)



if _name_ == "_main_":

    train()

In [None]:
from sagemaker.sklearn.estimator import SKLearn



# Define SageMaker SKLearn Estimator

sklearn_estimator = SKLearn(

    entry_point="train.py",

    framework_version="0.23-1",

    instance_type="ml.m5.large",

    role=role,

    sagemaker_session=sagemaker_session,

)

# Train the model on SageMaker

sklearn_estimator.fit({"train": s3_train_data})

In [4]:
%%writefile inference.py

import joblib

import os

import json

import pandas as pd

# Load trained model

def model_fn(model_dir):

    model_path = os.path.join(model_dir, "model.joblib")

    return joblib.load(model_path)

# Parse input JSON

def input_fn(request_body, request_content_type):

    if request_content_type == "application/json":

        data = json.loads(request_body)

        return pd.DataFrame(data, columns=["Text"])

    else:

        raise ValueError("Unsupported content type: {}".format(request_content_type))

# Generate predictions

def predict_fn(input_data, model):

    return model.predict(input_data["Text"]).tolist()






Writing inference.py


In [5]:
from sagemaker.sklearn.model import SKLearnModel



# Get model path from training job

model_data = sklearn_estimator.model_data



# Create a SageMaker model

sklearn_model = SKLearnModel(

    model_data=model_data,

    role=role,

    entry_point="inference.py",

    framework_version="0.23-1",

    sagemaker_session=sagemaker_session,

)



# Deploy the model to a real-time endpoint

predictor = sklearn_model.deploy(instance_type="ml.m5.large", initial_instance_count=1)


In [7]:
import json
test_data = json.dumps(["This product is amazing!", "Worst product ever."])
response = predictor.predict(test_data)
print("Predictions:", response)