In [1]:
# =====================
# 1. Import Libraries
# =====================
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import boto3
import sagemaker

from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA

# SageMaker session and roles
session = sagemaker.Session()
role = sagemaker.get_execution_role()


  from pandas.core.computation.check import NUMEXPR_INSTALLED


sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


In [2]:
# =====================
# 2. Load Dataset
# =====================
bucket = "genmab-assessment"
filename = "customer_segmentation_data.csv"
data_path = f"s3://{bucket}/{filename}"
df = pd.read_csv(data_path)
df.head()

severe performance issues, see also https://github.com/dask/dask/issues/10276

To fix, you should specify a lower version bound on s3fs, or
update the current installation.



Unnamed: 0,Customer_ID,Age,Income,Purchases,Gender
0,1,26,50640,35,Male
1,2,18,61969,22,Male
2,3,43,89262,2,Female
3,4,60,105248,21,Male
4,5,23,53550,49,Male


In [3]:
# =====================
# 3. Preprocess & Train
# =====================
# Drop Customer_ID and encode Gender
df_processed = df.drop(columns=["Customer_ID"])

encoder = LabelEncoder()
df_processed["Gender"] = encoder.fit_transform(df_processed["Gender"])

# Normalize all columns
scaler = StandardScaler()
scaled_data = scaler.fit_transform(df_processed)

# Choose optimal k
silhouette_scores = []
for k in range(2, 11):
    km = KMeans(n_clusters=k, random_state=42, n_init=10).fit(scaled_data)
    silhouette_scores.append(silhouette_score(scaled_data, km.labels_))

optimal_k = silhouette_scores.index(max(silhouette_scores)) + 2
print("Optimal k:", optimal_k)

kmeans_final = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
df["Cluster"] = kmeans_final.fit_predict(scaled_data)

Optimal k: 10


In [4]:
# =====================
# 4. Save Model Artifact
# =====================
import tarfile
import os
import pickle

# Your existing model saving code
with open("kmeans_model.pkl", "wb") as f:
    pickle.dump((kmeans_final, scaler), f)

print("Saved trained model: kmeans_model.pkl")

# Create a requirements.txt file
with open('requirements.txt', 'w') as f:
    f.write('scikit-learn==1.3.2\n')
    f.write('pandas==2.0.1\n')

print("Created requirements.txt")

# Package the model and inference script into a tar.gz archive
model_dir = "." # Set model_dir to the current directory
with tarfile.open("model.tar.gz", "w:gz") as tar:
    tar.add("kmeans_model.pkl")
    tar.add("inference.py")
    tar.add("requirements.txt")

print(f"Successfully packaged model artifacts into model.tar.gz")

# Upload the tar.gz archive to S3
prefix = "customer-segmentation"
model_artifact = session.upload_data("model.tar.gz", bucket=bucket, key_prefix=prefix)
print("Model uploaded to S3:", model_artifact)

Saved trained model: kmeans_model.pkl
Created requirements.txt
Successfully packaged model artifacts into model.tar.gz
Model uploaded to S3: s3://genmab-assessment/customer-segmentation/model.tar.gz


In [5]:
# =====================
# 5. Inference Script
# =====================
inference_code = """
import pickle
import os
import json
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.cluster import KMeans

def model_fn(model_dir):
    \"\"\"Load model from artifact\"\"\"
    with open(os.path.join(model_dir, "kmeans_model.pkl"), "rb") as f:
        model, scaler = pickle.load(f)
    return model, scaler

def input_fn(request_body, request_content_type):
    \"\"\"Deserialize request\"\"\"
    if request_content_type == "application/json":
        data = json.loads(request_body)
        return np.array(data["instances"])
    else:
        raise ValueError("Unsupported content type: " + request_content_type)

def predict_fn(input_data, model_and_scaler):
    \"\"\"Run prediction\"\"\"
    model, scaler = model_and_scaler
    scaled = scaler.transform(input_data)
    preds = model.predict(scaled)
    return preds.tolist()

def output_fn(prediction, accept):
    \"\"\"Serialize output\"\"\"
    if accept == "application/json":
        return json.dumps({"predictions": prediction}), accept
    else:
        raise ValueError("Unsupported accept type: " + accept)
"""

with open("inference.py", "w") as f:
    f.write(inference_code)

In [6]:
# =====================
# 6. Deploy Model
# =====================
from sagemaker.sklearn.model import SKLearnModel

sklearn_model = SKLearnModel(
    model_data=model_artifact,
    role=role,
    entry_point="inference.py",
    framework_version="1.0-1",  # pick version compatible with SKLearn container
    py_version="py3"
)

predictor = sklearn_model.deploy(
    instance_type="ml.m5.large",
    initial_instance_count=1,
    endpoint_name="customer-segmentation-endpoint"
)

print("Endpoint deployed:", predictor.endpoint_name)

------!Endpoint deployed: customer-segmentation-endpoint


In [7]:
# =====================
# 7. Test Endpoint
# =====================
import json
from sagemaker.serializers import JSONSerializer
from sagemaker.deserializers import JSONDeserializer

# Set the serializer and deserializer on the predictor object
predictor.serializer = JSONSerializer()
predictor.deserializer = JSONDeserializer()

# Select only the features used for training
test_data = df_processed[['Age', 'Income', 'Purchases', 'Gender']].head(3)

# Convert the DataFrame to a list of lists
sample_data = test_data.values.tolist()

# Wrap the data in a dictionary with the 'instances' key
payload = {"instances": sample_data}

# Send the JSON-formatted payload to the deployed endpoint
response = predictor.predict(payload)

print("Predictions:", response)

Predictions: {'predictions': [4, 4, 9]}


In [8]:
# =====================
# 8. Cleanup Endpoint
# =====================
# predictor.delete_endpoint()