# Model Training

### Imports

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import boto3
import re
import os
import io
import time
import tarfile
import matplotlib.pyplot as plt
from io import BytesIO
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold
from sklearn.cluster import KMeans
import sagemaker
from sagemaker import PCA, KMeans, get_execution_role, Session
from sagemaker import KMeansModel, KMeansModel
from sklearn.metrics import silhouette_score
from sagemaker.model import Model
from sagemaker.serializers import CSVSerializer
from sagemaker.deserializers import JSONDeserializer
from sagemaker.predictor import Predictor
import json
import pickle

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml


## Load Training Dataset from S3

In [3]:
train_df = pd.read_csv('s3://team6datasets/splitDatasets/train.csv')

In [4]:
train_df.shape

(284778, 46)

In [5]:
train_df.head()

Unnamed: 0,CustomerID,gender_Agender,gender_Bigender,gender_Female,gender_Genderfluid,gender_Genderqueer,gender_Male,gender_Non-binary,gender_Polygender,Education_2n Cycle,...,SpendingCategory_Medium,SpendingCategory_High,SpendingCategory_Very High,IncomeLevel_Low,IncomeLevel_Medium,IncomeLevel_High,IncomeLevel_Very High,Purchase_Frequency,Age,Total_Spent_byCustomer
0,4426,0,0,0,0,0,1,0,0,1,...,1,0,0,0,0,0,1,0.652632,0.892857,0.021736
1,1100,0,0,0,0,0,1,0,0,0,...,1,0,0,0,0,0,1,0.589474,0.839286,0.027672
2,3602,0,0,1,0,0,0,0,0,1,...,1,0,0,0,1,0,0,0.547368,0.678571,0.021624
3,5268,0,0,0,0,0,1,0,0,0,...,0,1,0,0,0,0,1,0.421053,0.714286,0.028717
4,716,0,0,0,0,0,1,0,0,0,...,1,0,0,0,0,0,1,0.421053,0.357143,0.024864


## Sagemaker Session and Role

In [6]:
# Define the SageMaker session and role
sagemaker_session = Session()
role = get_execution_role()

## Dimensionality Reduction using Variance

In [7]:
# Set the variance threshold
variance_threshold = 0.2

# Create a VarianceThreshold object
selector = VarianceThreshold(threshold=variance_threshold)

# Fit the selector to the original data (train_df)
selector.fit(train_df)

# Get the support mask (boolean array indicating features to keep)
support = selector.get_support(indices=True)

# Select features based on the support mask
reduced_features = train_df.iloc[:, support]  # Using .iloc for clarity

# Now 'reduced_features' is a DataFrame containing features with variance exceeding the threshold
print(f"Number of features removed: {len(train_df.columns) - len(reduced_features.columns)}")

Number of features removed: 35


In [8]:
# Show the reduced_features DataFrame
print("Reduced Features DataFrame:")
reduced_features.head()

Reduced Features DataFrame:


Unnamed: 0,CustomerID,gender_Female,gender_Male,Kidhome_0,Kidhome_1,Kidhome_2,Teenhome_0,Teenhome_1,Teenhome_2,region_Asia,region_Europe
0,4426,0,1,1,0,0,0,1,0,1,0
1,1100,0,1,0,0,1,1,0,0,1,0
2,3602,1,0,0,0,1,0,0,1,1,0
3,5268,0,1,0,1,0,0,1,0,1,0
4,716,0,1,1,0,0,0,1,0,0,1


In [9]:
reduced_features.shape

(284778, 11)

## Model Training with KMeans Algorithm

### Use Elbow Method to Determine Ideal K

In [10]:
# # Define a range of k values
# k_values = range(2, 11)  

# # Convert the reduced_features DataFrame to float32
# reduced_features_float32 = reduced_features.astype('float32')

# # Train KMeans models for each k value and compute WCSS
# wcss_values = []
# for k in k_values:
#     kmeans = KMeans(role=role,
#                     instance_count=1,
#                     instance_type='ml.c4.xlarge',
#                     k=k)
#     kmeans.fit(kmeans.record_set(reduced_features_float32.values)) 
#     # Filter the DataFrame for the row where metric_name is 'train:msd'
#     wcss_row = kmeans.training_job_analytics.dataframe().loc[kmeans.training_job_analytics.dataframe()['metric_name'] == 'train:msd']
#     # Extract the WCSS value from the 'value' column of the filtered row
#     wcss_value = wcss_row['value'].iloc[0]
#     # Append the WCSS value to the list
#     wcss_values.append(wcss_value)

# # Plot the WCSS values against k
# plt.plot(k_values, wcss_values, marker='o')
# plt.title('Elbow Method')
# plt.xlabel('Number of clusters (k)')
# plt.ylabel('Within-cluster sum of squares (WCSS)')
# plt.show()


In [11]:
# Drop the CustomerID column from the feature DataFrame
features_for_training = reduced_features.drop(columns=['CustomerID'])

# Store CustomerID separately for reference
customer_ids = reduced_features['CustomerID']

# Initialize S3 client
s3 = boto3.client('s3')

# Define hyperparameters for KMeans model
hyperparameters = {'k': 4, 'init_method': 'random'}

# Set the output path
output_path = 's3://team6datasets/kmeans_model_output/'

# Set k value
k_value = hyperparameters['k']
model_output_path = f'{output_path}kmeans-{k_value}/output/'

# Train the model
print(f"Training model for k={k_value}...")

# Convert DataFrame to NumPy array (excluding CustomerID)
features_np = features_for_training.values.astype('float32')

# Create KMeans estimator
kmeans = KMeans(role=role,
                instance_count=1,
                instance_type='ml.c4.xlarge',
                k=k_value,
                output_path=output_path)

# Fit the model
job_name = f"kmeans-{k_value}-2"
kmeans.fit(kmeans.record_set(features_np), job_name=job_name)

print(f"Model for k={k_value} trained and saved to {model_output_path}")


Training model for k=4...


INFO:sagemaker.image_uris:Same images used for training and inference. Defaulting to image scope: inference.
INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.
INFO:sagemaker:Creating training-job with name: kmeans-4-2


2024-04-08 04:46:55 Starting - Starting the training job...
2024-04-08 04:47:10 Starting - Preparing the instances for training...
2024-04-08 04:47:47 Downloading - Downloading input data...
2024-04-08 04:48:22 Downloading - Downloading the training image.........
2024-04-08 04:49:43 Training - Training image download completed. Training in progress..[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
[34m[04/08/2024 04:49:51 INFO 139646646785856] Reading default configuration from /opt/amazon/lib/python3.8/site-packages/algorithm/resources/default-input.json: {'init_method': 'random', 'mini_batch_size': '5000', 'epochs': '1', 'extra_center_factor': 'auto', 'local_lloyd_max_iter': '300', 'local_lloyd_tol': '0.0001', 'local_lloyd_init_method': 'kmeans++', 'local_lloyd_num_trials': 'auto', 'half_life_time_size': '0', 'eval_metrics': '["msd"]', 'force_dense': 'true', '_disable_wait_to_read': 'false', '_enable_profiler': 'fal

## Model Validation

### Load Validation Data from S3

In [12]:
validate_df = pd.read_csv('s3://team6datasets/splitDatasets/validation.csv')

In [13]:
validate_df.shape

(61024, 46)

### Dimensionality Reduction using Variance 

In [14]:
# Set the variance threshold
variance_threshold = 0.2

# Create a VarianceThreshold object
selector = VarianceThreshold(threshold=variance_threshold)

# Fit the selector to the original data (train_df)
selector.fit(validate_df)

# Get the support mask (boolean array indicating features to keep)
support = selector.get_support(indices=True)

# Select features based on the support mask
reduced_features_validate = validate_df.iloc[:, support]  # Using .iloc for clarity

# Now 'reduced_features' is a DataFrame containing features with variance exceeding the threshold
print(f"Number of features removed: {len(validate_df.columns) - len(reduced_features_validate.columns)}")

Number of features removed: 35


In [15]:
# Show the reduced_features_validate DataFrame
print("Reduced Features DataFrame:")
reduced_features_validate.head()

Reduced Features DataFrame:


Unnamed: 0,CustomerID,gender_Female,gender_Male,Kidhome_0,Kidhome_1,Kidhome_2,Teenhome_0,Teenhome_1,Teenhome_2,region_Asia,region_Europe
0,9706,0,0,1,0,0,1,0,0,0,0
1,5555,1,0,0,0,1,0,0,1,0,0
2,3766,0,1,1,0,0,0,0,1,0,1
3,6515,1,0,1,0,0,0,0,1,0,0
4,1158,0,0,0,0,1,1,0,0,0,1


### Validate Model

In [20]:
from sagemaker import get_execution_role, session

# Define the role
role = get_execution_role()

# Load the trained model
model_key = 'kmeans_model_output/kmeans-4/output/model.tar.gz'
model_location = f's3://team6datasets/{model_key}'

# Create a session
sagemaker_session = session.Session()

# Choose one of the existing endpoint names
endpoint_name = "Kmeans-Endpoint"

# Create a predictor object using the chosen endpoint name
predictor = Predictor(endpoint_name=endpoint_name, sagemaker_session=sagemaker_session)

# Extract features for validation (excluding CustomerID)
features_for_validation = reduced_features_validate.drop(columns=['CustomerID'])
features_np_validation = features_for_validation.values.astype('float32')

# Convert data to JSON format
json_data = json.dumps(features_np_validation.tolist())  # Convert NumPy array to a list for JSON

# Serialize the JSON data to bytes
buffer = BytesIO()
buffer.write(json_data.encode('utf-8'))  # Encode JSON string as UTF-8 bytes
buffer.seek(0)
data_bytes = buffer.read()

# Predict clusters for the validation data with content type set to application/json
initial_args = {'ContentType': 'application/json'}
clusters = predictor.predict(data_bytes, initial_args=initial_args)

# Add the cluster labels to the validation DataFrame
reduced_features_validate['Cluster'] = clusters


ModelError: An error occurred (ModelError) when calling the InvokeEndpoint operation: Received client error (400) from primary with message "unable to evaluate payload provided". See https://us-east-1.console.aws.amazon.com/cloudwatch/home?region=us-east-1#logEventViewer:group=/aws/sagemaker/Endpoints/Kmeans-Endpoint in account 654654507448 for more information.

## Evaluating KMeans Model

### List objects in S3 folder for KMeans

In [None]:
# # Initialize S3 client
# s3 = boto3.client('s3')

# # Bucket name
# bucket_name = 'team6datasets'

# # Prefix
# prefix = 'kmeans_model_output/'

# # List objects in the bucket with the given prefix
# response = s3.list_objects_v2(Bucket=bucket_name, Prefix=prefix)

# # # Extract folder names from the object keys
# # folders = []

# # if 'Contents' in response:
# #     for obj in response['Contents']:
# #         key = obj['Key']
# #         folder_name = key.split('/')[1]
# #         if folder_name not in folders:
# #             folders.append(folder_name)

# # print("Folders found:", folders)
# # List objects in the bucket with the given prefix
# response = s3.list_objects_v2(Bucket=bucket_name, Prefix=prefix)

# # Print the object keys excluding folders
# if 'Contents' in response:
#     for obj in response['Contents']:
#         key = obj['Key']
#         if not key.endswith('/'):  # Exclude folders
#             print("Object key:", key)

### Elbow Method

In [None]:
# # Initialize list to store WCSS values
# wcss = []

# # Specify the IAM role ARN
# role = "arn:aws:iam::533267092316:role/LabRole"

# # Iterate through each S3 location containing KMeans models
# model_locations = [
#     'kmeans_model_output/kmeans-5/output',
#     'kmeans_model_output/kmeans-10/output',
#     'kmeans_model_output/kmeans-15/output',
#     'kmeans_model_output/kmeans-20/output'
# ]

# for model_location in model_locations:
#     # Construct the full S3 key
#     key = f"{model_location}/model.tar.gz"
    
#     # Download the file to a local directory
#     file_path = f"/tmp/{model_location.split('/')[-1]}_model.tar.gz"
#     s3.download_file(bucket_name, key, file_path)
    
#     # Extract the model
#     with tarfile.open(file_path, 'r:gz') as tar:
#         tar.extractall('/tmp/')
    
#     # Load the model
#     model = KMeansModel(model_data=f's3://{model_location}/model.tar.gz', role=role)
    
#     # Compute WCSS (Within-cluster sum of squares)
#     wcss.append(model.inertia_)

# # Plot the elbow curve
# plt.plot([5, 10, 15, 20], wcss, marker='o')
# plt.xlabel('Number of clusters (K)')
# plt.ylabel('Within-cluster sum of squares (WCSS)')
# plt.title('Elbow Method for Optimal K')
# plt.xticks([5, 10, 15, 20])
# plt.grid(True)
# plt.show()

### Calculating Silhouette Scores

In [None]:
# # Define a function to evaluate the KMeans model
# def evaluate_kmeans(predictor, features, k_value):
#     # Split the data into training and validation sets
#     X_train, X_val = train_test_split(features, test_size=0.2, random_state=42)
    
#     # Convert DataFrame to NumPy array
#     val_np = X_val.values.astype('float32')
    
#     # Make predictions on the validation set
#     cluster_labels = predictor.predict(val_np)
    
#     # Evaluate the model using silhouette score
#     silhouette_avg = silhouette_score(X_val, cluster_labels)
#     print(f"Silhouette Score for k={k_value}: {silhouette_avg}")
    
#     return silhouette_avg

# # Define hyperparameters for KMeans models
# hyperparameter_sets = [
#     {'k': 5, 'init_method': 'random'},
#     {'k': 10, 'init_method': 'random'},
#     {'k': 15, 'init_method': 'random'},
#     {'k': 20, 'init_method': 'random'}
# ]

# # Set the output path
# output_path = 's3://team6datasets/kmeans_model_output/'

# # Load the trained model
# trained_model_path = 's3://team6datasets/kmeans_model_output/kmeans-5-1712529217/output/model.tar.gz'
# trained_model = KMeans.attach(trained_model_path)

# # Deploy the trained model to a SageMaker endpoint
# predictor = trained_model.deploy(initial_instance_count=1, instance_type='ml.m4.xlarge')

# # Perform cross-validation for each set of hyperparameters
# for hyperparameters in hyperparameter_sets:
#     k_value = hyperparameters['k']
#     model_output_path = f'{output_path}kmeans-{k_value}/output/'
    
#     # Evaluate the model
#     silhouette_avg = evaluate_kmeans(predictor, features_for_training, k_value)
    
#     print(f"Model for k={k_value} evaluated with Silhouette Score: {silhouette_avg}")

# # Delete the SageMaker endpoint after evaluation
# predictor.delete_endpoint()
