In [238]:
# SageMaker Python SDK version 2.x is required
import sagemaker
import sys
import sagemaker_datawrangler           # For interactive data prep widget
import numpy as np                                # For matrix operations and numerical processing
import pandas as pd                               # For munging tabular data
import matplotlib.pyplot as plt                   # For charts and visualizations
from IPython.display import Image                 # For displaying images in the notebook
from IPython.display import display               # For displaying outputs in the notebook
from time import gmtime, strftime                 # For labeling SageMaker models, endpoints, etc.
import sys                                        # For writing outputs to notebook
import math                                       # For ceiling function
import json                                       # For parsing hosting outputs
import os                                         # For manipulating filepath names
import sagemaker 
import zipfile   
import time
from time import strftime, gmtime
import boto3
import pandas as pd
import numpy as np
import io
from sagemaker.session import Session
from sagemaker import get_execution_role
from sagemaker.feature_store.feature_group import FeatureGroup

In [239]:
prefix = 'sagemaker-featurestore-introduction'
role = get_execution_role()
sagemaker_session = sagemaker.Session()
region = sagemaker_session.boto_region_name
s3_bucket_name = sagemaker_session.default_bucket()

# Create Features and Save into Feature Store

### Import data

In [240]:
!wget https://sagemaker-sample-data-us-west-2.s3-us-west-2.amazonaws.com/autopilot/direct_marketing/bank-additional.zip
with zipfile.ZipFile('bank-additional.zip', 'r') as zip_ref:
    zip_ref.extractall('.')

--2023-08-31 01:22:44--  https://sagemaker-sample-data-us-west-2.s3-us-west-2.amazonaws.com/autopilot/direct_marketing/bank-additional.zip
Resolving sagemaker-sample-data-us-west-2.s3-us-west-2.amazonaws.com (sagemaker-sample-data-us-west-2.s3-us-west-2.amazonaws.com)... 52.92.241.146, 3.5.77.165, 3.5.78.111, ...
Connecting to sagemaker-sample-data-us-west-2.s3-us-west-2.amazonaws.com (sagemaker-sample-data-us-west-2.s3-us-west-2.amazonaws.com)|52.92.241.146|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 432828 (423K) [application/zip]
Saving to: ‘bank-additional.zip.7’


2023-08-31 01:22:45 (1.64 MB/s) - ‘bank-additional.zip.7’ saved [432828/432828]



In [241]:
df = pd.read_csv('./bank-additional/bank-additional-full.csv')
pd.set_option('display.max_columns', 50)    
pd.set_option('display.max_rows', 5)       

### Feature Transformation

In [242]:
df['no_previous_contact'] = np.where(df['pdays'] == 999, 1, 0)                                 # Indicator variable to capture when pdays takes a value of 999
df['not_working'] = np.where(np.in1d(df['job'], ['student', 'retired', 'unemployed']), 1, 0) 

In [243]:
df.drop(columns=['duration','emp.var.rate','cons.price.idx','cons.conf.idx','nr.employed'], inplace=True)

In [244]:
df.columns = [col.replace('.', '_') for col in df.columns]

### Create unique ID and timestamp

In [245]:
df['FS_ID'] = df.index + 1000
current_time_sec = int(round(time.time()))
df['FS_time'] = pd.Series([current_time_sec]*len(df), dtype="float64") 

### Generate Feature Group

In [246]:
customers_feature_group_name = 'customers-feature-group-' + strftime('%d-%H-%M-%S', gmtime())

In [247]:
customers_feature_group = FeatureGroup(
    name=customers_feature_group_name, sagemaker_session=sagemaker_session
)

In [248]:
customers_feature_group.load_feature_definitions(data_frame=df)

[FeatureDefinition(feature_name='age', feature_type=<FeatureTypeEnum.INTEGRAL: 'Integral'>),
 FeatureDefinition(feature_name='job', feature_type=<FeatureTypeEnum.STRING: 'String'>),
 FeatureDefinition(feature_name='marital', feature_type=<FeatureTypeEnum.STRING: 'String'>),
 FeatureDefinition(feature_name='education', feature_type=<FeatureTypeEnum.STRING: 'String'>),
 FeatureDefinition(feature_name='default', feature_type=<FeatureTypeEnum.STRING: 'String'>),
 FeatureDefinition(feature_name='housing', feature_type=<FeatureTypeEnum.STRING: 'String'>),
 FeatureDefinition(feature_name='loan', feature_type=<FeatureTypeEnum.STRING: 'String'>),
 FeatureDefinition(feature_name='contact', feature_type=<FeatureTypeEnum.STRING: 'String'>),
 FeatureDefinition(feature_name='month', feature_type=<FeatureTypeEnum.STRING: 'String'>),
 FeatureDefinition(feature_name='day_of_week', feature_type=<FeatureTypeEnum.STRING: 'String'>),
 FeatureDefinition(feature_name='campaign', feature_type=<FeatureTypeEnum

In [249]:
customers_feature_group.create(
    s3_uri=f"s3://{s3_bucket_name}/{prefix}",
    record_identifier_name='FS_ID',
    event_time_feature_name='FS_time',
    role_arn=role,
    enable_online_store=True
)

{'FeatureGroupArn': 'arn:aws:sagemaker:us-east-1:996025384640:feature-group/customers-feature-group-31-01-24-13',
 'ResponseMetadata': {'RequestId': 'd18ea1f0-da1b-4032-a780-a5f08da83aec',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'd18ea1f0-da1b-4032-a780-a5f08da83aec',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '112',
   'date': 'Thu, 31 Aug 2023 01:24:26 GMT'},
  'RetryAttempts': 0}}

In [None]:
customers_feature_group.ingest(data_frame = df, max_workers=10, wait=True)

In [None]:
print(customers_feature_group.describe().get("FeatureGroupName"))
print(customers_feature_group.describe().get( 'FeatureDefinitions'))

# Retrieve Feature for Model Training

In [199]:
featurename=customers_feature_group.describe().get("FeatureGroupName")


In [250]:
featurename='customers-feature-group-31-00-56-43'

In [251]:
region = boto3.Session().region_name
boto_session = boto3.Session(region_name=region)

sagemaker_client = boto_session.client(service_name='sagemaker', region_name=region)
featurestore_runtime = boto_session.client(service_name='sagemaker-featurestore-runtime', region_name=region)

feature_store_session = Session(
    boto_session=boto_session,
    sagemaker_client=sagemaker_client,
    sagemaker_featurestore_runtime_client=featurestore_runtime
)

In [252]:
# Build SQL query to features group
feature_group_name =featurename
feature_group = FeatureGroup(name=feature_group_name, sagemaker_session=feature_store_session)
fs_query = feature_group.athena_query()
fs_table = fs_query.table_name
query_string = 'SELECT * FROM "'+fs_table+'"'
print('Running ' + query_string)

Running SELECT * FROM "customers_feature_group_31_00_56_43_1693443414"


In [233]:
# Run Athena query. The output is loaded to a Pandas dataframe.
bucket=sagemaker.Session().default_bucket()
fs_query.run(query_string=query_string, output_location='s3://'+bucket+'/'+prefix+'/fs_query_results/')
fs_query.wait()
model_data = fs_query.as_dataframe()
# model_data = model_data.drop(['fs_id', 'fs_time', 'write_time', 'api_invocation_time', 'is_deleted'], axis=1)

In [234]:
model_data.head(20)

    age          job  marital          education  default housing loan  \
0    33   technician   single  university.degree       no      no   no   
1    28   unemployed   single        high.school       no     yes  yes   
..  ...          ...      ...                ...      ...     ...  ...   
18   37     services  married        high.school  unknown     yes   no   
19   29  blue-collar   single        high.school       no     yes   no   

      contact month day_of_week  campaign  pdays  previous     poutcome  \
0    cellular   aug         wed         1    999         0  nonexistent   
1   telephone   may         mon         3    999         0  nonexistent   
..        ...   ...         ...       ...    ...       ...          ...   
18  telephone   may         mon         3    999         0  nonexistent   
19  telephone   jun         tue         2    999         0  nonexistent   

    euribor3m   y  no_previous_contact  not_working  fs_id       fs_time  \
0       4.965  no           

In [235]:
model_data[:2]

   age         job marital          education default housing loan    contact  \
0   33  technician  single  university.degree      no      no   no   cellular   
1   28  unemployed  single        high.school      no     yes  yes  telephone   

  month day_of_week  campaign  pdays  previous     poutcome  euribor3m   y  \
0   aug         wed         1    999         0  nonexistent      4.965  no   
1   may         mon         3    999         0  nonexistent      4.858  no   

   no_previous_contact  not_working  fs_id       fs_time  \
0                    1            0  21609  1.693443e+09   
1                    1            1   5132  1.693443e+09   

                write_time      api_invocation_time  is_deleted  
0  2023-08-31 01:14:35.764  2023-08-31 01:09:35.000       False  
1  2023-08-31 01:14:35.764  2023-08-31 01:09:36.000       False  

# Retrieve Feature for real-time inference

In [253]:
region = boto3.Session().region_name
boto_session = boto3.Session(region_name=region)

sagemaker_client = boto_session.client(service_name='sagemaker', region_name=region)
featurestore_runtime = boto_session.client(service_name='sagemaker-featurestore-runtime', region_name=region)

feature_store_session = Session(
    boto_session=boto_session,
    sagemaker_client=sagemaker_client,
    sagemaker_featurestore_runtime_client=featurestore_runtime
)

feature_group_name =featurename
feature_group = FeatureGroup(name=feature_group_name, sagemaker_session=feature_store_session)

In [254]:
record = feature_group.get_record('24663')

In [255]:
record

[{'FeatureName': 'age', 'ValueAsString': '35'},
 {'FeatureName': 'job', 'ValueAsString': 'technician'},
 {'FeatureName': 'marital', 'ValueAsString': 'single'},
 {'FeatureName': 'education', 'ValueAsString': 'university.degree'},
 {'FeatureName': 'default', 'ValueAsString': 'no'},
 {'FeatureName': 'housing', 'ValueAsString': 'yes'},
 {'FeatureName': 'loan', 'ValueAsString': 'no'},
 {'FeatureName': 'contact', 'ValueAsString': 'cellular'},
 {'FeatureName': 'month', 'ValueAsString': 'aug'},
 {'FeatureName': 'day_of_week', 'ValueAsString': 'thu'},
 {'FeatureName': 'campaign', 'ValueAsString': '17'},
 {'FeatureName': 'pdays', 'ValueAsString': '999'},
 {'FeatureName': 'previous', 'ValueAsString': '0'},
 {'FeatureName': 'poutcome', 'ValueAsString': 'nonexistent'},
 {'FeatureName': 'euribor3m', 'ValueAsString': '4.962'},
 {'FeatureName': 'y', 'ValueAsString': 'no'},
 {'FeatureName': 'no_previous_contact', 'ValueAsString': '1'},
 {'FeatureName': 'not_working', 'ValueAsString': '0'},
 {'FeatureNa

# Discover Features and Featuregroups

In [256]:
region = boto3.Session().region_name
boto_session = boto3.Session(region_name=region)

sagemaker_client = boto_session.client(service_name='sagemaker', region_name=region)
featurestore_runtime = boto_session.client(service_name='sagemaker-featurestore-runtime', region_name=region)

feature_store_session = Session(
    boto_session=boto_session,
    sagemaker_client=sagemaker_client,
    sagemaker_featurestore_runtime_client=featurestore_runtime
)

In [257]:
sagemaker_client.search(
    Resource="FeatureMetadata",
)  

# Search for all features that belong to a feature group that contain the "ver" substring
sagemaker_client.search(
    Resource="FeatureMetadata",
    SearchExpression={
        'Filters': [
            {
                'Name': 'FeatureGroupName',
                'Operator': 'Contains',
                'Value': 'customer'
            },
        ]
    }
)

{'Results': [{'FeatureMetadata': {'FeatureGroupArn': 'arn:aws:sagemaker:us-east-1:996025384640:feature-group/customers-feature-group-31-01-24-13',
    'FeatureGroupName': 'customers-feature-group-31-01-24-13',
    'FeatureName': 'age',
    'FeatureType': 'Integral',
    'CreationTime': datetime.datetime(2023, 8, 31, 1, 24, 26, tzinfo=tzlocal()),
    'LastModifiedTime': datetime.datetime(2023, 8, 31, 1, 24, 26, tzinfo=tzlocal()),
    'Parameters': []}},
  {'FeatureMetadata': {'FeatureGroupArn': 'arn:aws:sagemaker:us-east-1:996025384640:feature-group/customers-feature-group-31-01-24-13',
    'FeatureGroupName': 'customers-feature-group-31-01-24-13',
    'FeatureName': 'contact',
    'FeatureType': 'String',
    'CreationTime': datetime.datetime(2023, 8, 31, 1, 24, 26, tzinfo=tzlocal()),
    'LastModifiedTime': datetime.datetime(2023, 8, 31, 1, 24, 26, tzinfo=tzlocal()),
    'Parameters': []}},
  {'FeatureMetadata': {'FeatureGroupArn': 'arn:aws:sagemaker:us-east-1:996025384640:feature-gro

In [258]:
sagemaker_client.search(
    Resource="FeatureMetadata",
    SearchExpression={
        'Filters': [
            {
                'Name':'FeatureName',
                'Operator': 'Contains',
                'Value': 'age'
            }

        ]
    }
) 

{'Results': [{'FeatureMetadata': {'FeatureGroupArn': 'arn:aws:sagemaker:us-east-1:996025384640:feature-group/customers-feature-group-31-01-24-13',
    'FeatureGroupName': 'customers-feature-group-31-01-24-13',
    'FeatureName': 'age',
    'FeatureType': 'Integral',
    'CreationTime': datetime.datetime(2023, 8, 31, 1, 24, 26, tzinfo=tzlocal()),
    'LastModifiedTime': datetime.datetime(2023, 8, 31, 1, 24, 26, tzinfo=tzlocal()),
    'Parameters': []}},
  {'FeatureMetadata': {'FeatureGroupArn': 'arn:aws:sagemaker:us-east-1:996025384640:feature-group/customers-feature-group-31-01-17-28',
    'FeatureGroupName': 'customers-feature-group-31-01-17-28',
    'FeatureName': 'age',
    'FeatureType': 'Integral',
    'CreationTime': datetime.datetime(2023, 8, 31, 1, 17, 32, tzinfo=tzlocal()),
    'LastModifiedTime': datetime.datetime(2023, 8, 31, 1, 17, 32, tzinfo=tzlocal()),
    'Parameters': []}},
  {'FeatureMetadata': {'FeatureGroupArn': 'arn:aws:sagemaker:us-east-1:996025384640:feature-group

In [259]:
sagemaker_client.update_feature_metadata(
    FeatureGroupName=feature_group_name,
    FeatureName="age",
    Description="mdoel"
)


{'ResponseMetadata': {'RequestId': '53df5fe8-1dae-4756-8d85-e1b705ea7879',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '53df5fe8-1dae-4756-8d85-e1b705ea7879',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '0',
   'date': 'Thu, 31 Aug 2023 01:32:20 GMT'},
  'RetryAttempts': 0}}