# Feature Store 
## FeatureGroup Creation, Ingest batch data and retrive records

References:
 - https://docs.aws.amazon.com/sagemaker/latest/dg/feature-store-introduction-notebook.html
 - https://boto3.amazonaws.com/v1/documentation/api/1.20.9/reference/services/sagemaker-featurestore-runtime.html



In [1]:
from sagemaker import get_execution_role
from sagemaker.session import Session
import sagemaker
import boto3
import pandas as pd
import numpy as np 
from time import strftime, gmtime
import time
from sagemaker.feature_store.feature_group import FeatureGroup



sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml


In [2]:
sagemaker_session = Session()
REGION = Session().boto_region_name
BUCKET = Session().default_bucket()
ROLE = get_execution_role()
print(REGION, " ", BUCKET, " ", ROLE)

PREFIX = "FraudDetection_AutoInsurance"

us-east-1   sagemaker-us-east-1-205930620783   arn:aws:iam::205930620783:role/service-role/AmazonSageMaker-ExecutionRole-20250401T145997


In [3]:
s3_uri = f"s3://{BUCKET}"

claims_data = pd.read_csv(f"{s3_uri}/{PREFIX}/data/claims_preprocessed.csv")
customers_data = pd.read_csv(f"{s3_uri}/{PREFIX}/data/customers_preprocessed.csv")

In [5]:
print(claims_data.columns, "\n")
print(customers_data.columns, "\n")
print()
print("Policy id is the unique identifier in both cases")
print(claims_data.shape[0], claims_data['policy_id'].nunique())
print(customers_data.shape[0], customers_data['policy_id'].nunique())

Index(['policy_id', 'incident_severity', 'num_vehicles_involved',
       'num_injuries', 'num_witnesses', 'police_report_available',
       'injury_claim', 'vehicle_claim', 'total_claim_amount', 'incident_month',
       'incident_day', 'incident_dow', 'incident_hour', 'fraud',
       'driver_relationship_self', 'driver_relationship_na',
       'driver_relationship_spouse', 'driver_relationship_child',
       'driver_relationship_other', 'incident_type_collision',
       'incident_type_breakin', 'incident_type_theft', 'collision_type_front',
       'collision_type_rear', 'collision_type_side', 'collision_type_na',
       'authorities_contacted_police', 'authorities_contacted_none',
       'authorities_contacted_fire', 'authorities_contacted_ambulance'],
      dtype='object') 

Index(['policy_id', 'customer_age', 'customer_education', 'months_as_customer',
       'policy_deductable', 'policy_annual_premium', 'policy_liability',
       'auto_year', 'num_claims_past_year', 'num_insurers_pa

### Create Two Feature Groups for claims and customers.
Feature Groups is equivalent to tables/datasets.

In [6]:
customers_feature_group_name = "customers-feature-group-" + strftime("%d-%H-%M-%S", gmtime())
claims_feature_group_name = "claims-feature-group-" + strftime("%d-%H-%M-%S", gmtime())
print(customers_feature_group_name, "\n", claims_feature_group_name)

customers-feature-group-20-06-17-56 
 claims-feature-group-20-06-17-56


In [7]:
customers_feature_group = FeatureGroup(customers_feature_group_name, sagemaker_session)
claims_feature_group = FeatureGroup(claims_feature_group_name, sagemaker_session)

In [8]:
# Add event time in the datasets, that is required
customers_data['EventTime'] = time.time()
claims_data['EventTime'] = time.time()
print(customers_data['EventTime'])

0       1.745130e+09
1       1.745130e+09
2       1.745130e+09
3       1.745130e+09
4       1.745130e+09
            ...     
4995    1.745130e+09
4996    1.745130e+09
4997    1.745130e+09
4998    1.745130e+09
4999    1.745130e+09
Name: EventTime, Length: 5000, dtype: float64


### Load Feature Definitions into the feature groups

In [9]:
customers_feature_group.load_feature_definitions(data_frame=customers_data)
claims_feature_group.load_feature_definitions(data_frame=claims_data)

[FeatureDefinition(feature_name='policy_id', feature_type=<FeatureTypeEnum.INTEGRAL: 'Integral'>, collection_type=None),
 FeatureDefinition(feature_name='incident_severity', feature_type=<FeatureTypeEnum.FRACTIONAL: 'Fractional'>, collection_type=None),
 FeatureDefinition(feature_name='num_vehicles_involved', feature_type=<FeatureTypeEnum.INTEGRAL: 'Integral'>, collection_type=None),
 FeatureDefinition(feature_name='num_injuries', feature_type=<FeatureTypeEnum.INTEGRAL: 'Integral'>, collection_type=None),
 FeatureDefinition(feature_name='num_witnesses', feature_type=<FeatureTypeEnum.INTEGRAL: 'Integral'>, collection_type=None),
 FeatureDefinition(feature_name='police_report_available', feature_type=<FeatureTypeEnum.FRACTIONAL: 'Fractional'>, collection_type=None),
 FeatureDefinition(feature_name='injury_claim', feature_type=<FeatureTypeEnum.INTEGRAL: 'Integral'>, collection_type=None),
 FeatureDefinition(feature_name='vehicle_claim', feature_type=<FeatureTypeEnum.INTEGRAL: 'Integral'>,

### The following calls create to create two feature groups, customers_feature_group and orders_feature_group, respectively:
Point to note, only the necessary structure will be created on "Feature Store" we still need to load the data yet.

In [10]:

customers_feature_group.create(
    s3_uri = f"{s3_uri}/{PREFIX}/data/feature_store",
    record_identifier_name='policy_id',
    event_time_feature_name="EventTime",
    role_arn=ROLE,
    enable_online_store=True
)


claims_feature_group.create(
    s3_uri = f"{s3_uri}/{PREFIX}/data/feature_store",
    record_identifier_name='policy_id',
    event_time_feature_name="EventTime",
    role_arn=ROLE,
    enable_online_store=True
)

{'FeatureGroupArn': 'arn:aws:sagemaker:us-east-1:205930620783:feature-group/claims-feature-group-20-06-17-56',
 'ResponseMetadata': {'RequestId': '91371b5b-1678-4466-94ab-4c8405b45d30',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '91371b5b-1678-4466-94ab-4c8405b45d30',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '109',
   'date': 'Sun, 20 Apr 2025 06:19:31 GMT'},
  'RetryAttempts': 2}}

In [11]:
print(customers_feature_group.describe())

{'FeatureGroupArn': 'arn:aws:sagemaker:us-east-1:205930620783:feature-group/customers-feature-group-20-06-17-56', 'FeatureGroupName': 'customers-feature-group-20-06-17-56', 'RecordIdentifierFeatureName': 'policy_id', 'EventTimeFeatureName': 'EventTime', 'FeatureDefinitions': [{'FeatureName': 'policy_id', 'FeatureType': 'Integral'}, {'FeatureName': 'customer_age', 'FeatureType': 'Integral'}, {'FeatureName': 'customer_education', 'FeatureType': 'Integral'}, {'FeatureName': 'months_as_customer', 'FeatureType': 'Integral'}, {'FeatureName': 'policy_deductable', 'FeatureType': 'Integral'}, {'FeatureName': 'policy_annual_premium', 'FeatureType': 'Integral'}, {'FeatureName': 'policy_liability', 'FeatureType': 'Integral'}, {'FeatureName': 'auto_year', 'FeatureType': 'Integral'}, {'FeatureName': 'num_claims_past_year', 'FeatureType': 'Integral'}, {'FeatureName': 'num_insurers_past_5_years', 'FeatureType': 'Integral'}, {'FeatureName': 'customer_gender_male', 'FeatureType': 'Fractional'}, {'Featur

### List Feature Groups

In [12]:
print(sagemaker_session.boto_session.client('sagemaker').list_feature_groups())
print()
print(boto3.client('sagemaker').list_feature_groups())

{'FeatureGroupSummaries': [{'FeatureGroupName': 'customers-feature-group-20-06-17-56', 'FeatureGroupArn': 'arn:aws:sagemaker:us-east-1:205930620783:feature-group/customers-feature-group-20-06-17-56', 'CreationTime': datetime.datetime(2025, 4, 20, 6, 19, 28, 702000, tzinfo=tzlocal()), 'FeatureGroupStatus': 'Created'}, {'FeatureGroupName': 'claims-feature-group-20-06-17-56', 'FeatureGroupArn': 'arn:aws:sagemaker:us-east-1:205930620783:feature-group/claims-feature-group-20-06-17-56', 'CreationTime': datetime.datetime(2025, 4, 20, 6, 19, 30, 828000, tzinfo=tzlocal()), 'FeatureGroupStatus': 'Creating'}], 'ResponseMetadata': {'RequestId': '4eb90569-66ec-4c9a-89b3-d41f0a6b4a95', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amzn-requestid': '4eb90569-66ec-4c9a-89b3-d41f0a6b4a95', 'content-type': 'application/x-amz-json-1.1', 'content-length': '488', 'date': 'Sun, 20 Apr 2025 06:19:45 GMT'}, 'RetryAttempts': 0}}

{'FeatureGroupSummaries': [{'FeatureGroupName': 'customers-feature-group-20-06-17-56'

In [13]:
def check_feature_group_status(feature_group):
    status = feature_group.describe().get("FeatureGroupStatus")
    while status == "Creating":
        print("Waiting for Feature Group to be Created")
        time.sleep(5)
        status = feature_group.describe().get("FeatureGroupStatus")
    print(f"FeatureGroup {feature_group.name} successfully created.")

check_feature_group_status(customers_feature_group)
check_feature_group_status(claims_feature_group)

FeatureGroup customers-feature-group-20-06-17-56 successfully created.
FeatureGroup claims-feature-group-20-06-17-56 successfully created.


In [14]:
customers_feature_group.ingest(data_frame=customers_data, max_workers=3, wait=True) 
claims_feature_group.ingest(data_frame=claims_data, max_workers=3, wait=True) 

IngestionManagerPandas(feature_group_name='claims-feature-group-20-06-17-56', feature_definitions={'policy_id': {'FeatureName': 'policy_id', 'FeatureType': 'Integral'}, 'incident_severity': {'FeatureName': 'incident_severity', 'FeatureType': 'Fractional'}, 'num_vehicles_involved': {'FeatureName': 'num_vehicles_involved', 'FeatureType': 'Integral'}, 'num_injuries': {'FeatureName': 'num_injuries', 'FeatureType': 'Integral'}, 'num_witnesses': {'FeatureName': 'num_witnesses', 'FeatureType': 'Integral'}, 'police_report_available': {'FeatureName': 'police_report_available', 'FeatureType': 'Fractional'}, 'injury_claim': {'FeatureName': 'injury_claim', 'FeatureType': 'Integral'}, 'vehicle_claim': {'FeatureName': 'vehicle_claim', 'FeatureType': 'Integral'}, 'total_claim_amount': {'FeatureName': 'total_claim_amount', 'FeatureType': 'Integral'}, 'incident_month': {'FeatureName': 'incident_month', 'FeatureType': 'Integral'}, 'incident_day': {'FeatureName': 'incident_day', 'FeatureType': 'Integral'

In [15]:
customer_id = 1
sample_record = sagemaker_session.boto_session.client('sagemaker-featurestore-runtime', region_name=REGION).get_record(FeatureGroupName=customers_feature_group_name, RecordIdentifierValueAsString=str(customer_id))
print(sample_record)

{'ResponseMetadata': {'RequestId': '4c8deea1-b96c-4d38-8490-97e8aa385dcf', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amzn-requestid': '4c8deea1-b96c-4d38-8490-97e8aa385dcf', 'content-type': 'application/json', 'content-length': '1595', 'date': 'Sun, 20 Apr 2025 06:21:34 GMT'}, 'RetryAttempts': 0}, 'Record': [{'FeatureName': 'policy_id', 'ValueAsString': '1'}, {'FeatureName': 'customer_age', 'ValueAsString': '54'}, {'FeatureName': 'customer_education', 'ValueAsString': '2'}, {'FeatureName': 'months_as_customer', 'ValueAsString': '381'}, {'FeatureName': 'policy_deductable', 'ValueAsString': '750'}, {'FeatureName': 'policy_annual_premium', 'ValueAsString': '3000'}, {'FeatureName': 'policy_liability', 'ValueAsString': '1'}, {'FeatureName': 'auto_year', 'ValueAsString': '2019'}, {'FeatureName': 'num_claims_past_year', 'ValueAsString': '0'}, {'FeatureName': 'num_insurers_past_5_years', 'ValueAsString': '1'}, {'FeatureName': 'customer_gender_male', 'ValueAsString': '0.0'}, {'FeatureName': 'cu

In [18]:
all_records = sagemaker_session.boto_session.client(
    "sagemaker-featurestore-runtime", region_name=REGION
).batch_get_record(
    Identifiers=[
        {
            "FeatureGroupName": customers_feature_group_name,
            "RecordIdentifiersValueAsString": ["1", "2"],
        },
        {
            "FeatureGroupName": claims_feature_group_name,
            "RecordIdentifiersValueAsString": ["1", "2"],
        },
    ]
)

In [19]:
all_records

{'ResponseMetadata': {'RequestId': 'ee2474eb-228b-4fbc-977d-e7f538bf2243',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'ee2474eb-228b-4fbc-977d-e7f538bf2243',
   'content-type': 'application/json',
   'content-length': '8913',
   'date': 'Sun, 20 Apr 2025 06:29:17 GMT'},
  'RetryAttempts': 0},
 'Records': [{'FeatureGroupName': 'claims-feature-group-20-06-17-56',
   'RecordIdentifierValueAsString': '2',
   'Record': [{'FeatureName': 'policy_id', 'ValueAsString': '2'},
    {'FeatureName': 'incident_severity', 'ValueAsString': '1.0'},
    {'FeatureName': 'num_vehicles_involved', 'ValueAsString': '2'},
    {'FeatureName': 'num_injuries', 'ValueAsString': '3'},
    {'FeatureName': 'num_witnesses', 'ValueAsString': '0'},
    {'FeatureName': 'police_report_available', 'ValueAsString': '0.0'},
    {'FeatureName': 'injury_claim', 'ValueAsString': '30000'},
    {'FeatureName': 'vehicle_claim', 'ValueAsString': '19500'},
    {'FeatureName': 'total_claim_amount', 'ValueAsString'

In [21]:
customers_feature_group.describe()

{'FeatureGroupArn': 'arn:aws:sagemaker:us-east-1:205930620783:feature-group/customers-feature-group-20-06-17-56',
 'FeatureGroupName': 'customers-feature-group-20-06-17-56',
 'RecordIdentifierFeatureName': 'policy_id',
 'EventTimeFeatureName': 'EventTime',
 'FeatureDefinitions': [{'FeatureName': 'policy_id',
   'FeatureType': 'Integral'},
  {'FeatureName': 'customer_age', 'FeatureType': 'Integral'},
  {'FeatureName': 'customer_education', 'FeatureType': 'Integral'},
  {'FeatureName': 'months_as_customer', 'FeatureType': 'Integral'},
  {'FeatureName': 'policy_deductable', 'FeatureType': 'Integral'},
  {'FeatureName': 'policy_annual_premium', 'FeatureType': 'Integral'},
  {'FeatureName': 'policy_liability', 'FeatureType': 'Integral'},
  {'FeatureName': 'auto_year', 'FeatureType': 'Integral'},
  {'FeatureName': 'num_claims_past_year', 'FeatureType': 'Integral'},
  {'FeatureName': 'num_insurers_past_5_years', 'FeatureType': 'Integral'},
  {'FeatureName': 'customer_gender_male', 'FeatureTyp