#Load and Prepare Your Dataset from S3

In [1]:
!pip install s3fs



In [5]:
import boto3

# Initialize a session using Amazon S3
s3 = boto3.client('s3')

# Define the file path and bucket name
file_path = 'filtered_data.csv'
bucket_name = 'mycapstone5566'

# Upload the file to S3
# Upload the file to S3
s3.upload_file(file_path, bucket_name, 'filtered_data.csv')

print(f"The file {file_path} has been uploaded to s3://{bucket_name}/filtered_data.csv")


The file filtered_data.csv has been uploaded to s3://mycapstone5566/filtered_data.csv


In [7]:
import pandas as pd
import time

# Read the CSV file from S3 (make sure s3fs is installed: pip install s3fs)
s3_path = "s3://mycapstone5566/filtered_data.csv"
df = pd.read_csv(s3_path)

print(df.head())

# Fill missing values if necessary
df.fillna(0, inplace=True)

# Create an 'EventTime' column required by Feature Store.
current_time = int(time.time())
df['EventTime'] = current_time

# Create a unique identifier column (RecordId).
# Here we use the dataframe index as a simple unique identifier.
df['RecordId'] = df.index.astype(str)


   scaled_amount  scaled_time        V1       V13       V15       V20  \
0       1.783274    -0.994983 -1.359807 -0.991390  1.468177  0.251412   
1      -0.269825    -0.994983  1.191857  0.489095  0.635558 -0.069083   
2       4.983721    -0.994972 -1.358354  0.717293  2.345865  0.524980   
3       1.418291    -0.994972 -0.966272  0.507757 -0.631418 -0.208038   
4       0.670579    -0.994960 -1.158233  1.345852  0.175121  0.408542   

        V23       V24       V25       V26       V27       V28  Class  
0 -0.110474  0.066928  0.128539 -0.189115  0.133558 -0.021053      0  
1  0.101288 -0.339846  0.167170  0.125895 -0.008983  0.014724      0  
2  0.909412 -0.689281 -0.327642 -0.139097 -0.055353 -0.059752      0  
3 -0.190321 -1.175575  0.647376 -0.221929  0.062723  0.061458      0  
4 -0.137458  0.141267 -0.206010  0.502292  0.219422  0.215153      0  


Create a Feature Group in SageMaker Feature Store

Initialize the SageMaker Session:

In [8]:
import sagemaker
from sagemaker.feature_store.feature_group import FeatureGroup
from sagemaker import get_execution_role

role = get_execution_role()
sagemaker_session = sagemaker.Session()
region = sagemaker_session.boto_region_name
default_bucket = sagemaker_session.default_bucket()

# Define a custom prefix for your offline store
prefix = "sagemaker-featurestore"
offline_store_uri = f"s3://{default_bucket}/dataset"




sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


Define and Create the Feature Group:

Create the Feature Group:

In [9]:
df['EventTime'] = df['EventTime'].astype(float)

In [10]:
from sagemaker.feature_store.feature_group import FeatureGroup

feature_group_name = "filteredfgroup"
feature_group = FeatureGroup(name=feature_group_name, sagemaker_session=sagemaker_session)


# Load feature definitions from your DataFrame
feature_group.load_feature_definitions(data_frame=df)

# Create the feature group (offline only)
feature_group.create(
    s3_uri="s3://mycapstone5566/filteredfeaturegroup",  
    record_identifier_name="RecordId",
    event_time_feature_name="EventTime",
    role_arn=role,
    enable_online_store=False  # Offline only
)


{'FeatureGroupArn': 'arn:aws:sagemaker:ap-south-1:419622399030:feature-group/filteredfgroup',
 'ResponseMetadata': {'RequestId': '2e91e42b-540c-473e-9220-120d1f8281ad',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '2e91e42b-540c-473e-9220-120d1f8281ad',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '92',
   'date': 'Wed, 26 Mar 2025 10:49:44 GMT'},
  'RetryAttempts': 0}}

Ingest Data into the Feature Store

In [11]:
# Ingest the data into the feature group
feature_group.ingest(data_frame=df, max_workers=2, wait=True) 

Validate Ingestion and Data

In [31]:
feature_group.describe()


{'FeatureGroupArn': 'arn:aws:sagemaker:ap-south-1:419622399030:feature-group/cc-feature-group',
 'FeatureGroupName': 'cc-feature-group',
 'RecordIdentifierFeatureName': 'RecordId',
 'EventTimeFeatureName': 'EventTime',
 'FeatureDefinitions': [{'FeatureName': 'Time', 'FeatureType': 'Fractional'},
  {'FeatureName': 'V1', 'FeatureType': 'Fractional'},
  {'FeatureName': 'V2', 'FeatureType': 'Fractional'},
  {'FeatureName': 'V3', 'FeatureType': 'Fractional'},
  {'FeatureName': 'V4', 'FeatureType': 'Fractional'},
  {'FeatureName': 'V5', 'FeatureType': 'Fractional'},
  {'FeatureName': 'V6', 'FeatureType': 'Fractional'},
  {'FeatureName': 'V7', 'FeatureType': 'Fractional'},
  {'FeatureName': 'V8', 'FeatureType': 'Fractional'},
  {'FeatureName': 'V9', 'FeatureType': 'Fractional'},
  {'FeatureName': 'V10', 'FeatureType': 'Fractional'},
  {'FeatureName': 'V11', 'FeatureType': 'Fractional'},
  {'FeatureName': 'V12', 'FeatureType': 'Fractional'},
  {'FeatureName': 'V13', 'FeatureType': 'Fractional'

In [35]:
import s3fs
# Define Offline Store S3 Path
offline_store_path = "s3://mycapstone5566/featuregroup/419622399030/sagemaker/ap-south-1/offline-store/cc-feature-group-1741723945/data/"
fs = s3fs.S3FileSystem()

# List Parquet files in the offline store
parquet_files = fs.glob(offline_store_path + "**/*.parquet")
print("Found Parquet files:", parquet_files)

# Ensure there are Parquet files before reading
if not parquet_files:
    raise ValueError("No Parquet files found in the offline store!")

# Read all Parquet files into a single DataFrame
df_offline = pd.concat([pd.read_parquet(fs.open(file, mode='rb')) for file in parquet_files])

# Display sample records
print(df_offline.head())


Found Parquet files: ['mycapstone5566/featuregroup/419622399030/sagemaker/ap-south-1/offline-store/cc-feature-group-1741723945/data/year=2025/month=03/day=11/hour=20/20250311T201007Z_0OG0KzGwk7AMpi4Z.parquet', 'mycapstone5566/featuregroup/419622399030/sagemaker/ap-south-1/offline-store/cc-feature-group-1741723945/data/year=2025/month=03/day=11/hour=20/20250311T201007Z_0X6xjrXRZ1EeKsov.parquet', 'mycapstone5566/featuregroup/419622399030/sagemaker/ap-south-1/offline-store/cc-feature-group-1741723945/data/year=2025/month=03/day=11/hour=20/20250311T201007Z_0lTsuuWpALSlQiNX.parquet', 'mycapstone5566/featuregroup/419622399030/sagemaker/ap-south-1/offline-store/cc-feature-group-1741723945/data/year=2025/month=03/day=11/hour=20/20250311T201007Z_0wu1IKCi3B4HCQpB.parquet', 'mycapstone5566/featuregroup/419622399030/sagemaker/ap-south-1/offline-store/cc-feature-group-1741723945/data/year=2025/month=03/day=11/hour=20/20250311T201007Z_12i72rRMY8J6liAr.parquet', 'mycapstone5566/featuregroup/419622399