## Part V - Feature Engineering and Feature Store

University of San Diego - MS Applied AI

AAI-540 Team 5

October 21, 2024

In [99]:
# setup environment
%run 0-Environment_Setup.ipynb

Stored 's3_datalake_path_csv' (str)
Stored 'local_data_path_csv' (str)
Stored 's3_datalake_path_parquet' (str)


### Get a rollup of total sales by store, by date

In [12]:
table_name = "cleaned_data"

# Define SQL query
statement = """SELECT 
    date,
    store_nbr,
    round(sum(sales), 2) as sales,
    max(dcoilwtico) as oil,
    max(is_holiday) as is_holiday,
    max(city) as city,
    max(state) as state,
    max(cluster) as cluster,
    max(onpromotion) as onpromotion
    FROM aai540finalprojectdb.cleaned_data
    GROUP BY (date, store_nbr)
ORDER BY date, store_nbr""".format(
    database_name, table_name
)

# get the results
store_sales_df = wr.athena.read_sql_query(statement, database_name)

### Do some further cleanup

- Check for null values
- Impute as necessary

In [13]:
# check nulls in housing_gmaps_data
print(store_sales_df.isnull().sum())

date               0
store_nbr          0
sales              0
oil            25974
is_holiday         0
city               0
state              0
cluster            0
onpromotion        0
dtype: int64


In [15]:
# impute oil values using forward fill
store_sales_df['oil'].fillna(method='ffill', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  store_sales_df['oil'].fillna(method='ffill', inplace=True)
  store_sales_df['oil'].fillna(method='ffill', inplace=True)


In [20]:
# double check all nulls are imputed
store_sales_df[store_sales_df['oil'].isnull()]

Unnamed: 0,date,store_nbr,sales,oil,is_holiday,city,state,cluster,onpromotion


In [21]:
# break down date into year, month, day, day of week
store_sales_df["date"] = pd.to_datetime(store_sales_df["date"])
store_sales_df["year"] = store_sales_df["date"].dt.year
store_sales_df["month"] = store_sales_df["date"].dt.month
store_sales_df["day"] = store_sales_df["date"].dt.day
store_sales_df["dow"] = store_sales_df["date"].dt.dayofweek

In [22]:
# inspect
store_sales_df

Unnamed: 0,date,store_nbr,sales,oil,is_holiday,city,state,cluster,onpromotion,year,month,day,dow
0,2013-01-01,1,0.00,93.14,1,Quito,Pichincha,13,0,2013,1,1,1
1,2013-01-01,2,0.00,93.14,1,Quito,Pichincha,13,0,2013,1,1,1
2,2013-01-01,3,0.00,93.14,1,Quito,Pichincha,8,0,2013,1,1,1
3,2013-01-01,4,0.00,93.14,1,Quito,Pichincha,9,0,2013,1,1,1
4,2013-01-01,5,0.00,93.14,1,Santo Domingo,Santo Domingo de los Tsachilas,4,0,2013,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
90931,2017-08-15,50,16879.12,47.57,0,Ambato,Tungurahua,14,35,2017,8,15,1
90932,2017-08-15,51,20154.56,47.57,0,Guayaquil,Guayas,17,29,2017,8,15,1
90933,2017-08-15,52,18600.05,47.57,0,Manta,Manabi,11,37,2017,8,15,1
90934,2017-08-15,53,8208.19,47.57,0,Manta,Manabi,13,33,2017,8,15,1


### Encode Categorical Columns

In [23]:
# use labelencoder to encode categorical columns
from sklearn.preprocessing import LabelEncoder 
le = LabelEncoder()

store_sales_df['city'] = le.fit_transform(store_sales_df['city'])
store_sales_df['state'] = le.fit_transform(store_sales_df['state'])

store_sales_df

Unnamed: 0,date,store_nbr,sales,oil,is_holiday,city,state,cluster,onpromotion,year,month,day,dow
0,2013-01-01,1,0.00,93.14,1,18,12,13,0,2013,1,1,1
1,2013-01-01,2,0.00,93.14,1,18,12,13,0,2013,1,1,1
2,2013-01-01,3,0.00,93.14,1,18,12,8,0,2013,1,1,1
3,2013-01-01,4,0.00,93.14,1,18,12,9,0,2013,1,1,1
4,2013-01-01,5,0.00,93.14,1,21,14,4,0,2013,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
90931,2017-08-15,50,16879.12,47.57,0,0,15,14,35,2017,8,15,1
90932,2017-08-15,51,20154.56,47.57,0,8,6,17,29,2017,8,15,1
90933,2017-08-15,52,18600.05,47.57,0,14,10,11,37,2017,8,15,1
90934,2017-08-15,53,8208.19,47.57,0,14,10,13,33,2017,8,15,1


In [35]:
# convert date to format compatible with feature store
store_sales_df['date'] = store_sales_df['date'].apply(lambda x: x.strftime('%Y-%m-%d'))

In [41]:
# engineer id feature based on date and store_nbr
store_sales_df['sales_record_id'] = store_sales_df['date'].apply(lambda x: str(x)) + ":" + store_sales_df['store_nbr'].apply(lambda x: str(x))
store_sales_df

Unnamed: 0,date,store_nbr,sales,oil,is_holiday,city,state,cluster,onpromotion,year,month,day,dow,sales_record_id
0,2013-01-01,1,0.00,93.14,1,18,12,13,0,2013,1,1,1,2013-01-01:1
1,2013-01-01,2,0.00,93.14,1,18,12,13,0,2013,1,1,1,2013-01-01:2
2,2013-01-01,3,0.00,93.14,1,18,12,8,0,2013,1,1,1,2013-01-01:3
3,2013-01-01,4,0.00,93.14,1,18,12,9,0,2013,1,1,1,2013-01-01:4
4,2013-01-01,5,0.00,93.14,1,21,14,4,0,2013,1,1,1,2013-01-01:5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
90931,2017-08-15,50,16879.12,47.57,0,0,15,14,35,2017,8,15,1,2017-08-15:50
90932,2017-08-15,51,20154.56,47.57,0,8,6,17,29,2017,8,15,1,2017-08-15:51
90933,2017-08-15,52,18600.05,47.57,0,14,10,11,37,2017,8,15,1,2017-08-15:52
90934,2017-08-15,53,8208.19,47.57,0,14,10,13,33,2017,8,15,1,2017-08-15:53


In [42]:
store_sales_df.dtypes

date               string[python]
store_nbr                   Int64
sales                     float64
oil                       float64
is_holiday                  Int32
city                        int64
state                       int64
cluster                     Int64
onpromotion                 Int64
year                        int32
month                       int32
day                         int32
dow                         int32
sales_record_id            object
dtype: object

### Create Features and Feature Store

In [70]:
# initialize sagemaker featurestore session
from sagemaker.feature_store.feature_group import FeatureGroup
from sagemaker.session import Session

region = boto3.Session().region_name

boto_session = boto3.Session(region_name=region)

sagemaker_client = boto_session.client(service_name="sagemaker", region_name=region)
featurestore_runtime = boto_session.client(
    service_name="sagemaker-featurestore-runtime", region_name=region
)

feature_store_session = Session(
    boto_session=boto_session,
    sagemaker_client=sagemaker_client,
    sagemaker_featurestore_runtime_client=featurestore_runtime,
)

In [71]:
# helper function to convert col data types to non-objects
def cast_columns(data_frame):
    for label in data_frame.columns:
        if data_frame.dtypes[label] == "object":
            data_frame[label] = data_frame[label].astype("str").astype("string")
        elif data_frame.dtypes[label] == "bool":
            data_frame[label] = data_frame[label].astype("int")
            
cast_columns(store_sales_df)

In [72]:
# create fime feature
import time

current_time_sec = int(round(time.time()))

# set sales_record_id as primary key
record_identifier_feature_name = 'sales_record_id'
event_time_feature_name = "event_time"

# append EventTime feature
store_sales_df[event_time_feature_name] = pd.Series(
    [current_time_sec] * len(store_sales_df), dtype="float64"
)

In [73]:
from time import gmtime, strftime, sleep

# define feature group names    
store_sales_feature_group_name = "store-sales-feature-group-" + strftime("%d-%H-%M-%S", gmtime())

# define feature group
store_sales_feature_group = FeatureGroup(
    name=store_sales_feature_group_name, sagemaker_session=feature_store_session
)

store_sales_feature_group.load_feature_definitions(data_frame=store_sales_df)

[FeatureDefinition(feature_name='date', feature_type=<FeatureTypeEnum.STRING: 'String'>, collection_type=None),
 FeatureDefinition(feature_name='store_nbr', feature_type=<FeatureTypeEnum.INTEGRAL: 'Integral'>, collection_type=None),
 FeatureDefinition(feature_name='sales', feature_type=<FeatureTypeEnum.FRACTIONAL: 'Fractional'>, collection_type=None),
 FeatureDefinition(feature_name='oil', feature_type=<FeatureTypeEnum.FRACTIONAL: 'Fractional'>, collection_type=None),
 FeatureDefinition(feature_name='is_holiday', feature_type=<FeatureTypeEnum.INTEGRAL: 'Integral'>, collection_type=None),
 FeatureDefinition(feature_name='city', feature_type=<FeatureTypeEnum.INTEGRAL: 'Integral'>, collection_type=None),
 FeatureDefinition(feature_name='state', feature_type=<FeatureTypeEnum.INTEGRAL: 'Integral'>, collection_type=None),
 FeatureDefinition(feature_name='cluster', feature_type=<FeatureTypeEnum.INTEGRAL: 'Integral'>, collection_type=None),
 FeatureDefinition(feature_name='onpromotion', featur

In [74]:
# helper function to monitor feature group creation status
def wait_for_feature_group_creation_complete(feature_group):
    status = feature_group.describe().get("FeatureGroupStatus")
    while status == "Creating":
        print("Waiting for Feature Group Creation")
        time.sleep(5)
        status = feature_group.describe().get("FeatureGroupStatus")
    if status != "Created":
        raise RuntimeError(f"Failed to create feature group {feature_group.name}")
    print(f"FeatureGroup {feature_group.name} successfully created.")

In [75]:
# Default S3 bucket for offline feature store
default_s3_bucket_name = feature_store_session.default_bucket()
prefix = "sagemaker-featurestore-store-sales"

In [76]:
# Create Offline Feature Store

# create feature group
store_sales_feature_group.create(
    s3_uri=f"s3://{default_s3_bucket_name}/{prefix}",
    record_identifier_name=record_identifier_feature_name,
    event_time_feature_name=event_time_feature_name,
    role_arn=role,
    enable_online_store=True,
)

wait_for_feature_group_creation_complete(feature_group=store_sales_feature_group)

Waiting for Feature Group Creation
Waiting for Feature Group Creation
Waiting for Feature Group Creation
Waiting for Feature Group Creation
Waiting for Feature Group Creation
Waiting for Feature Group Creation
Waiting for Feature Group Creation
Waiting for Feature Group Creation
FeatureGroup store-sales-feature-group-24-06-02-34 successfully created.


In [77]:
# ingest sales data
store_sales_feature_group.ingest(data_frame=store_sales_df, max_workers=3, wait=True)

IngestionManagerPandas(feature_group_name='store-sales-feature-group-24-06-02-34', feature_definitions={'date': {'FeatureName': 'date', 'FeatureType': 'String'}, 'store_nbr': {'FeatureName': 'store_nbr', 'FeatureType': 'Integral'}, 'sales': {'FeatureName': 'sales', 'FeatureType': 'Fractional'}, 'oil': {'FeatureName': 'oil', 'FeatureType': 'Fractional'}, 'is_holiday': {'FeatureName': 'is_holiday', 'FeatureType': 'Integral'}, 'city': {'FeatureName': 'city', 'FeatureType': 'Integral'}, 'state': {'FeatureName': 'state', 'FeatureType': 'Integral'}, 'cluster': {'FeatureName': 'cluster', 'FeatureType': 'Integral'}, 'onpromotion': {'FeatureName': 'onpromotion', 'FeatureType': 'Integral'}, 'year': {'FeatureName': 'year', 'FeatureType': 'Integral'}, 'month': {'FeatureName': 'month', 'FeatureType': 'Integral'}, 'day': {'FeatureName': 'day', 'FeatureType': 'Integral'}, 'dow': {'FeatureName': 'dow', 'FeatureType': 'Integral'}, 'sales_record_id': {'FeatureName': 'sales_record_id', 'FeatureType': 'St

In [78]:
# Test a batch query
featurestore_runtime.batch_get_record(
    Identifiers=[
        {
            "FeatureGroupName": store_sales_feature_group_name,
            "RecordIdentifiersValueAsString": ["2017-08-15:1", "2017-08-15:2", "2017-08-15:3", "2017-08-15:4", "2017-08-15:5"],
        }
    ]
)

{'ResponseMetadata': {'RequestId': 'b7106e4b-e793-4b95-8397-2a539d4f3923',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'b7106e4b-e793-4b95-8397-2a539d4f3923',
   'content-type': 'application/json',
   'content-length': '6266',
   'date': 'Tue, 24 Sep 2024 06:09:59 GMT'},
  'RetryAttempts': 0},
 'Records': [{'FeatureGroupName': 'store-sales-feature-group-24-06-02-34',
   'RecordIdentifierValueAsString': '2017-08-15:5',
   'Record': [{'FeatureName': 'date', 'ValueAsString': '2017-08-15'},
    {'FeatureName': 'store_nbr', 'ValueAsString': '5'},
    {'FeatureName': 'sales', 'ValueAsString': '8703.69'},
    {'FeatureName': 'oil', 'ValueAsString': '47.57'},
    {'FeatureName': 'is_holiday', 'ValueAsString': '0'},
    {'FeatureName': 'city', 'ValueAsString': '21'},
    {'FeatureName': 'state', 'ValueAsString': '14'},
    {'FeatureName': 'cluster', 'ValueAsString': '4'},
    {'FeatureName': 'onpromotion', 'ValueAsString': '26'},
    {'FeatureName': 'year', 'ValueAsString': '

### Load dataset from Offline Feature Store

In [84]:
store_sales_feature_group_resolved_output_s3_uri = (
    store_sales_feature_group.describe()
    .get("OfflineStoreConfig")
    .get("S3StorageConfig")
    .get("ResolvedOutputS3Uri")
)

store_sales_feature_group_s3_prefix = store_sales_feature_group_resolved_output_s3_uri.replace(
    f"s3://{default_s3_bucket_name}/", ""
)

offline_store_contents = None
while offline_store_contents is None:
    objects_in_bucket = s3_client.list_objects(
        Bucket=default_s3_bucket_name, Prefix=store_sales_feature_group_s3_prefix
    )
    if "Contents" in objects_in_bucket and len(objects_in_bucket["Contents"]) > 1:
        offline_store_contents = objects_in_bucket["Contents"]
    else:
        print("Waiting for data in offline store...\n")
        sleep(60)

print("Data available.")

Data available.


In [92]:
# test loading dataset from feature group
get_dataset_from_offline_feature_group(store_sales_feature_group)

Running 
    SELECT *
    FROM
        "store_sales_feature_group_24_06_02_34_1727157764"
    


Unnamed: 0,date,store_nbr,sales,oil,is_holiday,city,state,cluster,onpromotion,year,month,day,dow,sales_record_id,event_time,write_time,api_invocation_time,is_deleted
0,2015-12-11,21,8163.51,35.65,0,21,14,6,55,2015,12,11,4,2015-12-11:21,1.727158e+09,2024-09-24 06:14:14.122,2024-09-24 06:09:14.000,False
1,2017-07-06,40,13983.18,45.52,0,13,4,3,66,2017,7,6,3,2017-07-06:40,1.727158e+09,2024-09-24 06:14:14.122,2024-09-24 06:09:14.000,False
2,2015-12-12,44,63828.27,35.65,0,18,12,5,39,2015,12,12,5,2015-12-12:44,1.727158e+09,2024-09-24 06:14:14.122,2024-09-24 06:09:14.000,False
3,2015-12-12,46,38898.76,35.65,0,18,12,14,27,2015,12,12,5,2015-12-12:46,1.727158e+09,2024-09-24 06:14:14.122,2024-09-24 06:09:14.000,False
4,2014-05-29,48,9507.73,104.26,0,18,12,14,0,2014,5,29,3,2014-05-29:48,1.727158e+09,2024-09-24 06:14:14.122,2024-09-24 06:09:15.000,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
90931,2015-12-09,43,8929.80,37.16,0,6,5,10,71,2015,12,9,2,2015-12-09:43,1.727158e+09,2024-09-24 06:09:14.650,2024-09-24 06:09:13.000,False
90932,2015-12-09,48,18060.71,37.16,0,18,12,14,194,2015,12,9,2,2015-12-09:48,1.727158e+09,2024-09-24 06:09:14.650,2024-09-24 06:09:13.000,False
90933,2014-05-25,54,8118.94,105.01,0,5,10,3,0,2014,5,25,6,2014-05-25:54,1.727158e+09,2024-09-24 06:09:14.650,2024-09-24 06:09:13.000,False
90934,2014-05-26,30,3259.29,104.78,0,8,6,3,0,2014,5,26,0,2014-05-26:30,1.727158e+09,2024-09-24 06:09:14.650,2024-09-24 06:09:13.000,False
