## Part V - Feature Engineering and Feature Store

University of San Diego - MS Applied AI

AAI-540 Team 5

October 21, 2024

In [1]:
# setup environment
%run 0-Environment_Setup.ipynb

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml
Stored 's3_datalake_path_csv' (str)
Stored 'local_data_path_csv' (str)
Stored 's3_datalake_path_parquet' (str)


### Get a rollup of total sales by store, by date

In [2]:
table_name = "cleaned_data"

# Define SQL query
statement = """SELECT 
    date,
    store_nbr,
    round(sum(sales), 2) as sales,
    max(dcoilwtico) as oil,
    max(is_holiday) as is_holiday,
    max(city) as city,
    max(state) as state,
    max(cluster) as cluster,
    max(onpromotion) as onpromotion
    FROM aai540finalprojectdb.cleaned_data
    GROUP BY (date, store_nbr)
ORDER BY date, store_nbr""".format(
    database_name, table_name
)

# get the results
store_sales_df = wr.athena.read_sql_query(statement, database_name)

2024-09-25 01:09:48,671	INFO worker.py:1771 -- Started a local Ray instance.


### Do some further cleanup

- Check for null values
- Impute as necessary

In [3]:
# check nulls in housing_gmaps_data
print(store_sales_df.isnull().sum())

date               0
store_nbr          0
sales              0
oil            25974
is_holiday         0
city               0
state              0
cluster            0
onpromotion        0
dtype: int64


In [4]:
# impute oil values using forward fill
store_sales_df['oil'].fillna(method='ffill', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  store_sales_df['oil'].fillna(method='ffill', inplace=True)
  store_sales_df['oil'].fillna(method='ffill', inplace=True)


In [5]:
# double check all nulls are imputed
store_sales_df[store_sales_df['oil'].isnull()]

Unnamed: 0,date,store_nbr,sales,oil,is_holiday,city,state,cluster,onpromotion


In [6]:
# break down date into year, month, day, day of week
store_sales_df["date"] = pd.to_datetime(store_sales_df["date"])
store_sales_df["year"] = store_sales_df["date"].dt.year
store_sales_df["month"] = store_sales_df["date"].dt.month
store_sales_df["day"] = store_sales_df["date"].dt.day
store_sales_df["dow"] = store_sales_df["date"].dt.dayofweek

In [7]:
# inspect
store_sales_df

Unnamed: 0,date,store_nbr,sales,oil,is_holiday,city,state,cluster,onpromotion,year,month,day,dow
0,2013-01-01,1,0.00,93.14,1,Quito,Pichincha,13,0,2013,1,1,1
1,2013-01-01,2,0.00,93.14,1,Quito,Pichincha,13,0,2013,1,1,1
2,2013-01-01,3,0.00,93.14,1,Quito,Pichincha,8,0,2013,1,1,1
3,2013-01-01,4,0.00,93.14,1,Quito,Pichincha,9,0,2013,1,1,1
4,2013-01-01,5,0.00,93.14,1,Santo Domingo,Santo Domingo de los Tsachilas,4,0,2013,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
90931,2017-08-15,50,16879.12,47.57,0,Ambato,Tungurahua,14,35,2017,8,15,1
90932,2017-08-15,51,20154.56,47.57,0,Guayaquil,Guayas,17,29,2017,8,15,1
90933,2017-08-15,52,18600.05,47.57,0,Manta,Manabi,11,37,2017,8,15,1
90934,2017-08-15,53,8208.19,47.57,0,Manta,Manabi,13,33,2017,8,15,1


### Encode Categorical Columns

In [8]:
# use labelencoder to encode categorical columns
from sklearn.preprocessing import LabelEncoder 
le = LabelEncoder()

store_sales_df['city'] = le.fit_transform(store_sales_df['city'])
store_sales_df['state'] = le.fit_transform(store_sales_df['state'])

store_sales_df

Unnamed: 0,date,store_nbr,sales,oil,is_holiday,city,state,cluster,onpromotion,year,month,day,dow
0,2013-01-01,1,0.00,93.14,1,18,12,13,0,2013,1,1,1
1,2013-01-01,2,0.00,93.14,1,18,12,13,0,2013,1,1,1
2,2013-01-01,3,0.00,93.14,1,18,12,8,0,2013,1,1,1
3,2013-01-01,4,0.00,93.14,1,18,12,9,0,2013,1,1,1
4,2013-01-01,5,0.00,93.14,1,21,14,4,0,2013,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
90931,2017-08-15,50,16879.12,47.57,0,0,15,14,35,2017,8,15,1
90932,2017-08-15,51,20154.56,47.57,0,8,6,17,29,2017,8,15,1
90933,2017-08-15,52,18600.05,47.57,0,14,10,11,37,2017,8,15,1
90934,2017-08-15,53,8208.19,47.57,0,14,10,13,33,2017,8,15,1


In [9]:
# convert date to format compatible with feature store
store_sales_df['date'] = store_sales_df['date'].apply(lambda x: x.strftime('%Y-%m-%d'))

In [10]:
# engineer id feature based on date and store_nbr
store_sales_df['sales_record_id'] = store_sales_df['date'].apply(lambda x: str(x)) + ":" + store_sales_df['store_nbr'].apply(lambda x: str(x))
store_sales_df

Unnamed: 0,date,store_nbr,sales,oil,is_holiday,city,state,cluster,onpromotion,year,month,day,dow,sales_record_id
0,2013-01-01,1,0.00,93.14,1,18,12,13,0,2013,1,1,1,2013-01-01:1
1,2013-01-01,2,0.00,93.14,1,18,12,13,0,2013,1,1,1,2013-01-01:2
2,2013-01-01,3,0.00,93.14,1,18,12,8,0,2013,1,1,1,2013-01-01:3
3,2013-01-01,4,0.00,93.14,1,18,12,9,0,2013,1,1,1,2013-01-01:4
4,2013-01-01,5,0.00,93.14,1,21,14,4,0,2013,1,1,1,2013-01-01:5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
90931,2017-08-15,50,16879.12,47.57,0,0,15,14,35,2017,8,15,1,2017-08-15:50
90932,2017-08-15,51,20154.56,47.57,0,8,6,17,29,2017,8,15,1,2017-08-15:51
90933,2017-08-15,52,18600.05,47.57,0,14,10,11,37,2017,8,15,1,2017-08-15:52
90934,2017-08-15,53,8208.19,47.57,0,14,10,13,33,2017,8,15,1,2017-08-15:53


In [11]:
store_sales_df.dtypes

date                object
store_nbr            Int64
sales              float64
oil                float64
is_holiday           Int32
city                 int64
state                int64
cluster              Int64
onpromotion          Int64
year                 int32
month                int32
day                  int32
dow                  int32
sales_record_id     object
dtype: object

### Create Features and Feature Store

In [12]:
# helper function to convert col data types to non-objects
def cast_columns(data_frame):
    for label in data_frame.columns:
        if data_frame.dtypes[label] == "object":
            data_frame[label] = data_frame[label].astype("str").astype("string")
        elif data_frame.dtypes[label] == "bool":
            data_frame[label] = data_frame[label].astype("int")
            
cast_columns(store_sales_df)

In [13]:
# create fime feature
import time

current_time_sec = int(round(time.time()))

# set sales_record_id as primary key
record_identifier_feature_name = 'sales_record_id'
event_time_feature_name = "event_time"

# append EventTime feature
store_sales_df[event_time_feature_name] = pd.Series(
    [current_time_sec] * len(store_sales_df), dtype="float64"
)

In [14]:
from time import gmtime, strftime, sleep
store_sales_feature_group.load_feature_definitions(data_frame=store_sales_df)

[FeatureDefinition(feature_name='date', feature_type=<FeatureTypeEnum.STRING: 'String'>, collection_type=None),
 FeatureDefinition(feature_name='store_nbr', feature_type=<FeatureTypeEnum.INTEGRAL: 'Integral'>, collection_type=None),
 FeatureDefinition(feature_name='sales', feature_type=<FeatureTypeEnum.FRACTIONAL: 'Fractional'>, collection_type=None),
 FeatureDefinition(feature_name='oil', feature_type=<FeatureTypeEnum.FRACTIONAL: 'Fractional'>, collection_type=None),
 FeatureDefinition(feature_name='is_holiday', feature_type=<FeatureTypeEnum.INTEGRAL: 'Integral'>, collection_type=None),
 FeatureDefinition(feature_name='city', feature_type=<FeatureTypeEnum.INTEGRAL: 'Integral'>, collection_type=None),
 FeatureDefinition(feature_name='state', feature_type=<FeatureTypeEnum.INTEGRAL: 'Integral'>, collection_type=None),
 FeatureDefinition(feature_name='cluster', feature_type=<FeatureTypeEnum.INTEGRAL: 'Integral'>, collection_type=None),
 FeatureDefinition(feature_name='onpromotion', featur

In [15]:
# helper function to monitor feature group creation status
def wait_for_feature_group_creation_complete(feature_group):
    status = feature_group.describe().get("FeatureGroupStatus")
    while status == "Creating":
        print("Waiting for Feature Group Creation")
        time.sleep(5)
        status = feature_group.describe().get("FeatureGroupStatus")
    if status != "Created":
        raise RuntimeError(f"Failed to create feature group {feature_group.name}")
    print(f"FeatureGroup {feature_group.name} successfully created.")

In [21]:
# Create Offline Feature Store

# create feature group
store_sales_feature_group.create(
    s3_uri=f"s3://{default_s3_bucket_name}/{prefix}",
    record_identifier_name=record_identifier_feature_name,
    event_time_feature_name=event_time_feature_name,
    role_arn=role,
    enable_online_store=True,
)

wait_for_feature_group_creation_complete(feature_group=store_sales_feature_group)

Waiting for Feature Group Creation
Waiting for Feature Group Creation
Waiting for Feature Group Creation
FeatureGroup store-sales-feature-group-offline successfully created.


In [22]:
# ingest sales data
store_sales_feature_group.ingest(data_frame=store_sales_df, max_workers=3, wait=True)

IngestionManagerPandas(feature_group_name='store-sales-feature-group-offline', feature_definitions={'date': {'FeatureName': 'date', 'FeatureType': 'String'}, 'store_nbr': {'FeatureName': 'store_nbr', 'FeatureType': 'Integral'}, 'sales': {'FeatureName': 'sales', 'FeatureType': 'Fractional'}, 'oil': {'FeatureName': 'oil', 'FeatureType': 'Fractional'}, 'is_holiday': {'FeatureName': 'is_holiday', 'FeatureType': 'Integral'}, 'city': {'FeatureName': 'city', 'FeatureType': 'Integral'}, 'state': {'FeatureName': 'state', 'FeatureType': 'Integral'}, 'cluster': {'FeatureName': 'cluster', 'FeatureType': 'Integral'}, 'onpromotion': {'FeatureName': 'onpromotion', 'FeatureType': 'Integral'}, 'year': {'FeatureName': 'year', 'FeatureType': 'Integral'}, 'month': {'FeatureName': 'month', 'FeatureType': 'Integral'}, 'day': {'FeatureName': 'day', 'FeatureType': 'Integral'}, 'dow': {'FeatureName': 'dow', 'FeatureType': 'Integral'}, 'sales_record_id': {'FeatureName': 'sales_record_id', 'FeatureType': 'String

In [23]:
# Test a batch query
featurestore_runtime.batch_get_record(
    Identifiers=[
        {
            "FeatureGroupName": store_sales_feature_group_name,
            "RecordIdentifiersValueAsString": ["2017-08-15:1", "2017-08-15:2", "2017-08-15:3", "2017-08-15:4", "2017-08-15:5"],
        }
    ]
)

{'ResponseMetadata': {'RequestId': 'd26026e7-3363-44e7-be10-2954b97b7615',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'd26026e7-3363-44e7-be10-2954b97b7615',
   'content-type': 'application/json',
   'content-length': '6246',
   'date': 'Wed, 25 Sep 2024 01:23:54 GMT'},
  'RetryAttempts': 0},
 'Records': [{'FeatureGroupName': 'store-sales-feature-group-offline',
   'RecordIdentifierValueAsString': '2017-08-15:5',
   'Record': [{'FeatureName': 'date', 'ValueAsString': '2017-08-15'},
    {'FeatureName': 'store_nbr', 'ValueAsString': '5'},
    {'FeatureName': 'sales', 'ValueAsString': '8703.69'},
    {'FeatureName': 'oil', 'ValueAsString': '47.57'},
    {'FeatureName': 'is_holiday', 'ValueAsString': '0'},
    {'FeatureName': 'city', 'ValueAsString': '21'},
    {'FeatureName': 'state', 'ValueAsString': '14'},
    {'FeatureName': 'cluster', 'ValueAsString': '4'},
    {'FeatureName': 'onpromotion', 'ValueAsString': '26'},
    {'FeatureName': 'year', 'ValueAsString': '2017

### Load dataset from Offline Feature Store

In [24]:
store_sales_feature_group_resolved_output_s3_uri = (
    store_sales_feature_group.describe()
    .get("OfflineStoreConfig")
    .get("S3StorageConfig")
    .get("ResolvedOutputS3Uri")
)

store_sales_feature_group_s3_prefix = store_sales_feature_group_resolved_output_s3_uri.replace(
    f"s3://{default_s3_bucket_name}/", ""
)

offline_store_contents = None
while offline_store_contents is None:
    objects_in_bucket = s3_client.list_objects(
        Bucket=default_s3_bucket_name, Prefix=store_sales_feature_group_s3_prefix
    )
    if "Contents" in objects_in_bucket and len(objects_in_bucket["Contents"]) > 1:
        offline_store_contents = objects_in_bucket["Contents"]
    else:
        print("Waiting for data in offline store...\n")
        sleep(60)

print("Data available.")

Data available.


In [25]:
# test loading dataset from feature group
sales_features_timestamp = get_raw_dataset_from_offline_feature_group(store_sales_feature_group)

Running 
    SELECT *
    FROM
        "store_sales_feature_group_offline_1727227039"
    


In [26]:
# inspect the dataset
sales_features_timestamp

Unnamed: 0,date,store_nbr,sales,oil,is_holiday,city,state,cluster,onpromotion,year,month,day,dow,sales_record_id,event_time,write_time,api_invocation_time,is_deleted
0,2016-01-31,43,13725.43,33.66,0,6,5,10,41,2016,1,31,6,2016-01-31:43,1.727227e+09,2024-09-25 01:23:03.691,2024-09-25 01:18:05.000,False
1,2013-01-01,10,0.00,93.14,1,18,12,15,0,2013,1,1,1,2013-01-01:10,1.727227e+09,2024-09-25 01:23:03.691,2024-09-25 01:18:05.000,False
2,2016-02-02,1,10655.79,29.90,0,18,12,13,38,2016,2,2,1,2016-02-02:1,1.727227e+09,2024-09-25 01:23:03.691,2024-09-25 01:18:06.000,False
3,2014-07-18,43,8631.46,103.83,0,6,5,10,3,2014,7,18,4,2014-07-18:43,1.727227e+09,2024-09-25 01:23:03.691,2024-09-25 01:18:06.000,False
4,2016-02-02,32,4258.27,29.90,0,8,6,3,43,2016,2,2,1,2016-02-02:32,1.727227e+09,2024-09-25 01:23:03.691,2024-09-25 01:18:07.000,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
77315,2017-05-20,41,15098.72,50.32,0,13,4,4,41,2017,5,20,5,2017-05-20:41,1.727227e+09,2024-09-25 01:23:17.579,2024-09-25 01:23:02.000,False
77316,2017-05-21,15,7859.87,50.32,0,9,7,15,52,2017,5,21,6,2017-05-21:15,1.727227e+09,2024-09-25 01:23:17.579,2024-09-25 01:23:02.000,False
77317,2014-04-28,12,4277.32,101.13,0,10,3,15,0,2014,4,28,0,2014-04-28:12,1.727227e+09,2024-09-25 01:23:17.579,2024-09-25 01:23:02.000,False
77318,2014-04-28,14,5573.60,101.13,0,19,2,7,0,2014,4,28,0,2014-04-28:14,1.727227e+09,2024-09-25 01:23:17.579,2024-09-25 01:23:02.000,False


In [27]:
# get features stacked by store then date
sales_features_store = get_store_dataset_from_offline_feature_group(store_sales_feature_group)

Running 
    SELECT *
    FROM
        "store_sales_feature_group_offline_1727227039"
    ORDER BY
        store_nbr ASC, date ASC
    


In [28]:
# inspect
sales_features_store

Unnamed: 0,date,store_nbr,sales,oil,is_holiday,city,state,cluster,onpromotion,year,month,day,dow,sales_record_id,event_time,write_time,api_invocation_time,is_deleted
0,2013-01-01,1,0.00,93.14,1,18,12,13,0,2013,1,1,1,2013-01-01:1,1.727227e+09,2024-09-25 01:23:03.697,2024-09-25 01:18:05.000,False
1,2013-01-02,1,7417.15,93.14,0,18,12,13,0,2013,1,2,2,2013-01-02:1,1.727227e+09,2024-09-25 01:23:07.224,2024-09-25 01:18:06.000,False
2,2013-01-03,1,5873.24,92.97,0,18,12,13,0,2013,1,3,3,2013-01-03:1,1.727227e+09,2024-09-25 01:23:03.964,2024-09-25 01:18:07.000,False
3,2013-01-04,1,5919.88,93.12,0,18,12,13,0,2013,1,4,4,2013-01-04:1,1.727227e+09,2024-09-25 01:23:03.859,2024-09-25 01:18:07.000,False
4,2013-01-05,1,6318.79,93.12,1,18,12,13,0,2013,1,5,5,2013-01-05:1,1.727227e+09,2024-09-25 01:23:03.726,2024-09-25 01:18:08.000,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
77315,2017-05-16,54,9195.98,48.64,0,5,10,3,76,2017,5,16,1,2017-05-16:54,1.727227e+09,2024-09-25 01:23:03.905,2024-09-25 01:22:59.000,False
77316,2017-05-17,54,17721.09,49.04,0,5,10,3,63,2017,5,17,2,2017-05-17:54,1.727227e+09,2024-09-25 01:23:03.691,2024-09-25 01:22:59.000,False
77317,2017-05-18,54,5997.13,49.36,0,5,10,3,47,2017,5,18,3,2017-05-18:54,1.727227e+09,2024-09-25 01:23:17.789,2024-09-25 01:23:00.000,False
77318,2017-05-19,54,13040.34,50.32,0,5,10,3,45,2017,5,19,4,2017-05-19:54,1.727227e+09,2024-09-25 01:23:03.680,2024-09-25 01:23:01.000,False
