# AWS SageMaker Data Lake and Feature Store for Time Series Forecasting

This notebook creates:
1. **Data Lake** - S3 buckets + Athena tables for raw aggregated time series data
2. **Feature Store** - SageMaker Feature Store for engineered features

This supports the demand forecasting models built in Modeling.ipynb

## Part 1: Data Lake Setup

Initialize SageMaker SDK and configure AWS resources.

In [None]:
import boto3
import sagemaker
import pandas as pd
import numpy as np
from datetime import datetime
import time

# Initialize SageMaker session
sess = sagemaker.Session()
bucket = sess.default_bucket()
role = sagemaker.get_execution_role()
region = boto3.Session().region_name
s3_client = boto3.client('s3', region_name=region)
athena_client = boto3.client('athena', region_name=region)

print(f"SageMaker Role: {role}")
print(f"Default S3 Bucket: {bucket}")
print(f"Region: {region}")

### Load and Prepare Data
Load dataset.csv and create aggregated + engineered feature tables.

In [None]:
# Load raw data
df_raw = pd.read_csv('dataset.csv')
print(f"Raw data shape: {df_raw.shape}")
print(f"Raw data columns: {df_raw.columns.tolist()}")

# Aggregate data by timestamp (sum across all locations)
df_raw['timestamp'] = pd.to_datetime(df_raw['timestamp'])
df_agg = df_raw.groupby('timestamp')['value'].sum().reset_index()
df_agg.columns = ['ds', 'y']
df_agg = df_agg.sort_values('ds').reset_index(drop=True)
df_agg['event_time'] = df_agg['ds']  # For SageMaker Feature Store

print(f"\nAggregated data shape: {df_agg.shape}")
print(f"Date range: {df_agg['ds'].min()} to {df_agg['ds'].max()}")
print(df_agg.head())

In [None]:
# Create engineered features
def create_features(df):
    df = df.copy()
    df['month']         = df['ds'].dt.month
    df['quarter']       = df['ds'].dt.quarter
    df['year']          = df['ds'].dt.year
    df['month_sin']     = np.sin(2 * np.pi * df['month'] / 12)
    df['month_cos']     = np.cos(2 * np.pi * df['month'] / 12)
    df['trend']         = np.arange(len(df))

    for lag in [1, 2, 3, 6, 12]:
        df[f'lag_{lag}'] = df['y'].shift(lag)

    df['rolling_mean_3']  = df['y'].shift(1).rolling(3).mean()
    df['rolling_mean_12'] = df['y'].shift(1).rolling(12).mean()
    df['rolling_std_3']   = df['y'].shift(1).rolling(3).std()

    return df

df_feat = create_features(df_agg)
df_feat = df_feat.dropna().reset_index(drop=True)
df_feat['event_time'] = df_feat['ds']  # For SageMaker Feature Store

print(f"Engineered features shape: {df_feat.shape}")
print(f"Feature columns: {[c for c in df_feat.columns if c not in ['ds', 'y']]}")
print(df_feat.head())

### Upload Data to S3 Data Lake

In [None]:
# Define S3 paths
s3_prefix = 'timeseries-demand-forecasting'
s3_agg_path = f's3://{bucket}/{s3_prefix}/data-lake/aggregated/'
s3_feat_path = f's3://{bucket}/{s3_prefix}/data-lake/features/'
s3_staging_dir = f's3://{bucket}/{s3_prefix}/athena/staging'

# Convert to Parquet for better performance
df_agg.to_parquet('df_agg.parquet', index=False)
df_feat.to_parquet('df_feat.parquet', index=False)

# Upload to S3
sess.upload_data('df_agg.parquet', bucket=bucket, key_prefix=f'{s3_prefix}/data-lake/aggregated/')
sess.upload_data('df_feat.parquet', bucket=bucket, key_prefix=f'{s3_prefix}/data-lake/features/')

print(f"✓ Data uploaded to S3")
print(f"  Aggregated: {s3_agg_path}")
print(f"  Features:   {s3_feat_path}")

### Create Athena Database & Tables

Use Athena to query data in the data lake.

In [None]:
from pyathena import connect

# Setup Athena connection
database_name = "demand_forecasting"
table_agg = "timeseries_aggregated"
table_feat = "timeseries_features"

conn = connect(region_name=region, s3_staging_dir=s3_staging_dir)

# Create database
create_db = f"CREATE DATABASE IF NOT EXISTS {database_name}"
pd.read_sql(create_db, conn)
print(f"✓ Database '{database_name}' created")

# Create aggregated table
create_agg_table = f"""
CREATE EXTERNAL TABLE IF NOT EXISTS {database_name}.{table_agg} (
  ds timestamp,
  y double,
  event_time timestamp
)
STORED AS PARQUET
LOCATION '{s3_agg_path}'
"""
pd.read_sql(create_agg_table, conn)
print(f"✓ Table '{table_agg}' created")

# Create features table
create_feat_table = f"""
CREATE EXTERNAL TABLE IF NOT EXISTS {database_name}.{table_feat} (
  ds timestamp,
  y double,
  event_time timestamp,
  month int,
  quarter int,
  year int,
  month_sin double,
  month_cos double,
  trend int,
  lag_1 double,
  lag_2 double,
  lag_3 double,
  lag_6 double,
  lag_12 double,
  rolling_mean_3 double,
  rolling_mean_12 double,
  rolling_std_3 double
)
STORED AS PARQUET
LOCATION '{s3_feat_path}'
"""
pd.read_sql(create_feat_table, conn)
print(f"✓ Table '{table_feat}' created")

In [None]:
# Verify tables with queries
query_agg = f"SELECT COUNT(*) as row_count FROM {database_name}.{table_agg}"
df_agg_verify = pd.read_sql(query_agg, conn)
print(f"Aggregated table rows: {df_agg_verify['row_count'].values[0]}")

query_feat = f"SELECT COUNT(*) as row_count FROM {database_name}.{table_feat}"
df_feat_verify = pd.read_sql(query_feat, conn)
print(f"Features table rows: {df_feat_verify['row_count'].values[0]}")

# Sample query
sample_query = f"SELECT ds, y FROM {database_name}.{table_agg} LIMIT 5"
df_sample = pd.read_sql(sample_query, conn)
print(f"\nSample data from aggregated table:")
print(df_sample)

---

## Part 2: SageMaker Feature Store Setup

Create feature groups for aggregated data and engineered features.

In [None]:
from sagemaker.feature_store.feature_group import FeatureGroup
from sagemaker.feature_store.inputs import FeatureGroupInputFormat, FeatureGroupOutputFormat

# Initialize Feature Store client
fs_client = boto3.client('sagemaker-featurestore-runtime', region_name=region)
sm_client = boto3.client('sagemaker', region_name=region)

account_id = boto3.client('sts').get_caller_identity()['Account']

print(f"Feature Store initialized for account {account_id}")
print(f"Region: {region}")

### Create Feature Group: Aggregated Time Series Data

In [None]:
# Prepare data for aggregated feature group
fg_agg_name = 'timeseries-aggregated-fg'
df_agg['event_time'] = df_agg['event_time'].astype('int64') // 10**9  # Convert to Unix timestamp

# Create Feature Group
fg_agg = FeatureGroup(
    name=fg_agg_name,
    sagemaker_session=sess
)

print(f"Creating feature group: {fg_agg_name}")

try:
    fg_agg.load()
    print(f"✓ Feature group '{fg_agg_name}' already exists")
except:
    # Create feature group with Parquet as input format
    fg_agg.ingest(
        data_frame=df_agg,
        max_workers=3,
        wait=True
    )
    print(f"✓ Feature group '{fg_agg_name}' created and data ingested")
    
    # Wait for feature group to be active
    fg_agg.wait()
    print(f"✓ Feature group is active")

### Create Feature Group: Engineered Features

In [None]:
# Prepare data for features feature group
fg_feat_name = 'timeseries-engineered-features-fg'
df_feat['event_time'] = df_feat['event_time'].astype('int64') // 10**9  # Convert to Unix timestamp

# Create Feature Group
fg_feat = FeatureGroup(
    name=fg_feat_name,
    sagemaker_session=sess
)

print(f"Creating feature group: {fg_feat_name}")

try:
    fg_feat.load()
    print(f"✓ Feature group '{fg_feat_name}' already exists")
except:
    # Create feature group
    fg_feat.ingest(
        data_frame=df_feat,
        max_workers=3,
        wait=True
    )
    print(f"✓ Feature group '{fg_feat_name}' created and data ingested")
    
    # Wait for feature group to be active
    fg_feat.wait()
    print(f"✓ Feature group is active")

### Verify Feature Store Setup

In [None]:
# List feature groups
fgs = sm_client.list_feature_groups(
    MaxResults=10,
    SortBy='CreationTime',
    SortOrder='Descending'
)

print("Feature Groups in SageMaker Feature Store:")
print("=" * 60)
for fg in fgs.get('FeatureGroupSummaries', []):
    if 'timeseries' in fg['FeatureGroupName']:
        print(f"  • {fg['FeatureGroupName']}")
        print(f"    Status: {fg['FeatureGroupStatus']}")
        print(f"    Created: {fg['CreationTime']}")
        print()

# Show feature group descriptions
print("\nFeature Group Details:")
print("=" * 60)

try:
    agg_desc = sm_client.describe_feature_group(FeatureGroupName=fg_agg_name)
    print(f"\n{fg_agg_name}:")
    print(f"  Records ingested: {agg_desc.get('RecordIdentifierFeatureGroupArn', 'N/A')}")
    print(f"  Features: {len(agg_desc.get('FeatureDefinitions', []))}")
    print(f"  Features: {[f['FeatureName'] for f in agg_desc.get('FeatureDefinitions', [])][:5]}...")
except Exception as e:
    print(f"  Error: {e}")

try:
    feat_desc = sm_client.describe_feature_group(FeatureGroupName=fg_feat_name)
    print(f"\n{fg_feat_name}:")
    print(f"  Records ingested: {feat_desc.get('RecordIdentifierFeatureGroupArn', 'N/A')}")
    print(f"  Features: {len(feat_desc.get('FeatureDefinitions', []))}")
    print(f"  Features: {[f['FeatureName'] for f in feat_desc.get('FeatureDefinitions', [])][:5]}...")
except Exception as e:
    print(f"  Error: {e}")

print("\n✓ Feature Store setup complete!")

---

### Using Feature Store with Models

Query the feature store to retrieve data for training your forecasting models.

In [None]:
# Example: Query features from Feature Store for model training
# This retrieves the latest version of features for each record

print("Example: Retrieving training data from Feature Store")
print("=" * 60)

# Load features from Feature Store using Athena (default)
df_training = fg_feat.athena_query().as_dataframe()

print(f"✓ Retrieved {len(df_training)} records from feature store")
print(f"  Shape: {df_training.shape}")
print(f"  Columns: {df_training.columns.tolist()}")

# Show sample
print("\nSample features:")
print(df_training.head())

# Export feature information for documentation
feature_store_info = {
    'database': database_name,
    'tables': {
        'aggregated': table_agg,
        'features': table_feat
    },
    'feature_groups': {
        'aggregated': fg_agg_name,
        'engineered': fg_feat_name
    },
    's3_paths': {
        'data_lake_prefix': s3_prefix,
        'aggregated_data': s3_agg_path,
        'feature_data': s3_feat_path,
        'staging_dir': s3_staging_dir
    },
    'docs': 'Use these resources to fetch training data for forecasting models'
}

print("\n✓ Feature Store Configuration Summary:")
import json
print(json.dumps(feature_store_info, indent=2, default=str))