In [10]:
import pandas as pd
import boto3
import os
from io import StringIO

def preprocess_stock_data_from_s3(input_bucket_name, input_file_name, output_bucket_name, output_file_name):
    # Create an S3 client
    s3_client = boto3.client('s3')

    # Read the file from S3
    s3_object = s3_client.get_object(Bucket=input_bucket_name, Key=input_file_name)
    file_content = s3_object['Body'].read().decode('utf-8')

    # Read the content into a DataFrame
    df = pd.read_csv(StringIO(file_content))

    # Convert the 'Data' column to datetime format
    df['Data'] = pd.to_datetime(df['Data'])
    
    # Extract the ticker from the file name
    ticker = os.path.basename(input_file_name).split('_')[0]

    # Rename columns to lowercase
    df.columns = df.columns.str.lower()

    # Add prefix with ticker to each column name except 'date'
    df.columns = ['date' if col == 'data' else f"{ticker}_{col}" for col in df.columns]

    # Save the preprocessed DataFrame back to a CSV
    csv_buffer = StringIO()
    df.to_csv(csv_buffer, index=False)

    # Upload the preprocessed file back to S3
    s3_client.put_object(Bucket=output_bucket_name, Key=output_file_name, Body=csv_buffer.getvalue())

    print(f"Preprocessed file uploaded to s3://{output_bucket_name}/{output_file_name}")


def list_files_in_s3_folder(bucket_name, folder_name):
    # Create an S3 client
    s3_client = boto3.client('s3')

    # List objects within the specified folder
    response = s3_client.list_objects_v2(Bucket=bucket_name, Prefix=folder_name)
    
    # Check if the folder contains any objects
    if 'Contents' in response:
        files = [os.path.basename(item['Key']) for item in response['Contents'] if not item['Key'].endswith('/')]
        return files
    else:
        return []
        
# Example usage
bucket_name = 'sagemaker-bucket-ds'
input_folder_name = '01_STOCKS/DATA/RAW'
output_folder_name = '01_STOCKS/DATA/PREPROCESSED'

file_list = list_files_in_s3_folder(bucket_name, input_folder_name)

for f in file_list:
    input_file = input_folder_name + '/' + f
    output_file = output_folder_name + '/' + f
    preprocess_stock_data_from_s3(bucket_name, input_file, bucket_name, output_file)

Preprocessed file uploaded to s3://sagemaker-bucket-ds/01_STOCKS/DATA/PREPROCESSED/^spx_20000101_20231231_d.csv
Preprocessed file uploaded to s3://sagemaker-bucket-ds/01_STOCKS/DATA/PREPROCESSED/aapl.us_20000101_20231231_d.csv
Preprocessed file uploaded to s3://sagemaker-bucket-ds/01_STOCKS/DATA/PREPROCESSED/mcd.us_20000101_20231231_d.csv
Preprocessed file uploaded to s3://sagemaker-bucket-ds/01_STOCKS/DATA/PREPROCESSED/msft.us_20000101_20231231_d.csv
