In [1]:
import boto3
import pandas as pd
from io import StringIO
import os

def list_files_in_s3_folder(bucket_name, folder_name):
    # Create an S3 client
    s3_client = boto3.client('s3')

    # List objects within the specified folder
    response = s3_client.list_objects_v2(Bucket=bucket_name, Prefix=folder_name)
    
    # Check if the folder contains any objects
    if 'Contents' in response:
        files = [item['Key'] for item in response['Contents'] if item['Key'].endswith('.csv')]
        return files
    else:
        return []

def read_and_join_csv_files(bucket_name, folder_name):
    s3_client = boto3.client('s3')
    files = list_files_in_s3_folder(bucket_name, folder_name)
    
    if not files:
        print("No CSV files found in the specified folder.")
        return None
    
    # Initialize an empty list to store dataframes
    df_list = []

    for file_key in files:
        # Read the CSV file content from S3
        s3_object = s3_client.get_object(Bucket=bucket_name, Key=file_key)
        file_content = s3_object['Body'].read().decode('utf-8')

        # Read the content into a DataFrame
        df = pd.read_csv(StringIO(file_content))
        df_list.append(df)
    
    # Merge all dataframes on the 'date' column
    if df_list:
        merged_df = df_list[0]
        for df in df_list[1:]:
            merged_df = merged_df.merge(df, on='date', how='outer')
    
    return merged_df

# Example usage

merged_df = read_and_join_csv_files(bucket_name, folder_name)


In [5]:
import boto3
import pandas as pd
from io import StringIO
import os

def list_files_in_s3_folder(bucket_name, folder_name):
    # Create an S3 client
    s3_client = boto3.client('s3')

    # List objects within the specified folder
    response = s3_client.list_objects_v2(Bucket=bucket_name, Prefix=folder_name)
    
    # Check if the folder contains any objects
    if 'Contents' in response:
        files = [item['Key'] for item in response['Contents'] if item['Key'].endswith('.csv')]
        return files
    else:
        return []

def read_and_join_csv_files(input_bucket_name, input_folder_name, output_bucket_name, output_file_name):
    s3_client = boto3.client('s3')
    files = list_files_in_s3_folder(input_bucket_name, input_folder_name)
    
    if not files:
        print("No CSV files found in the specified folder.")
        return None
    
    # Initialize an empty list to store dataframes
    df_list = []

    for file_key in files:
        # Read the CSV file content from S3
        s3_object = s3_client.get_object(Bucket=input_bucket_name, Key=file_key)
        file_content = s3_object['Body'].read().decode('utf-8')

        # Read the content into a DataFrame
        df = pd.read_csv(StringIO(file_content))
        df_list.append(df)
    
    # Merge all dataframes on the 'date' column
    if df_list:
        merged_df = df_list[0]
        for df in df_list[1:]:
            merged_df = merged_df.merge(df, on='date', how='outer')
    
    # Save the merged DataFrame to a CSV in memory
    csv_buffer = StringIO()
    merged_df.to_csv(csv_buffer, index=False)

    # Upload the merged file to the output S3 bucket
    s3_client.put_object(Bucket=output_bucket_name, Key=output_file_name, Body=csv_buffer.getvalue())
    
    print(f"Merged file uploaded to s3://{output_bucket_name}/{output_file_name}")

# Example usage

bucket_name = 'sagemaker-bucket-ds'
folder_name = '01_STOCKS/DATA/PREPROCESSED/'
input_bucket_name = 'sagemaker-bucket-ds'
input_folder_name = '01_STOCKS/DATA/PREPROCESSED/'  # Make sure to include the trailing slash if it's a folder
output_bucket_name = 'sagemaker-bucket-ds'
output_file_name = '01_STOCKS/DATA/JOINED/joined_dataset.csv'

read_and_join_csv_files(input_bucket_name, input_folder_name, output_bucket_name, output_file_name)

Merged file uploaded to s3://sagemaker-bucket-ds/01_STOCKS/DATA/JOINED/joined_dataset.csv
