In [4]:
import boto3
import pandas as pd
import numpy as np
import io
import timeit

# Initialize S3 client
s3_client = boto3.client('s3')
bucket_name = 'dataset-ingested'
s3_key_direct = "temp/direct_no_disk.parquet"

# Generate a DataFrame in memory
def generate_random_dataframe(rows=1_500_000, str_len=40):
    print("Generating random DataFrame in memory...")
    data = {
        "random_string": [
            ''.join(np.random.choice(list('abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789'), str_len))
            for _ in range(rows)
        ]
    }
    return pd.DataFrame(data)

# Upload DataFrame to S3 directly
def upload_df_to_s3(df, s3_bucket, s3_key):
    buffer = io.BytesIO()  # Create an in-memory buffer
    df.to_parquet(buffer, engine="pyarrow")  # Write the DataFrame to the buffer as Parquet
    buffer.seek(0)  # Reset buffer position to the beginning
    s3_client.put_object(Bucket=s3_bucket, Key=s3_key, Body=buffer)  # Upload to S3
    print(f"Uploaded DataFrame directly to S3 as {s3_key}")

# Benchmark the direct upload
def benchmark_upload(df, s3_bucket, s3_key):
    def upload():
        buffer = io.BytesIO()
        df.to_parquet(buffer, engine="pyarrow")
        buffer.seek(0)
        s3_client.put_object(Bucket=s3_bucket, Key=s3_key, Body=buffer)
    
    # Time the upload process using timeit
    elapsed_time = timeit.timeit(upload, number=1)
    print(f"Direct upload took {elapsed_time:.2f} seconds.")
    return elapsed_time

# Generate the DataFrame
df = generate_random_dataframe()

# Time the direct upload
time_direct = benchmark_upload(df, bucket_name, s3_key_direct)

# Print Results
print(f"Direct upload (no disk) time: {time_direct:.2f} seconds")

Generating random DataFrame in memory...
Direct upload took 4.62 seconds.
Direct upload (no disk) time: 4.62 seconds
