## 1. Configuration

Set up S3 bucket path and test parameters.

In [None]:
from pyspark.sql import functions as F

# S3 configuration (same as production)
s3_base_path = "s3://dev-cf-databricks-catalog-bucket/dev/dashboard/compliance"
s3_test_path = f"{s3_base_path}/_connection_test"

print("="*80)
print("S3 CONNECTION TEST")
print("="*80)
print(f"S3 Base Path:  {s3_base_path}")
print(f"Test Path:     {s3_test_path}")
print("="*80)

## 2. Test 1: List S3 Bucket Contents

Verify read access to S3 bucket.

In [None]:
print("\n" + "="*80)
print("TEST 1: LIST S3 BUCKET CONTENTS (READ ACCESS)")
print("="*80)

try:
    # List files in the base path
    files = dbutils.fs.ls(s3_base_path)

    print(f"‚úÖ SUCCESS: Listed {len(files)} items in {s3_base_path}")
    print("\nContents:")
    for f in files[:10]:  # Show first 10 items
        file_type = "DIR " if f.isDir() else "FILE"
        size_mb = f.size / (1024 * 1024) if not f.isDir() else 0
        print(f"  [{file_type}] {f.name:50s} ({size_mb:.2f} MB)")

    if len(files) > 10:
        print(f"  ... and {len(files) - 10} more items")

except Exception as e:
    print(f"‚ùå FAILED: Cannot list S3 bucket")
    print(f"Error: {str(e)}")
    raise

## 3. Test 2: Write Test File to S3

Verify write access and CSV export functionality.

In [None]:
print("\n" + "="*80)
print("TEST 2: WRITE TEST CSV TO S3 (WRITE ACCESS)")
print("="*80)

try:
    # Create a small test DataFrame
    test_data = [
        ("test_company_001", "2024-12-26", "123456789012", "us-east-1", 85.5, 120, 103),
        ("test_company_001", "2024-12-26", "123456789012", "us-west-2", 90.0, 120, 108),
        ("test_company_002", "2024-12-26", "987654321098", "us-east-1", 75.0, 100, 75),
    ]

    test_df = spark.createDataFrame(
        test_data,
        ["company_id", "date", "account_id", "region_id", "score", "total_rules", "total_passed"]
    )

    print(f"Created test DataFrame with {test_df.count()} rows")
    print("\nTest data:")
    test_df.show(truncate=False)

    # Write to S3 with same options as production
    (test_df
     .repartition("company_id")
     .write
     .mode("overwrite")
     .option("header", "true")
     .option("compression", "gzip")
     .option("maxRecordsPerFile", 200000)
     .partitionBy("company_id", "date")
     .csv(s3_test_path))

    print(f"‚úÖ SUCCESS: Test CSV written to {s3_test_path}")

except Exception as e:
    print(f"‚ùå FAILED: Cannot write to S3")
    print(f"Error: {str(e)}")
    raise

## 4. Test 3: Read Back Test File

Verify written data can be read back correctly.

In [None]:
print("\n" + "="*80)
print("TEST 3: READ BACK TEST FILE (VERIFY WRITE)")
print("="*80)

try:
    # Read back the test file
    read_df = (
        spark.read
        .option("header", "true")
        .option("inferSchema", "true")
        .csv(s3_test_path)
    )

    row_count = read_df.count()
    print(f"‚úÖ SUCCESS: Read {row_count} rows from S3")

    print("\nRead data:")
    read_df.orderBy("company_id", "region_id").show(truncate=False)

    # Verify data integrity
    if row_count == 3:
        print("‚úÖ Data integrity verified: Row count matches")
    else:
        print(f"‚ö†Ô∏è  WARNING: Expected 3 rows, got {row_count}")

except Exception as e:
    print(f"‚ùå FAILED: Cannot read from S3")
    print(f"Error: {str(e)}")
    raise

## 5. Test 4: Verify S3 Folder Structure

Check that partitioning creates the expected folder hierarchy.

In [None]:
print("\n" + "="*80)
print("TEST 4: VERIFY S3 FOLDER STRUCTURE (PARTITIONING)")
print("="*80)

try:
    # List top-level (should see company_id= folders)
    company_folders = dbutils.fs.ls(s3_test_path)
    print(f"‚úÖ Found {len(company_folders)} company folders:")
    for folder in company_folders:
        if folder.name.startswith("company_id="):
            print(f"  - {folder.name}")

            # List date folders within each company
            date_folders = dbutils.fs.ls(folder.path)
            for date_folder in date_folders:
                if date_folder.name.startswith("date="):
                    print(f"    - {date_folder.name}")

                    # List CSV files
                    csv_files = dbutils.fs.ls(date_folder.path)
                    csv_count = len([f for f in csv_files if f.name.endswith(".csv.gz")])
                    print(f"      ‚Üí {csv_count} CSV file(s)")

    print("\n‚úÖ SUCCESS: Folder structure matches expected pattern:")
    print("   s3://.../company_id=xxx/date=YYYY-MM-DD/*.csv.gz")

except Exception as e:
    print(f"‚ùå FAILED: Folder structure verification failed")
    print(f"Error: {str(e)}")
    raise

## 6. Test 5: Test Parallel Write with Repartition

Verify that repartition creates separate files per company.

In [None]:
print("\n" + "="*80)
print("TEST 5: PARALLEL WRITE VERIFICATION (REPARTITION)")
print("="*80)

try:
    # Create test data with multiple companies
    large_test_data = []
    for i in range(1, 6):  # 5 companies
        company_id = f"test_company_{i:03d}"
        for j in range(1, 11):  # 10 rows per company
            large_test_data.append((
                company_id,
                "2024-12-26",
                f"12345678{i:04d}",
                f"region-{j}",
                85.0 + i,
                100,
                85 + i
            ))

    large_df = spark.createDataFrame(
        large_test_data,
        ["company_id", "date", "account_id", "region_id", "score", "total_rules", "total_passed"]
    )

    total_rows = large_df.count()
    company_count = large_df.select("company_id").distinct().count()
    print(f"Created DataFrame: {total_rows} rows, {company_count} companies")

    # Repartition by company_id (Serverless auto-scales partitions)
    repartitioned_df = large_df.repartition("company_id")
    print(f"Repartitioned by company_id (Databricks Serverless auto-scales)")

    # Write with parallel processing
    s3_parallel_test_path = f"{s3_base_path}/_parallel_test"

    (repartitioned_df
     .write
     .mode("overwrite")
     .option("header", "true")
     .option("compression", "gzip")
     .partitionBy("company_id", "date")
     .csv(s3_parallel_test_path))

    print(f"‚úÖ SUCCESS: Parallel write completed to {s3_parallel_test_path}")

    # Verify each company has its own folder
    company_folders = dbutils.fs.ls(s3_parallel_test_path)
    company_folder_count = len([f for f in company_folders if f.name.startswith("company_id=")])

    print(f"\n‚úÖ Verification: Created {company_folder_count} company folders")
    if company_folder_count == company_count:
        print("‚úÖ PASSED: Each company has separate S3 folder (parallel write confirmed)")
    else:
        print(f"‚ö†Ô∏è  WARNING: Expected {company_count} folders, found {company_folder_count}")

except Exception as e:
    print(f"‚ùå FAILED: Parallel write test failed")
    print(f"Error: {str(e)}")
    raise


## 7. Cleanup: Remove Test Files

Clean up test files from S3 bucket.

In [None]:
print("\n" + "="*80)
print("CLEANUP: REMOVING TEST FILES")
print("="*80)

try:
    # Remove test files
    dbutils.fs.rm(s3_test_path, recurse=True)
    print(f"‚úÖ Removed: {s3_test_path}")

    dbutils.fs.rm(f"{s3_base_path}/_parallel_test", recurse=True)
    print(f"‚úÖ Removed: {s3_base_path}/_parallel_test")

    print("\n‚úÖ CLEANUP COMPLETE")

except Exception as e:
    print(f"‚ö†Ô∏è  WARNING: Cleanup failed (files may still exist)")
    print(f"Error: {str(e)}")

## 8. Summary Report

Final summary of all S3 connection tests.

In [None]:
print("\n" + "="*80)
print("S3 CONNECTION TEST SUMMARY")
print("="*80)
print("‚úÖ TEST 1: List S3 bucket contents (READ)")
print("‚úÖ TEST 2: Write CSV to S3 (WRITE)")
print("‚úÖ TEST 3: Read back CSV from S3 (VERIFY)")
print("‚úÖ TEST 4: Verify folder structure (PARTITIONING)")
print("‚úÖ TEST 5: Parallel write with repartition (PERFORMANCE)")
print("="*80)
print("\nüéâ ALL TESTS PASSED!")
print("\nS3 connection is working correctly:")
print(f"  - Read access: ‚úÖ")
print(f"  - Write access: ‚úÖ")
print(f"  - Compression (gzip): ‚úÖ")
print(f"  - Partitioning (company_id/date): ‚úÖ")
print(f"  - Parallel writes (.repartition): ‚úÖ")
print("\n‚úÖ Ready for production deployment!")
print("="*80)