In [None]:
# For connecting to s3 bucket

import boto3
import pandas as pd
import os
from io import BytesIO
import dask.dataframe as dd
import s3fs
from dask import delayed

s3 = boto3.client(
    's3',
    aws_access_key_id = "<AWS_ACCESS_KEY_ID>",
    aws_secret_access_key="<AWS_SECRET_ACCESS_KEY>"
)

bucket_name = "<YOUR_BUCKET_NAME>"

In [None]:

pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

In [None]:
# List the contents of a folder

folder_path = "<FOLDER_PATH>/"

def list_folder_contents(bucket, folder):
    try:
        response = s3.list_objects_v2(Bucket=bucket, Prefix=folder)
        
        if 'Contents' in response:
            data = [
                {"File Name": obj['Key'], "Size (Bytes)": obj['Size']}
                for obj in response['Contents']
            ]
            df = pd.DataFrame(data)
            return df
        else:
            print(f"No contents found in folder '{folder}'.")
            return pd.DataFrame(columns=["File Name", "Size (Bytes)"])
    
    except Exception as e:
        print(f"Error: {e}")
        return pd.DataFrame(columns=["File Name", "Size (Bytes)"])

folder_contents_df = list_folder_contents(bucket_name, folder_path)

if not folder_contents_df.empty:
    print(folder_contents_df.to_string(index=False))
else:
    print("Folder is empty.")

In [None]:
# Read a file dynamically based on extension

def read_s3_file(bucket_name, s3_file_path):
    full_path = f's3://{bucket_name}/{s3_file_path}'

    if s3_file_path.endswith('.csv'):
        dask_df = dd.read_csv(full_path, dtype=str, low_memory=False)
    elif s3_file_path.endswith('.tsv'):
        dask_df = dd.read_csv(full_path, sep='\t', dtype=str, low_memory=False)
    elif s3_file_path.endswith('.xlsx') or s3_file_path.endswith('.xls'):
        sheet_number = 0  # Default to the first tab
        delayed_read_excel = delayed(pd.read_excel)
        delayed_df = delayed_read_excel(full_path, sheet_name=sheet_number)
        dask_df = dd.from_delayed([delayed_df])
    else:
        raise ValueError("Unsupported file format")

    return dask_df    

def process_data_with_inferred_dtypes(dask_df):
    sample_df = dask_df.head(n=1000, compute=True)
    inferred_dtypes = sample_df.dtypes.to_dict()

    for column, dtype in inferred_dtypes.items():
        dask_df[column] = dask_df[column].astype(dtype)

    return dask_df

s3_file_path = '<S3_FILE_PATH>'

s3_file_df = read_s3_file(bucket_name, s3_file_path)
final_dask_df = process_data_with_inferred_dtypes(s3_file_df)

final_dask_df.head()

In [None]:
# For uploading files

base_folder = "<BASE_FOLDER>/"  # Replace with your base folder
new_folder = "<NEW_FOLDER>/"  # Replace with your new folder

def create_folder(bucket, folder_path):
    placeholder_key = folder_path + ".keep"
    s3.put_object(Bucket=bucket, Key=placeholder_key, Body="")
    print(f"{folder_path} created")

def upload_file(bucket, folder_path, local_file_path):
    if not os.path.isfile(local_file_path):
        raise FileNotFoundError(f"File not found: {local_file_path}")
    
    file_name = os.path.basename(local_file_path)
    
    s3.upload_file(local_file_path, bucket, folder_path + file_name)
    print(f"{file_name} uploaded to {folder_path}")

try:
    create_folder(bucket_name, base_folder + new_folder)
    
    local_file_path = "<LOCAL_FILE_PATH>"  # Replace with your local file path
    
    upload_file(bucket_name, base_folder + new_folder, local_file_path)

except Exception as e:
    print(f"Error: {e}")

In [None]:
# For deleting folder

base_folder = "<BASE_FOLDER>/" 
new_folder = "<NEW_FOLDER>/" 

def delete_folder(bucket, folder_path):
    response = s3.list_objects_v2(Bucket=bucket, Prefix=folder_path)
    
    if 'Contents' in response:
        for obj in response['Contents']:
            s3.delete_object(Bucket=bucket, Key=obj['Key'])
            print(f"Deleted: {obj['Key']}")
    else:
        print(f"No objects found in {folder_path}")
    
    print(f"Folder '{folder_path}' deleted.")

try:
    delete_folder(bucket_name, base_folder + new_folder)

except Exception as e:
    print(f"Error: {e}")