Step 1: Create an IAM Role with an Inline Policy for S3 Access


In [0]:
aws_access_key_id = 'replace with your access key'
aws_secret_access_key = 'replace with your secret key'
iam_role_for_s3_access = 'ec2 role name that u want to create to access bucket'
policy_name='policy name'
s3_bucket_name = 'bucket name that u need to acess'
workspace_id = 'workspaceid'
#workspace_id2 = ''
aws_account_id = 'AWS acc id'

databricks_role_name = 'databricks workspace deployment role' #databricks workspace deployment role
DATABRICKS_INSTANCE = "databricks workspace url note: remove https:// "
DATABRICKS_TOKEN = "databricks token" 
     

In [0]:
import boto3
import json
import requests

In [0]:
def create_iam_client(aws_access_key_id, aws_secret_access_key):
    return boto3.client(
        'iam',
        aws_access_key_id=aws_access_key_id,
        aws_secret_access_key=aws_secret_access_key,
    )

def create_s3_client(aws_access_key_id, aws_secret_access_key):
    return boto3.client(
        's3',
        aws_access_key_id=aws_access_key_id,
        aws_secret_access_key=aws_secret_access_key
    )

Step 1: Create an IAM Role with an Inline Policy for S3 `Access`

In [0]:
def setup_iam_s3_access(aws_access_key_id, aws_secret_access_key, iam_role_for_s3_access, policy_name, s3_bucket_name):
    # Step 1: Create IAM role for EC2
    create_iam_client(aws_access_key_id, aws_secret_access_key)
    assume_role_policy = {
        "Version": "2012-10-17",
        "Statement": [
            {
                "Effect": "Allow",
                "Principal": {
                    "Service": "ec2.amazonaws.com"
                },
                "Action": "sts:AssumeRole"
            }
        ]
    }
    # Check if the role already exists
    try:
        iam_client=create_iam_client(aws_access_key_id, aws_secret_access_key)
        iam_client.get_role(RoleName=iam_role_for_s3_access)
        print(f"Role {iam_role_for_s3_access} already exists.")
    except iam_client.exceptions.NoSuchEntityException:
        # Create the role if it does not exist
        create_role_response = iam_client.create_role(
            RoleName=iam_role_for_s3_access,
            AssumeRolePolicyDocument=json.dumps(assume_role_policy)
        )
        role_arn = create_role_response['Role']['Arn']
        print(f"Role ARN: {role_arn}")
    # Step 2: Create inline policy for S3 access
    s3_policy = {
        "Version": "2012-10-17",
        "Statement": [
            {
                "Effect": "Allow",
                "Action": ["s3:ListBucket"],
                "Resource": [f"arn:aws:s3:::{s3_bucket_name}"]
            },
            {
                "Effect": "Allow",
                "Action": ["s3:PutObject", "s3:GetObject", "s3:DeleteObject", "s3:PutObjectAcl"],
                "Resource": [f"arn:aws:s3:::{s3_bucket_name}/*"]
            }
        ]
    }
    # Add the inline policy to the role
    iam_client.put_role_policy(
        RoleName=iam_role_for_s3_access,
        PolicyName=policy_name,
        PolicyDocument=json.dumps(s3_policy)
    )
    print("S3 access policy added to role.")

Step 2: Enable Policy for Serverless Resources


In [0]:
def update_trust_policy(iam_role_for_s3_access, workspace_id):
    trust_policy_extension = {
        "Effect": "Allow",
        "Principal": {
            "AWS": "arn:aws:iam::790110701330:role/serverless-customer-resource-role"
        },
        "Action": "sts:AssumeRole",
        "Condition": {
            "StringEquals": {
                "sts:ExternalId": f"databricks-serverless-{workspace_id}"
            }
        }
    }
    # Get existing trust policy
    iam_client = create_iam_client(aws_access_key_id, aws_secret_access_key)
    current_trust_policy = iam_client.get_role(RoleName=iam_role_for_s3_access)['Role']['AssumeRolePolicyDocument']
    # Append new trust policy
    if trust_policy_extension not in current_trust_policy['Statement']:
        current_trust_policy['Statement'].append(trust_policy_extension)
        # Update the role's trust policy
        iam_client.update_assume_role_policy(
            RoleName=iam_role_for_s3_access,
            PolicyDocument=json.dumps(current_trust_policy)
        )
        print("Trust policy updated for serverless resources.")
    else:
        print("Trust policy extension already exists.")

Step 3: Create the S3 Bucket Policy


In [0]:
def update_s3_bucket_policy(aws_account_id, iam_role_for_s3_access, s3_bucket_name):
    bucket_policy = {
        "Version": "2012-10-17",
        "Statement": [
            {
                "Sid": "AllowS3Read",
                "Effect": "Allow",
                "Principal": {
                    "AWS": f"arn:aws:iam::{aws_account_id}:role/{iam_role_for_s3_access}"
                },
                "Action": ["s3:GetBucketLocation", "s3:ListBucket"],
                "Resource": f"arn:aws:s3:::{s3_bucket_name}"
            },
            {
                "Sid": "AllowS3RW",
                "Effect": "Allow",
                "Principal": {
                    "AWS": f"arn:aws:iam::{aws_account_id}:role/{iam_role_for_s3_access}"
                },
                "Action": ["s3:PutObject", "s3:GetObject", "s3:DeleteObject", "s3:PutObjectAcl"],
                "Resource": f"arn:aws:s3:::{s3_bucket_name}/*"
            }
        ]
    }
    s3_client=create_s3_client(aws_access_key_id, aws_secret_access_key)
    
    # Check if bucket policy already exists
    try:
        existing_policy = s3_client.get_bucket_policy(Bucket=s3_bucket_name)
        if existing_policy:
            print("Bucket policy already exists.")
            return
    except:
        pass
    
    # Update the bucket policy
    s3_client.put_bucket_policy(
        Bucket=s3_bucket_name,
        Policy=json.dumps(bucket_policy)
    )

    print("S3 bucket policy updated.")

Step 4: Add S3 Role to Databricks IAM Role


In [0]:
def update_iam_policies_for_databricks(aws_account_id, iam_role_for_s3_access, databricks_role_name):
    iam_client=create_iam_client(aws_access_key_id, aws_secret_access_key)
    try:
        # Get attached policies for the specified role

        attached_policies = iam_client.list_attached_role_policies(RoleName=databricks_role_name)
        if attached_policies['AttachedPolicies']:
            print("Attached policies found for the role.")
            for policy in attached_policies['AttachedPolicies']:
                policy_arn = policy['PolicyArn']
                print(f"PolicyName: {policy['PolicyName']}, PolicyArn: {policy_arn}")
                # Get the current policy document
                policy_version = iam_client.get_policy(PolicyArn=policy_arn)
                policy_document = iam_client.get_policy_version(
                    PolicyArn=policy_arn,
                    VersionId=policy_version['Policy']['DefaultVersionId']
                )
                policy_doc = policy_document['PolicyVersion']['Document']
                # Define the new statement to add
                new_statement = {
                    "Effect": "Allow",
                    "Action": "iam:PassRole",
                    "Resource": f"arn:aws:iam::{aws_account_id}:role/{iam_role_for_s3_access}"
                }

                # Append new statement if it doesn't exist
                if new_statement not in policy_doc.get('Statement', []):
                    policy_doc['Statement'].append(new_statement)

                    # Create a new version of the policy
                    iam_client.create_policy_version(
                        PolicyArn=policy_arn,
                        PolicyDocument=json.dumps(policy_doc),
                        SetAsDefault=True
                    )
                    print(f"Policy updated with new statement: {new_statement}")
                else:
                    print("The statement already exists in the policy.")
        else:
            print("Attached policies found for the role.")

        # Get inline policies for the specified role
        inline_policies = iam_client.list_role_policies(RoleName=databricks_role_name)
        if inline_policies['PolicyNames']:
            print("\nInline Policies:")
            for policy_name in inline_policies['PolicyNames']:
                print(f"PolicyName: {policy_name}")

                # Get the current inline policy document
                policy_document = iam_client.get_role_policy(
                    RoleName=databricks_role_name,
                    PolicyName=policy_name
                )
                policy_doc = policy_document['PolicyDocument']

                # Define the new statement to add
                new_statement = {
                    "Effect": "Allow",
                    "Action": "iam:PassRole",
                    "Resource": f"arn:aws:iam::{aws_account_id}:role/{iam_role_for_s3_access}"
                }
                # Append new statement if it doesn't exist
                if new_statement not in policy_doc.get('Statement', []):
                    policy_doc['Statement'].append(new_statement)

                    # Update the inline policy
                    iam_client.put_role_policy(
                        RoleName=databricks_role_name,
                        PolicyName=policy_name,
                        PolicyDocument=json.dumps(policy_doc)
                    )
                    print(f"Inline policy updated with new statement: {new_statement}")
                else:
                    print("The statement already exists in the inline policy.")
        else:
            print("No inline policies found for the role.")
        print("Databricks deployment role updated successfully.")
    except Exception as e:
        print(f"Error updating policy: {str(e)}")

Step 5: Add the Instance Profile to Databricks


In [0]:
def add_instance_profile_to_databricks(aws_access_key_id, aws_secret_access_key, iam_role_for_s3_access, DATABRICKS_INSTANCE, DATABRICKS_TOKEN):
    import time
    import requests

    iam_client = create_iam_client(aws_access_key_id, aws_secret_access_key)
    instance_profile = None  # Define an empty instance_profile variable

    try:
        create_instance_profile_response = iam_client.create_instance_profile(
            InstanceProfileName=iam_role_for_s3_access
        )
        instance_profile_arn = create_instance_profile_response['InstanceProfile']['Arn']
        print(f"Instance Profile ARN: {instance_profile_arn}")
        instance_profile = create_instance_profile_response  # Update instance_profile with the response
    except iam_client.exceptions.EntityAlreadyExistsException:
        instance_profile = iam_client.get_instance_profile(
            InstanceProfileName=iam_role_for_s3_access
        )
        instance_profile_arn = instance_profile['InstanceProfile']['Arn']
        print(f"Instance Profile ARN (existing): {instance_profile_arn}")

    if instance_profile:  # Check if instance_profile is not None
        roles = [role['RoleName'] for role in instance_profile['InstanceProfile']['Roles']]
        if iam_role_for_s3_access not in roles:
            iam_client.add_role_to_instance_profile(
                InstanceProfileName=iam_role_for_s3_access,
                RoleName=iam_role_for_s3_access
            )

    url = f'https://{DATABRICKS_INSTANCE}/api/2.0/instance-profiles/add'
    headers = {
        'Authorization': f'Bearer {DATABRICKS_TOKEN}',
        'Content-Type': 'application/json'
    }
    payload = {
        'instance_profile_arn': instance_profile_arn
    }

    max_retries = 3
    for attempt in range(max_retries):
        response = requests.post(url, headers=headers, json=payload)
        if response.status_code == 200:
            print("Instance profile added successfully.")
            break
        elif response.status_code == 400 and "already been added" in response.text:
            print("Instance profile already added to Databricks.")
            break
        else:
            print(f"Attempt {attempt + 1} failed: {response.text}")
            if attempt < max_retries - 1:
                print("Retrying...")
                time.sleep(10)  # Wait for 10 seconds before retrying
            else:
                print("Failed to add instance profile after multiple attempts.")

In [0]:
create_iam_client(aws_access_key_id, aws_secret_access_key)
create_s3_client(aws_access_key_id, aws_secret_access_key)
setup_iam_s3_access(aws_access_key_id, aws_secret_access_key, iam_role_for_s3_access, policy_name, s3_bucket_name)
update_trust_policy(iam_role_for_s3_access,workspace_id)
update_s3_bucket_policy(aws_account_id, iam_role_for_s3_access, s3_bucket_name)
update_iam_policies_for_databricks(aws_account_id, iam_role_for_s3_access, databricks_role_name)
add_instance_profile_to_databricks(aws_access_key_id, aws_secret_access_key, iam_role_for_s3_access, DATABRICKS_INSTANCE, DATABRICKS_TOKEN)


In [0]:
df=spark.read.csv("s3://ucccc1/Creator_2024-09-19_2024-10-18 (1).csv")
df.display()