<img src="./images/logo.svg" alt="lakeFS logo" width=300/> 

# lakeFS Role-Based Access Control (RBAC) Demo

## Use Case: Data Governance - secure your data lake

## Prerequisites

###### This Notebook requires connecting to lakeFS Cloud.
###### Register for the lakeFS Cloud: https://lakefs.cloud/register

## Following image explains the steps you will run in this notebook:

![RBAC](./images/RBAC.png)

## Config

### Change your lakeFS credentials

In [None]:
lakefsEndPoint = '127.0.0.1:8000' # e.g. 'https://username.aws_region_name.lakefscloud.io'
lakefsAccessKey = 'AKIAIOSFOLKFSSAMPLES'
lakefsSecretKey = 'wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY'

### Storage Information
##### Change the Storage Namespace to a location in the bucket you’ve configured. The storage namespace is a location in the underlying storage where data for this repository will be stored.

In [None]:
storageNamespace = 's3://<S3 Bucket Name>/' # e.g. "s3://username-lakefs-cloud/"

## Setup

**(you shouldn't need to change anything in this section, just run it)**

## You can change lakeFS repo name (it can be an existing repo or provide another repo name)

In [None]:
repo_name = "rbac-repo"

## Versioning Information

In [None]:
sourceBranch = "main"
newBranch = "experiment1"
fileName = "lakefs_test.csv"

### Import libraries

In [None]:
%xmode Minimal
import lakefs
from lakefs.client import Client
import lakefs_sdk
from lakefs_sdk.client import LakeFSClient
from lakefs_sdk import models
from assets.lakefs_demo import print_commit

## Working with the lakeFS Python client API

In [None]:
if not 'superUserClient' in locals():
    configuration = lakefs_sdk.Configuration(
        host=lakefsEndPoint,
        username=lakefsAccessKey,
        password=lakefsSecretKey,
    )
    superUserClient = LakeFSClient(configuration)

### Verify lakeFS credentials by getting lakeFS version

In [None]:
print("Verifying lakeFS credentials…")
try:
    v=superUserClient.internal_api.get_lake_fs_version().version
except:
    print("🛑 failed to get lakeFS version")
else:
    print(f"…✅lakeFS credentials verified\n\nℹ️lakeFS version {v}")

## Super User creates "admin1" user

In [None]:
superUserClient.auth_api.create_user(
    user_creation=models.UserCreation(
        id='admin1'))

## Super User adds "admin1" user to lakeFS created "Admins" group

In [None]:
groupName='Admins'

has_more = True
next_offset = ""
while has_more:
    groups = superUserClient.auth_api.list_groups(after=next_offset)
    for r in groups.results:
        if r.name == groupName:
            groupId = r.id
            break
    has_more = groups.pagination.has_more
    next_offset = groups.pagination.next_offset
    
superUserClient.auth_api.add_group_membership(
    group_id=groupId,
    user_id='admin1')

## Create credentials for "admin1" user

In [None]:
credentials = superUserClient.auth_api.create_credentials(user_id='admin1')
print(credentials)
admin1AccessKey = credentials.access_key_id
admin1SecretKey = credentials.secret_access_key

## Create lakeFS Python client for "admin1" user

In [None]:
configuration = lakefs_sdk.Configuration(
    host=lakefsEndPoint,
    username=admin1AccessKey,
    password=admin1SecretKey,
)
admin1Client = LakeFSClient(configuration)

admin1LakefsClient = Client(
    host=lakefsEndPoint,
    username=admin1AccessKey,
    password=admin1SecretKey,
)
    
print("Created lakeFS client for admin1.")

## Verify user for "admin1Client" Python client

In [None]:
admin1Client.auth_api.get_current_user()

## Now "admin1" will do rest of the setup
#### "admin1" creates "developer1" user

In [None]:
admin1Client.auth_api.create_user(
    user_creation=models.UserCreation(
        id='developer1'))

## "admin1" adds "developer1" user to lakeFS created "Developers" group

In [None]:
groupNameDevelopers='Developers'

has_more = True
next_offset = ""
while has_more:
    groups = superUserClient.auth_api.list_groups(after=next_offset)
    for r in groups.results:
        if r.name == groupNameDevelopers:
            groupIdDevelopers = r.id
            break
    has_more = groups.pagination.has_more
    next_offset = groups.pagination.next_offset
    
admin1Client.auth_api.add_group_membership(
    group_id=groupIdDevelopers,
    user_id='developer1')

## Create credentials for "developer1" user

In [None]:
credentials = admin1Client.auth_api.create_credentials(user_id='developer1')
print(credentials)
developer1AccessKey = credentials.access_key_id
developer1SecretKey = credentials.secret_access_key

## Create lakeFS Python client for "developer1" user

In [None]:
configuration = lakefs_sdk.Configuration(
    host=lakefsEndPoint,
    username=developer1AccessKey,
    password=developer1SecretKey,
)
developer1Client = LakeFSClient(configuration)

developer1LakeFSClient = Client(
    host=lakefsEndPoint,
    username=developer1AccessKey,
    password=developer1SecretKey,
)
    
print("Created lakeFS client for developer1.")

## Verify user for "developer1Client" Python client

In [None]:
developer1Client.auth_api.get_current_user()

## "admin1" creates "DataScientists" group

In [None]:
DataScientistsGroup = admin1Client.auth_api.create_group(
    group_creation=models.GroupCreation(
        id='DataScientists'))

## "admin1" attaches lakeFS created "AuthManageOwnCredentials" policy to "DataScientists" group

In [None]:
admin1Client.auth_api.attach_policy_to_group(
    group_id=DataScientistsGroup.id,
    policy_id='AuthManageOwnCredentials')

## "admin1" attaches lakeFS created "FSReadWriteAll" policy to "DataScientists" group

In [None]:
admin1Client.auth_api.attach_policy_to_group(
    group_id=DataScientistsGroup.id,
    policy_id='FSReadWriteAll')

## "admin1" attaches lakeFS created "RepoManagementReadAll" policy to "DataScientists" group

In [None]:
admin1Client.auth_api.attach_policy_to_group(
    group_id=DataScientistsGroup.id,
    policy_id='RepoManagementReadAll')

## "admin1" creates "data_scientist1" user

In [None]:
admin1Client.auth_api.create_user(
    user_creation=models.UserCreation(
        id='data_scientist1'))

## "admin1" adds "data_scientist1" user to "DataScientists" group

In [None]:
admin1Client.auth_api.add_group_membership(
    group_id=DataScientistsGroup.id,
    user_id='data_scientist1')

## Create credentials for "data_scientist1" user

In [None]:
credentials = admin1Client.auth_api.create_credentials(user_id='data_scientist1')
print(credentials)
data_scientist1AccessKey = credentials.access_key_id
data_scientist1SecretKey = credentials.secret_access_key

## Create lakeFS Python client for "data_scientist1" user

In [None]:
configuration = lakefs_sdk.Configuration(
    host=lakefsEndPoint,
    username=data_scientist1AccessKey,
    password=data_scientist1SecretKey,
)
data_scientist1Client = LakeFSClient(configuration)

data_scientist1LakeFSClient = Client(
    host=lakefsEndPoint,
    username=data_scientist1AccessKey,
    password=data_scientist1SecretKey,
)
    
print("Created lakeFS client for data_scientist1.")

## Verify user for "data_scientist1Client" Python client

In [None]:
data_scientist1Client.auth_api.get_current_user()

## "admin1" creates "FSBlockMergingToMain" policy to prevent commits to the main branch

In [None]:
admin1Client.auth_api.create_policy(
    policy=models.Policy(
        id='FSBlockMergingToMain',
        statement=[models.Statement(
            effect="deny",
            resource="arn:lakefs:fs:::repository/*/branch/main",
            action=["fs:CreateCommit"],
        ),
        ]
    )
)

## "admin1" attaches "FSBlockMergingToMain" policy to "DataScientists" group

In [None]:
admin1Client.auth_api.attach_policy_to_group(
    group_id=DataScientistsGroup.id,
    policy_id='FSBlockMergingToMain')

## "admin1" creates "FSBlockAccessToPIIData" policy which denies access to any objects in "PII" folder

In [None]:
admin1Client.auth_api.create_policy(
    policy=models.Policy(
        id='FSBlockAccessToPIIData',
        statement=[models.Statement(
            effect="deny",
            resource="arn:lakefs:fs:::repository/"+repo_name+"/object/PII/*",
            action=["fs:*"],
        ),
        ]
    )
)

## "admin1" attaches "FSBlockAccessToPIIData" policy to "Developers" group

In [None]:
admin1Client.auth_api.attach_policy_to_group(
    group_id=groupIdDevelopers,
    policy_id='FSBlockAccessToPIIData')

## If repo already exists on your lakeFS server then you can skip following step otherwise "admin1" creates a new repo

In [None]:
repo = lakefs.Repository(repo_name, client=admin1LakefsClient).create(storage_namespace=f"{storageNamespace}/{repo_name}", default_branch=sourceBranch, exist_ok=True)
branchMain = repo.branch(sourceBranch)
print(repo)

## "admin1" protects main branch so no one can write directly to main branch and any subsequent writes must be done via the merge of a branch

In [None]:
admin1Client.repositories_api.set_branch_protection_rules(
    repository=repo_name,
    branch_protection_rule=[models.BranchProtectionRule(
        pattern=sourceBranch)])

## "admin1" tries to upload a file to "PII" folder to main branch but it fails because main branch is protected

In [None]:
contentToUpload = open(f"/data/{fileName}", 'r').read()
branchMain.object('PII/'+fileName).upload(data=contentToUpload, mode='wb', pre_sign=False)

## "admin1" creates "ingest-pii-data" branch

In [None]:
branchIngestPIIData = repo.branch('ingest-pii-data').create(source_reference=sourceBranch)
print("ingest-pii-data ref:", branchIngestPIIData.get_commit().id)

## "admin1" uploads the file to "PII" folder in "ingest-pii-data" branch

In [None]:
contentToUpload = open(f"/data/{fileName}", 'r').read()
branchIngestPIIData.object('PII/'+fileName).upload(data=contentToUpload, mode='wb', pre_sign=False)

## "admin1" commits changes and attaches some metadata

In [None]:
ref = branchIngestPIIData.commit(message='Added PII file!', metadata={'using': 'python_sdk'})
print_commit(ref.get_commit())

## "admin1" merges "ingest-pii-data" branch to main branch

In [None]:
res = branchIngestPIIData.merge_into(branchMain)
print(res)

## "admin1" reads object under "PII" folder successfully

In [None]:
lakefs.Repository(repo_name, client=admin1LakefsClient).branch(sourceBranch).object(path='PII/'+fileName).reader(mode='r').read()

## "data_scientist1" reads object under "PII" folder successfully

In [None]:
lakefs.Repository(repo_name, client=data_scientist1LakeFSClient).branch(sourceBranch).object(path='PII/'+fileName).reader(mode='r').read()

## But "developer1" can't read object under "PII" folder due to "FSBlockAccessToPIIData" policy attached to "Developers" group

In [None]:
lakefs.Repository(repo_name, client=developer1LakeFSClient).branch(sourceBranch).object(path='PII/'+fileName).reader(mode='r').read()

## "data_scientist1" creates "ds_branch" branch

In [None]:
branchDSBranch = lakefs.Repository(repo_name, client=data_scientist1LakeFSClient).branch('ds_branch').create(source_reference=sourceBranch)
print("ds_branch ref:", branchDSBranch.get_commit().id)

## "data_scientist1" uploads a new file to "ds_branch" branch

In [None]:
contentToUpload = open('/data/lakefs_test_new.csv', 'r').read()
branchDSBranch.object('ds/lakefs_test_new.csv').upload(data=contentToUpload, mode='wb', pre_sign=False)

## "data_scientist1" commits changes and attaches some metadata

In [None]:
ref = branchDSBranch.commit(message='Added new data file!', metadata={'using': 'python_sdk'})
print_commit(ref.get_commit())

## But "data_scientist1" can't merge "ds_branch" branch to main branch due to "FSBlockMergingToMain" policy attached to "DataScientists" group

In [None]:
branchDSBranch.merge_into(sourceBranch)

## More Questions?

###### Join the lakeFS Slack group - https://lakefs.io/slack