### Important Note!

This notebook ran using with a 12.2 LTS Runtime version!

There are no concerns regarding cluster size (memory and cores).

Insert `spark.databricks.delta.retentionDurationCheck.enabled false` during cluster configs creation to be able to use VACUUM properly in this notebook.

The purpose of this notebook is just to show the version of the SQL commands for Python, **always use the SQL version as a reference**, as it was the one used during the Databricks SQL course.

## Using DBFS

In [0]:
%fs ls /

path,name,size,modificationTime
dbfs:/FileStore/,FileStore/,0,1694631178000
dbfs:/Volume/,Volume/,0,0
dbfs:/Volumes/,Volumes/,0,0
dbfs:/databricks-datasets/,databricks-datasets/,0,0
dbfs:/databricks-results/,databricks-results/,0,0
dbfs:/mnt/,mnt/,0,1694530632000
dbfs:/user/,user/,0,1693629492000
dbfs:/volume/,volume/,0,0
dbfs:/volumes/,volumes/,0,0


In [0]:
display(dbutils.fs.ls('/'))

path,name,size,modificationTime
dbfs:/FileStore/,FileStore/,0,1694631178000
dbfs:/Volume/,Volume/,0,0
dbfs:/Volumes/,Volumes/,0,0
dbfs:/databricks-datasets/,databricks-datasets/,0,0
dbfs:/databricks-results/,databricks-results/,0,0
dbfs:/mnt/,mnt/,0,1694530632000
dbfs:/user/,user/,0,1693629492000
dbfs:/volume/,volume/,0,0
dbfs:/volumes/,volumes/,0,0



## Some examples of DBFS Unix-Like Commands

| Command | Result |
|--------|--------|
| `ls` | List the directory files. |
| `mkdirs` | Create a new directory. |
| `cp` | Able to copy files from one directory to another. |
| `mv` | Able to move files from one directory to another. |
| `rm` | Can remove a file from a path. |
| `rm -r` | Recursively removes nested folders or files. |
| `put` | Let you insert data or even create new data files. |
| `head` | Allows file reading in the first rows. |
| `mount` | Create a link between a workspace and cloud object storage. |
| `secrets.get` | Collect a secret from a certain scope of Databricks Secrets. |


In [0]:
%fs help

In [0]:
help(dbutils.fs)

Help on FSHandler in module dbruntime.dbutils object:

class FSHandler(builtins.object)
 |  FSHandler(fsutils, fsutils_parallel, dbcore, sc, entry_point, displayHTML)
 |  
 |  Methods defined here:
 |  
 |  __call__(self)
 |      Call self as a function.
 |  
 |  __init__(self, fsutils, fsutils_parallel, dbcore, sc, entry_point, displayHTML)
 |      Initialize self.  See help(type(self)) for accurate signature.
 |  
 |  __repr__(self)
 |      Return repr(self).
 |  
 |  cacheFiles = f_with_exception_handling(*args, **kwargs)
 |  
 |  cacheTable = f_with_exception_handling(*args, **kwargs)
 |  
 |  check_types(self, vars_and_types)
 |  
 |  cp = f_with_exception_handling(*args, **kwargs)
 |  
 |  create_list_from_jschema(self, jschema, create_obj_from_jschema)
 |  
 |  head = f_with_exception_handling(*args, **kwargs)
 |  
 |  help(self, method_name=None)
 |  
 |  ls = f_with_exception_handling(*args, **kwargs)
 |  
 |  mkdirs = f_with_exception_handling(*args, **kwargs)
 |  
 |  mount 

In [0]:
%fs mkdirs /databricks_cc_example

In [0]:
%fs ls /databricks_cc_example/

In [0]:
dbutils.fs.put("/databricks_cc_example/data.csv", """Name, Age, Job
Angelo, 25, Nurse
Maria, 23, Architect
Ronaldo, 33, Actor
Jessica, 18, Student
Mara, 27, Doctor""",
True)

Wrote 110 bytes.
Out[4]: True

In [0]:
%fs ls /databricks_cc_example

path,name,size,modificationTime
dbfs:/databricks_cc_example/data.csv,data.csv,110,1694650055000


In [0]:
%fs head /databricks_cc_example/data.csv

In [0]:
%fs ls /

path,name,size,modificationTime
dbfs:/FileStore/,FileStore/,0,1694631178000
dbfs:/Volume/,Volume/,0,0
dbfs:/Volumes/,Volumes/,0,0
dbfs:/databricks-datasets/,databricks-datasets/,0,0
dbfs:/databricks-results/,databricks-results/,0,0
dbfs:/databricks_cc_example/,databricks_cc_example/,0,1694649911000
dbfs:/mnt/,mnt/,0,1694530632000
dbfs:/user/,user/,0,1693629492000
dbfs:/volume/,volume/,0,0
dbfs:/volumes/,volumes/,0,0


In [0]:
%fs ls /mnt/

In [0]:
%fs mounts

mountPoint,source,encryptionType
/databricks-datasets,databricks-datasets,
/Volumes,UnityCatalogVolumes,
/databricks/mlflow-tracking,databricks/mlflow-tracking,
/databricks-results,databricks-results,
/databricks/mlflow-registry,databricks/mlflow-registry,
/Volume,DbfsReserved,
/volumes,DbfsReserved,
/,DatabricksRoot,
/volume,DbfsReserved,


In [0]:
access_key = dbutils.secrets.get(scope = "aws", key = "aws-access-key")
secret_key = dbutils.secrets.get(scope = "aws", key = "aws-secret-key")
encoded_secret_key = secret_key.replace("/", "%2F")
aws_bucket_name = "databricks-cc-eng-academy"
mount_name = "s3_dbfs"

dbutils.fs.mount(f"s3a://{access_key}:{encoded_secret_key}@{aws_bucket_name}", f"/mnt/{mount_name}")

Out[5]: True

In [0]:
top_secret = dbutils.secrets.get(scope = "databricks", key = "secrets")
for letter in top_secret:
    print(letter)

f
l
a
m
e
n
g
o


In [0]:
%fs mounts

mountPoint,source,encryptionType
/databricks-datasets,databricks-datasets,
/mnt/s3_dbfs,s3a://databricks-cc-eng-academy,
/Volumes,UnityCatalogVolumes,
/databricks/mlflow-tracking,databricks/mlflow-tracking,
/databricks-results,databricks-results,
/databricks/mlflow-registry,databricks/mlflow-registry,
/Volume,DbfsReserved,
/volumes,DbfsReserved,
/,DatabricksRoot,
/volume,DbfsReserved,


In [0]:
%fs ls /mnt/s3_dbfs/

path,name,size,modificationTime
dbfs:/mnt/s3_dbfs/remuneracao202206.csv,remuneracao202206.csv,86911959,1694562402000


In [0]:
spark.sql(
"CREATE TABLE IF NOT EXISTS remuneracao_day2" + \
"USING csv" + \
"OPTIONS (header 'true'," + \
"      delimiter ';'," + \
"      path '/mnt/s3_dbfs/remuneracao202206.csv')"
)

In [0]:
display(df_remuneracao_day2.history())

In [0]:
display(df_remuneracao_day2.describe())

format,id,name,description,location,createdAt,lastModified,partitionColumns,numFiles,sizeInBytes,properties,minReaderVersion,minWriterVersion,tableFeatures,statistics
csv,,hive_metastore.default.remuneracao_day2,,dbfs:/mnt/s3_dbfs/remuneracao202206.csv,2023-09-14T00:20:10.000+0000,,List(),,,Map(),,,,Map()


In [0]:
df_remuneracao_day2.createOrReplaceTemporaryView("remuneracao_day2")
display(spark.sql("DESCRIBE TABLE EXTENDED remuneracao_day2"))

col_name,data_type,comment
NOME,string,
CPF,string,
?RG?O,string,
CARGO,string,
FUN??O,string,
SITUA??O,string,
M?S,string,
ANO,string,
C?DIGO DO ?RG?O,string,
MATR?CULA,string,


In [0]:
#dbutils.fs.unmount(/mnt/s3_dbfs)
#%fs unmount /mnt/s3_dbfs

In [0]:
display(df_remuneracao_day2.limit(5))

NOME,CPF,?RG?O,CARGO,FUN??O,SITUA??O,M?S,ANO,C?DIGO DO ?RG?O,MATR?CULA,REMUNERA??O B?SICA,BENEF?CIOS,VALOR DA FUN??O,COMISS?O CONSELHEIRO,HORA EXTRA,VERBAS EVENTUAIS,VERBAS JUDICIAIS,DESCONTOS A MAIOR,LICEN?A PR?MIO,IRRF,SEG. SOCIAL,TETO REDUTOR,OUTROS RECEBIMENTOS,OUTROS DESCONTOS OBRIGAT?RIOS,PAGAMENTOS A MAIOR,BRUTO,L?QUIDO
EDMUNDO BARBOSA DE FREITAS,***898181**,BRB-BANCO DE BRASILIA S.A.,ESCRITUR�RIO,CAIXA BANC�RIO,AFASTADO/ABONO ASSIDUIDADE,6,2022,300001,1569,611126,189530,128459,",00",",00",794115,",00",",00",",00",850987,419359,",00",",00",",00",",00",1723230,452884
CLEUSA MARIA DE CAMPOS MORAIS,***939691**,BRB-BANCO DE BRASILIA S.A.,ESCRITUR�RIO,,AFASTADO/ABONO ASSIDUIDADE,6,2022,300001,1877,1176312,189530,",00",",00",",00",",00",",00",",00",",00",174228,257998,",00",",00",",00",",00",1365842,933616
DIVINO CARLOS SEROZINO,***730581**,BRB-BANCO DE BRASILIA S.A.,ESCRITUR�RIO,CAIXA BANC�RIO,AFASTADO/ABONO ASSIDUIDADE,6,2022,300001,2485,1477048,189530,385377,",00",",00",",00",",00",",00",",00",343661,326264,",00",",00",",00",",00",2051955,1382030
ADEMILSON CARNEIRO,***749601**,BRB-BANCO DE BRASILIA S.A.,ESCRITUR�RIO,,AFASTADO/ABONO ASSIDUIDADE,6,2022,300001,3264,1795543,199520,",00",",00",",00",",00",",00",",00",",00",330407,342973,",00",",00",",00",",00",1995063,1321683
ANA MARIA NOGUEIRA,***040221**,BRB-BANCO DE BRASILIA S.A.,ESCRITUR�RIO,,AFASTADO/ABONO ASSIDUIDADE,6,2022,300001,4674,920404,189530,",00",",00",",00",",00",",00",",00",",00",122866,187180,",00",",00",",00",",00",1109934,799888


In [0]:
#FONTE DO DICIONÁRIO DE DADOS: http://dados.df.gov.br/dataset/462126f8-8a61-4cec-91a2-38615b7f70f6/resource/0514402e-cd29-440b-83e5-f97787dd1ad3/download/dicdadosremuneracaodosservidores.html
spark.sql(
"CREATE TABLE IF NOT EXISTS remuneracao_day (nome STRING COMMENT 'NOME COMPLETO DO SERVIDOR', " + \
"                                            cpf STRING COMMENT 'Nº DO CADASTRO DE PESSOA FÍSICA MASCARADO'," + \ 
"                                            orgao STRING COMMENT 'ÓRGÃO VINCULADO AO SERVIDOR'," + \
"                                            cargo STRING COMMENT 'POSIÇÃO QUE O SERVIDOR OCUPA NO ÓRGÃO DE FORMA PERMANENTE'," + \
"                                            funcao STRING COMMENT 'DESIGNAÇÃO TEMPORÁRIA PARA DESEMPENHO DE DETERMINADAS ATRIBUIÇÕES (CARGO COMISSIONADO)'," + \
"                                            situacao STRING COMMENT 'CONDIÇÃO DO SERVIDOR EM RELAÇÃO AO EXERCÍCIO DE SUAS ATIVIDADES'," + \
"                                            mes INTEGER COMMENT 'MÊS DE REFERÊNCIA'," + \
"                                            ano INTEGER COMMENT 'ANO DE REFERÊNCIA'," + \
"                                            codigo_do_orgao INTEGER COMMENT 'ÓRGÃO VINCULADO AO SERVIDOR'," + \
"                                            matricula STRING COMMENT 'MATRÍCULA DO SERVIDOR NO ÓRGÃO VINCULADO'," + \
"                                            remuneracao_basica STRING COMMENT 'REMUNERAÇÃO BÁSICA'," + \
"                                            beneficios STRING COMMENT 'VALOR DOS BENEFÍCIOS'," + \
"                                            valor_das_funcoes STRING COMMENT 'VALOR DAS FUNÇÕES'," + \
"                                            comissao_conselheiro STRING COMMENT 'COMISSÃO CONSELHEIRO'," + \
"                                            hora_extra STRING COMMENT 'TOTAL DE HORA EXTRA'," + \
"                                            verbas_eventuais STRING COMMENT 'VALOR DAS VERBAS EVENTUAIS'," + \
"                                            verbas_judiciais STRING COMMENT 'VALOR DAS VERBAS JUDICIAIS'," + \
"                                            descontos_a_maior STRING COMMENT 'VALOR DA REPOSIÇÃO DE DESCONTOS A MAIOR'," + \
"                                            licenca_premio STRING COMMENT 'VALOR DA LICENÇA PRÊMIO'," + \
"                                            irrf STRING COMMENT 'VALOR DO DESCONTO DE IMPOSTO DE RENDA RETIDO NA FONTE'," + \
"                                            seguridade_social STRING COMMENT 'VALOR DO DESCONTO DE SEGURIDADE SOCIAL'," + \
"                                            teto_redutor STRING COMMENT 'VALOR DO DESCONTO DE TETO REDUTOR'," + \
"                                            outros_recebimentos STRING COMMENT 'VALOR DE OUTROS RECEBIMENTOS'," + \
"                                            outros_descontos_obrigatorios STRING COMMENT 'VALOR DE OUTROS DESCONTOS OBRIGATÓRIOS'," + \
"                                            pagamento_a_maior STRING COMMENT 'VALOR DOS DESCONTOS DE PAGAMENTOS A MAIOR'," + \
"                                            bruto STRING COMMENT 'VALOR BRUTO DA REMUNERAÇÃO'," + \
"                                            liquido STRING COMMENT 'VALOR LÍQUIDO APÓS DESCONTOS OBRIGATÓRIOS')" + \
"COMMENT 'Este conjunto de dados apresenta a remuneração dos servidores do Governo do DF, detalhada por órgão e nome do servidor.'"
)

In [0]:
df_remuneracao_day2.selectExpr("*").write.insertInto("remuneracao_day")

num_affected_rows,num_inserted_rows
247567,247567


## Welcome Unity Catalog

In [0]:
spark.sql("CREATE CATALOG IF NOT EXISTS hive_to_uc_sync")
spark.sql("USE CATALOG hive_to_uc_sync")
spark.sql("CREATE SCHEMA IF NOT EXISTS sync_data")
spark.sql("USE SCHEMA sync_data")

In [0]:
spark.sql("SYNC SCHEMA hive_to_uc_sync.sync_data FROM hive_metastore.default DRY RUN")

source_schema,source_name,source_type,target_catalog,target_schema,target_name,status_code,description
default,people_10millions,managed,hive_to_uc_sync,sync_data,people_10millions,DBFS_ROOT_LOCATION,[UPGRADE_NOT_SUPPORTED.DBFS_ROOT_LOCATION] Table is not eligible for upgrade from Hive Metastore to Unity Catalog. Reason: Table located on DBFS root.
default,remuneracao_day,managed,hive_to_uc_sync,sync_data,remuneracao_day,DBFS_ROOT_LOCATION,[UPGRADE_NOT_SUPPORTED.DBFS_ROOT_LOCATION] Table is not eligible for upgrade from Hive Metastore to Unity Catalog. Reason: Table located on DBFS root.
default,remuneracao_day2,external,hive_to_uc_sync,sync_data,remuneracao_day2,EXTERNAL_LOCATION_DOES_NOT_EXIST,parent external location for path `s3://databricks-cc-eng-academy/remuneracao202206.csv` does not exist.
default,titanic_clean,managed,hive_to_uc_sync,sync_data,titanic_clean,DBFS_ROOT_LOCATION,[UPGRADE_NOT_SUPPORTED.DBFS_ROOT_LOCATION] Table is not eligible for upgrade from Hive Metastore to Unity Catalog. Reason: Table located on DBFS root.


### CLONING TABLES


| Deep Clone Features | Shallow Clone Features |
|--------|--------|
| Completely independent of the source. | It depends exclusively on the source. |
| Duplicated all data & metadata | Duplicates metadata only. |
| You can clone a Deep Clone Table | Cannot create Shallow Clone from another Shallow Clone Table. |
| Works with Managed and Unmanaged tables | Works only with Managed Tables on Unity Catalog. |
| Scenarios: migrations, back-ups, upgrade Hive -> UC  | Scenarios: tests, short-duration workloads, low-cost experiments. |

PS: Not only exclusive for Delta Tables as you also can clone Parquet and Iceberg tables.

In [0]:
df_people = spark.read.table("default.people_10millions")

In [0]:
df_people.clone("dbfs://clone_path/deep_clone", isShallow = False, replace)

source_table_size,source_num_of_files,num_removed_files,num_copied_files,removed_files_size,copied_files_size
236570589,4,0,4,0,236570589


In [0]:
df_people.clone("dbfs://clone_path/shallow_clone", isShallow, replace)

source_table_size,source_num_of_files,num_removed_files,num_copied_files,removed_files_size,copied_files_size
236570589,4,0,0,0,0


In [0]:
display(df_people.count())

count(1)
10000000


In [0]:
df_people_deep = spark.read.load("dbfs://clone_path/deep_clone")
display(df_people_deep.count())

count(1)
10000000


In [0]:
df_people_shallow = spark.read.load("dbfs://clone_path/shallow_clone")
display(df_people_shallow.count())

count(1)
10000000


In [0]:
display(df_people_deep.history())

version,timestamp,userId,userName,operation,operationParameters,job,notebook,clusterId,readVersion,isolationLevel,isBlindAppend,operationMetrics,userMetadata,engineInfo
0,2023-09-14T01:06:09.000+0000,2657276046068378,vitojon@outlook.com,CLONE,"Map(source -> hive_metastore.default.people_10millions, sourceVersion -> 4, isShallow -> false)",,List(4008888185532145),0913-194424-ztitdjd5,-1,Serializable,False,"Map(removedFilesSize -> 0, numRemovedFiles -> 0, sourceTableSize -> 236570589, numCopiedFiles -> 4, copiedFilesSize -> 236570589, sourceNumOfFiles -> 4)",,Databricks-Runtime/12.2.x-scala2.12


In [0]:
df_people.delete("birthDate >= '2000-01-01'")

num_affected_rows
16758


In [0]:
display(df_people.count())

count(1)
9983242


In [0]:
display(df_people_deep.count())

count(1)
10000000


In [0]:
display(df_people_deep.history())

version,timestamp,userId,userName,operation,operationParameters,job,notebook,clusterId,readVersion,isolationLevel,isBlindAppend,operationMetrics,userMetadata,engineInfo
5,2023-09-14T01:08:58.000+0000,2657276046068378,vitojon@outlook.com,DELETE,"Map(predicate -> [""(birthDate#5324 >= 2000-01-01 00:00:00)""])",,List(4008888185532145),0913-194424-ztitdjd5,4.0,WriteSerializable,False,"Map(numRemovedFiles -> 4, numRemovedBytes -> 236570589, numCopiedRows -> 9983242, numDeletionVectorsAdded -> 0, numDeletionVectorsRemoved -> 0, numAddedChangeFiles -> 0, executionTimeMs -> 15373, numDeletedRows -> 16758, scanTimeMs -> 1388, numAddedFiles -> 4, numAddedBytes -> 236178358, rewriteTimeMs -> 13975)",,Databricks-Runtime/12.2.x-scala2.12
4,2023-09-13T20:59:58.000+0000,2657276046068378,vitojon@outlook.com,RESTORE,"Map(version -> 0, timestamp -> null)",,List(639040945428698),0913-194424-ztitdjd5,3.0,Serializable,False,"Map(numRestoredFiles -> 4, removedFilesSize -> 236178357, numRemovedFiles -> 4, restoredFilesSize -> 236570589, numOfFilesAfterRestore -> 4, tableSizeAfterRestore -> 236570589)",,Databricks-Runtime/12.2.x-scala2.12
3,2023-09-13T20:59:21.000+0000,2657276046068378,vitojon@outlook.com,DELETE,"Map(predicate -> [""(birthDate#27008 >= 2000-01-01 00:00:00)""])",,List(639040945428698),0913-194424-ztitdjd5,2.0,WriteSerializable,False,"Map(numRemovedFiles -> 4, numRemovedBytes -> 236570588, numCopiedRows -> 9983242, numDeletionVectorsAdded -> 0, numDeletionVectorsRemoved -> 0, numAddedChangeFiles -> 0, executionTimeMs -> 15361, numDeletedRows -> 16758, scanTimeMs -> 1049, numAddedFiles -> 4, numAddedBytes -> 236178357, rewriteTimeMs -> 14312)",,Databricks-Runtime/12.2.x-scala2.12
2,2023-09-13T20:59:00.000+0000,2657276046068378,vitojon@outlook.com,UPDATE,"Map(predicate -> [""(gender#26154 = F)""])",,List(639040945428698),0913-194424-ztitdjd5,1.0,WriteSerializable,False,"Map(numRemovedFiles -> 4, numRemovedBytes -> 236570589, numCopiedRows -> 4812698, numDeletionVectorsAdded -> 0, numDeletionVectorsRemoved -> 0, numAddedChangeFiles -> 0, executionTimeMs -> 15963, scanTimeMs -> 1329, numAddedFiles -> 4, numUpdatedRows -> 5187302, numAddedBytes -> 236570588, rewriteTimeMs -> 14634)",,Databricks-Runtime/12.2.x-scala2.12
1,2023-09-13T20:58:42.000+0000,2657276046068378,vitojon@outlook.com,UPDATE,"Map(predicate -> [""(gender#25338 = M)""])",,List(639040945428698),0913-194424-ztitdjd5,0.0,WriteSerializable,False,"Map(numRemovedFiles -> 3, numRemovedBytes -> 177167561, numCopiedRows -> 2670016, numDeletionVectorsAdded -> 0, numDeletionVectorsRemoved -> 0, numAddedChangeFiles -> 0, executionTimeMs -> 13510, scanTimeMs -> 1202, numAddedFiles -> 3, numUpdatedRows -> 4812698, numAddedBytes -> 177167561, rewriteTimeMs -> 12307)",,Databricks-Runtime/12.2.x-scala2.12
0,2023-09-13T20:58:11.000+0000,2657276046068378,vitojon@outlook.com,CREATE TABLE AS SELECT,"Map(isManaged -> true, description -> null, partitionBy -> [], properties -> {})",,List(639040945428698),0913-194424-ztitdjd5,,WriteSerializable,True,"Map(numFiles -> 4, numOutputRows -> 10000000, numOutputBytes -> 236570589)",,Databricks-Runtime/12.2.x-scala2.12


In [0]:
df_people.restoreToVersion(0)

table_size_after_restore,num_of_files_after_restore,num_removed_files,num_restored_files,removed_files_size,restored_files_size
236570589,4,4,4,236178358,236570589


In [0]:
display(df_people.count())

count(1)
10000000


#### To enable a duration out of the minimum allowed (168h / 1week) you can set:
`spark.databricks.delta.retentionDurationCheck.enabled false`
as parameter settings on your cluster.

In [0]:
df_people.vacuum(0)

path
dbfs:/user/hive/warehouse/people_10millions


In [0]:
df_people.clone("dbfs://hive_to_uc_sync/people_deep_clone", isShallow = False, replace)

source_table_size,source_num_of_files,num_removed_files,num_copied_files,removed_files_size,copied_files_size
236570589,4,0,4,0,236570589


In [0]:
df_titanic = spark.read.table("default.titanic_clean")
df_titanic.clone("dbfs://hive_to_uc_sync/titanic_clean", isShallow = False, replace)

source_table_size,source_num_of_files,num_removed_files,num_copied_files,removed_files_size,copied_files_size
26853,1,0,1,0,26853


In [0]:
df_titanic = spark.read.table("default.remuneracao_day")
df_remuneracao_day.clone("dbfs://hive_to_uc_sync/remuneracao_day", isShallow = False, replace)

source_table_size,source_num_of_files,num_removed_files,num_copied_files,removed_files_size,copied_files_size
15030736,4,0,4,0,15030736


## Dealing with Dynamic Views

In [0]:
table_path = "dbfs://hive_to_uc_sync/remuneracao_day"
df_remuneracao_day = spark.read.load(table_path)
display(df_remuneracao_day.limit(5))

nome,cpf,orgao,cargo,funcao,situacao,mes,ano,codigo_do_orgao,matricula,remuneracao_basica,beneficios,valor_das_funcoes,comissao_conselheiro,hora_extra,verbas_eventuais,verbas_judiciais,descontos_a_maior,licenca_premio,irrf,seguridade_social,teto_redutor,outros_recebimentos,outros_descontos_obrigatorios,pagamento_a_maior,bruto,liquido
EDMUNDO BARBOSA DE FREITAS,***898181**,BRB-BANCO DE BRASILIA S.A.,ESCRITUR�RIO,CAIXA BANC�RIO,AFASTADO/ABONO ASSIDUIDADE,6,2022,300001,1569,611126,189530,128459,",00",",00",794115,",00",",00",",00",850987,419359,",00",",00",",00",",00",1723230,452884
CLEUSA MARIA DE CAMPOS MORAIS,***939691**,BRB-BANCO DE BRASILIA S.A.,ESCRITUR�RIO,,AFASTADO/ABONO ASSIDUIDADE,6,2022,300001,1877,1176312,189530,",00",",00",",00",",00",",00",",00",",00",174228,257998,",00",",00",",00",",00",1365842,933616
DIVINO CARLOS SEROZINO,***730581**,BRB-BANCO DE BRASILIA S.A.,ESCRITUR�RIO,CAIXA BANC�RIO,AFASTADO/ABONO ASSIDUIDADE,6,2022,300001,2485,1477048,189530,385377,",00",",00",",00",",00",",00",",00",343661,326264,",00",",00",",00",",00",2051955,1382030
ADEMILSON CARNEIRO,***749601**,BRB-BANCO DE BRASILIA S.A.,ESCRITUR�RIO,,AFASTADO/ABONO ASSIDUIDADE,6,2022,300001,3264,1795543,199520,",00",",00",",00",",00",",00",",00",",00",330407,342973,",00",",00",",00",",00",1995063,1321683
ANA MARIA NOGUEIRA,***040221**,BRB-BANCO DE BRASILIA S.A.,ESCRITUR�RIO,,AFASTADO/ABONO ASSIDUIDADE,6,2022,300001,4674,920404,189530,",00",",00",",00",",00",",00",",00",",00",122866,187180,",00",",00",",00",",00",1109934,799888


In [0]:
df_remuneracao_day.createOrReplaceTemporaryView("remuneracao_day")
display(spark.sql(
"SELECT" + \
"      mask(cpf,"O","i","*") AS cpf_masked, c" + \
"      cargo, " + \
"      ano, " + \
"      bruto" + \
"FROM" + \
"  remuneracao_day" + \
"LIMIT 10"
))

cpf_masked,cargo,ano,bruto
***********,ESCRITUR�RIO,2022,1723230
***********,ESCRITUR�RIO,2022,1365842
***********,ESCRITUR�RIO,2022,2051955
***********,ESCRITUR�RIO,2022,1995063
***********,ESCRITUR�RIO,2022,1109934
***********,ESCRITUR�RIO,2022,785295
***********,ESCRITUR�RIO,2022,1372087
***********,ESCRITUR�RIO,2022,2494278
***********,ESCRITUR�RIO,2022,1105650
***********,ESCRITUR�RIO,2022,1558854


## Symmetric Encryption

Advanced Encryption Standard (AES) is a specification used as syncronous symmetric cryptographic keys assuming 128 bytes, 192 bytes or 256 bytes of size:
| AES Byte Size | Length Key |
|--------|--------|
| AES-128 | 16 |
| AES-192 | 24 |
| AES-256 | 32 |

In [0]:
display(spark.sql("SELECT aes_encrypt('Day 2 - Fighter - Databricks SQL: From Zero to Hero', 'databricks012345databricks012345')"))

"aes_encrypt(Day 2 - Fighter - Databricks SQL: From Zero to Hero, databricks012345databricks012345, GCM, DEFAULT)"
KAwERKYuitmalltqEsPLuC7YcLH3kNNMU45J4gjULPr3ijL7Wt9K3X5vpR7uPDtjoo0/dUISR2+qY5HvSom/NHgmtuBQHQds2PKYyiYirw==


In [0]:
display(spark.sql("SELECT CAST(aes_decrypt(unbase64('KAwERKYuitmalltqEsPLuC7YcLH3kNNMU45J4gjULPr3ijL7Wt9K3X5vpR7uPDtjoo0/dUISR2+qY5HvSom/NHgmtuBQHQds2PKYyiYirw=='), 'databricks012345databricks012345') AS STRING)"))

"CAST(aes_decrypt(unbase64(KAwERKYuitmalltqEsPLuC7YcLH3kNNMU45J4gjULPr3ijL7Wt9K3X5vpR7uPDtjoo0/dUISR2+qY5HvSom/NHgmtuBQHQds2PKYyiYirw==), databricks012345databricks012345, GCM, DEFAULT) AS STRING)"
Day 2 - Fighter - Databricks SQL: From Zero to Hero


In [0]:
display(spark.sql(
"SELECT" + \
"      aes_encrypt(cpf , 'databricks012345databricks012345') AS encrypted_cpf, " + \
"      cargo, " + \
"      ano, " + \
"      bruto" + \
"FROM" + \
"  remuneracao_day" + \
"LIMIT 10"
))

encrypted_cpf,cargo,ano,bruto
nCO8Y7WMyR9L6zNeRR2Nlmft0nivYuAgBxiYZlAxKW5ZfOayCyvp,ESCRITUR�RIO,2022,1723230
KCL+RqSqVsezeznboSBOpVD7jl+BPSGykpf4q2CD/hLY//a2I1C1,ESCRITUR�RIO,2022,1365842
dy10S7atqskmD6Y2Vnnb1/B7OZW436Lt0glptRKD1eXvUAJMrJ/M,ESCRITUR�RIO,2022,2051955
UBNUsYk4Kcu1BMhg1mdR0NKaQkaU2zmb+HYJdXno3eGvm73XaRlO,ESCRITUR�RIO,2022,1995063
pqz1IABxRSyfpq5h4W3ySbZTMw83vwNtjDRBQBBr4PdwDlmy/xRu,ESCRITUR�RIO,2022,1109934
0bZrFusVKUx8iA+IfS6COHc/1QGAeO0KI/sqfdg4qtlNSavUj+pA,ESCRITUR�RIO,2022,785295
ZTAaKnOdW05VSy2CrvMo11P4h6GSkCuHa1HQX7Cgvcr8C0Vh9FvW,ESCRITUR�RIO,2022,1372087
+C7xRiZ7cGuUf2rW5erfA+9cfalxL9gRDWvOPCNZcsMFRmjo5uhB,ESCRITUR�RIO,2022,2494278
AK5/SFFBGQ4qM+5SohZtqgTXWLpUwLhlCsERMD+wpi9+/ZaI3QeZ,ESCRITUR�RIO,2022,1105650
2+fhu164I4UDeNIAyDzilC5vzd0tjuRUdS7l5xPoY13D7oFIR8DH,ESCRITUR�RIO,2022,1558854


In [0]:
spark.sql(
"CREATE VIEW remuneracao_column_security AS " + \
"SELECT" + \
"    CASE" + \
"      WHEN is_account_group_member('ML_Team') THEN 'ACESSO NEGADO'" + \
"      ELSE cpf" + \
"    END AS cpf," + \
"    cargo," + \
"    ano," + \
"    bruto" + \
"FROM" + \
"  remuneracao_day"
)

In [0]:
display(spark.sql("SELECT * FROM remuneracao_column_security LIMIT 5"))

cpf,cargo,ano,bruto
ACESSO NEGADO,ESCRITUR�RIO,2022,1723230
ACESSO NEGADO,ESCRITUR�RIO,2022,1365842
ACESSO NEGADO,ESCRITUR�RIO,2022,2051955
ACESSO NEGADO,ESCRITUR�RIO,2022,1995063
ACESSO NEGADO,ESCRITUR�RIO,2022,1109934


In [0]:
display(spark.sql(
"SELECT + \
"    cargo," + \
"    COUNT(cpf) AS TOTAL" + \
"FROM" + \
"    remuneracao_day" + \
"GROUP BY CARGO" + \
"ORDER BY 2 DESC"
))

cargo,TOTAL
PROFESSOR DE EDUC. BASICA,46505
TECNICO ENFERMAGEM,14653
CONTRATO TEMPORARIO,13594
,11334
,9238
PRIMEIRO SARGENTO,7976
ANALISTA GEST ASS PUB SAUDE,6560
ANALISTA POL PUBL E GEST GOV,6332
ANALISTA TECNICO-ASSIST.PPGG,6032
SEGUNDO SARGENTO,5703


In [0]:
spark.sql(
"CREATE VIEW remuneracao_row_column_security AS " + \
"SELECT" + \
"    CASE" + \
"      WHEN is_account_group_member("ML_Team") THEN "ACCESSO NEGADO"" + \
"      ELSE cpf" + \
"    END AS cpf," + \
"    cargo," + \
"    ano," + \
"    bruto" + \
"FROM" + \
"    remuneracao_day" + \
"WHERE" + \
"    CASE" + \
"      WHEN is_account_group_member('ML_Team') THEN CARGO NOT LIKE '%TECNICO ENFERMAGEM%'" + \
"      ELSE TRUE" + \
"    END"
)

In [0]:
display(spark.sql(
"SELECT" + \
"    cargo," + \
"    COUNT(cpf) AS TOTAL" + \
"FROM" + \
"    hive_to_uc_sync.sync_data.remuneracao_row_column_security" + \
"GROUP BY cargo" + \
"ORDER BY 2 DESC"
))

cargo,TOTAL
PROFESSOR DE EDUC. BASICA,46505
CONTRATO TEMPORARIO,13594
,11334
PRIMEIRO SARGENTO,7976
ANALISTA GEST ASS PUB SAUDE,6560
ANALISTA POL PUBL E GEST GOV,6332
ANALISTA TECNICO-ASSIST.PPGG,6032
SEGUNDO SARGENTO,5703
AGENTE G.E.- CONS E LIMPEZA,4483
ENFERMEIRO,4406


## VOLUMES

In [0]:
spark.read.option("header", True).format("csv").load('/Volumes/titanic/clean/volume/titanic.csv').show()

+------+--------+--------------------+------+------+-----+-----+--------+--------+-------+--------+----+----+--------------------+
|pclass|survived|                name|   sex|   age|sibsp|parch|  ticket|    fare|  cabin|embarked|boat|body|           home.dest|
+------+--------+--------------------+------+------+-----+-----+--------+--------+-------+--------+----+----+--------------------+
|     1|       1|Allen, Miss. Elis...|female|    29|    0|    0|   24160|211.3375|     B5|       S|   2|null|        St Louis, MO|
|     1|       1|Allison, Master. ...|  male|0.9167|    1|    2|  113781|151.5500|C22 C26|       S|  11|null|Montreal, PQ / Ch...|
|     1|       0|Allison, Miss. He...|female|     2|    1|    2|  113781|151.5500|C22 C26|       S|null|null|Montreal, PQ / Ch...|
|     1|       0|Allison, Mr. Huds...|  male|    30|    1|    2|  113781|151.5500|C22 C26|       S|null| 135|Montreal, PQ / Ch...|
|     1|       0|Allison, Mrs. Hud...|female|    25|    1|    2|  113781|151.5500|C

In [0]:
#df_clone_covid_shallow = df_covid_clone.clone('dbfs://hive_to_uc_sync/deep_clone', isShallow = True)
df_remuneracao_volume = spark.read.load('dbfs://hive_to_uc_sync/remuneracao_deep_clone')
df_remuneracao_volume.write.saveAsTable("hive_to_uc_sync.sync_data.remuneracao_volume")

source_table_size,source_num_of_files,num_removed_files,num_copied_files,removed_files_size,copied_files_size
15030736,4,0,4,0,15030736


In [0]:
#df_remuneracao_volume.write.mode("overwrite").option("truncate", "true").save()
spark.sql("TRUNCATE TABLE hive_to_uc_sync.sync_data.remuneracao_volume")

In [0]:
spark.sql(
"COPY INTO remuneracao_volume" + \
"FROM '/Volumes/hive_to_uc_sync/sync_data/volume'" + \
"FILEFORMAT = CSV" + \
"FORMAT_OPTIONS ('mergeSchema' = 'true'," + \
"                  'inferSchema' = 'true'," + \
"                  'delimiter' = ';'," + \
"                  'encoding' = 'ISO-8859-1'," + \
"                  'header' = 'true')" + \
"COPY_OPTIONS ('mergeSchema' = 'true')"
)

num_affected_rows,num_inserted_rows,num_skipped_corrupt_files
248355,248355,0


In [0]:
display(spark.sql("SELECT COUNT(*) FROM remuneracao_volume"))

count(1)
495922


In [0]:
display(spark.sql("SELECT * FROM remuneracao_volume LIMIT 5"))

nome,cpf,orgao,cargo,funcao,situacao,mes,ano,codigo_do_orgao,matricula,remuneracao_basica,beneficios,valor_das_funcoes,comissao_conselheiro,hora_extra,verbas_eventuais,verbas_judiciais,descontos_a_maior,licenca_premio,irrf,seguridade_social,teto_redutor,outros_recebimentos,outros_descontos_obrigatorios,pagamento_a_maior,bruto,liquido
EDMUNDO BARBOSA DE FREITAS,***898181**,BRB-BANCO DE BRASILIA S.A.,ESCRITURÁRIO,CAIXA BANCÁRIO,AFASTADO/ABONO ASSIDUIDADE,6,2022,300001,1569,611126,189530,128459,",00",",00",794115,",00",",00",",00",850987,419359,",00",",00",",00",",00",1723230,452884
CLEUSA MARIA DE CAMPOS MORAIS,***939691**,BRB-BANCO DE BRASILIA S.A.,ESCRITURÁRIO,,AFASTADO/ABONO ASSIDUIDADE,6,2022,300001,1877,1176312,189530,",00",",00",",00",",00",",00",",00",",00",174228,257998,",00",",00",",00",",00",1365842,933616
DIVINO CARLOS SEROZINO,***730581**,BRB-BANCO DE BRASILIA S.A.,ESCRITURÁRIO,CAIXA BANCÁRIO,AFASTADO/ABONO ASSIDUIDADE,6,2022,300001,2485,1477048,189530,385377,",00",",00",",00",",00",",00",",00",343661,326264,",00",",00",",00",",00",2051955,1382030
ADEMILSON CARNEIRO,***749601**,BRB-BANCO DE BRASILIA S.A.,ESCRITURÁRIO,,AFASTADO/ABONO ASSIDUIDADE,6,2022,300001,3264,1795543,199520,",00",",00",",00",",00",",00",",00",",00",330407,342973,",00",",00",",00",",00",1995063,1321683
ANA MARIA NOGUEIRA,***040221**,BRB-BANCO DE BRASILIA S.A.,ESCRITURÁRIO,,AFASTADO/ABONO ASSIDUIDADE,6,2022,300001,4674,920404,189530,",00",",00",",00",",00",",00",",00",",00",122866,187180,",00",",00",",00",",00",1109934,799888
