### Início

#### Importação das Libs e criação da sessão spark

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *

import pyspark.sql.functions as f

import pandas as pd
import requests
import zipfile
import os

import findspark
findspark.init()

spark = (
    SparkSession
    .builder
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
    .getOrCreate()
)


In [2]:
import pyspark.sql.functions as f

# Local Datalake Paths
local_zones = {
    'landing': './data/landing_zone',
    'raw': './data/raw_zone',
    'staging': './data/staging_zone',
    'consumer': './data/consumer_zone',
}

#### Etapa 1 - Download do Dataset de Pensionistas

In [3]:
# Link para download do arquivo de pensionistas
zipUrl = 'http://repositorio.dados.gov.br/segrt/pensionistas/PENSIONISTAS_082021.zip'

### Landing Zone
# Path do diretorio para gravação do arquivo
outPath = f'{local_zones["landing"]}/pensionistas'
os.makedirs(outPath, exist_ok=True)
    
# baixar arquivo direto da fonte
req = requests.get(zipUrl)
zipOut = f'{outPath}/PENSIONISTAS.zip'
print('Saving:', zipOut)
with open(zipOut, 'wb') as f:
    f.write(req.content)
    f.close()


### Raw Zone
outPath = f'{local_zones["raw"]}/pensionistas'
os.makedirs(outPath, exist_ok=True)

# Unzip - Extract all
with zipfile.ZipFile(zipOut, 'r') as zipObj:
   # Extract all
   listOfFileNames = zipObj.namelist()
   # varre a lista de arquivos - file names
   for fileName in listOfFileNames:
       # Apenas .csv
       if fileName.endswith('.csv'):
           print('Unzip:', fileName, ' -> ', outPath)
           zipObj.extract(fileName, path=outPath)

Saving: ./data/landing_zone/pensionistas/PENSIONISTAS.zip
Unzip: PENSIONISTAS_082021.csv  ->  ./data/raw_zone/pensionistas


#### Etapa 02 - Leitura do dataframe da Raw e formatação para Staging

In [101]:
import pyspark.sql.functions as f

csvFile = f'{local_zones["raw"]}/pensionistas/{fileName}'

# Carregando o Dataframe
df_origem = (
    spark.read
    .format('csv')
    .option('sep',';')
    .option('header',True)
    .load(csvFile)
)

# Selecionando as Colunas representativas
cols = ['NOME DO BENEFICIARIO', 
        'NOME DO ORGAO',
        'DATA DE NASCIMENTO',
        'UF DA UPAG DE VINCULACAO',
        'NATUREZA PENSAO',
        'DATA INICIO DO BENEFICIO',
        'DATA FIM DO BENEFICIO',
        'RENDIMENTO LIQUIDO',
        'PAGAMENTO SUSPENSO']

# Renomeando das colunas
df_pensionistas = (
    df_origem
    .select(cols)
    .withColumnRenamed('NOME DO BENEFICIARIO','nome')
    .withColumnRenamed('NOME DO ORGAO','orgao')
    .withColumnRenamed('DATA DE NASCIMENTO','dtnasc')
    .withColumnRenamed('UF DA UPAG DE VINCULACAO','uf')
    .withColumnRenamed('NATUREZA PENSAO','natpensao')
    .withColumnRenamed('DATA INICIO DO BENEFICIO','dtiniben')
    .withColumnRenamed('DATA FIM DO BENEFICIO','dtfimben')
    .withColumnRenamed('RENDIMENTO LIQUIDO','rendLiquido')
    .withColumnRenamed('PAGAMENTO SUSPENSO','pagsuspenso')
)


# Trantando e convertendo as colunas
df_pensionistas = (
    df_pensionistas
    .select('nome', 'orgao', 'dtnasc', 'uf', 'natpensao', 'dtiniben', 'dtfimben', 'rendLiquido', 'pagsuspenso')
    .withColumn('dtnasc', f.to_date( f.col('dtnasc'),'ddMMyyyy') )
    .withColumn('dtiniben', f.to_date( f.col('dtiniben'),'ddMMyyyy') )
    .withColumn('dtfimben', f.to_date( f.col('dtfimben'),'ddMMyyyy') )
    .withColumn('rendLiquido', f.regexp_replace(f.regexp_replace(f.col('rendLiquido'), "\\.", ""), "\\,", "."))
    .withColumn('rendLiquido', f.col('rendLiquido').cast('float') )
    .withColumn('pagsuspenso', f.when(f.col('pagsuspenso') == 'NAO', False)
                            .when(f.col('pagsuspenso') == 'SIM', True)
                            .otherwise(None) )
    .withColumn('limitMax35', f.expr('round(rendLiquido*0.35,2)').cast('float') )
    .withColumn('limitMax40', f.expr('round(rendLiquido*0.40,2)').cast('float') )
    .withColumn('faixaRenda', f.when(f.col('rendLiquido').between(0,3000),'Entre 0 e 3000')
                             .when(f.col('rendLiquido').between(3001,7000),'Entre 3001 e 7000')
                             .when(f.col('rendLiquido').between(7001,15000),'Entre 7001 e 15000')
                             .when(f.col('rendLiquido').between(15001,30000),'Entre 15001 e 30000')
                             .otherwise('Acima de 30.000') )
    .withColumn('idade' , f.floor( f.datediff( f.current_date(), f.col('dtnasc') ) / 365))
    .withColumn('faixaIdade', f.when(f.col('idade').between(0,15),'Entre 0 e 15 anos')
                             .when(f.col('idade').between(16,30),'Entre 16 e 30 anos')
                             .when(f.col('idade').between(31,60),'Entre 31 e 60 anos')
                             .otherwise('Acima de 60 anos') )
)

del df_origem

In [102]:
df_pensionistas.limit(5).toPandas()

Unnamed: 0,nome,orgao,dtnasc,uf,natpensao,dtiniben,dtfimben,rendLiquido,pagsuspenso,limitMax35,limitMax40,faixaRenda,idade,faixaIdade
0,MARIA DA CRUZ DOS SANTOS,"MINIST.DA AGRICULTURA,PECUARIA E ABAST.",1955-01-28,DF,VITALICIA,2021-01-29,,3039.580078,False,1063.849976,1215.829956,Entre 3001 e 7000,66,Acima de 60 anos
1,CARMEN LUCINDA FARKAS DE ARAUJO,"MINIST.DA AGRICULTURA,PECUARIA E ABAST.",1951-04-18,DF,VITALICIA,1998-02-10,,1216.25,False,425.690002,486.5,Entre 0 e 3000,70,Acima de 60 anos
2,MARIA DE LOURDES DA SILVA,"MINIST.DA AGRICULTURA,PECUARIA E ABAST.",1948-01-14,DF,TEMPORARIA,2016-01-25,,2697.459961,False,944.109985,1078.97998,Entre 0 e 3000,73,Acima de 60 anos
3,DIOMARINA ALVES DOS SANTOS,"MINIST.DA AGRICULTURA,PECUARIA E ABAST.",1944-06-18,DF,VITALICIA,2010-12-19,,6953.77002,False,2433.820068,2781.51001,Entre 3001 e 7000,77,Acima de 60 anos
4,MARIZETE DIAS SOUZA,"MINIST.DA AGRICULTURA,PECUARIA E ABAST.",1954-03-18,DF,VITALICIA,2012-10-13,,14994.889648,False,5248.209961,5997.959961,Entre 7001 e 15000,67,Acima de 60 anos


In [6]:
(
    df_pensionistas
    .groupBy('pagsuspenso')
    .agg(f.count('nome'))
    .toPandas()
)

Unnamed: 0,pagsuspenso,count(nome)
0,True,2791
1,False,289255


In [7]:
(
    df_pensionistas
    .groupBy('uf','natpensao','pagsuspenso')
    .agg(f.count('nome'))
    .toPandas()
)

Unnamed: 0,uf,natpensao,pagsuspenso,count(nome)
0,PA,TEMPORARIA,True,30
1,PE,TEMPORARIA,True,31
2,SE,TEMPORARIA,False,564
3,PR,TEMPORARIA,True,3
4,MA,VITALICIA,True,20
...,...,...,...,...
103,AL,VITALICIA,False,1663
104,ES,TEMPORARIA,True,2
105,AC,TEMPORARIA,True,1
106,PA,TEMPORARIA,False,2255


In [106]:
(
    df_pensionistas
    .groupBy('faixaRenda')
    .agg(f.sum(f.col('rendliquido')).cast('float').alias('valor'))
    .toPandas()
)

Unnamed: 0,faixaRenda,valor
0,Entre 7001 e 15000,281589312.0
1,Entre 15001 e 30000,160190880.0
2,Acima de 30.000,8150895.5
3,Entre 0 e 3000,232925504.0
4,Entre 3001 e 7000,550267136.0


In [107]:
### Gravação na Staging
outPath = f'{local_zones["staging"]}/pensionistas'
df_pensionistas.write.format('parquet').save(outPath, mode='overwrite')

Py4JJavaError: An error occurred while calling o2348.save.
: org.apache.spark.SparkException: Job aborted.
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.write(FileFormatWriter.scala:231)
	at org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelationCommand.run(InsertIntoHadoopFsRelationCommand.scala:188)
	at org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult$lzycompute(commands.scala:108)
	at org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult(commands.scala:106)
	at org.apache.spark.sql.execution.command.DataWritingCommandExec.doExecute(commands.scala:131)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$execute$1(SparkPlan.scala:180)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:218)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:215)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:176)
	at org.apache.spark.sql.execution.QueryExecution.toRdd$lzycompute(QueryExecution.scala:132)
	at org.apache.spark.sql.execution.QueryExecution.toRdd(QueryExecution.scala:131)
	at org.apache.spark.sql.DataFrameWriter.$anonfun$runCommand$1(DataFrameWriter.scala:989)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$5(SQLExecution.scala:103)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:163)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:90)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:775)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64)
	at org.apache.spark.sql.DataFrameWriter.runCommand(DataFrameWriter.scala:989)
	at org.apache.spark.sql.DataFrameWriter.saveToV1Source(DataFrameWriter.scala:438)
	at org.apache.spark.sql.DataFrameWriter.saveInternal(DataFrameWriter.scala:415)
	at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:293)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(Unknown Source)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(Unknown Source)
	at java.lang.reflect.Method.invoke(Unknown Source)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Unknown Source)
Caused by: java.lang.UnsatisfiedLinkError: org.apache.hadoop.io.nativeio.NativeIO$Windows.access0(Ljava/lang/String;I)Z
	at org.apache.hadoop.io.nativeio.NativeIO$Windows.access0(Native Method)
	at org.apache.hadoop.io.nativeio.NativeIO$Windows.access(NativeIO.java:645)
	at org.apache.hadoop.fs.FileUtil.canRead(FileUtil.java:1230)
	at org.apache.hadoop.fs.FileUtil.list(FileUtil.java:1435)
	at org.apache.hadoop.fs.RawLocalFileSystem.listStatus(RawLocalFileSystem.java:493)
	at org.apache.hadoop.fs.FileSystem.listStatus(FileSystem.java:1868)
	at org.apache.hadoop.fs.FileSystem.listStatus(FileSystem.java:1910)
	at org.apache.hadoop.fs.ChecksumFileSystem.listStatus(ChecksumFileSystem.java:678)
	at org.apache.hadoop.fs.FileSystem.listStatus(FileSystem.java:1868)
	at org.apache.hadoop.fs.FileSystem.listStatus(FileSystem.java:1910)
	at org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter.getAllCommittedTaskPaths(FileOutputCommitter.java:332)
	at org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter.commitJobInternal(FileOutputCommitter.java:402)
	at org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter.commitJob(FileOutputCommitter.java:375)
	at org.apache.parquet.hadoop.ParquetOutputCommitter.commitJob(ParquetOutputCommitter.java:48)
	at org.apache.spark.internal.io.HadoopMapReduceCommitProtocol.commitJob(HadoopMapReduceCommitProtocol.scala:182)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.write(FileFormatWriter.scala:220)
	... 32 more
