# EDA Covid

In [2]:
# !curl -o /tmp/postgresql-42.1.4.jar https://jdbc.postgresql.org/download/postgresql-42.1.4.jar

In [3]:
# !pyspark --driver-class-path /tmp/postgresql-42.1.4.jar --jars /tmp/postgresql-42.1.4.jar

In [4]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import DataFrameReader, SQLContext, SparkSession

import os

In [15]:
spark = SparkSession \
    .builder \
    .appName("DataSUS") \
    .config("spark.jars", "/opt/spark/jars/postgresql-42.2.22.jar") \
    .getOrCreate()

In [16]:
spark.sql("SHOW TABLES")

DataFrame[database: string, tableName: string, isTemporary: boolean]

## Mode 1

In [4]:
jdbcDF = spark.read.format("jdbc"). \
options(
         url='jdbc:postgresql://localhost:5432/datasus', # jdbc:postgresql://<host>:<port>/<database>
         dbtable='sia_pa_rs',
         user='postgres',
         password='postgres',
         driver='org.postgresql.Driver',
        #partitionColumn='index',
        ).\
load()

In [5]:
jdbcDF.printSchema()

root
 |-- index: long (nullable = true)
 |-- pa_gestao: long (nullable = true)
 |-- pa_condic: string (nullable = true)
 |-- pa_ufmun: long (nullable = true)
 |-- pa_regct: long (nullable = true)
 |-- pa_incout: long (nullable = true)
 |-- pa_incurg: long (nullable = true)
 |-- pa_tpups: long (nullable = true)
 |-- pa_tippre: long (nullable = true)
 |-- pa_mn_ind: string (nullable = true)
 |-- pa_cnpjcpf: long (nullable = true)
 |-- pa_cnpjmnt: long (nullable = true)
 |-- pa_cnpj_cc: long (nullable = true)
 |-- pa_mvm: long (nullable = true)
 |-- pa_cmp: long (nullable = true)
 |-- pa_proc_id: long (nullable = true)
 |-- pa_tpfin: long (nullable = true)
 |-- pa_subfin: long (nullable = true)
 |-- pa_nivcpl: long (nullable = true)
 |-- pa_docorig: string (nullable = true)
 |-- pa_autoriz: string (nullable = true)
 |-- pa_cnsmed: long (nullable = true)
 |-- pa_cbocod: string (nullable = true)
 |-- pa_motsai: long (nullable = true)
 |-- pa_obito: long (nullable = true)
 |-- pa_encerr: lon

In [7]:
# spark.sql('select count(1) from datasus.sia_pa_rs').show()

In [None]:
# jdbcDF.count()

In [None]:
jdbcDF.select("*").toPandas()

In [None]:
jdbcDF.select('*').collect()

In [None]:
# jdbcDF.show(1)
# jdbcDF.take(1)


In [6]:
jdbcDF.createOrReplaceTempView("tableA")

In [7]:
teenagerDF = spark.sql("SELECT min(PA_IDADE) FROM tableA WHERE PA_CIDPRI = 'H57'")
teenagerDF.show()

+-------------+
|min(PA_IDADE)|
+-------------+
|            0|
+-------------+



In [8]:
teenagerDF = spark.sql("SELECT max(PA_IDADE) FROM tableA WHERE PA_CIDPRI = 'H57'")
teenagerDF.show()

+-------------+
|max(PA_IDADE)|
+-------------+
|           76|
+-------------+



In [None]:
# spark.sql("SELECT count(*) from tableA").show()

## Mode 2

In [5]:
sparkClassPath = os.getenv('SPARK_CLASSPATH', '/opt/spark/jars/postgresql-42.2.22.jar')

# Populate configuration
conf = SparkConf()
conf.setAppName('DataSUS')
conf.set('spark.jars', 'file:%s' % sparkClassPath)
conf.set('spark.executor.extraClassPath', sparkClassPath)
conf.set('spark.driver.extraClassPath', sparkClassPath)

conf.set('spark.driver.cores', '1')
conf.set('spark.executor.cores', '1')
conf.set('spark.driver.memory', '4G')
conf.set('spark.executor.memory', '4G')

<pyspark.conf.SparkConf at 0x7f7cd4985208>

In [6]:
# spark = SparkSession.builder.getOrCreate()

In [7]:
sc = SparkContext(conf=conf)

sqlContext = SQLContext(sc)

In [8]:
url = 'postgresql://127.0.0.1:5432/datasus'
properties = {'user':'postgres', 'password':'postgres'}

In [9]:
df = DataFrameReader(sqlContext).jdbc(url='jdbc:%s' % url, table='sia_pa_rs', properties=properties)

In [10]:
df.printSchema()

root
 |-- index: long (nullable = true)
 |-- pa_gestao: long (nullable = true)
 |-- pa_condic: string (nullable = true)
 |-- pa_ufmun: long (nullable = true)
 |-- pa_regct: long (nullable = true)
 |-- pa_incout: long (nullable = true)
 |-- pa_incurg: long (nullable = true)
 |-- pa_tpups: long (nullable = true)
 |-- pa_tippre: long (nullable = true)
 |-- pa_mn_ind: string (nullable = true)
 |-- pa_cnpjcpf: long (nullable = true)
 |-- pa_cnpjmnt: long (nullable = true)
 |-- pa_cnpj_cc: long (nullable = true)
 |-- pa_mvm: long (nullable = true)
 |-- pa_cmp: long (nullable = true)
 |-- pa_proc_id: long (nullable = true)
 |-- pa_tpfin: long (nullable = true)
 |-- pa_subfin: long (nullable = true)
 |-- pa_nivcpl: long (nullable = true)
 |-- pa_docorig: string (nullable = true)
 |-- pa_autoriz: string (nullable = true)
 |-- pa_cnsmed: long (nullable = true)
 |-- pa_cbocod: string (nullable = true)
 |-- pa_motsai: long (nullable = true)
 |-- pa_obito: long (nullable = true)
 |-- pa_encerr: lon

In [11]:
df.columns

['index',
 'pa_gestao',
 'pa_condic',
 'pa_ufmun',
 'pa_regct',
 'pa_incout',
 'pa_incurg',
 'pa_tpups',
 'pa_tippre',
 'pa_mn_ind',
 'pa_cnpjcpf',
 'pa_cnpjmnt',
 'pa_cnpj_cc',
 'pa_mvm',
 'pa_cmp',
 'pa_proc_id',
 'pa_tpfin',
 'pa_subfin',
 'pa_nivcpl',
 'pa_docorig',
 'pa_autoriz',
 'pa_cnsmed',
 'pa_cbocod',
 'pa_motsai',
 'pa_obito',
 'pa_encerr',
 'pa_perman',
 'pa_alta',
 'pa_transf',
 'pa_cidpri',
 'pa_cidsec',
 'pa_cidcas',
 'pa_catend',
 'pa_idade',
 'idademin',
 'idademax',
 'pa_flidade',
 'pa_sexo',
 'pa_racacor',
 'pa_munpcn',
 'pa_qtdpro',
 'pa_qtdapr',
 'pa_valpro',
 'pa_valapr',
 'pa_ufdif',
 'pa_mndif',
 'pa_dif_val',
 'nu_vpa_tot',
 'nu_pa_tot',
 'pa_indica',
 'pa_codoco',
 'pa_flqt',
 'pa_fler',
 'pa_etnia',
 'pa_vl_cf',
 'pa_vl_cl',
 'pa_vl_inc',
 'pa_srv_c',
 'pa_ine',
 'pa_nat_jur']

In [12]:
print(sc)
print(sqlContext)

<SparkContext master=local[*] appName=DataSUS>
<pyspark.sql.context.SQLContext object at 0x7f7cd4974748>


In [9]:
df.take(1)

In [13]:
df.createOrReplaceTempView("tableA")

In [17]:
teenagerDF = spark.sql("SELECT min(PA_IDADE) FROM tableA WHERE PA_CIDPRI = 'H57'")
teenagerDF.show()

+-------------+
|min(PA_IDADE)|
+-------------+
|            0|
+-------------+



In [9]:
# df.show(1, vertical=True)

In [12]:
df.select("PA_CIDPRI", "PA_CIDSEC").describe().show()

+-------+-------------------+---------+
|summary|          PA_CIDPRI|PA_CIDSEC|
+-------+-------------------+---------+
|  count|           40722508| 40722508|
|   mean|0.04845269043950959|      0.0|
| stddev|  7.336797206585033|      0.0|
|    min|               0000|        0|
|    max|               Z999|     Z940|
+-------+-------------------+---------+



In [14]:
df.select("PA_CIDPRI", "PA_CIDSEC").show()

+---------+---------+
|PA_CIDPRI|PA_CIDSEC|
+---------+---------+
|     0000|     0000|
|     0000|     0000|
|     0000|     0000|
|     0000|     0000|
|     0000|     0000|
|     0000|     0000|
|     0000|     0000|
|     0000|     0000|
|     0000|     0000|
|     0000|     0000|
|     0000|     0000|
|     0000|     0000|
|     0000|     0000|
|     0000|     0000|
|     0000|     0000|
|     0000|     0000|
|     0000|     0000|
|     0000|     0000|
|     0000|     0000|
|     0000|     0000|
+---------+---------+
only showing top 20 rows



In [7]:
df.groupby('PA_CIDPRI').count().show()

+---------+-----+
|PA_CIDPRI|count|
+---------+-----+
|     F900|12074|
|      Z21|17926|
|     S903| 1098|
|     M238| 5179|
|     F985| 2811|
|     N921|  736|
|     H490|  345|
|     J689|   41|
|     N179|  735|
|     C718|  837|
|     L010|  444|
|     S501|  501|
|     S666|  716|
|     Z002|   23|
|     V284|  197|
|     G930|   32|
|     C491|  291|
|     S599|   75|
|     Q373|    2|
|     I676|   11|
+---------+-----+
only showing top 20 rows



In [8]:
df.groupby('PA_CIDSEC').count().show()

+---------+-----+
|PA_CIDSEC|count|
+---------+-----+
|     H904|  663|
|     C718|    3|
|     C491|    2|
|      Z21|    2|
|     H906| 1012|
|     C676|    2|
|     G710|   26|
|      E83|    2|
|     N110|    4|
|     M179|    1|
|      C22|   16|
|     E142|   12|
|     I071|    1|
|     C186|   15|
|      C77|  211|
|     C159|   26|
|     N039|   27|
|     C248|    2|
|     C509|  673|
|     N189|  357|
+---------+-----+
only showing top 20 rows



In [10]:
df.count()

40722508

In [13]:
from pyspark.sql.functions import expr

# df.selectExpr('add_one(v1)').show()
df.select(expr('PA_CIDPRI') == "H57").show()

+-----------------+
|(PA_CIDPRI = H57)|
+-----------------+
|            false|
|            false|
|            false|
|            false|
|            false|
|            false|
|            false|
|            false|
|            false|
|            false|
|            false|
|            false|
|            false|
|            false|
|            false|
|            false|
|            false|
|            false|
|            false|
|            false|
+-----------------+
only showing top 20 rows



In [8]:
df.filter(df["PA_CIDPRI"] == "H57").show()

+-------+---------+---------+--------+--------+---------+---------+--------+---------+---------+--------------+--------------+----------+------+------+----------+--------+---------+---------+----------+-------------+---------------+---------+---------+--------+---------+---------+-------+---------+---------+---------+---------+---------+--------+--------+--------+----------+-------+----------+---------+---------+---------+---------+---------+--------+--------+----------+----------+---------+---------+---------+-------+-------+--------+--------+--------+---------+--------+------+----------+
|  index|pa_gestao|pa_condic|pa_ufmun|pa_regct|pa_incout|pa_incurg|pa_tpups|pa_tippre|pa_mn_ind|    pa_cnpjcpf|    pa_cnpjmnt|pa_cnpj_cc|pa_mvm|pa_cmp|pa_proc_id|pa_tpfin|pa_subfin|pa_nivcpl|pa_docorig|   pa_autoriz|      pa_cnsmed|pa_cbocod|pa_motsai|pa_obito|pa_encerr|pa_perman|pa_alta|pa_transf|pa_cidpri|pa_cidsec|pa_cidcas|pa_catend|pa_idade|idademin|idademax|pa_flidade|pa_sexo|pa_racacor|pa_munp

In [18]:
# df.select("*").toPandas()

# End

In [19]:
spark.stop()