In [59]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import pyspark.sql.types as T

In [60]:
# Criação da sessão Spark
spark = SparkSession.builder \
    .appName("PySpark BigQuery Connection") \
    .config('spark.jars.packages', 'com.google.cloud.spark:spark-bigquery-with-dependencies_2.12:0.23.2') \
    .config("spark.jars", "/usr/local/lib/spark-connectors/bigquery-connector-hadoop2-latest.jar") \
    .config("spark.driver.extraJavaOptions", "-Dio.netty.tryReflectionSetAccessible=true -Dio.netty.noUnsafe=true") \
    .config("spark.executor.extraJavaOptions", "-Dio.netty.tryReflectionSetAccessible=true -Dio.netty.noUnsafe=true") \
    .getOrCreate()

In [61]:
spark.conf.set("viewsEnabled", True)
spark.conf.set("materializationDataset", "SOR")

In [62]:
sc = spark.sparkContext
sc.setLogLevel("INFO")
sc._jsc.hadoopConfiguration().set('fs.gs.impl', 'com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem')
sc._jsc.hadoopConfiguration().set('fs.gs.auth.service.account.json.keyfile', '/usr/local/lib/gcp/credentials/my-project-1508437523553-e9bafe7e3368.json')

In [63]:
# Função para salvar DataFrame em formato Parquet
def save_to_bigquery(df, dataset, table_name):
    # Salva o DataFrame em formato Parquet
    df.write \
    .format("bigquery") \
    .option("table", f"{dataset.upper()}.{table_name}") \
    .option("temporaryGcsBucket", "meu-bucket-temporario-spark") \
    .option("credentialsFile", "/usr/local/lib/gcp/credentials/my-project-1508437523553-e9bafe7e3368.json") \
    .mode("overwrite") \
    .save()

In [64]:
# Função para ler dados do BigQuery
def read_from_bigquery(dataset, table_name):  
    df = spark.read \
        .format('bigquery') \
        .option('table', f"{dataset.upper()}.{table_name}") \
        .option("credentialsFile", "/usr/local/lib/gcp/credentials/my-project-1508437523553-e9bafe7e3368.json") \
        .load()

    return df

In [65]:
df_dimensao = read_from_bigquery('SOR', 'tbx002_dimensao_geral')
df_dimensao.createOrReplaceTempView("tbx002_dimensao_geral")

In [66]:
df = read_from_bigquery('SOR', 'tbx001_data')

In [67]:
# Dicionário de mapeamento de renomeação de colunas
col_rename_map = {
    "UF": "uf",
    "Ano": "ano",
    "V1013": "mes",
    "V1012": "semana",
    "A001B3": "ano_nascimento",
    "A003": "sexo",
    "A004": "cor_raca",
    "V1023": "tipo_area",
    "A005": "escolaridade",
    "B002": "foi_posto_saude",
    "B0031": "ficou_em_casa",
    "B005": "ficou_internado",
    "B009B": "resultado_covid",
    "B009D" : "resultado_covid_2",
    "B009F": "resultado_covid_3",
    "B007": "tem_plano_saude",
    "C01011": "faixa_rendimento",
    "F001": "situacao_domicilio",
    "B0011": "teve_febre",
    "B0012": "teve_tosse",
    "B0013": "teve_dor_garganta",
    "B0014": "teve_dificuldade_respirar",
    "B0015": "teve_dor_cabeca",
    "B0016": "teve_dor_peito",
    "B0017": "teve_nausea",
    "B0018": "teve_nariz_entupido_escorrendo",
    "B0019": "teve_fadiga",
    "B00110": "teve_dor_olhos",
    "B00111": "teve_perda_olfato_sabor",
    "B00112": "teve_dor_muscular",
    "B00113": "teve_diarreia"

}



# Aplicar a renomeação das colunas e selecionar apenas as colunas renomeadas
df = df.select([F.col(old_name).alias(new_name) for old_name, new_name in col_rename_map.items()])


In [68]:
df.printSchema()

root
 |-- uf: string (nullable = true)
 |-- ano: string (nullable = true)
 |-- mes: string (nullable = true)
 |-- semana: string (nullable = true)
 |-- ano_nascimento: string (nullable = true)
 |-- sexo: string (nullable = true)
 |-- cor_raca: string (nullable = true)
 |-- tipo_area: string (nullable = true)
 |-- escolaridade: string (nullable = true)
 |-- foi_posto_saude: string (nullable = true)
 |-- ficou_em_casa: string (nullable = true)
 |-- ficou_internado: string (nullable = true)
 |-- resultado_covid: string (nullable = true)
 |-- resultado_covid_2: string (nullable = true)
 |-- resultado_covid_3: string (nullable = true)
 |-- tem_plano_saude: string (nullable = true)
 |-- faixa_rendimento: string (nullable = true)
 |-- situacao_domicilio: string (nullable = true)
 |-- teve_febre: string (nullable = true)
 |-- teve_tosse: string (nullable = true)
 |-- teve_dor_garganta: string (nullable = true)
 |-- teve_dificuldade_respirar: string (nullable = true)
 |-- teve_dor_cabeca: strin

In [69]:
df.createOrReplaceTempView("tbx001_data")

In [70]:
# realizar o join entre os dataframes, dimensionando a tabela de fatos
query = """
SELECT 
T2.categoria_descricao AS uf,
T1.ano,
T1.mes,
T1.semana,
T1.ano_nascimento,
T3.categoria_descricao AS sexo,
T4.categoria_descricao AS cor_raca,
T5.categoria_descricao AS tipo_area,
T6.categoria_descricao AS escolaridade,
case
        when T1.teve_febre in (1) then 'Sim'
        when T1.teve_tosse in (1) then 'Sim'
        when T1.teve_dor_garganta in (1) then 'Sim'
        when T1.teve_dificuldade_respirar in (1) then 'Sim'
        when T1.teve_dor_cabeca in (1) then 'Sim'
        when T1.teve_dor_peito in (1) then 'Sim'
        when T1.teve_nausea in (1) then 'Sim'
        when T1.teve_nariz_entupido_escorrendo in (1) then 'Sim'
        when T1.teve_fadiga in (1) then 'Sim'
        when T1.teve_dor_olhos in (1) then 'Sim'
        when T1.teve_perda_olfato_sabor in (1) then 'Sim'
        when T1.teve_dor_muscular in (1) then 'Sim'
        when T1.teve_diarreia in (1) then 'Sim'
        when T1.teve_febre is null
                and T1.teve_tosse is null
                and T1.teve_dor_garganta is null
                and T1.teve_dificuldade_respirar is null
                and T1.teve_dor_cabeca is null
                and T1.teve_dor_peito is null
                and T1.teve_nausea is null
                and T1.teve_nariz_entupido_escorrendo is null
                and T1.teve_fadiga is null
                and T1.teve_dor_olhos is null
                and T1.teve_perda_olfato_sabor is null
                and T1.teve_dor_muscular is null
                and T1.teve_diarreia is null 
        then null
        else 'Não'
end as teve_sintomas_covid,
T12.categoria_descricao AS foi_posto_saude,
T13.categoria_descricao AS ficou_em_casa,
T14.categoria_descricao AS ficou_internado,
case
        when T1.resultado_covid in (1) then 'Sim'
        when T1.resultado_covid_2 in (1) then 'Sim'
        when T1.resultado_covid_3 in (1) then 'Sim' 
        when T1.resultado_covid is null and T1.resultado_covid_2 is null and T1.resultado_covid_3 is null then null      
        else 'Não'
end as teve_covid,
case
        when T15.categoria_descricao is not null then T15.categoria_descricao
        when T19.categoria_descricao is not null then T19.categoria_descricao
        when T20.categoria_descricao is not null then T20.categoria_descricao
end as resultado_covid,
T16.categoria_descricao AS tem_plano_saude,
T17.categoria_descricao AS faixa_rendimento,
T18.categoria_descricao AS situacao_domicilio
FROM tbx001_data T1
LEFT JOIN tbx002_dimensao_geral T2 on T2.codigo_variavel = 'UF' and T1.uf = T2.categoria_tipo
LEFT JOIN tbx002_dimensao_geral T3 on T3.codigo_variavel = 'A003' and T1.sexo = T3.categoria_tipo
LEFT JOIN tbx002_dimensao_geral T4 on T4.codigo_variavel = 'A004' and T1.cor_raca = T4.categoria_tipo
LEFT JOIN tbx002_dimensao_geral T5 on T5.codigo_variavel = 'V1023' and T1.tipo_area = T5.categoria_tipo
LEFT JOIN tbx002_dimensao_geral T6 on T6.codigo_variavel = 'A005' and T1.escolaridade = T6.categoria_tipo
LEFT JOIN tbx002_dimensao_geral T12 on T12.codigo_variavel = 'B002' and T1.foi_posto_saude = T12.categoria_tipo
LEFT JOIN tbx002_dimensao_geral T13 on T13.codigo_variavel = 'B0031' and T1.ficou_em_casa = T13.categoria_tipo
LEFT JOIN tbx002_dimensao_geral T14 on T14.codigo_variavel = 'B005' and T1.ficou_internado = T14.categoria_tipo
LEFT JOIN tbx002_dimensao_geral T15 on T15.codigo_variavel = 'B009B' and T1.resultado_covid = T15.categoria_tipo
LEFT JOIN tbx002_dimensao_geral T16 on T16.codigo_variavel = 'B007' and T1.tem_plano_saude = T16.categoria_tipo
LEFT JOIN tbx002_dimensao_geral T17 on T17.codigo_variavel = 'C01011' and T1.faixa_rendimento = T17.categoria_tipo
LEFT JOIN tbx002_dimensao_geral T18 on T18.codigo_variavel = 'F001' and T1.situacao_domicilio = T18.categoria_tipo
LEFT JOIN tbx002_dimensao_geral T19 on T19.codigo_variavel = 'B009D' and T1.resultado_covid_2 = T19.categoria_tipo
LEFT JOIN tbx002_dimensao_geral T20 on T20.codigo_variavel = 'B009F' and T1.resultado_covid_3 = T20.categoria_tipo

"""

In [71]:
# Executar a consulta SQL
df_joined = spark.sql(query)

In [72]:
print(len(df_joined.columns))

18


In [73]:
df_joined.printSchema()

root
 |-- uf: string (nullable = true)
 |-- ano: string (nullable = true)
 |-- mes: string (nullable = true)
 |-- semana: string (nullable = true)
 |-- ano_nascimento: string (nullable = true)
 |-- sexo: string (nullable = true)
 |-- cor_raca: string (nullable = true)
 |-- tipo_area: string (nullable = true)
 |-- escolaridade: string (nullable = true)
 |-- teve_sintomas_covid: string (nullable = true)
 |-- foi_posto_saude: string (nullable = true)
 |-- ficou_em_casa: string (nullable = true)
 |-- ficou_internado: string (nullable = true)
 |-- teve_covid: string (nullable = true)
 |-- resultado_covid: string (nullable = true)
 |-- tem_plano_saude: string (nullable = true)
 |-- faixa_rendimento: string (nullable = true)
 |-- situacao_domicilio: string (nullable = true)



In [45]:
df_joined.show()

24/09/28 18:48:07 INFO DirectBigQueryRelation: |Querying table my-project-1508437523553.SOR.tbx001_data, parameters sent from Spark:|requiredColumns=[UF,Ano,V1013,V1012,A001B3,A003,A004,V1023,A005,B002,B0031,B005,B009B,B009D,B009F,B007,C01011,F001,B0011,B0012,B0013,B0014,B0015,B0016,B0017,B0018,B0019,B00110,B00111,B00112,B00113],|filters=[]
24/09/28 18:48:08 INFO ReadSessionCreator: Read session:{"readSessionName":"projects/my-project-1508437523553/locations/us-east1/sessions/CAISDG5vQVJTeUE0YTF1NhoCdngaAnVo","readSessionCreationStartTime":"2024-09-28T21:48:07.001Z","readSessionCreationEndTime":"2024-09-28T21:48:08.434Z","readSessionPrepDuration":608,"readSessionCreationDuration":825,"readSessionDuration":1433}
24/09/28 18:48:08 INFO ReadSessionCreator: Requested 20000 max partitions, but only received 4 from the BigQuery Storage API for session projects/my-project-1508437523553/locations/us-east1/sessions/CAISDG5vQVJTeUE0YTF1NhoCdngaAnVo. Notice that the number of streams in actual ma

+--------+----+---+------+--------------+------+--------+---------+--------------------+-------------------+---------------+-------------+---------------+----------+---------------+---------------+----------------+--------------------+
|      uf| ano|mes|semana|ano_nascimento|  sexo|cor_raca|tipo_area|        escolaridade|teve_sintomas_covid|foi_posto_saude|ficou_em_casa|ficou_internado|teve_covid|resultado_covid|tem_plano_saude|faixa_rendimento|  situacao_domicilio|
+--------+----+---+------+--------------+------+--------+---------+--------------------+-------------------+---------------+-------------+---------------+----------+---------------+---------------+----------------+--------------------+
|Rondônia|2020| 07|     3|          1958| Homem|   Parda|  Capital|       Sem instrução|                Não|           NULL|         NULL|           NULL|      NULL|           NULL|           Não |            NULL|Cedido por familiar |
|Rondônia|2020| 07|     4|          9999| Homem|   Parda

24/09/28 18:48:25 INFO Executor: Finished task 0.0 in stage 94.0 (TID 97). 3625 bytes result sent to driver
24/09/28 18:48:25 INFO TaskSetManager: Finished task 0.0 in stage 94.0 (TID 97) in 1838 ms on 10.0.2.15 (executor driver) (1/1)
24/09/28 18:48:25 INFO TaskSchedulerImpl: Removed TaskSet 94.0, whose tasks have all completed, from pool 
24/09/28 18:48:25 INFO DAGScheduler: ResultStage 94 (showString at NativeMethodAccessorImpl.java:0) finished in 1,874 s
24/09/28 18:48:25 INFO DAGScheduler: Job 94 is finished. Cancelling potential speculative or zombie tasks for this job
24/09/28 18:48:25 INFO TaskSchedulerImpl: Killing all running tasks in stage 94: Stage finished
24/09/28 18:48:25 INFO DAGScheduler: Job 94 finished: showString at NativeMethodAccessorImpl.java:0, took 1,979117 s
                                                                                

In [74]:
save_to_bigquery(df_joined, "SOT", "tbx001_data")

24/09/28 19:29:31 INFO DirectBigQueryRelation: |Querying table my-project-1508437523553.SOR.tbx001_data, parameters sent from Spark:|requiredColumns=[UF,Ano,V1013,V1012,A001B3,A003,A004,V1023,A005,B002,B0031,B005,B009B,B009D,B009F,B007,C01011,F001,B0011,B0012,B0013,B0014,B0015,B0016,B0017,B0018,B0019,B00110,B00111,B00112,B00113],|filters=[]
24/09/28 19:29:32 INFO ReadSessionCreator: Read session:{"readSessionName":"projects/my-project-1508437523553/locations/us-east1/sessions/CAISDEhpYnRzRktJZ2ZjMxoCdngaAnVo","readSessionCreationStartTime":"2024-09-28T22:29:31.070Z","readSessionCreationEndTime":"2024-09-28T22:29:32.464Z","readSessionPrepDuration":446,"readSessionCreationDuration":948,"readSessionDuration":1394}
24/09/28 19:29:32 INFO ReadSessionCreator: Requested 20000 max partitions, but only received 4 from the BigQuery Storage API for session projects/my-project-1508437523553/locations/us-east1/sessions/CAISDEhpYnRzRktJZ2ZjMxoCdngaAnVo. Notice that the number of streams in actual ma

24/09/29 13:58:07 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 33532964 ms exceeds timeout 120000 ms
24/09/29 13:58:07 WARN NettyRpcEnv: Ignored message: true
24/09/29 13:58:07 WARN NettyRpcEnv: Ignored message: true
24/09/29 13:58:07 WARN NettyRpcEnv: Ignored message: true
24/09/29 13:58:07 WARN NettyRpcEnv: Ignored message: true
24/09/29 13:58:07 WARN NettyRpcEnv: Ignored message: true
24/09/29 13:58:07 WARN NettyRpcEnv: Ignored message: true
24/09/29 13:58:07 WARN NettyRpcEnv: Ignored message: true
24/09/29 13:58:07 WARN NettyRpcEnv: Ignored message: true
24/09/29 13:58:07 WARN NettyRpcEnv: Ignored message: true
24/09/29 13:58:07 WARN NettyRpcEnv: Ignored message: true
24/09/29 13:58:07 WARN NettyRpcEnv: Ignored message: true
24/09/29 13:58:07 WARN NettyRpcEnv: Ignored message: true
24/09/29 13:58:07 WARN NettyRpcEnv: Ignored message: true
24/09/29 13:58:07 WARN NettyRpcEnv: Ignored message: true
24/09/29 13:58:07 WARN NettyRpcEnv: Ignored message: tru