In [26]:
from pyspark.sql import SparkSession
import pyspark.sql.types as T
import pyspark.sql.functions as F
import os
from pyspark.sql.window import Window

In [None]:
# Criação da sessão Spark
spark = SparkSession.builder \
    .appName("PySpark BigQuery Connection") \
    .config('spark.jars.packages', 'com.google.cloud.spark:spark-bigquery-with-dependencies_2.12:0.23.2') \
    .config("spark.jars", "/usr/local/lib/spark-connectors/bigquery-connector-hadoop2-latest.jar") \
    .config("spark.jars.packages", "com.crealytics:spark-excel_2.12:0.13.5") \
    .getOrCreate()

In [3]:
sc = spark.sparkContext
sc.setLogLevel("INFO")
sc._jsc.hadoopConfiguration().set('fs.gs.impl', 'com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem')
sc._jsc.hadoopConfiguration().set('fs.gs.auth.service.account.json.keyfile', '/usr/local/lib/gcp/credentials/my-project-1508437523553-e9bafe7e3368.json')

In [4]:
# Função para salvar DataFrame em formato Parquet
def save_to_bigquery(df, dataset, table_name):
    # Salva o DataFrame em formato Parquet
    df.write \
    .format("bigquery") \
    .option("table", f"{dataset.upper()}.{table_name}") \
    .option("temporaryGcsBucket", "meu-bucket-temporario-spark") \
    .option("credentialsFile", "/usr/local/lib/gcp/credentials/my-project-1508437523553-e9bafe7e3368.json") \
    .mode("overwrite") \
    .save()

24/09/28 12:03:52 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors


In [177]:
df_data = spark\
    .read\
    .option('delimiter',',')\
    .option('header',True)\
    .option('inferSchema',True)\
    .csv('./data/raw')

# Salvando o DataFrame em BigQuery
save_to_bigquery(df_data, "sor", "tbx001_data")

24/09/28 13:43:09 INFO InMemoryFileIndex: It took 11 ms to list leaf files for 1 paths.
24/09/28 13:43:09 INFO InMemoryFileIndex: It took 12 ms to list leaf files for 7 paths.
24/09/28 13:43:10 INFO FileSourceStrategy: Pushed Filters: 
24/09/28 13:43:10 INFO FileSourceStrategy: Post-Scan Filters: (length(trim(value#6952, None)) > 0)
24/09/28 13:43:10 INFO CodeGenerator: Code generated in 23.779078 ms
24/09/28 13:43:10 INFO MemoryStore: Block broadcast_128 stored as values in memory (estimated size 207.0 KiB, free 433.6 MiB)
24/09/28 13:43:11 INFO MemoryStore: Block broadcast_128_piece0 stored as bytes in memory (estimated size 36.9 KiB, free 433.6 MiB)
24/09/28 13:43:11 INFO BlockManagerInfo: Added broadcast_128_piece0 in memory on 10.0.2.15:32869 (size: 36.9 KiB, free: 434.2 MiB)
24/09/28 13:43:11 INFO SparkContext: Created broadcast 128 from csv at NativeMethodAccessorImpl.java:0
24/09/28 13:43:11 INFO FileSourceScanExec: Planning scan with bin packing, max size: 134217728 bytes, ope

In [168]:
# Diretório contendo os arquivos Excel
directory_path = './data/raw/dimensoes/'
# Listar todos os arquivos Excel no diretório
excel_files = [os.path.join(directory_path, f) for f in os.listdir(directory_path) if f.endswith('.xls') or f.endswith('.xlsx')]

# Inicializar uma lista para armazenar os DataFrames
dataframes = []
# Carregar cada arquivo Excel em um DataFrame e adicionar à lista
for file in excel_files:
    df_dimensao = spark.read \
        .format("com.crealytics.spark.excel") \
        .option("header", "true") \
        .option("inferSchema", "false") \
        .option("dataAddress", "A4") \
        .load(file)
    dataframes.append(df_dimensao)   

In [169]:
# Unir todos os DataFrames em um único DataFrame
df_dimensoes = dataframes[0]
for df in dataframes[1:]:
    df_dimensoes = df_dimensoes.union(df)

# Isso pode ser ajustado conforme necessário para outras colunas ou comportamentos de substituição
for col in df_dimensoes.columns:
    df_dimensoes = df_dimensoes.withColumn(col, F.regexp_replace(F.col(col), r"[\r\n]+", " "))

In [170]:
df_dimensoes = df_dimensoes.drop("Parte 1 - Identificação e Controle") \
                   .withColumnRenamed("_c1", "codigo_variavel") \
                   .drop("_c2") \
                   .withColumnRenamed("_c3", "quesito_descricao") \
                   .withColumnRenamed("_c4", "categoria_tipo") \
                   .withColumnRenamed("_c5", "categoria_descricao")

In [171]:
window_spec_fill = Window.rowsBetween(Window.unboundedPreceding, 0)

# Preencher os valores nulos da coluna 'codigo_variavel' com o valor anterior ou o primeiro valor não nulo
df_dimensoes = df_dimensoes.withColumn(
    "codigo_variavel",
    F.last("codigo_variavel", ignorenulls=True).over(window_spec_fill)
)

df_dimensoes = df_dimensoes.withColumn(
    "quesito_descricao",
    F.last("quesito_descricao", ignorenulls=True).over(window_spec_fill)
)

In [172]:
# remover duplicados
df_dimensoes = df_dimensoes.drop_duplicates()

# Filtrar o DataFrame com não aplicável e não aplicado
df_dimensoes = df_dimensoes.filter(~F.lower(F.col("categoria_descricao")).isin("não aplicável", "não aplicado"))

# Agrupar por "codigo_variavel" e calcular a quantidade de nulos em cada coluna relevante
nulos_por_variavel = df_dimensoes.groupBy("codigo_variavel").agg(
    F.sum(F.col("quesito_descricao").isNull().cast("int")).alias("Qtd_Nulos_Quesito_Descricao"),
    F.sum(F.col("categoria_tipo").isNull().cast("int")).alias("Qtd_Nulos_Categoria_Tipo"),
    F.sum(F.col("categoria_descricao").isNull().cast("int")).alias("Qtd_Nulos_Categoria_Descricao")
)

# Filtrar para mostrar apenas grupos que têm nulos em pelo menos uma das colunas
variaveis_com_nulos = nulos_por_variavel.filter(
    (F.col("Qtd_Nulos_Quesito_Descricao") > 0) |
    (F.col("Qtd_Nulos_Categoria_Tipo") > 0) |
    (F.col("Qtd_Nulos_Categoria_Descricao") > 0)
)

# Coletar os valores da coluna "codigo_variavel" em uma lista
codigos_variaveis_com_nulos = variaveis_com_nulos.select("codigo_variavel").rdd.flatMap(lambda x: x).collect()

# Filtrar o DataFrame original para excluir linhas com "codigo_variavel" na lista
df_dimensoes = df_dimensoes.filter(~F.col("codigo_variavel").isin(codigos_variaveis_com_nulos))

24/09/28 13:31:11 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/09/28 13:31:11 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/09/28 13:31:12 INFO DAGScheduler: Registering RDD 1214 (javaToPython at NativeMethodAccessorImpl.java:0) as input to shuffle 29
24/09/28 13:31:12 INFO DAGScheduler: Got map stage job 121 (javaToPython at NativeMethodAccessorImpl.java:0) with 28 output partitions
24/09/28 13:31:12 INFO DAGScheduler: Final stage: ShuffleMapStage 149 (javaToPython at NativeMethodAccessorImpl.java:0)
24/09/28 13:31:12 INFO DAGScheduler: Parents of final stage: List()
24/09/28 13:31:12 INFO DAGScheduler: Missing parents: List()
24/09/28 13:31:12 INFO DAGScheduler: Submitting ShuffleMapStage 149 (MapPartitionsRDD[1214] at javaToPython at NativeMethodAccessorImpl.java:0), which has 

In [176]:
# Salvando o DataFrame em BigQuery
save_to_bigquery(df_dimensoes, "sor", "tbx002_dimensao_geral")

24/09/28 13:35:45 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/09/28 13:35:45 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/09/28 13:35:46 INFO DAGScheduler: Registering RDD 1248 (save at BigQueryWriteHelper.java:105) as input to shuffle 30
24/09/28 13:35:46 INFO DAGScheduler: Got map stage job 124 (save at BigQueryWriteHelper.java:105) with 28 output partitions
24/09/28 13:35:46 INFO DAGScheduler: Final stage: ShuffleMapStage 153 (save at BigQueryWriteHelper.java:105)
24/09/28 13:35:46 INFO DAGScheduler: Parents of final stage: List()
24/09/28 13:35:46 INFO DAGScheduler: Missing parents: List()
24/09/28 13:35:46 INFO DAGScheduler: Submitting ShuffleMapStage 153 (MapPartitionsRDD[1248] at save at BigQueryWriteHelper.java:105), which has no missing parents
24/09/28 13:35:46 INFO Me