### Setting up file paths

In [None]:
workdir = './work/'
parquetdir = './parquet/'
source_csv = 'votacao_candidato_munzona_2022_BRASIL.csv'
extracted_file = workdir +'extracted/'+source_csv

### Creating Spark session

In [None]:
from pyspark.sql import SparkSession

# Create a SparkSession
spark_session = SparkSession.builder.appName('spark').getOrCreate()

### Reading file

In [None]:
df = spark_session.read.options(header="true", delimiter=";", encoding="ISO-8859-1").csv(extracted_file)

### Selecting relevant columns

In [None]:
# Defining relevant columns
relevant_columns=[
    "NR_TURNO",
    "DS_ELEICAO",
    "TP_ABRANGENCIA",
    "SG_UF",
    "NM_MUNICIPIO",
    "NR_ZONA",
    "DS_CARGO",
    "NR_CANDIDATO",
    "NM_CANDIDATO",
    "NM_URNA_CANDIDATO",
    "DS_SITUACAO_CANDIDATURA",
    "NR_PARTIDO",
    "SG_PARTIDO",
    "NM_PARTIDO",
    "NM_COLIGACAO",
    "DS_COMPOSICAO_COLIGACAO",
    "ST_VOTO_EM_TRANSITO",
    "QT_VOTOS_NOMINAIS",
    "NM_TIPO_DESTINACAO_VOTOS",
    "QT_VOTOS_NOMINAIS_VALIDOS",
    "DS_SIT_TOT_TURNO"
]

# Selecting relevant columns
selected_columns_df = df.select(relevant_columns)

### Printing first lines

In [None]:
selected_columns_df.show()

### Defining transformation functions

In [None]:
import pyspark.sql.functions as F

def transform_spark_dataframe_into_star_schema(
    dataframe,
    colunas_fato = ["col1", "col2"],
    tabela_fato_nome = "tabela_fato",
    mapping_colunas_dimensao = {'dim1':["col3", "col4"], "dim2":["col5", "col6"]},
):

    colunas_fato_e_dimensao = colunas_fato + [col for cols in mapping_colunas_dimensao.values() for col in cols]
    dataframe = dataframe.select(*colunas_fato_e_dimensao)

    dimensions = []
    for dim, cols in mapping_colunas_dimensao.items():

        df_dimension = dataframe.select(*cols).distinct()
        sk_name = f"sk_{dim.replace('DIM_', '')}"
        # add unique id to dimension
        df_dimension = df_dimension.withColumn(sk_name, F.monotonically_increasing_id())

        dimensions.append( (dim, df_dimension) )



    # Substitui as colunas de dimensão pelo respectivo SK na tabela fato
    # ------------------------------------------------------------------
    for dim, df_dimension in dimensions:
        # join the dimension dataframe to the original dataframe
        dataframe = dataframe.join(
            df_dimension, 
            on=[
                dataframe[col] == df_dimension[col]
                for col in mapping_colunas_dimensao[dim]
            ],
            how="left"
        )

        # drop the original columns
        dataframe = dataframe.drop(*mapping_colunas_dimensao[dim])
    return dimensions + [ (tabela_fato_nome, dataframe) ]

### Executing transformation

In [None]:
star_schema = transform_spark_dataframe_into_star_schema(
    selected_columns_df,
    colunas_fato=["QT_VOTOS_NOMINAIS_VALIDOS", "QT_VOTOS_NOMINAIS"],
    tabela_fato_nome="tabela_fato",
    mapping_colunas_dimensao={
        'dim_municipio': ["SG_UF", "NM_MUNICIPIO"],
        'dim_cargo': ["DS_CARGO"],
        'dim_ds_eleicao':["DS_ELEICAO"],
        'dim_partido':["SG_PARTIDO","NM_PARTIDO", "NR_PARTIDO"],
        'dim_candidato':["NM_CANDIDATO", "NR_CANDIDATO", "NM_URNA_CANDIDATO"],
        'dim_turno':["NR_TURNO"],
        'dim_tp_agrangencia':["TP_ABRANGENCIA"],
        'dim_zona':["NR_ZONA"],
        'dim_situacao_candidatura':["DS_SITUACAO_CANDIDATURA"],
        'dim_coligacao':["NM_COLIGACAO", "DS_COMPOSICAO_COLIGACAO"],
        "dim_voto_transito":["ST_VOTO_EM_TRANSITO"],
        'dim_situacaof_turno':["DS_SIT_TOT_TURNO"],
        'dim_destinacao_voto':["NM_TIPO_DESTINACAO_VOTOS"]
    },   
)

### Setting up connection parameters

In [None]:
hostname_or_ip = "dw"
port = "5432"
db = "star"
user = "star"
password = "star"
schema = "star"


db_url = "jdbc:postgresql://" + hostname_or_ip + ":" + port + "/" + db

properties = {
    "user": user,
    "password": password,
    "driver": "org.postgresql.Driver"
}


### Writing to DW

In [None]:
for item in star_schema:
    table_name,dataframe = item
    print(f"Writing {table_name} to DW")
    dataframe.write.jdbc(url=db_url, table=schema+"."+table_name, mode="overwrite", properties=properties)

### Cleaning up

In [None]:
# Stopping spark session
spark_session.stop()

# Cleaning up files 
# Delete the directory and all its contents
# import shutil

# shutil.rmtree(workdir+'extracted/')