### Setting up file paths

In [1]:
workdir = './work/'
parquetdir = './parquet/'
source_csv = 'votacao_candidato_munzona_2022_BRASIL.csv'
extracted_file = workdir +'extracted/'+source_csv

### Creating Spark session

In [2]:
from pyspark.sql import SparkSession

# Create a SparkSession
spark_session = SparkSession.builder.appName('spark').getOrCreate()

### Reading file

In [3]:
df = spark_session.read.options(header="true", delimiter=";", encoding="ISO-8859-1").csv(extracted_file)

### Selecting relevant columns

In [4]:
# Defining relevant columns
relevant_columns=[
    "NR_TURNO",
    "DS_ELEICAO",
    "TP_ABRANGENCIA",
    "SG_UF",
    "NM_MUNICIPIO",
    "NR_ZONA",
    "DS_CARGO",
    "NR_CANDIDATO",
    "NM_CANDIDATO",
    "NM_URNA_CANDIDATO",
    "DS_SITUACAO_CANDIDATURA",
    "NR_PARTIDO",
    "SG_PARTIDO",
    "NM_PARTIDO",
    "NM_COLIGACAO",
    "DS_COMPOSICAO_COLIGACAO",
    "ST_VOTO_EM_TRANSITO",
    "QT_VOTOS_NOMINAIS",
    "NM_TIPO_DESTINACAO_VOTOS",
    "QT_VOTOS_NOMINAIS_VALIDOS",
    "DS_SIT_TOT_TURNO"
]

# Selecting relevant columns
selected_columns_df = df.select(relevant_columns)

### Printing first lines

In [5]:
selected_columns_df.show()

+--------+--------------------+--------------+-----+--------------------+-------+-----------------+------------+--------------------+--------------------+-----------------------+----------+-------------+--------------------+---------------+-----------------------+-------------------+-----------------+------------------------+-------------------------+----------------+
|NR_TURNO|          DS_ELEICAO|TP_ABRANGENCIA|SG_UF|        NM_MUNICIPIO|NR_ZONA|         DS_CARGO|NR_CANDIDATO|        NM_CANDIDATO|   NM_URNA_CANDIDATO|DS_SITUACAO_CANDIDATURA|NR_PARTIDO|   SG_PARTIDO|          NM_PARTIDO|   NM_COLIGACAO|DS_COMPOSICAO_COLIGACAO|ST_VOTO_EM_TRANSITO|QT_VOTOS_NOMINAIS|NM_TIPO_DESTINACAO_VOTOS|QT_VOTOS_NOMINAIS_VALIDOS|DS_SIT_TOT_TURNO|
+--------+--------------------+--------------+-----+--------------------+-------+-----------------+------------+--------------------+--------------------+-----------------------+----------+-------------+--------------------+---------------+------------------

### Setting up connection parameters

In [15]:
hostname_or_ip = "dw"
port = "5432"
db = "star"
user = "star"
password = "password"
schema = "star"


url = "jdbc:postgresql://" + hostname_or_ip + ":" + port + "/" + db

properties = {
    "user": user,
    "password": password,
    "driver": "org.postgresql.Driver"
}

table = schema + ".star"

### Writing to DW

In [16]:
selected_columns_df.write.jdbc(url=url, table=table, mode="overwrite", properties=properties)

IllegalArgumentException: requirement failed: The driver could not open a JDBC connection. Check the URL: jbdc:postgresql://dw:5432/star

### Cleaning up

In [None]:
# Stopping spark session
spark_session.stop()

# Cleaning up files 
# Delete the directory and all its contents
# import shutil

# shutil.rmtree(workdir+'extracted/')