In [None]:
# %pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.4.tar.gz (317.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.3/317.3 MB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:02[0m
[?25h  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
[?25hCollecting py4j==0.10.9.7 (from pyspark)
  Using cached py4j-0.10.9.7-py2.py3-none-any.whl.metadata (1.5 kB)
Using cached py4j-0.10.9.7-py2.py3-none-any.whl (200 kB)
Building wheels for collected packages: pyspark
  Building wheel for pyspark (pyproject.toml) ... [?25ldone
[?25h  Created wheel for pyspark: filename=pyspark-3.5.4-py2.py3-none-any.whl size=317849827 sha256=2ac7a4d4c384d29b7abc26339592be8345c6f9e4ed375dd3d7a107c71e872c9b
  Stored in directory: /home/spark/.cache/pip/wheels/13/92/64/da92a3521323cc629fdf25dd56eb26938e08014c1b57ad3759
Successfully built pyspark
Installing collected packages: py4j, p

In [1]:
import os
import re
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, current_date

In [2]:
class SparkPostgres:
    def __init__(self, spark, host, port, database, user, password):
        self.spark = spark
        self.url = f"jdbc:postgresql://{host}:{port}/{database}"
        self.properties = {
            "user": user,
            "password": password,
            "driver": "org.postgresql.Driver"
        }
    
    def load_csv(self, file_path):
        if os.path.exists(file_path):
            df = self.spark.read.csv(file_path, header=True, inferSchema=True)
            return df
        else:
            print(f"Error: O arquivo {file_path} não existe.")
            return None
        
    def save_to_postgres(self, df, schema, table):
        if df is not None:
            df = df.withColumn('dt_pst', current_date())
            df.write.jdbc(
                url=self.url,
                table=f"{schema}.{table}",
                mode="overwrite",
                properties=self.properties
            )
        else:
            print(f"Error: DataFrame vazio para a tabela {schema}.{table}")
    
    def run(self, base_path, mapping, schema):
        for file_path, table_name in mapping.items():
            self.process_file(base_path, file_path, table_name, schema)
    
    def clean_column_name(self, column_name):
        column_name = column_name.lower().strip()
        column_name = re.sub(r"[^a-zA-Z0-9_]", "", column_name)
        column_name = re.sub(r"\s+", "_", column_name)
        return column_name
    
    def process_file(self, base_path, file_path, table_name, schema):
        full_path = os.path.join(base_path, file_path)
        
        df = self.load_csv(full_path)
        if df is not None:
            for col_name in df.columns:
                df = df.withColumnRenamed(col_name, self.clean_column_name(col_name))
            
            for col_name in df.columns:
                df = df.withColumn(col_name, col(col_name).cast("string"))
            
            self.save_to_postgres(df, schema, table_name)
            print(f"Tabela {table_name} processada com sucesso.")
        else:
            print(f"Erro ao processar o arquivo: {file_path}")

In [None]:
if __name__ == "__main__":
    spark = SparkSession.builder.appName("CSV to PostgreSQL").getOrCreate()

    host = os.getenv("DB_HOST")
    port = os.getenv("DB_PORT")
    database = os.getenv("DB_DATABASE")
    user = os.getenv("DB_USER")
    password = os.getenv("DB_PASSWORD")
    schema = os.getenv("DB_SCHEMA")

    base_path = "source/"

    mapping = {
        "auxiliares/TbAbatimento/Merge/merged_data.csv": "tb_abatimento_mrg",
        "auxiliares/TbAluno/Merge/merged_data.csv": "tb_aluno_mrg",
        "auxiliares/TbCampoDinamico/Merge/merged_data.csv": "tb_campo_dinamico_mrg",
        "auxiliares/TbFase/Merge/merged_data.csv":  "tb_fase_mrg",
        "auxiliares/TbHistorico/Merge/merged_data.csv" : "tb_historico_mrg",
        "auxiliares/TbProfessor/Merge/merged_data.csv" : "tb_professor_mrg",
        "auxiliares/TbSerie/Merge/merged_data.csv" : "tb_serie_mrg",
        "auxiliares/TbTurma/Merge/merged_data.csv" : "tb_turma_mrg"
    }

    processar = SparkPostgres(spark, host, port, database, user, password)
    processar.run(base_path, mapping, schema)

    spark.stop()


25/02/09 15:49:26 WARN Utils: Your hostname, spark-VirtualBox resolves to a loopback address: 127.0.1.1; using 10.0.2.15 instead (on interface enp0s3)
25/02/09 15:49:26 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
25/02/09 15:49:37 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/02/09 15:50:48 WARN Executor: Issue communicating with driver in heartbeater
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.SparkThreadUtils$.awaitResult(SparkThreadUtils.scala:56)
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:310)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEndpointRef.askSync(RpcEndpointRef.scala:101)
	at org.apache.spark.rpc.RpcEndpointRef.a

Tabela tb_abatimento_mrg processada com sucesso.


                                                                                

Tabela tb_aluno_mrg processada com sucesso.


                                                                                

Tabela tb_campo_dinamico_mrg processada com sucesso.


                                                                                

Tabela tb_fase_mrg processada com sucesso.


                                                                                

Tabela tb_historico_mrg processada com sucesso.


                                                                                

Tabela tb_professor_mrg processada com sucesso.


                                                                                

Tabela tb_serie_mrg processada com sucesso.
