# Data LakeHouse de Acidentes de Recife

## Configurações de Ambiente

### Importação dos dados utilizados

In [1]:
import os
import requests

urls = {
  "2019": "http://dados.recife.pe.gov.br/dataset/44087d2d-73b5-4ab3-9bd8-78da7436eed1/resource/3531bafe-d47d-415e-b154-a881081ac76c/download/acidentes-2019.csv",
  "2020": "http://dados.recife.pe.gov.br/dataset/44087d2d-73b5-4ab3-9bd8-78da7436eed1/resource/fc1c8460-0406-4fff-b51a-e79205d1f1ab/download/acidentes_2020-novo.csv",
  "2021": "http://dados.recife.pe.gov.br/dataset/44087d2d-73b5-4ab3-9bd8-78da7436eed1/resource/2caa8f41-ccd9-4ea5-906d-f66017d6e107/download/acidentes2021.csv",
  "2022": "http://dados.recife.pe.gov.br/dataset/44087d2d-73b5-4ab3-9bd8-78da7436eed1/resource/971e0228-fa9c-4a42-b360-c842b29d2f56/download/acidentes2022.csv",
  "2023": "http://dados.recife.pe.gov.br/dataset/44087d2d-73b5-4ab3-9bd8-78da7436eed1/resource/d26b864b-0f7b-403e-b142-fd9989acaaf5/download/acidentes2023.csv",
  "2024": "http://dados.recife.pe.gov.br/dataset/44087d2d-73b5-4ab3-9bd8-78da7436eed1/resource/29afbf42-a36c-475c-8b75-761e17e67679/download/acidentes2024.csv"
}

output_dir = 'data'
os.makedirs(output_dir, exist_ok=True)

for year, url in urls.items():
  file_name = f"acidentes_{year}.csv"
  file_path =  os.path.join(output_dir, file_name)

  response = requests.get(url)

  if response.status_code == 200:
    with open(file_path, 'wb') as file:
      file.write(response.content)
    print(f"Archive {file_name} downloaded successfully.")
  else:
    print(f"Failed to download {file_name}. Status code: {response.status_code}")

print("All files downloaded successfully.")



Archive acidentes_2019.csv downloaded successfully.
Archive acidentes_2020.csv downloaded successfully.
Archive acidentes_2021.csv downloaded successfully.
Archive acidentes_2022.csv downloaded successfully.
Archive acidentes_2023.csv downloaded successfully.
Archive acidentes_2024.csv downloaded successfully.
All files downloaded successfully.


### Configurando ambiente spark



In [2]:
!pip install pyspark==3.5.1 delta-spark==3.2.0 -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.5/200.5 kB[0m [31m13.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for pyspark (pyproject.toml) ... [?25l[?25hdone


In [3]:
from pyspark.sql import SparkSession
from delta import *

builder = (
  SparkSession.builder.appName("EtlAcidentesRecife")
  .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
  .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
)

spark = configure_spark_with_delta_pip(builder).getOrCreate()

print("SparkSession and Delta Lake configured successfully!")

SparkSession and Delta Lake configured successfully!


## Entendimento dos dados

### 1. Inspeção Inicial de um Arquivo (Amostra de 2019)

In [5]:
df_acidentes = spark.read \
  .option("header", "true") \
  .option("inferSchema", "true") \
  .option("delimiter", ";") \
  .csv('data/acidentes_2019.csv')

In [6]:
df_acidentes.printSchema()

root
 |-- DATA: date (nullable = true)
 |-- hora: string (nullable = true)
 |-- natureza_acidente: string (nullable = true)
 |-- situacao: string (nullable = true)
 |-- bairro: string (nullable = true)
 |-- endereco: string (nullable = true)
 |-- numero: string (nullable = true)
 |-- detalhe_endereco_acidente: string (nullable = true)
 |-- complemento: string (nullable = true)
 |-- endereco_cruzamento: string (nullable = true)
 |-- numero_cruzamento: string (nullable = true)
 |-- referencia_cruzamento: string (nullable = true)
 |-- bairro_cruzamento: string (nullable = true)
 |-- num_semaforo: integer (nullable = true)
 |-- sentido_via: string (nullable = true)
 |-- tipo: string (nullable = true)
 |-- descricao: string (nullable = true)
 |-- auto: integer (nullable = true)
 |-- moto: integer (nullable = true)
 |-- ciclom: integer (nullable = true)
 |-- ciclista: integer (nullable = true)
 |-- pedestre: integer (nullable = true)
 |-- onibus: integer (nullable = true)
 |-- caminhao: inte

In [8]:
df_acidentes.show(5, truncate=False)

+----------+--------+-----------------+----------+-----------+-------------------------------+------+---------------------------+-----------------------------------------------------------+-------------------------------+-----------------+-----------------------------------------------------------+-----------------+------------+-----------+------------------------+-----------------------------------------------------------------------------------+----+----+------+--------+--------+------+--------+-------+------+-------+-------------+-------------------+-----------+-----------------+---------------+------------+---------------+-----------------+--------------+------------------+-----------+----------------+------------+------------+
|DATA      |hora    |natureza_acidente|situacao  |bairro     |endereco                       |numero|detalhe_endereco_acidente  |complemento                                                |endereco_cruzamento            |numero_cruzamento|referencia_cruzamen

### 2. Verificação de Consistência de Schema entre Arquivos

In [30]:
import os
import csv

data_dir = 'data/'

csv_files = sorted([f for f in os.listdir(data_dir) if f.endswith('.csv')])

if not csv_files:
  print("No CSV files found in the directory 'data/'")
else:
  base_path = os.path.join(data_dir, csv_files[0])
  base_header = spark.read.option("delimiter", ";").option("header", "true").csv(base_path).columns

  equals = True

  for f in csv_files[1:]:
    current_path = os.path.join(data_dir, f)
    current_header = spark.read.option("delimiter", ";").option("header", "true").csv(current_path).columns
    if current_header != base_header:
      equals = False
      print(f"\n!!! ALERT: The header of '{f}' is DIFFERENT! Analysis:")
      base_set = set(base_header)
      current_set = set(current_header)
      removed_columns = base_set - current_set
      if removed_columns:
          print(f"  - Missing columns in this file: {list(removed_columns)}")
      added_columns = current_set - base_set
      if added_columns:
          print(f"  - Extra columns found in this file: {list(added_columns)}")
      if len(base_header) != len(current_header):
            print(f"  - Column count diverges: {len(base_header)} in reference vs. {len(current_header)} in this file.") # Translated
      print("-" * 30)

  if equals:
      print("\nGreat news! All CSV files have the same header.")


!!! ALERT: The header of 'acidentes_2020.csv' is DIFFERENT! Analysis:
  - Missing columns in this file: ['endereco_cruzamento', 'numero_cruzamento', 'referencia_cruzamento', 'DATA']
  - Extra columns found in this file: ['data']
  - Column count diverges: 41 in reference vs. 38 in this file.
------------------------------

!!! ALERT: The header of 'acidentes_2021.csv' is DIFFERENT! Analysis:
  - Missing columns in this file: ['numero_cruzamento', 'referencia_cruzamento', 'DATA', 'endereco_cruzamento', 'descricao']
  - Extra columns found in this file: ['data']
  - Column count diverges: 41 in reference vs. 37 in this file.
------------------------------

!!! ALERT: The header of 'acidentes_2022.csv' is DIFFERENT! Analysis:
  - Missing columns in this file: ['natureza_acidente', 'numero_cruzamento', 'referencia_cruzamento', 'DATA', 'endereco_cruzamento', 'descricao']
  - Extra columns found in this file: ['Protocolo', 'natureza', 'data']
  - Column count diverges: 41 in reference vs. 3

### 3. Definição do Schema Unificado para a Camada Bronze

In [None]:
from pyspark.sql.types import StructType, StructField, StringType, DateType, IntegerType

schema_acidentes = StructType([
    #...
])

print(schema_acidentes)