# Gerador Automático de Data Schemas - GECOB

Processa 5 tabelas e gera schemas completos

In [None]:
# CONFIGURAÇÕES
DATABASE = 'gecob'
TABELAS = ['prior_master_consolidado', 'prior_score_priorizacao', 'prior_score_componentes', 'prior_clusters_empresas', 'prior_outliers_identificados']
OUTPUT_DIR = 'data_schemas'
SAMPLE_LIMIT = 10
print('Config OK')

In [None]:
import sys
sys.path.append('/home/tsevero/notebooks/SAT_BIG_DATA/data-pipeline/batch/dags')

import os
import json
import builtins
from datetime import datetime

from pyspark.sql.types import *
from pyspark.sql.functions import *

try:
    from utils import spark_utils_session as utils
    print('Utils importado')
except:
    print('Sem utils - use SparkSession manual')

print('Imports OK')

In [None]:
# SPARK SESSION
profile = 'prod'
app_name = 'tsevero_gerar_data_schemas'

spark_builder = (utils.DBASparkAppSession.builder.setAppName(app_name).usingProcessProfile(profile).enableHiveSupport().enableHudiSupport())
spark_session = spark_builder.build()
spark = spark_session.spark

print(f'Spark OK: {app_name}')

In [None]:
# VERIFICAR ACESSO
print('Verificando tabelas...')
for t in TABELAS:
    try:
        cnt = spark.sql(f'SELECT COUNT(*) as c FROM {DATABASE}.{t}').collect()[0]['c']
        print(f'  OK {t:40s} {cnt:,} registros')
    except Exception as e:
        print(f'  ERRO {t}: {str(e)[:60]}')

In [None]:
# FUNÇÕES AUXILIARES

def criar_dir():
    if not os.path.exists(OUTPUT_DIR):
        os.makedirs(OUTPUT_DIR)
        print(f'Dir criado: {OUTPUT_DIR}')

def salvar_csv(df, nome):
    path = os.path.join(OUTPUT_DIR, nome)
    df.to_csv(path, index=False, encoding='utf-8-sig')
    return path

def salvar_json(df, nome):
    path = os.path.join(OUTPUT_DIR, nome)
    df.to_json(path, orient='records', indent=2, force_ascii=False)
    return path

def salvar_md(df, nome, titulo):
    path = os.path.join(OUTPUT_DIR, nome)
    with open(path, 'w') as f:
        f.write(f'# {titulo}\n\n')
        f.write(df.to_markdown(index=False))
    return path

print('Funcoes OK')

In [None]:
# FUNÇÃO PROCESSAR TABELA

def processar_tabela(tabela):
    print(f'\n{"="*60}')
    print(f'Processando: {DATABASE}.{tabela}')
    print('='*60)
    
    resultado = {'database': DATABASE, 'tabela': tabela, 'timestamp': datetime.now().isoformat()}
    
    try:
        # DESCRIBE
        print('  1. DESCRIBE FORMATTED...')
        df_desc = spark.sql(f'DESCRIBE FORMATTED {DATABASE}.{tabela}').toPandas()
        salvar_csv(df_desc, f'{tabela}_describe_formatted.csv')
        salvar_json(df_desc, f'{tabela}_describe_formatted.json')
        salvar_md(df_desc, f'{tabela}_describe_formatted.md', f'DESCRIBE {DATABASE}.{tabela}')
        print(f'     OK - {len(df_desc)} linhas')
        
        # SAMPLE
        print(f'  2. SELECT * LIMIT {SAMPLE_LIMIT}...')
        df_sample = spark.sql(f'SELECT * FROM {DATABASE}.{tabela} LIMIT {SAMPLE_LIMIT}').toPandas()
        salvar_csv(df_sample, f'{tabela}_sample_{SAMPLE_LIMIT}.csv')
        salvar_json(df_sample, f'{tabela}_sample_{SAMPLE_LIMIT}.json')
        salvar_md(df_sample, f'{tabela}_sample_{SAMPLE_LIMIT}.md', f'SAMPLE {DATABASE}.{tabela}')
        print(f'     OK - {len(df_sample)} x {len(df_sample.columns)} colunas')
        
        # COUNT
        print('  3. COUNT(*) ...')
        cnt = spark.sql(f'SELECT COUNT(*) as c FROM {DATABASE}.{tabela}').collect()[0]['c']
        resultado['row_count'] = int(cnt)
        print(f'     OK - {cnt:,} registros')
        
        return resultado
    except Exception as e:
        print(f'  ERRO: {str(e)}')
        resultado['error'] = str(e)
        return resultado

print('Funcao processar OK')

In [None]:
# EXECUTAR GERADOR
print('='*60)
print('EXECUTANDO GERADOR')
print('='*60)

criar_dir()
resultados = []

for i, tabela in enumerate(TABELAS, 1):
    print(f'\n[{i}/{len(TABELAS)}] {tabela}')
    resultado = processar_tabela(tabela)
    resultados.append(resultado)

# JSON consolidado
with open(f'{OUTPUT_DIR}/data_schemas_completo.json', 'w') as f:
    json.dump(resultados, f, indent=2, ensure_ascii=False)

# README
with open(f'{OUTPUT_DIR}/README.md', 'w') as f:
    f.write('# Data Schemas - GECOB\n\n')
    f.write(f'Database: {DATABASE}\n\n')
    for r in resultados:
        if 'error' not in r:
            f.write(f"## {r['tabela']}\n")
            f.write(f"- Registros: {r['row_count']:,}\n\n")

sucessos = [r for r in resultados if 'error' not in r]
erros = [r for r in resultados if 'error' in r]

print(f'\nCONCLUÍDO!')
print(f'Sucessos: {len(sucessos)}')
print(f'Erros: {len(erros)}')
print(f'Output: {OUTPUT_DIR}/')

In [None]:
# LISTAR ARQUIVOS GERADOS
import glob

arquivos = sorted(glob.glob(f'{OUTPUT_DIR}/*'))
print(f'Total: {len(arquivos)} arquivos\n')

for arq in arquivos:
    nome = os.path.basename(arq)
    tamanho = os.path.getsize(arq) / 1024
    print(f'  {nome:50s} {tamanho:8.1f} KB')

In [None]:
# PREVIEW DOS RESULTADOS
import pandas as pd

# Ver describe da primeira tabela
tabela = TABELAS[0]
df = pd.read_csv(f'{OUTPUT_DIR}/{tabela}_describe_formatted.csv')
print(f'DESCRIBE {tabela}:\n')
display(df.head(20))

# Ver sample
df_sample = pd.read_csv(f'{OUTPUT_DIR}/{tabela}_sample_{SAMPLE_LIMIT}.csv')
print(f'\nSAMPLE {tabela}:\n')
display(df_sample)