In [None]:
%%configure

{
    "conf": {
        "spark.driver.maxResultSize": "8g",
        "spark.driver.memory": "54g",
        "spark.driver.cores": 8,
        "spark.executor.instances": 0,
        "spark.sql.execution.arrow.pyspark.enabled": "true",
        "spark.sql.execution.arrow.pyspark.selfDestruct.enabled": "true"
    }
}

# Export Bases — Treino, OOS e OOT para Lakehouse (Parquet)

Notebook de entrega que exporta as bases utilizadas no treinamento do modelo v6 para o Lakehouse em formato **Parquet**, garantindo rastreabilidade e reprodutibilidade.

### Bases exportadas
| Base | Descricao | SAFRAs | Uso |
|------|-----------|--------|-----|
| **treino** | Amostra estratificada 25% (train + val) | 202410-202501 | Treino e validacao do modelo |
| **oos** | Out-of-Sample (75% restante) | 202410-202501 | Validacao fora da amostra |
| **oot** | Out-of-Time | 202502-202503 | Teste temporal (producao simulada) |

### Destinos
- **Parquet**: `/lakehouse/default/Files/projeto-final/datasets/v6/`
- **Delta Tables**: `Gold.feature_store.{treino_v6, oos_v6, oot_v6}`

### Pipeline
1. Leitura do Gold Feature Store (5-layer memory optimization)
2. Limpeza de dados (mesmas 7 funcoes do modelo v6)
3. Split temporal + amostragem estratificada 25%
4. Export Parquet + Delta (particionado por SAFRA)
5. Validacao de integridade (contagens, schema, missing)

In [None]:
# =============================================================================
# 1. IMPORTS E CONFIGURACAO
# =============================================================================
import pandas as pd
import numpy as np
import gc
import os

from pyspark.sql import functions as F
from pyspark.sql.types import FloatType, IntegerType, DoubleType, LongType

import sys; sys.path.insert(0, '/lakehouse/default/Files/projeto-final')
from config.pipeline_config import (
    PATH_FEATURE_STORE, SAFRAS, LEAKAGE_BLACKLIST, GOLD_BASE
)

# Destinos de export
PARQUET_DIR = "/lakehouse/default/Files/projeto-final/datasets/v6"
DELTA_BASE = f"{GOLD_BASE}/Tables/feature_store"
os.makedirs(PARQUET_DIR, exist_ok=True)

print('Imports OK')
print(f'Parquet dir: {PARQUET_DIR}')
print(f'Delta base:  {DELTA_BASE}')

## 2. Leitura do Gold Feature Store

Mesma estrategia de 5 camadas do modelo v6 para contornar o limite de `spark.driver.maxResultSize`.

In [None]:
# =============================================================================
# 2. LEITURA OTIMIZADA DO GOLD FEATURE STORE (Spark -> Pandas)
# =============================================================================
spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")
spark.conf.set("spark.sql.execution.arrow.pyspark.selfDestruct.enabled", "true")

print(f'Lendo feature store de: {PATH_FEATURE_STORE}\n')

df_spark = spark.read.format("delta").load(PATH_FEATURE_STORE)
n_original = df_spark.count()
print(f'Original: {n_original:,} rows x {len(df_spark.columns)} cols')

# Drop colunas audit + leakage no Spark
cols_audit = ['_execution_id', '_data_inclusao', '_data_alteracao_silver', 'DT_PROCESSAMENTO']
cols_drop = [c for c in cols_audit + LEAKAGE_BLACKLIST if c in df_spark.columns]
if cols_drop:
    df_spark = df_spark.drop(*cols_drop)
    print(f'Drop {len(cols_drop)} colunas (audit+leakage): {cols_drop}')

# Filtrar FLAG_INSTALACAO == 1 no Spark
n_reprovados = 0
if 'FLAG_INSTALACAO' in df_spark.columns:
    n_reprovados = df_spark.filter(F.col('FLAG_INSTALACAO') == 0).count()
    df_spark = df_spark.filter(F.col('FLAG_INSTALACAO') == 1).drop('FLAG_INSTALACAO')
    n_pos = n_original - n_reprovados
    print(f'FLAG_INSTALACAO: {n_original:,} -> {n_pos:,} ({n_reprovados:,} reprovados removidos)')
else:
    n_pos = n_original

# Cast tipos via .select() (plano flat)
cast_exprs = []
n_double, n_long = 0, 0
for field in df_spark.schema.fields:
    if isinstance(field.dataType, DoubleType):
        cast_exprs.append(F.col(field.name).cast(FloatType()).alias(field.name))
        n_double += 1
    elif isinstance(field.dataType, LongType):
        cast_exprs.append(F.col(field.name).cast(IntegerType()).alias(field.name))
        n_long += 1
    else:
        cast_exprs.append(F.col(field.name))
df_spark = df_spark.select(*cast_exprs)
print(f'Cast tipos: {n_double} Double->Float, {n_long} Long->Int')

# Conversao chunked por SAFRA
safras_disponiveis = sorted([row.SAFRA for row in df_spark.select('SAFRA').distinct().collect()])
print(f'\nSAFRAs: {safras_disponiveis} | Colunas: {len(df_spark.columns)}')
print('Convertendo por SAFRA...')

chunks = []
for safra in safras_disponiveis:
    chunk = df_spark.filter(F.col('SAFRA') == safra).toPandas()
    mem_mb = chunk.memory_usage(deep=True).sum() / 1e6
    print(f'  SAFRA {safra}: {len(chunk):,} rows | {mem_mb:.0f} MB')
    chunks.append(chunk)
    gc.collect()

df = pd.concat(chunks, ignore_index=True)
del chunks
gc.collect()

print(f'\nDataset carregado: {df.shape}')
print(f'Memory: {df.memory_usage(deep=True).sum() / 1e9:.2f} GB')

## 3. Limpeza de Dados

Mesmas 7 funcoes de limpeza do modelo v6 para garantir que as bases exportadas correspondam **exatamente** ao que foi usado no treinamento.

In [None]:
# =============================================================================
# 3. FUNCOES DE LIMPEZA (identicas ao modelo v6)
# =============================================================================

def clean_empty_keys(df):
    return df.dropna(subset=['NUM_CPF', 'SAFRA'])

def convert_cep3_uf_regiao(df):
    cep_map = {
        '01':('SP','SUDESTE'),'02':('SP','SUDESTE'),'03':('SP','SUDESTE'),
        '04':('SP','SUDESTE'),'05':('SP','SUDESTE'),'06':('SP','SUDESTE'),
        '07':('SP','SUDESTE'),'08':('SP','SUDESTE'),'09':('SP','SUDESTE'),
        '20':('RJ','SUDESTE'),'21':('RJ','SUDESTE'),'22':('RJ','SUDESTE'),
        '23':('RJ','SUDESTE'),'24':('RJ','SUDESTE'),'29':('ES','SUDESTE'),
        '30':('MG','SUDESTE'),'31':('MG','SUDESTE'),'32':('MG','SUDESTE'),
        '33':('MG','SUDESTE'),'34':('MG','SUDESTE'),'35':('MG','SUDESTE'),
        '36':('MG','SUDESTE'),'37':('MG','SUDESTE'),'38':('MG','SUDESTE'),
        '39':('MG','SUDESTE'),
        '40':('BA','NORDESTE'),'41':('BA','NORDESTE'),'42':('BA','NORDESTE'),
        '43':('BA','NORDESTE'),'44':('BA','NORDESTE'),'45':('BA','NORDESTE'),
        '46':('BA','NORDESTE'),'47':('BA','NORDESTE'),'48':('BA','NORDESTE'),
        '49':('SE','NORDESTE'),
        '50':('PE','NORDESTE'),'51':('PE','NORDESTE'),'52':('PE','NORDESTE'),
        '53':('PE','NORDESTE'),'54':('PE','NORDESTE'),'55':('PE','NORDESTE'),
        '56':('AL','NORDESTE'),'57':('AL','NORDESTE'),
        '58':('PB','NORDESTE'),'59':('RN','NORDESTE'),
        '60':('CE','NORDESTE'),'61':('CE','NORDESTE'),'62':('CE','NORDESTE'),
        '63':('PI','NORDESTE'),'64':('PI','NORDESTE'),'65':('MA','NORDESTE'),
        '66':('PA','NORTE'),'67':('PA','NORTE'),'68':('AC','NORTE'),
        '69':('AM','NORTE'),'77':('TO','NORTE'),
        '70':('DF','CENTRO-OESTE'),'71':('DF','CENTRO-OESTE'),
        '72':('GO','CENTRO-OESTE'),'73':('GO','CENTRO-OESTE'),
        '74':('GO','CENTRO-OESTE'),'75':('GO','CENTRO-OESTE'),
        '76':('GO','CENTRO-OESTE'),
        '78':('MT','CENTRO-OESTE'),'79':('MS','CENTRO-OESTE'),
        '80':('PR','SUL'),'81':('PR','SUL'),'82':('PR','SUL'),
        '83':('PR','SUL'),'84':('PR','SUL'),'85':('PR','SUL'),
        '86':('PR','SUL'),'87':('PR','SUL'),
        '88':('SC','SUL'),'89':('SC','SUL'),
        '90':('RS','SUL'),'91':('RS','SUL'),'92':('RS','SUL'),
        '93':('RS','SUL'),'94':('RS','SUL'),'95':('RS','SUL'),
        '96':('RS','SUL'),'97':('RS','SUL'),'98':('RS','SUL'),'99':('RS','SUL'),
    }
    if 'CEP_3_digitos' not in df.columns:
        return df
    cep2 = df['CEP_3_digitos'].astype(str).str[:2]
    mapped = cep2.map(cep_map)
    df['UF'] = mapped.apply(lambda x: x[0] if isinstance(x, tuple) else 'OUTROS')
    df['REGIAO'] = mapped.apply(lambda x: x[1] if isinstance(x, tuple) else 'OUTROS')
    return df.drop(columns=['CEP_3_digitos'])

def adjust_and_drop_date_cols(df):
    if 'var_12' in df.columns:
        df['var_12'] = pd.to_datetime(df['var_12'], format='%d/%m/%Y', errors='coerce')
    df['DATA_REF_SAFRA'] = pd.to_datetime(df['SAFRA'].astype(str), format='%Y%m')
    if 'var_12' in df.columns:
        df['DIAS_VAR_12'] = (df['DATA_REF_SAFRA'] - df['var_12']).dt.days
    if 'PAG_DT_PRIMEIRA_FATURA' in df.columns:
        df['PAG_DT_PRIMEIRA_FATURA'] = pd.to_datetime(df['PAG_DT_PRIMEIRA_FATURA'], errors='coerce')
        df['PAG_DIAS_DESDE_PRIMEIRA_FATURA'] = (df['DATA_REF_SAFRA'] - df['PAG_DT_PRIMEIRA_FATURA']).dt.days
    date_cols = df.select_dtypes(include=['datetime64', 'datetimetz']).columns.tolist()
    date_cols.append('DATA_REF_SAFRA')
    return df.drop(columns=[c for c in date_cols if c in df.columns])

def remove_high_missing(df, threshold=0.75):
    null_pct = df.isnull().mean()
    cols_to_drop = null_pct[null_pct >= threshold].index.tolist()
    print(f'  High missing (>= {threshold:.0%}): {len(cols_to_drop)} colunas removidas')
    return df.drop(columns=cols_to_drop)

def remove_low_cardinality(df):
    low_card = [c for c in df.columns if df[c].nunique() <= 1]
    print(f'  Low cardinality (== 1): {len(low_card)} colunas removidas')
    return df.drop(columns=low_card)

def remove_high_correlation(df, threshold=0.8, safras_train=None):
    if safras_train is not None:
        df_corr_base = df[df['SAFRA'].isin(safras_train)]
    else:
        df_corr_base = df
    df_sample = df_corr_base.groupby(['SAFRA', 'FPD'], group_keys=False).apply(
        lambda x: x.sample(frac=0.25, random_state=42))
    num_cols = df_sample.select_dtypes(include=['int32','int64','float32','float64']).columns
    num_cols = [c for c in num_cols if c != 'FPD']
    corr_matrix = df_sample[num_cols].corr().abs()
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    to_drop = []
    while True:
        max_corr = upper.max().max()
        if max_corr < threshold:
            break
        col_to_drop = upper.max().sort_values(ascending=False).index[0]
        to_drop.append(col_to_drop)
        upper = upper.drop(index=col_to_drop, columns=col_to_drop)
    print(f'  High correlation (> {threshold}): {len(to_drop)} colunas removidas')
    return df.drop(columns=to_drop)

def remove_misused_columns(df):
    misused = ['PROD', 'flag_mig2', 'FAT_VLR_FPD', 'FAT_FLAG_MIG2_AQUISICAO']
    existing = [c for c in misused if c in df.columns]
    if existing:
        print(f'  Misused columns removed: {existing}')
    return df.drop(columns=existing, errors='ignore')

print('Funcoes de limpeza carregadas')

In [None]:
# =============================================================================
# 3.2 APLICAR LIMPEZAS
# =============================================================================
safras_train_val = SAFRAS[:4]

print('Aplicando limpezas...')
print(f'Shape original: {df.shape}')

df = clean_empty_keys(df)
df = convert_cep3_uf_regiao(df)
df = adjust_and_drop_date_cols(df)
df = remove_high_missing(df)
df = remove_low_cardinality(df)
df = remove_high_correlation(df, threshold=0.8, safras_train=safras_train_val)
df = remove_misused_columns(df)

print(f'Shape apos limpezas: {df.shape}')

## 4. Split Temporal + Amostragem Estratificada

Mesma logica do modelo v6:
- **Treino (sample)**: 25% estratificado por (SAFRA, FPD) das SAFRAs 202410-202501
- **OOS**: 75% restante das SAFRAs 202410-202501
- **OOT**: SAFRAs 202502-202503 (100%)

In [None]:
# =============================================================================
# 4. SPLIT TEMPORAL + AMOSTRAGEM ESTRATIFICADA
# =============================================================================
safras_ord = sorted(df['SAFRA'].unique())
safras_train_oos = safras_ord[:4]  # 202410-202501
safras_oot = safras_ord[4:]        # 202502-202503

df_4_safras = df[df['SAFRA'].isin(safras_train_oos)]
df_oot_full = df[df['SAFRA'].isin(safras_oot)]

# Amostragem estratificada 25%
df_sample = df_4_safras.groupby(['SAFRA', 'FPD'], group_keys=False).apply(
    lambda x: x.sample(frac=0.25, random_state=42))
df_oos = df_4_safras.drop(df_sample.index)

df_treino = df_sample.reset_index(drop=True).drop_duplicates()
df_oos = df_oos.reset_index(drop=True).drop_duplicates()
df_oot = df_oot_full.reset_index(drop=True).drop_duplicates()

del df, df_4_safras, df_oot_full, df_sample
gc.collect()

print(f'Treino (sample 25%): {df_treino.shape}')
print(f'OOS (75% restante):  {df_oos.shape}')
print(f'OOT (temporal):      {df_oot.shape}')

# Volumetria detalhada
for name, data in [('Treino', df_treino), ('OOS', df_oos), ('OOT', df_oot)]:
    print(f'\n--- {name} ---')
    total = len(data)
    for safra in sorted(data['SAFRA'].unique()):
        s = data[data['SAFRA'] == safra]
        n_bad = (s['FPD'] == 1).sum()
        n_good = (s['FPD'] == 0).sum()
        print(f'  SAFRA {safra}: {len(s):>8,} rows | FPD=0: {n_good:>7,} | FPD=1: {n_bad:>6,} ({n_bad/len(s):.2%})')
    n_bad_total = (data['FPD'] == 1).sum()
    print(f'  TOTAL:       {total:>8,} rows | FPD=1: {n_bad_total:>6,} ({n_bad_total/total:.2%})')

## 5. Export para Lakehouse (Parquet + Delta)

Converte cada base de Pandas para Spark e grava em:
1. **Parquet**: `/lakehouse/default/Files/projeto-final/datasets/v6/` — formato de entrega
2. **Delta Table**: `Gold.feature_store.{treino_v6, oos_v6, oot_v6}` — acesso estruturado

Todas as tabelas sao particionadas por `SAFRA` para otimizar leituras parciais.

In [None]:
# =============================================================================
# 5. EXPORT PARA LAKEHOUSE — PARQUET + DELTA
# =============================================================================
bases = {
    'treino_v6': df_treino,
    'oos_v6': df_oos,
    'oot_v6': df_oot,
}

results = []

for name, df_base in bases.items():
    print(f'\n{"="*60}')
    print(f'Exportando: {name} ({df_base.shape[0]:,} rows x {df_base.shape[1]} cols)')
    print(f'{"="*60}')

    # Converter Pandas -> Spark
    df_spark = spark.createDataFrame(df_base)

    # --- 5.1 Parquet ---
    parquet_path = f'{PARQUET_DIR}/{name}'
    df_spark.write.mode('overwrite').partitionBy('SAFRA').parquet(parquet_path)
    print(f'  Parquet salvo: {parquet_path}')

    # --- 5.2 Delta Table ---
    delta_path = f'{DELTA_BASE}/{name}'
    df_spark.write.mode('overwrite').partitionBy('SAFRA').format('delta').save(delta_path)
    print(f'  Delta salvo:   {delta_path}')

    # Registrar contagens
    results.append({
        'base': name,
        'rows': df_base.shape[0],
        'cols': df_base.shape[1],
        'safras': sorted(df_base['SAFRA'].unique().tolist()),
        'fpd_rate': (df_base['FPD'] == 1).mean() * 100,
        'parquet_path': parquet_path,
        'delta_path': delta_path,
    })

    print(f'  OK!')

print(f'\n{"="*60}')
print('Export concluido!')
print(f'{"="*60}')

## 6. Validacao de Integridade

Recarrega cada base exportada e verifica:
- Contagem de registros (deve bater com original)
- Schema (colunas e tipos)
- FPD rate (deve bater com original)

In [None]:
# =============================================================================
# 6. VALIDACAO DE INTEGRIDADE
# =============================================================================
print('Validando bases exportadas...\n')

all_ok = True

for res in results:
    name = res['base']
    expected_rows = res['rows']
    expected_cols = res['cols']

    # Ler Parquet de volta
    df_check_parquet = spark.read.parquet(res['parquet_path'])
    n_parquet = df_check_parquet.count()
    cols_parquet = len(df_check_parquet.columns)

    # Ler Delta de volta
    df_check_delta = spark.read.format('delta').load(res['delta_path'])
    n_delta = df_check_delta.count()

    # Checks
    ok_rows_pq = n_parquet == expected_rows
    ok_rows_dt = n_delta == expected_rows
    ok_cols = cols_parquet == expected_cols

    status = 'PASS' if (ok_rows_pq and ok_rows_dt and ok_cols) else 'FAIL'
    if status == 'FAIL':
        all_ok = False

    print(f'--- {name} [{status}] ---')
    print(f'  Esperado:    {expected_rows:>10,} rows x {expected_cols} cols')
    print(f'  Parquet:     {n_parquet:>10,} rows x {cols_parquet} cols {"OK" if ok_rows_pq else "FALHA"}')
    print(f'  Delta:       {n_delta:>10,} rows {"OK" if ok_rows_dt else "FALHA"}')
    print(f'  FPD rate:    {res["fpd_rate"]:.2f}%')
    print(f'  SAFRAs:      {res["safras"]}')
    print()

if all_ok:
    print('=== TODAS AS VALIDACOES PASSARAM ===')
else:
    print('!!! ALGUMA VALIDACAO FALHOU — VERIFICAR ACIMA !!!')

In [None]:
# =============================================================================
# 7. RESUMO FINAL
# =============================================================================
print('=' * 70)
print('  EXPORT BASES v6 — RESUMO')
print('=' * 70)

print(f'\n  {"Base":<15} {"Rows":>10} {"Cols":>6} {"FPD%":>7} {"SAFRAs"}')
print(f'  {"-"*55}')
for res in results:
    safras_str = ', '.join(str(s) for s in res['safras'])
    print(f'  {res["base"]:<15} {res["rows"]:>10,} {res["cols"]:>6} {res["fpd_rate"]:>6.2f}% {safras_str}')

total_rows = sum(r['rows'] for r in results)
print(f'  {"-"*55}')
print(f'  {"TOTAL":<15} {total_rows:>10,}')

print(f'\n  Parquet: {PARQUET_DIR}/')
print(f'  Delta:   Gold.feature_store.treino_v6 / oos_v6 / oot_v6')
print(f'\n  Particao: SAFRA')
print(f'  Limpezas: 7 funcoes (identicas modelo v6)')
print(f'  Split: 25% estratificado (SAFRA x FPD) | random_state=42')

print(f'\n{"="*70}')
print('  Bases prontas para entrega do projeto')
print(f'{"="*70}')