# 0.0 Setup

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df_raw = pd.read_csv('../data/raw_data/df_raw.csv')

# 1.0 Tratamento de dados faltantes

In [3]:
print("Duplicatas:", df_raw.duplicated().sum())
df_raw = df_raw.drop_duplicates()

print("Idades inv√°lidas:", df_raw[df_raw['idade'] < 0].shape[0])
print("Dependentes inv√°lidos:", df_raw[df_raw['dependentes'] < 0].shape[0])
print("Renda inv√°lida:", df_raw[df_raw['renda_mensal'] < 0].shape[0])


Duplicatas: 0
Idades inv√°lidas: 0
Dependentes inv√°lidos: 0
Renda inv√°lida: 0


In [4]:

target_col = 'inadipl_90dias_ult2anos'

print("=" * 80)
print("üîç ESTRAT√âGIA DE TRATAMENTO DE DADOS FALTANTES")
print("=" * 80)

missing_analysis = pd.DataFrame({
    'Feature': df_raw.columns,
    'Missing_Count': df_raw.isnull().sum(),
    'Missing_Pct': (df_raw.isnull().sum() / len(df_raw) * 100).round(2),
    'Dtype': df_raw.dtypes
})

missing_analysis = missing_analysis[missing_analysis['Missing_Count'] > 0]
missing_analysis = missing_analysis.sort_values('Missing_Pct', ascending=False)

print(missing_analysis.to_string())

# Decis√£o de tratamento
print("\nüìã ESTRAT√âGIA POR COLUNA:")
for idx, row in missing_analysis.iterrows():
    col = row['Feature']
    pct = row['Missing_Pct']
    
    if pct > 50:
        strategy = "‚ùå REMOVER coluna (>50% missing)"
    elif pct > 20:
        strategy = "‚ö†Ô∏è Criar flag + imputar ou remover"
    elif pct > 5:
        strategy = "üìä Imputar pela mediana/moda"
    else:
        strategy = "‚úÖ Imputar pela mediana/moda"
    
    print(f"  {col:30s} ({pct:5.2f}%) -> {strategy}")

üîç ESTRAT√âGIA DE TRATAMENTO DE DADOS FALTANTES
                                     Feature  Missing_Count  Missing_Pct    Dtype
comprometimento_renda  comprometimento_renda          29083        19.49  float64
renda_mensal                    renda_mensal          29065        19.48  float64
faixa_etaria                    faixa_etaria            658         0.44      str
utilizacao_credito        utilizacao_credito            114         0.08  float64
divida_ratio                    divida_ratio             18         0.01  float64

üìã ESTRAT√âGIA POR COLUNA:
  comprometimento_renda          (19.49%) -> üìä Imputar pela mediana/moda
  renda_mensal                   (19.48%) -> üìä Imputar pela mediana/moda
  faixa_etaria                   ( 0.44%) -> ‚úÖ Imputar pela mediana/moda
  utilizacao_credito             ( 0.08%) -> ‚úÖ Imputar pela mediana/moda
  divida_ratio                   ( 0.01%) -> ‚úÖ Imputar pela mediana/moda


## 1.1 Imputa√ß√£o de valores segmentada por faixa et√°ria (mediana dentro de cada grupo et√°rio)

In [5]:
df_clean = df_raw.copy()

# 1. Criar flags de missing (vari√°veis cr√≠ticas e adicionais)
df_clean['renda_mensal_missing'] = df_clean['renda_mensal'].isna().astype(int)
df_clean['comprometimento_renda_missing'] = df_clean['comprometimento_renda'].isna().astype(int)
df_clean['faixa_etaria_missing'] = df_clean['faixa_etaria'].isna().astype(int)
df_clean['utilizacao_credito_missing'] = df_clean['utilizacao_credito'].isna().astype(int)
df_clean['divida_ratio_missing'] = df_clean['divida_ratio'].isna().astype(int)

# 2. Imputa√ß√£o segmentada por faixa et√°ria (mediana dentro de cada grupo)
for col in ['renda_mensal', 'comprometimento_renda']:
    df_clean[col] = df_clean.groupby('faixa_etaria')[col].transform(
        lambda x: x.fillna(x.median())
    )
    # Se ainda restarem NaN (grupos sem mediana), imputar com a mediana geral
    df_clean[col] = df_clean[col].fillna(df_clean[col].median())

# 3. Faixa et√°ria: imputar como "n√£o informado"
df_clean['faixa_etaria'] = df_clean['faixa_etaria'].fillna('n√£o informado')

# 4. Vari√°veis com baixo missing ‚Üí imputa√ß√£o simples
df_clean['utilizacao_credito'] = df_clean['utilizacao_credito'].fillna(df_clean['utilizacao_credito'].median())
df_clean['divida_ratio'] = df_clean['divida_ratio'].fillna(df_clean['divida_ratio'].median())

# Checar se ainda restam NaN
print(df_clean.isnull().sum())


inadipl_90dias_ult2anos          0
utilizacao_credito               0
idade                            0
atrasos_30dias                   0
divida_ratio                     0
renda_mensal                     0
linhas_credito_abertas           0
atrasos_90dias                   0
emprestimos_imobiliarioss        0
dependentes                      0
comprometimento_renda            0
faixa_etaria                     0
renda_mensal_missing             0
comprometimento_renda_missing    0
faixa_etaria_missing             0
utilizacao_credito_missing       0
divida_ratio_missing             0
dtype: int64


In [6]:
df_clean

Unnamed: 0,inadipl_90dias_ult2anos,utilizacao_credito,idade,atrasos_30dias,divida_ratio,renda_mensal,linhas_credito_abertas,atrasos_90dias,emprestimos_imobiliarioss,dependentes,comprometimento_renda,faixa_etaria,renda_mensal_missing,comprometimento_renda_missing,faixa_etaria_missing,utilizacao_credito_missing,divida_ratio_missing
0,True,0.7661,45,2,0.8030,9120.0,13,0,6,2,7323.36,36-45,0,0,0,0,0
1,False,0.9572,40,0,0.1219,2600.0,4,0,0,1,316.94,36-45,0,0,0,0,0
2,False,0.6582,38,1,0.0851,3042.0,2,1,0,0,258.87,36-45,0,0,0,0,0
3,False,0.2338,30,0,0.0360,3300.0,5,0,0,0,118.80,26-35,0,0,0,0,0
4,False,0.9072,49,1,0.0249,63588.0,7,0,1,0,1583.34,46-60,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
149229,False,0.0407,74,0,0.2251,2100.0,4,0,1,0,472.71,60+,0,0,0,0,0
149230,False,0.2997,44,0,0.7166,5584.0,4,0,1,2,4001.49,36-45,0,0,0,0,0
149231,False,0.2460,58,0,3870.0000,6291.0,18,0,1,0,2096.84,46-60,1,1,0,0,0
149232,False,0.0000,30,0,0.0000,5716.0,4,0,0,0,0.00,26-35,0,0,0,0,0


## 1.2 Tratamento de outliers

In [7]:
print("="*80)
print("üîß TRATAMENTO DE OUTLIERS ORIENTADO AO NEG√ìCIO DE CR√âDITO")
print("="*80)

# Idade fora de faixa (regras de elegibilidade)
df_clean.loc[df_clean['idade'] < 18, 'idade'] = 18
df_clean.loc[df_clean['idade'] > 90, 'idade'] = 90
print("‚úÖ Idade truncada para [18, 90]")

# Comprometimento de renda (limite de aceita√ß√£o orientado ao neg√≥cio)
# At√© 30% = saud√°vel, 30‚Äì50% = alerta, >50% = alto risco, >100% = cr√≠tico
df_clean.loc[df_clean['comprometimento_renda'] > 100, 'comprometimento_renda'] = 100
print("‚úÖ Comprometimento de renda truncado em 100% (inadimpl√™ncia cr√≠tica)")

# Winsorization para renda mensal (evitar valores absurdos)
# Regras de neg√≥cio: m√≠nimo ~ sal√°rio m√≠nimo, m√°ximo ~ R$ 50 mil
renda_min = 1412   # sal√°rio m√≠nimo 2026
renda_max = 50000  # limite plaus√≠vel para PF em cr√©dito massificado
df_clean['renda_mensal'] = np.clip(df_clean['renda_mensal'], renda_min, renda_max)
print(f"‚úÖ Renda mensal truncada entre {renda_min:.2f} e {renda_max:.2f}")

# Opcional: sinalizar registros alterados
df_clean['idade_outlier_flag'] = ((df_raw['idade'] < 18) | (df_raw['idade'] > 90)).astype(int)
df_clean['comprometimento_outlier_flag'] = (df_raw['comprometimento_renda'] > 100).astype(int)
df_clean['renda_outlier_flag'] = ((df_raw['renda_mensal'] < renda_min) | (df_raw['renda_mensal'] > renda_max)).astype(int)

print("\n‚úÖ Flags de outliers criadas para auditoria/modelagem")


üîß TRATAMENTO DE OUTLIERS ORIENTADO AO NEG√ìCIO DE CR√âDITO
‚úÖ Idade truncada para [18, 90]
‚úÖ Comprometimento de renda truncado em 100% (inadimpl√™ncia cr√≠tica)
‚úÖ Renda mensal truncada entre 1412.00 e 50000.00

‚úÖ Flags de outliers criadas para auditoria/modelagem


In [8]:
df_clean.describe()

Unnamed: 0,utilizacao_credito,idade,atrasos_30dias,divida_ratio,renda_mensal,linhas_credito_abertas,atrasos_90dias,emprestimos_imobiliarioss,dependentes,comprometimento_renda,renda_mensal_missing,comprometimento_renda_missing,faixa_etaria_missing,utilizacao_credito_missing,divida_ratio_missing,idade_outlier_flag,comprometimento_outlier_flag,renda_outlier_flag
count,149234.0,149234.0,149234.0,149234.0,149234.0,149234.0,149234.0,149234.0,149234.0,149234.0,149234.0,149234.0,149234.0,149234.0,149234.0,149234.0,149234.0,149234.0
mean,6.077182,52.284989,0.394294,354.805859,6318.768773,8.487838,0.238344,1.023467,2.758976,93.164707,0.194761,0.194882,0.004409,0.000764,0.000121,0.003129,0.7115,0.046692
std,250.395211,14.684844,3.854958,2042.885057,4558.551071,5.134498,3.828162,1.130303,36.689326,23.092184,0.396018,0.396111,0.066255,0.027628,0.010982,0.055853,0.453066,0.210979
min,0.0,18.0,0.0,0.0,1412.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0304,41.0,0.0,0.1782,3878.0,5.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.1547,52.0,0.0,0.3686,5394.0,8.0,0.0,1.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
75%,0.5561,63.0,0.0,0.877,7400.0,11.0,0.0,2.0,2.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
max,50708.0,90.0,98.0,329664.0,50000.0,58.0,98.0,54.0,985.0,100.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [9]:
# Salvando em excel .csv
import os

caminho_dir = r"C:\Users\wesle\anaconda3\envs\Credit_Score_Project\data"
os.makedirs(caminho_dir, exist_ok=True)

df_clean.to_csv(os.path.join(caminho_dir, "df_cleaned.csv"), index=False, encoding="utf-8-sig")
print(f"   Shape original: {df_raw.shape}")
print(f"   Shape limpo: {df_clean.shape}")

   Shape original: (149234, 12)
   Shape limpo: (149234, 20)
