# Modelo de Risco de Crédito

In [1]:
import pandas as pd
from lib.feature_eng import find_class_columns

### Extraição de dados usando Kaggle CLI


In [None]:
%%bash
mkdir -p data && \
cd data && \ 
kaggle competitions download -c home-credit-credit-risk-model-stability && \
unzip home-credit-credit-risk-model-stability.zip && \
rm -rf parquet_files \

### Exploração e pré-processamento dos dados

PD na base de treino = 3,14%

In [2]:
df_base_train = pd.read_csv('data/csv_files/train/train_base.csv')
df_base_train.head(10)

count = df_base_train.target.value_counts()

# porcentagem de inadimplencia
not_default_rate = count[1] / (count[0] + count[1])
print(not_default_rate)

0.03143727577671242


### Seleção de arquivos relacionados com as classes escolhidas

In [2]:
train_depth_0 = ['train_static_0_0', 'train_static_0_1','train_static_cb_0']
train_depth_1 = [
  'train_applprev_1_0', 'train_applprev_1_1','train_other_1', 'train_tax_registry_a_1', 
  'train_tax_registry_b_1', 'train_tax_registry_c_1', 'train_credit_bureau_a_1_0', 
  'train_credit_bureau_a_1_1', 'train_credit_bureau_a_1_2', 'train_credit_bureau_a_1_3', 
  'train_credit_bureau_b_1', 'train_deposit_1', 'train_person_1', 'train_debitcard_1',
]
train_depth_2 = [
  'train_applprev_2', 'train_person_2', 'train_credit_bureau_a_2_0', 'train_credit_bureau_a_2_1',
  'train_credit_bureau_a_2_2', 'train_credit_bureau_a_2_3', 'train_credit_bureau_a_2_4',
  'train_credit_bureau_a_2_5', 'train_credit_bureau_a_2_6', 'train_credit_bureau_a_2_7',
  'train_credit_bureau_a_2_8', 'train_credit_bureau_a_2_9', 'train_credit_bureau_a_2_10',
  'train_credit_bureau_b_2',
]

classes_list = [
  'amount_416A', 'amtdepositbalance_4809441A','amtdepositincoming_4809444A', 'amtdepositoutgoing_4809442A',
  'amtinstpaidbefduel24m_4187115A', 'days360_512L','maxdpdlast12m_727P',
  'birthdate_87D', 'cntpmts24_3658933L', 'credacc_actualbalance_314A','credacc_maxhisbal_375A', 'credacc_minhisbal_90A',
  'currdebt_22A', 'currdebtcredtyperange_828A', 'contractsum_5085717L', 'downpmt_116A', 'education_88M',
  'for3years_128L', 'gender_992L'
]

depth_0_occurrences = find_class_columns(feature_cols=classes_list, file_list=train_depth_0)
depth_1_occurrences = find_class_columns(feature_cols=classes_list, file_list=train_depth_1)
depth_2_occurrences = find_class_columns(feature_cols=classes_list, file_list=train_depth_2)

# Dicionarios com arquivos e suas respectivas colunas (classes)
print(f'depth_0 = {depth_0_occurrences}') 
print(f'depth_1 = {depth_1_occurrences}')
print(f'depth_2 = {depth_2_occurrences}')

depth_0 = {'train_static_0_0': {'amtinstpaidbefduel24m_4187115A', 'maxdpdlast12m_727P', 'currdebtcredtyperange_828A', 'cntpmts24_3658933L', 'downpmt_116A', 'currdebt_22A'}, 'train_static_0_1': {'amtinstpaidbefduel24m_4187115A', 'maxdpdlast12m_727P', 'currdebtcredtyperange_828A', 'cntpmts24_3658933L', 'downpmt_116A', 'currdebt_22A'}, 'train_static_cb_0': {'days360_512L', 'education_88M', 'for3years_128L'}}
depth_1 = {'train_applprev_1_0': {'credacc_actualbalance_314A', 'credacc_maxhisbal_375A', 'credacc_minhisbal_90A'}, 'train_applprev_1_1': {'credacc_actualbalance_314A', 'credacc_maxhisbal_375A', 'credacc_minhisbal_90A'}, 'train_other_1': {'amtdepositincoming_4809444A', 'amtdepositoutgoing_4809442A', 'amtdepositbalance_4809441A'}, 'train_credit_bureau_a_1_0': {'contractsum_5085717L'}, 'train_credit_bureau_a_1_1': {'contractsum_5085717L'}, 'train_credit_bureau_a_1_2': {'contractsum_5085717L'}, 'train_credit_bureau_a_1_3': {'contractsum_5085717L'}, 'train_deposit_1': {'amount_416A'}, 'tr

### Pré-processamento dos dados selecionados

In [12]:

train_static_0_0 = list(depth_0_occurrences['train_static_0_0'])
train_static_0_0.insert(0, 'case_id')

# Meses com renda : cntpmts24_3658933L
df_base_info = pd.read_csv('data/csv_files/train/train_base.csv', usecols=['case_id', 'target'])
df_static = pd.read_csv('data/csv_files/train/train_static_0_0.csv', usecols=["case_id", "cntpmts24_3658933L"])
df_renda = pd.merge(df_base_info, df_static, how='left', on='case_id')
df_renda.fillna(0, inplace=True)

result = df_renda.groupby('cntpmts24_3658933L')['target'].value_counts().unstack(fill_value=0)

# Add necessary columns
result.columns = ['non_events', 'events']  # target=0, target=1
result['total_obs'] = result['non_events'] + result['events']

# Calculate distributions
total_events = result['events'].sum()
total_non_events = result['non_events'].sum()

result['pct_events'] = result['events'] / total_events
result['pct_non_events'] = result['non_events'] / total_non_events

# Calculate WoE and Information Value (IV)
result['woe'] = np.log(result['pct_events'] / result['pct_non_events'])
result['iv'] = (result['pct_events'] - result['pct_non_events']) * result['woe']

# total_iv = result['iv'].sum()

result

NameError: name 'np' is not defined