### Importação das bibliotecas necessárias

In [40]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import  GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import make_scorer, classification_report, confusion_matrix, accuracy_score, f1_score, precision_score, recall_score, roc_auc_score
import xgboost as xgb
import lightgbm as lgb
import warnings
warnings.filterwarnings('ignore')

### Carregamento e exploração inicial dos dados

In [41]:
# Carregar dados
train_data = pd.read_csv("../data/raw/credit_score_train.csv")
test_data = pd.read_csv("../data/raw/credit_score_test.csv")
print("Shape train:", train_data.shape)
print("Shape test:", test_data.shape)

Shape train: (100000, 28)
Shape test: (50000, 27)


In [42]:
train_data.head()

Unnamed: 0,ID,Customer_ID,Month,Name,Age,SSN,Occupation,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,...,Credit_Mix,Outstanding_Debt,Credit_Utilization_Ratio,Credit_History_Age,Payment_of_Min_Amount,Total_EMI_per_month,Amount_invested_monthly,Payment_Behaviour,Monthly_Balance,Credit_Score
0,0x1602,CUS_0xd40,January,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,1824.843333,3,...,_,809.98,26.82262,22 Years and 1 Months,No,49.574949,80.41529543900253,High_spent_Small_value_payments,312.49408867943663,Good
1,0x1603,CUS_0xd40,February,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,,3,...,Good,809.98,31.94496,,No,49.574949,118.28022162236736,Low_spent_Large_value_payments,284.62916249607184,Good
2,0x1604,CUS_0xd40,March,Aaron Maashoh,-500,821-00-0265,Scientist,19114.12,,3,...,Good,809.98,28.609352,22 Years and 3 Months,No,49.574949,81.699521264648,Low_spent_Medium_value_payments,331.2098628537912,Good
3,0x1605,CUS_0xd40,April,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,,3,...,Good,809.98,31.377862,22 Years and 4 Months,No,49.574949,199.4580743910713,Low_spent_Small_value_payments,223.45130972736783,Good
4,0x1606,CUS_0xd40,May,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,1824.843333,3,...,Good,809.98,24.797347,22 Years and 5 Months,No,49.574949,41.420153086217326,High_spent_Medium_value_payments,341.48923103222177,Good


In [43]:
train_data.columns.tolist()


['ID',
 'Customer_ID',
 'Month',
 'Name',
 'Age',
 'SSN',
 'Occupation',
 'Annual_Income',
 'Monthly_Inhand_Salary',
 'Num_Bank_Accounts',
 'Num_Credit_Card',
 'Interest_Rate',
 'Num_of_Loan',
 'Type_of_Loan',
 'Delay_from_due_date',
 'Num_of_Delayed_Payment',
 'Changed_Credit_Limit',
 'Num_Credit_Inquiries',
 'Credit_Mix',
 'Outstanding_Debt',
 'Credit_Utilization_Ratio',
 'Credit_History_Age',
 'Payment_of_Min_Amount',
 'Total_EMI_per_month',
 'Amount_invested_monthly',
 'Payment_Behaviour',
 'Monthly_Balance',
 'Credit_Score']

In [44]:
test_data.head()

Unnamed: 0,ID,Customer_ID,Month,Name,Age,SSN,Occupation,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,...,Num_Credit_Inquiries,Credit_Mix,Outstanding_Debt,Credit_Utilization_Ratio,Credit_History_Age,Payment_of_Min_Amount,Total_EMI_per_month,Amount_invested_monthly,Payment_Behaviour,Monthly_Balance
0,0x160a,CUS_0xd40,September,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,1824.843333,3,...,2022.0,Good,809.98,35.030402,22 Years and 9 Months,No,49.574949,236.64268203272132,Low_spent_Small_value_payments,186.26670208571767
1,0x160b,CUS_0xd40,October,Aaron Maashoh,24,821-00-0265,Scientist,19114.12,1824.843333,3,...,4.0,Good,809.98,33.053114,22 Years and 10 Months,No,49.574949,21.465380264657146,High_spent_Medium_value_payments,361.444003853782
2,0x160c,CUS_0xd40,November,Aaron Maashoh,24,821-00-0265,Scientist,19114.12,1824.843333,3,...,4.0,Good,809.98,33.811894,,No,49.574949,148.23393788500923,Low_spent_Medium_value_payments,264.67544623343
3,0x160d,CUS_0xd40,December,Aaron Maashoh,24_,821-00-0265,Scientist,19114.12,,3,...,4.0,Good,809.98,32.430559,23 Years and 0 Months,No,49.574949,39.08251089460281,High_spent_Medium_value_payments,343.82687322383634
4,0x1616,CUS_0x21b1,September,Rick Rothackerj,28,004-07-5839,_______,34847.84,3037.986667,2,...,5.0,Good,605.03,25.926822,27 Years and 3 Months,No,18.816215,39.684018417945296,High_spent_Large_value_payments,485.2984336755923


In [45]:
test_data.columns.tolist()

['ID',
 'Customer_ID',
 'Month',
 'Name',
 'Age',
 'SSN',
 'Occupation',
 'Annual_Income',
 'Monthly_Inhand_Salary',
 'Num_Bank_Accounts',
 'Num_Credit_Card',
 'Interest_Rate',
 'Num_of_Loan',
 'Type_of_Loan',
 'Delay_from_due_date',
 'Num_of_Delayed_Payment',
 'Changed_Credit_Limit',
 'Num_Credit_Inquiries',
 'Credit_Mix',
 'Outstanding_Debt',
 'Credit_Utilization_Ratio',
 'Credit_History_Age',
 'Payment_of_Min_Amount',
 'Total_EMI_per_month',
 'Amount_invested_monthly',
 'Payment_Behaviour',
 'Monthly_Balance']

In [46]:
# Observando a diferença entre os conjuntos (teste não tem Credit_Score)
train_cols = set(train_data.columns.tolist())
test_cols = set(test_data.columns.tolist())
print("Colunas apenas no treino:", train_cols - test_cols)
print("Colunas apenas no teste:", test_cols - train_cols)

Colunas apenas no treino: {'Credit_Score'}
Colunas apenas no teste: set()


### Análise exploratória de dados (EDA)

In [47]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 28 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   ID                        100000 non-null  object 
 1   Customer_ID               100000 non-null  object 
 2   Month                     100000 non-null  object 
 3   Name                      90015 non-null   object 
 4   Age                       100000 non-null  object 
 5   SSN                       100000 non-null  object 
 6   Occupation                100000 non-null  object 
 7   Annual_Income             100000 non-null  object 
 8   Monthly_Inhand_Salary     84998 non-null   float64
 9   Num_Bank_Accounts         100000 non-null  int64  
 10  Num_Credit_Card           100000 non-null  int64  
 11  Interest_Rate             100000 non-null  int64  
 12  Num_of_Loan               100000 non-null  object 
 13  Type_of_Loan              88592 non-null   ob

In [48]:
train_data.describe()

Unnamed: 0,Monthly_Inhand_Salary,Num_Bank_Accounts,Num_Credit_Card,Interest_Rate,Delay_from_due_date,Num_Credit_Inquiries,Credit_Utilization_Ratio,Total_EMI_per_month
count,84998.0,100000.0,100000.0,100000.0,100000.0,98035.0,100000.0,100000.0
mean,4194.17085,17.09128,22.47443,72.46604,21.06878,27.754251,32.285173,1403.118217
std,3183.686167,117.404834,129.05741,466.422621,14.860104,193.177339,5.116875,8306.04127
min,303.645417,-1.0,0.0,1.0,-5.0,0.0,20.0,0.0
25%,1625.568229,3.0,4.0,8.0,10.0,3.0,28.052567,30.30666
50%,3093.745,6.0,5.0,13.0,18.0,6.0,32.305784,69.249473
75%,5957.448333,7.0,7.0,20.0,28.0,9.0,36.496663,161.224249
max,15204.633333,1798.0,1499.0,5797.0,67.0,2597.0,50.0,82331.0


In [49]:
train_data.isnull().sum()

ID                              0
Customer_ID                     0
Month                           0
Name                         9985
Age                             0
SSN                             0
Occupation                      0
Annual_Income                   0
Monthly_Inhand_Salary       15002
Num_Bank_Accounts               0
Num_Credit_Card                 0
Interest_Rate                   0
Num_of_Loan                     0
Type_of_Loan                11408
Delay_from_due_date             0
Num_of_Delayed_Payment       7002
Changed_Credit_Limit            0
Num_Credit_Inquiries         1965
Credit_Mix                      0
Outstanding_Debt                0
Credit_Utilization_Ratio        0
Credit_History_Age           9030
Payment_of_Min_Amount           0
Total_EMI_per_month             0
Amount_invested_monthly      4479
Payment_Behaviour               0
Monthly_Balance              1200
Credit_Score                    0
dtype: int64

In [50]:
train_data['Credit_Score'].unique()

array(['Good', 'Standard', 'Poor'], dtype=object)

In [51]:
# Verificando a distribuição da variável alvo
print("\nDistribuição da variável alvo (Credit_Score):")
print(train_data['Credit_Score'].value_counts())
print(train_data['Credit_Score'].value_counts(normalize=True) * 100)


Distribuição da variável alvo (Credit_Score):
Credit_Score
Standard    53174
Poor        28998
Good        17828
Name: count, dtype: int64
Credit_Score
Standard    53.174
Poor        28.998
Good        17.828
Name: proportion, dtype: float64


Pré-processamento

In [52]:
def duplicate_values(df):
    print("Validação de duplicação.")
    num_duplicates = df.duplicated(subset=None, keep='first').sum()
    if num_duplicates > 0:
        print("Existem", num_duplicates, "duplicadas.")
        df.drop_duplicates(keep='first', inplace=True)
        print(num_duplicates, "duplicadas excluidas")
    else:
        print("Não existem duplicadas")

duplicate_values(train_data)
duplicate_values(test_data)

Validação de duplicação.
Não existem duplicadas
Validação de duplicação.
Não existem duplicadas


In [53]:
# Verificando valores inválidos ou inconsistentes em colunas numéricas
print("Verificando valores numéricos inválidos...")
numeric_cols = train_data.select_dtypes(include=['int64', 'float64']).columns.tolist()
for col in numeric_cols:
    invalid_values = train_data[~train_data[col].astype(str).str.replace('.', '').str.replace('-', '').str.isdigit()]
    if not invalid_values.empty:
        print(f"Coluna {col} contém valores não numéricos: {invalid_values[col].unique()}")

Verificando valores numéricos inválidos...
Coluna Monthly_Inhand_Salary contém valores não numéricos: [nan]
Coluna Num_Credit_Inquiries contém valores não numéricos: [nan]


Aqui pudemos notar que existem valores que não fazem sentindo para o campo Idade

In [54]:
train_data['Age'].unique()

array(['23', '-500', '28_', ..., '4808_', '2263', '1342'],
      shape=(1788,), dtype=object)

In [55]:
train_data['Occupation'].unique()

array(['Scientist', '_______', 'Teacher', 'Engineer', 'Entrepreneur',
       'Developer', 'Lawyer', 'Media_Manager', 'Doctor', 'Journalist',
       'Manager', 'Accountant', 'Musician', 'Mechanic', 'Writer',
       'Architect'], dtype=object)

Então criamos uma função para limpar e converter valores

In [56]:
# Função para limpar e converter valores
def clean_data(df):
    df_copy = df.copy()

    # colunas que não serão usadas
    df_copy = df_copy.drop(['ID','Customer_ID','Month','Name','SSN', 'Type_of_Loan', 'Changed_Credit_Limit', 'Monthly_Inhand_Salary'], axis = 1)

    # Tratando valores especiais como NaN
    for col in df_copy.columns:
        df_copy[col] = df_copy[col].replace('_', np.nan)
        df_copy[col] = df_copy[col].replace('!@9#%8', np.nan)
        df_copy[col] = df_copy[col].replace('#F%$D@*&8', np.nan)
    # preenche o na do Credit_Mix com desconhecido
    df_copy['Credit_Mix'].fillna('Unknown', inplace=True)
    df_copy['Credit_Mix'] = df_copy['Credit_Mix'].astype('object')

    # Tratando valores especiais na coluna Occupation
    df_copy['Occupation'] = df_copy['Occupation'].replace('_______', 'Other')
    df_copy.loc[pd.isna(df_copy['Occupation']), 'Occupation'] = 'Other'

    # Limpando e convertendo a coluna Age para numérico
    df_copy['Age'] = pd.to_numeric(df_copy['Age'].astype(str).str.replace('_', ''), errors='coerce')
    # Substituir valores inválidos (negativos ou muito altos) por NaN
    df_copy.loc[df_copy['Age'] < 0, 'Age'] = np.nan
    df_copy.loc[df_copy['Age'] > 100, 'Age'] = np.nan

    # Preencher NaN com a mediana das idades válidas
    mediana_idade = df_copy['Age'].median()
    df_copy['Age'].fillna(mediana_idade, inplace=True)

    # Convertendo Credit_History_Age para numérico (em meses)
    def convert_credit_history(x):
        if pd.isna(x) or x == 'NA':
            return np.nan
        try:
            years = 0
            months = 0
            if 'Years' in str(x):
                years = int(str(x).split('Years')[0].strip())
            if 'Months' in str(x):
                months = int(str(x).split('Months')[0].split('and')[-1].strip())
            return years * 12 + months
        except:
            return np.nan

    df_copy['Credit_History_Age'] = df_copy['Credit_History_Age'].apply(convert_credit_history)
        # Preencher NaN com a mediana dos valores válidos de Credit_History_Age
    mediana_credit_history = df_copy['Credit_History_Age'].median()
    df_copy['Credit_History_Age'].fillna(mediana_credit_history, inplace=True)


    # Convertendo colunas numéricas
    numeric_cols = ['Age','Annual_Income', 'Num_Bank_Accounts', 'Num_Credit_Card', 'Credit_History_Age',
                  'Interest_Rate', 'Num_of_Loan', 'Delay_from_due_date',
                  'Num_of_Delayed_Payment', 'Num_Credit_Inquiries',
                  'Outstanding_Debt', 'Credit_Utilization_Ratio', 'Total_EMI_per_month',
                  'Amount_invested_monthly', 'Monthly_Balance']

    for col in numeric_cols:
        # Primeiro limpar quaisquer caracteres não numéricos
        if col in df_copy.columns:
            # Converter para string, remover caracteres não numéricos exceto ponto decimal
            df_copy[col] = df_copy[col].astype(str).str.replace(r'[^0-9.-]', '', regex=True)
            # Converter para float
            df_copy[col] = pd.to_numeric(df_copy[col], errors='coerce')

    return df_copy

In [57]:
# Aplicando a limpeza aos conjuntos de dados
train_data = clean_data(train_data)
test_data = clean_data(test_data)

In [58]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 19 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Age                       50000 non-null  float64
 1   Occupation                50000 non-null  object 
 2   Annual_Income             50000 non-null  float64
 3   Num_Bank_Accounts         50000 non-null  int64  
 4   Num_Credit_Card           50000 non-null  int64  
 5   Interest_Rate             50000 non-null  int64  
 6   Num_of_Loan               50000 non-null  int64  
 7   Delay_from_due_date       50000 non-null  int64  
 8   Num_of_Delayed_Payment    46502 non-null  float64
 9   Num_Credit_Inquiries      48965 non-null  float64
 10  Credit_Mix                50000 non-null  object 
 11  Outstanding_Debt          50000 non-null  float64
 12  Credit_Utilization_Ratio  50000 non-null  float64
 13  Credit_History_Age        50000 non-null  float64
 14  Paymen

In [59]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 20 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   Age                       100000 non-null  float64
 1   Occupation                100000 non-null  object 
 2   Annual_Income             100000 non-null  float64
 3   Num_Bank_Accounts         100000 non-null  int64  
 4   Num_Credit_Card           100000 non-null  int64  
 5   Interest_Rate             100000 non-null  int64  
 6   Num_of_Loan               100000 non-null  int64  
 7   Delay_from_due_date       100000 non-null  int64  
 8   Num_of_Delayed_Payment    92998 non-null   float64
 9   Num_Credit_Inquiries      98035 non-null   float64
 10  Credit_Mix                100000 non-null  object 
 11  Outstanding_Debt          100000 non-null  float64
 12  Credit_Utilization_Ratio  100000 non-null  float64
 13  Credit_History_Age        100000 non-null  fl

In [60]:
# Usando dicionário para mapear categorias para valores numéricos
mapeamento = {'Good': 2, 'Standard': 1, 'Poor': 0}

# Aplicando o mapeamento
train_data['Credit_Score'] = train_data['Credit_Score'].map(mapeamento)

In [61]:
# Verificando valores nulos após limpeza
print("\nValores nulos após limpeza (treino):")
print(train_data.isnull().sum())
print("\nValores nulos após limpeza (teste):")
print(test_data.isnull().sum())


Valores nulos após limpeza (treino):
Age                            0
Occupation                     0
Annual_Income                  0
Num_Bank_Accounts              0
Num_Credit_Card                0
Interest_Rate                  0
Num_of_Loan                    0
Delay_from_due_date            0
Num_of_Delayed_Payment      7002
Num_Credit_Inquiries        1965
Credit_Mix                     0
Outstanding_Debt               0
Credit_Utilization_Ratio       0
Credit_History_Age             0
Payment_of_Min_Amount          0
Total_EMI_per_month            0
Amount_invested_monthly     4479
Payment_Behaviour           7600
Monthly_Balance             1200
Credit_Score                   0
dtype: int64

Valores nulos após limpeza (teste):
Age                            0
Occupation                     0
Annual_Income                  0
Num_Bank_Accounts              0
Num_Credit_Card                0
Interest_Rate                  0
Num_of_Loan                    0
Delay_from_due_date  

In [62]:
train_data['Credit_Mix'].unique()

array(['Unknown', 'Good', 'Standard', 'Bad'], dtype=object)

In [63]:
numeric_cols = train_data.select_dtypes(include=['int64', 'float64']).columns.tolist()
corr_matrix = train_data[numeric_cols].corr()

In [64]:
print(train_data['Delay_from_due_date'].unique())

[ 3 -1  5  6  8  7 13 10  0  4  9  1 12 11 30 31 34 27 14  2 -2 16 17 15
 23 22 21 18 19 52 51 48 53 26 43 28 25 20 47 46 49 24 61 29 50 58 45 59
 55 56 57 54 62 65 64 67 36 41 33 32 39 44 42 60 35 38 -3 63 40 37 -5 -4
 66]


Isso mostra que o atraso no pagamento auxlia na decisão de aprovar ou não o crédito

In [65]:
print(train_data['Outstanding_Debt'].unique())

[ 809.98  605.03 1303.01 ...  620.64 3571.7   502.38]


In [66]:
colunas_obj = ['Occupation', 'Credit_Mix', 'Payment_of_Min_Amount', 'Payment_Behaviour']
for coluna in colunas_obj:
    print(f"Valores únicos na coluna {coluna}: {train_data[coluna].unique()}")


Valores únicos na coluna Occupation: ['Scientist' 'Other' 'Teacher' 'Engineer' 'Entrepreneur' 'Developer'
 'Lawyer' 'Media_Manager' 'Doctor' 'Journalist' 'Manager' 'Accountant'
 'Musician' 'Mechanic' 'Writer' 'Architect']
Valores únicos na coluna Credit_Mix: ['Unknown' 'Good' 'Standard' 'Bad']
Valores únicos na coluna Payment_of_Min_Amount: ['No' 'NM' 'Yes']
Valores únicos na coluna Payment_Behaviour: ['High_spent_Small_value_payments' 'Low_spent_Large_value_payments'
 'Low_spent_Medium_value_payments' 'Low_spent_Small_value_payments'
 'High_spent_Medium_value_payments' nan 'High_spent_Large_value_payments']


In [67]:
# Criar um dicionário para armazenar os encoders
encoders = {}

# Colunas categóricas a serem codificadas
colunas_categoricas = ['Occupation', 'Credit_Mix', 'Payment_of_Min_Amount', 'Payment_Behaviour']

# Aplicar Label Encoding em cada coluna categórica
for coluna in colunas_categoricas:
    le = LabelEncoder()
    # Preencher valores NaN com uma string para codificação
    train_data[coluna] = train_data[coluna].fillna('Missing')
    train_data[coluna] = le.fit_transform(train_data[coluna])
    encoders[coluna] = le

# Exibir os valores codificados para verificação
for coluna in colunas_categoricas:
    print(f"Valores codificados na coluna {coluna}: {train_data[coluna].unique()}")


Valores codificados na coluna Occupation: [13 12 14  4  5  2  7 10  3  6  8  0 11  9 15  1]
Valores codificados na coluna Credit_Mix: [3 1 2 0]
Valores codificados na coluna Payment_of_Min_Amount: [1 0 2]
Valores codificados na coluna Payment_Behaviour: [2 3 4 5 1 6 0]


In [68]:
# Calcular a correlação entre as colunas codificadas e a coluna 'Credit_Score'
correlacoes = {}
for coluna in colunas_categoricas:
    correlacao = train_data[coluna].corr(train_data['Credit_Score'])
    correlacoes[coluna] = correlacao

# Exibir a lista de correlações
for coluna, correlacao in correlacoes.items():
    print(f"Correlação entre {coluna} e Credit_Score: {correlacao}")


Correlação entre Occupation e Credit_Score: -0.01030381716828363
Correlação entre Credit_Mix e Credit_Score: 0.10632627078934266
Correlação entre Payment_of_Min_Amount e Credit_Score: -0.2781269555652287
Correlação entre Payment_Behaviour e Credit_Score: -0.10301659049414222


In [69]:
# Calcular a correlação entre todas as colunas e a coluna 'Credit_Score'
correlacoes_todas = train_data.corr()['Credit_Score']
correlacoes_todas.sort_values(ascending=False)

Credit_Score                1.000000
Credit_History_Age          0.371374
Age                         0.157543
Credit_Mix                  0.106326
Credit_Utilization_Ratio    0.045793
Amount_invested_monthly     0.011631
Annual_Income               0.008015
Total_EMI_per_month         0.004382
Monthly_Balance            -0.001585
Interest_Rate              -0.002591
Num_Credit_Card            -0.007624
Num_of_Delayed_Payment     -0.008103
Occupation                 -0.010304
Num_Bank_Accounts          -0.010491
Num_of_Loan                -0.010915
Num_Credit_Inquiries       -0.011235
Payment_Behaviour          -0.103017
Payment_of_Min_Amount      -0.278127
Outstanding_Debt           -0.386525
Delay_from_due_date        -0.431591
Name: Credit_Score, dtype: float64

In [70]:
print("\nVerificando valores numéricos inválidos...")
numeric_cols = train_data.select_dtypes(include=['int64', 'float64']).columns.tolist()
for col in numeric_cols:
    invalid_values = train_data[~train_data[col].astype(str).str.replace('.', '').str.replace('-', '').str.isdigit()]
    if not invalid_values.empty:
        print(f"Coluna {col} contém valores não numéricos: {invalid_values[col].unique()}")


Verificando valores numéricos inválidos...
Coluna Num_of_Delayed_Payment contém valores não numéricos: [nan]
Coluna Num_Credit_Inquiries contém valores não numéricos: [nan]
Coluna Amount_invested_monthly contém valores não numéricos: [nan]
Coluna Monthly_Balance contém valores não numéricos: [            nan -3.33333333e+26]


In [71]:
print(train_data.isnull().sum())

Age                            0
Occupation                     0
Annual_Income                  0
Num_Bank_Accounts              0
Num_Credit_Card                0
Interest_Rate                  0
Num_of_Loan                    0
Delay_from_due_date            0
Num_of_Delayed_Payment      7002
Num_Credit_Inquiries        1965
Credit_Mix                     0
Outstanding_Debt               0
Credit_Utilization_Ratio       0
Credit_History_Age             0
Payment_of_Min_Amount          0
Total_EMI_per_month            0
Amount_invested_monthly     4479
Payment_Behaviour              0
Monthly_Balance             1200
Credit_Score                   0
dtype: int64


In [72]:
# Separando features e target
X = train_data.drop(['Credit_Score'], axis=1)
y = train_data['Credit_Score']

In [73]:
# Salvar processados
train_data.to_csv("../data/processed/quantum_finance_train_processed.csv", index=False)
test_data.to_csv("../data/processed/quantum_finance_test_processed.csv", index=False)
print("Arquivos salvos como quantum_finance_train_processed.csv e quantum_finance_test_processed.csv")

Arquivos salvos como quantum_finance_train_processed.csv e quantum_finance_test_processed.csv
