# Exploração e Processamento de Dados para Classificação de Score de Crédito

In [6]:

import pandas as pd
import numpy as np
import re

# Carregar dados
df_train = pd.read_csv("../data/raw/credit_score_train.csv")
df_test = pd.read_csv("../data/raw/credit_score_test.csv")
print("Shape train:", df_train.shape)
print("Shape test:", df_test.shape)


  df_train = pd.read_csv("../data/raw/credit_score_train.csv")


Shape train: (100000, 28)
Shape test: (50000, 27)


## Conversões de tipos e limpeza de colunas numéricas

In [7]:

def to_float(val):
    try:
        return float(str(val).replace("_", "").replace(",", ""))
    except:
        return np.nan

def extract_months(age_str):
    if isinstance(age_str, str):
        match = re.search(r"(\d+)\s*Years?.*?(\d+)?\s*Months?", age_str)
        if match:
            years = int(match.group(1))
            months = int(match.group(2)) if match.group(2) else 0
            return years * 12 + months
    return np.nan

def to_numeric(val):
    try:
        return pd.to_numeric(val)
    except:
        return np.nan

# Conversões
cols_to_float = ['Annual_Income', 'Outstanding_Debt', 'Amount_invested_monthly', 'Monthly_Balance']
for col in cols_to_float:
    df_train[col] = df_train[col].apply(to_float)
    df_test[col] = df_test[col].apply(to_float)

df_train['Age'] = df_train['Age'].apply(to_numeric)
df_test['Age'] = df_test['Age'].apply(to_numeric)

df_train['Num_of_Loan'] = df_train['Num_of_Loan'].apply(to_numeric)
df_test['Num_of_Loan'] = df_test['Num_of_Loan'].apply(to_numeric)

df_train['Num_of_Delayed_Payment'] = df_train['Num_of_Delayed_Payment'].apply(to_numeric)
df_test['Num_of_Delayed_Payment'] = df_test['Num_of_Delayed_Payment'].apply(to_numeric)

df_train['Credit_History_Age'] = df_train['Credit_History_Age'].apply(extract_months)
df_test['Credit_History_Age'] = df_test['Credit_History_Age'].apply(extract_months)


## Tratamento de valores ausentes

In [8]:

# Preencher valores ausentes com média (numéricos) e 'Unknown' (categóricos)
num_cols = df_train.select_dtypes(include=[np.number]).columns
cat_cols = df_train.select_dtypes(include=['object']).columns.drop(['Credit_Score'])

for col in num_cols:
    mean_val = df_train[col].mean()
    df_train[col].fillna(mean_val, inplace=True)
    if col in df_test.columns:
        df_test[col].fillna(mean_val, inplace=True)

for col in cat_cols:
    df_train[col].fillna("Unknown", inplace=True)
    if col in df_test.columns:
        df_test[col].fillna("Unknown", inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_train[col].fillna(mean_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_test[col].fillna(mean_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values alwa

## Transformação da variável alvo

In [9]:

from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
df_train['Credit_Score_Label'] = label_encoder.fit_transform(df_train['Credit_Score'])

# Exibir classes
dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))


{'Good': np.int64(0), 'Poor': np.int64(1), 'Standard': np.int64(2)}

## Seleção de colunas relevantes e exportação

In [None]:

# Remover colunas irrelevantes
cols_to_remove = ['ID', 'Occupation', 'Customer_ID', 'Name', 'SSN', 'Month', 'Credit_Score']
df_train_cleaned = df_train.drop(columns=cols_to_remove)
df_test_cleaned = df_test.drop(columns=[col for col in cols_to_remove if col in df_test.columns])

# Salvar processados
df_train_cleaned.to_csv("../data/processed/quantum_finance_train_processed.csv", index=False)
df_test_cleaned.to_csv("../data/processed/quantum_finance_test_processed.csv", index=False)
print("Arquivos salvos como quantum_finance_train_processed.csv e quantum_finance_test_processed.csv")


Arquivos salvos como train_processed.csv e test_processed.csv
