In [15]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import RobustScaler, OneHotEncoder
import pandas as pd
from sklearn.preprocessing import FunctionTransformer

In [4]:
df = pd.read_parquet("dataset.pq")

In [5]:
df

Unnamed: 0,customer_id,age,occupation,annual_income,monthly_inhand_salary,num_bank_accounts,num_credit_card,interest_rate,num_of_loan,delay_from_due_date,...,num_credit_inquiries,outstanding_debt,credit_utilization_ratio,credit_history_age,payment_of_min_amount,total_emi_per_month,amount_invested_monthly,payment_behaviour,monthly_balance,credit_score
0,CUS_0xd40,23.0,Scientist,19114.12,1824.843333,3,4,3,4.0,3,...,4.0,809.98,23.933795,,No,49.574949,24.785217,High_spent_Medium_value_payments,358.124168,0
1,CUS_0x21b1,28.0,Teacher,34847.84,3037.986667,2,4,6,1.0,3,...,2.0,605.03,32.933856,27.0,No,18.816215,218.904344,Low_spent_Small_value_payments,356.078109,0
2,CUS_0x2dbc,34.0,Engineer,143162.64,12187.220000,1,5,8,3.0,8,...,3.0,1303.01,38.374753,18.0,No,246.992319,10000.000000,High_spent_Small_value_payments,895.494583,0
3,CUS_0xb891,55.0,Entrepreneur,30689.89,2612.490833,2,5,4,-100.0,4,...,4.0,632.46,27.332515,17.0,No,16.415452,125.617251,High_spent_Small_value_payments,379.216381,0
4,CUS_0x1cdb,21.0,Developer,35547.71,2853.309167,7,5,5,-100.0,1,...,4.0,943.86,25.862922,31.0,Yes,0.000000,181.330901,High_spent_Small_value_payments,364.000016,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12495,CUS_0x372c,19.0,Lawyer,42903.79,3468.315833,0,4,6,1.0,9,...,1.0,1079.48,35.716618,28.0,No,34.975457,115.184984,High_spent_Medium_value_payments,,0
12496,CUS_0xf16,45.0,Media_Manager,16680.35,,1,1,5,4.0,1,...,8.0,897.16,41.212367,,No,41.113561,70.805550,Low_spent_Large_value_payments,,0
12497,CUS_0xaf61,50.0,Writer,37188.10,3097.008333,1,4,5,3.0,7,...,3.0,620.64,39.300980,30.0,No,84.205949,42.935566,High_spent_Medium_value_payments,,0
12498,CUS_0x8600,29.0,Architect,20002.88,1929.906667,10,8,29,5.0,33,...,9.0,3571.70,37.140784,6.0,Yes,60.964772,34.662906,High_spent_Large_value_payments,,0


In [6]:
df["payment_of_min_amount"].unique().sum()

'NoYesNM'

In [9]:
def clean_dataset(df: pd.DataFrame, erase_nm: bool= True, erase_null: bool= True) -> pd.DataFrame:
    """Preprocesses the data"""
    query_string = (
        "age > 0 and "
        "age <= 120 and "
        "num_bank_accounts >= 0 and "
        "num_of_loan >= 0 and "
        "num_of_delayed_payment >= 0 and "
        "monthly_balance >= 0"
    )
    df = df.query(query_string)
    df.loc[:, "occupation"] = df["occupation"].astype(str)
    if erase_nm:
        df.loc[:, "payment_of_min_amount"] = df["payment_of_min_amount"].replace("NM", "no")
    if erase_null:
        df = df[df["payment_behaviour"] != "9#%8"]
    return df

In [16]:
# Definir las listas de características numéricas y categóricas
numeric_features = [
    "age",
    "annual_income",
    "monthly_inhand_salary",
    "num_bank_accounts",
    "num_credit_card",
    "interest_rate",
    "num_of_loan",
    "delay_from_due_date",
    "num_of_delayed_payment",
    "changed_credit_limit",
    "num_credit_inquiries",
    "outstanding_debt",
    "credit_utilization_ratio",
    "credit_history_age",
    "total_emi_per_month",
    "amount_invested_monthly",
    "monthly_balance",
]

categorical_features = ["occupation", "payment_behaviour", "payment_of_min_amount"]

# Definir los transformadores para datos numéricos y categóricos
numeric_transformer = RobustScaler()
categorical_transformer = OneHotEncoder(sparse=False)

# Crear el preprocesador
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ],
    remainder="drop",
)

# Convertir la función clean_dataset en un transformador compatible
cleaning_transformer = FunctionTransformer(clean_dataset)

# Crear el pipeline completo
pipeline = Pipeline([
    ('cleaning', cleaning_transformer),  # Aplicar la limpieza del dataset
    ('preprocessing', preprocessor),  # Aplicar el preprocesamiento
])




In [17]:
X_processed = pipeline.fit_transform(df)



In [18]:
X_processed

array([[-5.88235294e-01, -3.37256945e-01, -2.84350861e-01, ...,
         1.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [-2.94117647e-01, -3.76039408e-02, -9.73483666e-04, ...,
         1.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [ 5.88235294e-02,  2.02528108e+00,  2.13619009e+00, ...,
         1.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       ...,
       [-5.88235294e-02, -1.52847273e-01,             nan, ...,
         0.00000000e+00,  1.00000000e+00,  0.00000000e+00],
       [-2.35294118e-01, -4.23252641e-02, -2.33182527e-02, ...,
         1.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [ 2.94117647e-01,  7.98617846e-02,  2.58836479e-02, ...,
         1.00000000e+00,  0.00000000e+00,  0.00000000e+00]])