In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [40]:
df = pd.read_csv('train.csv')
df

Unnamed: 0,ID,RevolvingUtilizationOfUnsecuredLines,Age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents,SeriousDlqin2yrs
0,9580,0.668999,58,2,0.449504,3425.0,9,1,1,1,1.0,0
1,39755,0.015922,71,0,6.000000,,5,0,0,0,0.0,0
2,118799,0.183062,52,1,0.035593,5000.0,9,0,0,0,0.0,0
3,16489,0.162301,77,0,0.227886,2000.0,8,0,0,0,0.0,0
4,149857,0.404199,30,0,0.026010,5843.0,4,0,0,0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
104995,79500,0.842886,33,0,0.182561,9300.0,10,0,0,0,1.0,0
104996,84928,0.805186,68,0,0.229466,5429.0,7,0,0,0,0.0,0
104997,56301,0.811494,51,2,3.709314,3016.0,26,0,4,0,0.0,1
104998,41912,0.412590,62,1,0.173290,14166.0,7,1,1,0,0.0,0


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 105000 entries, 0 to 104999
Data columns (total 12 columns):
 #   Column                                Non-Null Count   Dtype  
---  ------                                --------------   -----  
 0   ID                                    105000 non-null  int64  
 1   RevolvingUtilizationOfUnsecuredLines  105000 non-null  float64
 2   Age                                   105000 non-null  int64  
 3   NumberOfTime30-59DaysPastDueNotWorse  105000 non-null  int64  
 4   DebtRatio                             105000 non-null  float64
 5   MonthlyIncome                         84164 non-null   float64
 6   NumberOfOpenCreditLinesAndLoans       105000 non-null  int64  
 7   NumberOfTimes90DaysLate               105000 non-null  int64  
 8   NumberRealEstateLoansOrLines          105000 non-null  int64  
 9   NumberOfTime60-89DaysPastDueNotWorse  105000 non-null  int64  
 10  NumberOfDependents                    102236 non-null  float64
 11  

In [4]:
df.describe()

Unnamed: 0,ID,RevolvingUtilizationOfUnsecuredLines,Age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents,SeriousDlqin2yrs
count,105000.0,105000.0,105000.0,105000.0,105000.0,84164.0,105000.0,105000.0,105000.0,105000.0,102236.0,105000.0
mean,75006.458152,5.378324,52.32561,0.409352,352.044192,6703.641,8.459952,0.254619,1.01901,0.228762,0.757933,0.066514
std,43315.742022,201.573457,14.766425,4.056717,1820.229318,16222.88,5.134329,4.032506,1.131065,4.017864,1.115273,0.24918
min,1.0,0.0,21.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,37590.75,0.029974,41.0,0.0,0.17495,3400.0,5.0,0.0,0.0,0.0,0.0,0.0
50%,74941.5,0.154252,52.0,0.0,0.366061,5400.0,8.0,0.0,1.0,0.0,0.0,0.0
75%,112542.5,0.556035,63.0,0.0,0.870083,8250.0,11.0,0.0,2.0,0.0,1.0,0.0
max,149999.0,29110.0,109.0,98.0,329664.0,3008750.0,58.0,98.0,54.0,98.0,20.0,1.0


In [5]:
df['SeriousDlqin2yrs'].value_counts(normalize=True)

SeriousDlqin2yrs
0    0.933486
1    0.066514
Name: proportion, dtype: float64

In [6]:
print((df['MonthlyIncome'].isnull().sum())/(len(df['MonthlyIncome']))*100) #porcentaje de los valores NaN en la columna MonthlyIncome

19.843809523809526


In [7]:
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler

cols_to_use = ['RevolvingUtilizationOfUnsecuredLines', 'Age', 'NumberOfTime30-59DaysPastDueNotWorse',
               'DebtRatio', 'MonthlyIncome', 'NumberOfOpenCreditLinesAndLoans', 'NumberOfTimes90DaysLate',
               'NumberRealEstateLoansOrLines', 'NumberOfTime60-89DaysPastDueNotWorse', 'NumberOfDependents']

df_impute = df[cols_to_use]

# Escalar las variables (muy importante para KNN)
scaler = StandardScaler()
df_scaled = pd.DataFrame(scaler.fit_transform(df_impute), columns=cols_to_use)

imputer = KNNImputer(n_neighbors=5)

# Imputar los valores faltantes
df_imputed_scaled = pd.DataFrame(imputer.fit_transform(df_scaled), columns=cols_to_use)

# Volver a escala original
df_imputed = pd.DataFrame(scaler.inverse_transform(df_imputed_scaled), columns=cols_to_use)

# Reemplazar en el DataFrame original
df[cols_to_use] = df_imputed

In [8]:
def preprocess(df):
    df = df.copy()

    df["RevolvingUtilizationOfUnsecuredLines"] = df["RevolvingUtilizationOfUnsecuredLines"].clip(upper=1)

    df["TotalPastDue"] = (
        df["NumberOfTime30-59DaysPastDueNotWorse"] +
        df["NumberOfTime60-89DaysPastDueNotWorse"] +
        df["NumberOfTimes90DaysLate"]
    )

    df["IncomePerDependent"] = df["MonthlyIncome"] / (df["NumberOfDependents"] + 1)
    df["IncomeToDebtRatio"] = df["MonthlyIncome"] / (df["DebtRatio"] + 1e-5)
    df["AgeBin"] = pd.cut(df["Age"], bins=[0, 25, 35, 50, 65, 120], labels=False)
    df["HasDependents"] = (df["NumberOfDependents"] > 0).astype(int)

    # Log transforms para estabilizar distribución
    df["LogMonthlyIncome"] = np.log1p(df["MonthlyIncome"])
    df["LogDebtRatio"] = np.log1p(df["DebtRatio"])
    df["LogUtilization"] = np.log1p(df["RevolvingUtilizationOfUnsecuredLines"])

    df["PastDueWeighted"] = (
        df["NumberOfTime30-59DaysPastDueNotWorse"] * 1 +
        df["NumberOfTime60-89DaysPastDueNotWorse"] * 2 +
        df["NumberOfTimes90DaysLate"] * 3
    )
    df["HasSeriousLate"] = (df["NumberOfTimes90DaysLate"] > 0).astype(int)
    df["DebtPerCreditLine"] = df["DebtRatio"] / (df["NumberOfOpenCreditLinesAndLoans"] + 1)
    df["RealEstateToCreditRatio"] = df["NumberRealEstateLoansOrLines"] / (df["NumberOfOpenCreditLinesAndLoans"] + 1)
    df["IncomeAge"] = df["MonthlyIncome"] * df["Age"]

    return df



In [9]:
df = preprocess(df)

In [10]:
df

Unnamed: 0,ID,RevolvingUtilizationOfUnsecuredLines,Age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,...,AgeBin,HasDependents,LogMonthlyIncome,LogDebtRatio,LogUtilization,PastDueWeighted,HasSeriousLate,DebtPerCreditLine,RealEstateToCreditRatio,IncomeAge
0,9580,0.668999,58.0,2.0,0.449504,3425.0,9.0,1.0,1.0,1.0,...,3,1,8.139149,0.371221,0.512224,7.0,1,0.044950,0.100000,198650.0
1,39755,0.015922,71.0,0.0,6.000000,5262.4,5.0,0.0,0.0,0.0,...,4,0,8.568532,1.945910,0.015796,0.0,0,1.000000,0.000000,373630.4
2,118799,0.183062,52.0,1.0,0.035593,5000.0,9.0,0.0,0.0,0.0,...,3,0,8.517393,0.034974,0.168106,1.0,0,0.003559,0.000000,260000.0
3,16489,0.162301,77.0,0.0,0.227886,2000.0,8.0,0.0,0.0,0.0,...,4,0,7.601402,0.205294,0.150401,0.0,0,0.025321,0.000000,154000.0
4,149857,0.404199,30.0,0.0,0.026010,5843.0,4.0,0.0,0.0,0.0,...,1,0,8.673171,0.025677,0.339467,0.0,0,0.005202,0.000000,175290.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
104995,79500,0.842886,33.0,0.0,0.182561,9300.0,10.0,0.0,0.0,0.0,...,1,1,9.137877,0.167682,0.611333,0.0,0,0.016596,0.000000,306900.0
104996,84928,0.805186,68.0,0.0,0.229466,5429.0,7.0,0.0,0.0,0.0,...,4,0,8.599694,0.206580,0.590664,0.0,0,0.028683,0.000000,369172.0
104997,56301,0.811494,51.0,2.0,3.709314,3016.0,26.0,0.0,4.0,0.0,...,3,0,8.012018,1.549542,0.594152,2.0,0,0.137382,0.148148,153816.0
104998,41912,0.412590,62.0,1.0,0.173290,14166.0,7.0,1.0,1.0,0.0,...,3,0,9.558671,0.159812,0.345425,4.0,1,0.021661,0.125000,878292.0


In [11]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

X = df.drop(columns=["SeriousDlqin2yrs","ID"], axis=1)
y = df["SeriousDlqin2yrs"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [171]:
rf = RandomForestClassifier(random_state=42, class_weight='balanced')

param_grid_rf = {
    'n_estimators': [100, 200, 300, 500],
    'max_depth': [8, 10, 12, 15, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 4],
}

grid_search = GridSearchCV(estimator=rf,
                           param_grid=param_grid_rf,
                           cv=5,  
                           scoring='roc_auc',
                           n_jobs=-1)

grid_search.fit(X_train, y_train)

print("Mejores parámetros:", grid_search.best_params_)
print("Mejor score ROC AUC:", grid_search.best_score_)

Mejores parámetros: {'max_depth': 8, 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 500}
Mejor score ROC AUC: 0.8592497374071225


In [12]:
import xgboost as xgb

xgb_clf = xgb.XGBClassifier(
    objective='binary:logistic',
    use_label_encoder=False,
    eval_metric='auc',
    random_state=42,
    scale_pos_weight=(y_train == 0).sum() / (y_train == 1).sum()  # Para clases desbalanceadas
)

param_grid_xgb = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7, 9],
    'learning_rate': [0.01, 0.05, 0.1],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'gamma': [0, 0.1, 0.3],
}



In [13]:
grid_search_xgb = GridSearchCV(
    estimator=xgb_clf,
    param_grid=param_grid_xgb,
    scoring='roc_auc',
    cv=5,
    n_jobs=-1
)

grid_search_xgb.fit(X_train, y_train)

print(grid_search_xgb.best_score_)
print(grid_search_xgb.best_params_)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


0.8636768076547208
{'colsample_bytree': 0.8, 'gamma': 0.3, 'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 200, 'subsample': 0.8}


In [29]:
print(grid_search_xgb.best_params_)
print(grid_search_xgb.best_score_)

{'colsample_bytree': 0.8, 'gamma': 0.3, 'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 200, 'subsample': 0.8}
0.8636768076547208


In [30]:
# best_rf = grid_search.best_estimator_
best_xgb = grid_search_xgb.best_estimator_

In [31]:
df_test = pd.read_csv('test.csv')
df_test

Unnamed: 0,ID,RevolvingUtilizationOfUnsecuredLines,Age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
0,129460,1.000000,21,0,8.000000,,0,0,0,0,
1,134018,0.009878,38,0,0.229978,10500.0,10,0,1,0,1.0
2,86523,0.276836,70,0,1914.000000,,23,0,1,0,0.0
3,138466,0.045413,75,0,452.000000,,4,0,0,0,0.0
4,143905,0.000000,82,0,0.000000,,5,0,0,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
44995,124596,0.055997,70,0,51.000000,,2,0,0,0,0.0
44996,75895,1.000000,62,0,2796.000000,,5,0,1,0,3.0
44997,92453,0.673065,56,1,0.511132,7500.0,9,0,2,0,4.0
44998,139288,1.000000,22,0,0.000000,2500.0,0,0,0,0,0.0


In [32]:
df_impute = df_test[cols_to_use]

scaler = StandardScaler()
df_scaled = pd.DataFrame(scaler.fit_transform(df_impute), columns=cols_to_use)

imputer = KNNImputer(n_neighbors=5)


df_imputed_scaled = pd.DataFrame(imputer.fit_transform(df_scaled), columns=cols_to_use)


df_imputed = pd.DataFrame(scaler.inverse_transform(df_imputed_scaled), columns=cols_to_use)


df_test[cols_to_use] = df_imputed

df_test = preprocess(df_test)

In [33]:
df_test

Unnamed: 0,ID,RevolvingUtilizationOfUnsecuredLines,Age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,...,AgeBin,HasDependents,LogMonthlyIncome,LogDebtRatio,LogUtilization,PastDueWeighted,HasSeriousLate,DebtPerCreditLine,RealEstateToCreditRatio,IncomeAge
0,129460,1.000000,21.0,0.0,8.000000,910.0,0.0,0.0,0.0,0.0,...,0.0,0,6.814543,2.197225,0.693147,0.0,0,8.000000,0.000000,19110.0
1,134018,0.009878,38.0,0.0,0.229978,10500.0,10.0,0.0,1.0,0.0,...,2.0,1,9.259226,0.206996,0.009829,0.0,0,0.020907,0.090909,399000.0
2,86523,0.276836,70.0,0.0,1914.000000,7392.0,23.0,0.0,1.0,0.0,...,4.0,0,8.908289,7.557473,0.244385,0.0,0,79.750000,0.041667,517440.0
3,138466,0.045413,75.0,0.0,452.000000,1761.8,4.0,0.0,0.0,0.0,...,4.0,0,7.474659,6.115892,0.044412,0.0,0,90.400000,0.000000,132135.0
4,143905,0.000000,82.0,0.0,0.000000,2556.0,5.0,0.0,0.0,0.0,...,4.0,0,7.846590,0.000000,0.000000,0.0,0,0.000000,0.000000,209592.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44995,124596,0.055997,70.0,0.0,51.000000,3250.0,2.0,0.0,0.0,0.0,...,4.0,0,8.086718,3.951244,0.054485,0.0,0,17.000000,0.000000,227500.0
44996,75895,1.000000,62.0,0.0,2796.000000,8743.4,5.0,0.0,1.0,0.0,...,3.0,1,9.076169,7.936303,0.693147,0.0,0,466.000000,0.166667,542090.8
44997,92453,0.673065,56.0,1.0,0.511132,7500.0,9.0,0.0,2.0,0.0,...,3.0,1,8.922792,0.412859,0.514658,1.0,0,0.051113,0.200000,420000.0
44998,139288,1.000000,22.0,0.0,0.000000,2500.0,0.0,0.0,0.0,0.0,...,0.0,0,7.824446,0.000000,0.693147,0.0,0,0.000000,0.000000,55000.0


In [None]:
# pred_rf = best_rf.predict(df_test.drop(columns=['ID']))

pred_xgb = best_xgb.predict(df_test.drop(columns=['ID']))
pred_xgb = best_xgb.predict_proba(df_test.drop(columns=['ID']))

In [35]:
entrega_xgb = df_test[['ID']]
entrega_xgb['SeriousDlqin2yrs'] = pred_xgb
entrega_xgb.to_csv('entrega_xgb.csv',index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  entrega_xgb['SeriousDlqin2yrs'] = pred_xgb


In [41]:
entrega_xgb_proba = df_test[['ID']]
entrega_xgb_proba['SeriousDlqin2yrs'] = pred_xgb[:, 1]
entrega_xgb_proba.to_csv('entrega_xgb_proba.csv',index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  entrega_xgb_proba['SeriousDlqin2yrs'] = pred_xgb[:, 1]


In [36]:
best_xgb.feature_importances_

array([0.0813166 , 0.01716296, 0.03895262, 0.00989377, 0.01079832,
       0.01272228, 0.0495859 , 0.02345325, 0.01128103, 0.00685165,
       0.43745658, 0.00709599, 0.01010098, 0.02739917, 0.00680001,
       0.00588137, 0.01346348, 0.06969664, 0.13360026, 0.00466277,
       0.00609093, 0.00736077, 0.00837257], dtype=float32)

In [37]:
importances = best_xgb.feature_importances_

cols = ['RevolvingUtilizationOfUnsecuredLines', 'Age', 'NumberOfTime30-59DaysPastDueNotWorse',
        'DebtRatio', 'MonthlyIncome', 'NumberOfOpenCreditLinesAndLoans', 'NumberOfTimes90DaysLate',
        'NumberRealEstateLoansOrLines', 'NumberOfTime60-89DaysPastDueNotWorse', 'NumberOfDependents',
        'TotalPastDue', 'IncomePerDependent', 'IncomeToDebtRatio', 'AgeInverse', 'AgeBin',
        'RealEstateToCreditRatio', 'DebtPerCreditLine', 'HasDependents', 'LogMonthlyIncome',
        'LogDebtRatio', 'LogUtilization']

feat_imp_df = pd.DataFrame({
    'feature': cols,
    'importance': importances
}).sort_values(by='importance', ascending=False)

print(feat_imp_df)

# Gráfico
plt.figure(figsize=(10,6))
plt.barh(feat_imp_df['feature'], feat_imp_df['importance'])
plt.gca().invert_yaxis()
plt.xlabel('Importancia')
plt.title('Importancia de características en XGBoost')
plt.show()

ValueError: All arrays must be of the same length

In [None]:

def __():
    import pickle
    import base64
    from IPython import get_ipython
    ip = get_ipython()
    if ip is not None:
        ip.user_ns.update(pickle.loads(base64.b64decode("gAN9cQAoWBYAAABSYW5kb21Gb3Jlc3RDbGFzc2lmaWVycQFjc2tsZWFybi5lbnNlbWJsZS5fZm9yZXN0ClJhbmRvbUZvcmVzdENsYXNzaWZpZXIKcQJYCgAAAEtOTkltcHV0ZXJxA2Nza2xlYXJuLmltcHV0ZS5fa25uCktOTkltcHV0ZXIKcQRYDQAAAHJvY19hdWNfc2NvcmVxBWNza2xlYXJuLm1ldHJpY3MuX3JhbmtpbmcKcm9jX2F1Y19zY29yZQpxBlgQAAAAdHJhaW5fdGVzdF9zcGxpdHEHY3NrbGVhcm4ubW9kZWxfc2VsZWN0aW9uLl9zcGxpdAp0cmFpbl90ZXN0X3NwbGl0CnEIWAwAAABHcmlkU2VhcmNoQ1ZxCWNza2xlYXJuLm1vZGVsX3NlbGVjdGlvbi5fc2VhcmNoCkdyaWRTZWFyY2hDVgpxClgOAAAAU3RhbmRhcmRTY2FsZXJxC2Nza2xlYXJuLnByZXByb2Nlc3NpbmcuX2RhdGEKU3RhbmRhcmRTY2FsZXIKcQx1Lg==".strip())))
__()
del __
