In [2]:
from sklearn.ensemble import RandomForestClassifier
import pandas as pd 
import numpy as np 
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import classification_report, confusion_matrix,accuracy_score,f1_score,roc_auc_score
from sklearn.model_selection import GridSearchCV 
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
import pickle
import os

In [3]:
# Directorio base
base_dir = os.path.join('..','data', 'train_test_val')

# Cargar los DataFrames
train_df = pd.read_csv(os.path.join(base_dir, 'train.csv'))
test_df = pd.read_csv(os.path.join(base_dir, 'test.csv'))
val_df = pd.read_csv(os.path.join(base_dir, 'val.csv'))

# Declaracion de la variable objetivo
target = 'income'

In [7]:
train_df.shape, test_df.shape, val_df.shape

((33131, 44), (7100, 44), (7100, 44))

In [4]:
X_train = train_df.drop(columns=[target])
y_train = train_df[target]

X_val = val_df.drop(columns=[target])
y_val = val_df[target]

X_test = test_df.drop(columns=[target])
y_test = test_df[target]

In [10]:
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

Nos fijaremos en la metrica de F1 al tratarse de un dataset desbalanceado

In [36]:
y_train_pred = model.predict(X_train)
print("--- Entrenamiento ---")
print(classification_report(y_train, y_train_pred))
print("F1 (train):", f1_score(y_train, y_train_pred))
y_val_pred = model.predict(X_val)
print(confusion_matrix(y_val, y_val_pred))
print(classification_report(y_val, y_val_pred))
print("F1:", f1_score(y_val, y_val_pred))

--- Entrenamiento ---
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     25170
           1       1.00      1.00      1.00      7961

    accuracy                           1.00     33131
   macro avg       1.00      1.00      1.00     33131
weighted avg       1.00      1.00      1.00     33131

F1 (train): 0.9998743718592965
[[5023  371]
 [ 606 1100]]
              precision    recall  f1-score   support

           0       0.89      0.93      0.91      5394
           1       0.75      0.64      0.69      1706

    accuracy                           0.86      7100
   macro avg       0.82      0.79      0.80      7100
weighted avg       0.86      0.86      0.86      7100

F1: 0.6924771797293043


In [20]:
param_dist = {
    'n_estimators': randint(50, 300),
    'max_depth': [None] + list(range(5, 30)),
    'min_samples_split': randint(2, 10),
    'min_samples_leaf': randint(1, 10),
    'max_features': ['sqrt', 'log2', None]
}

random_search = RandomizedSearchCV(model, param_distributions=param_dist, n_iter=30,scoring="f1", cv=3, verbose=1, n_jobs=-1, random_state=42)
random_search.fit(X_train, y_train)
print("Best parameters found: ", random_search.best_params_)
print("Best score found: ", random_search.best_score_)

best_model = random_search.best_estimator_
y_val_pred = best_model.predict(X_val)
print(confusion_matrix(y_val, y_val_pred))
print(classification_report(y_val, y_val_pred))
print("F1:", f1_score(y_val, y_val_pred))

Fitting 3 folds for each of 30 candidates, totalling 90 fits
Best parameters found:  {'max_depth': 29, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 141}
Best score found:  0.6811130341397842
[[5097  297]
 [ 632 1074]]
              precision    recall  f1-score   support

           0       0.89      0.94      0.92      5394
           1       0.78      0.63      0.70      1706

    accuracy                           0.87      7100
   macro avg       0.84      0.79      0.81      7100
weighted avg       0.86      0.87      0.86      7100

F1: 0.6980825479363016


In [None]:
model2= RandomForestClassifier(class_weight='balanced',n_estimators=100, max_depth=25, min_samples_split=2, min_samples_leaf=1, max_features='sqrt', random_state=42)
model2.fit(X_train, y_train)

In [27]:
y_val_pred2 = model2.predict(X_val)
y_train_pred = model2.predict(X_train)
print("--- Entrenamiento ---")
print(classification_report(y_train, y_train_pred))
print("F1 (train):", f1_score(y_train, y_train_pred))

y_val_pred = model2.predict(X_val)
print("\n--- Validación ---")
print(classification_report(y_val, y_val_pred))
print("F1 (val):", f1_score(y_val, y_val_pred))

--- Entrenamiento ---
              precision    recall  f1-score   support

           0       1.00      0.93      0.96     25170
           1       0.81      0.99      0.89      7961

    accuracy                           0.94     33131
   macro avg       0.90      0.96      0.93     33131
weighted avg       0.95      0.94      0.94     33131

F1 (train): 0.8915949252378794

--- Validación ---
              precision    recall  f1-score   support

           0       0.93      0.88      0.90      5394
           1       0.68      0.78      0.72      1706

    accuracy                           0.86      7100
   macro avg       0.80      0.83      0.81      7100
weighted avg       0.87      0.86      0.86      7100

F1 (val): 0.7243449781659389


Claro overfitting, vamos a probar a reducir el numero de profundidad del arbol

In [29]:
model3= RandomForestClassifier(class_weight='balanced',n_estimators=100, max_depth=25, min_samples_split=2, min_samples_leaf=1, max_features='sqrt', random_state=42)
model3.fit(X_train, y_train)

In [30]:

y_train_pred3 = model3.predict(X_train)
print("--- Entrenamiento ---")
print(classification_report(y_train, y_train_pred3))
print("F1 (train):", f1_score(y_train, y_train_pred3))

y_val_pred3 = model3.predict(X_val)
print("\n--- Validación ---")
print(classification_report(y_val, y_val_pred3))
print("F1 (val):", f1_score(y_val, y_val_pred3))

--- Entrenamiento ---
              precision    recall  f1-score   support

           0       1.00      0.93      0.96     25170
           1       0.81      0.99      0.89      7961

    accuracy                           0.94     33131
   macro avg       0.90      0.96      0.93     33131
weighted avg       0.95      0.94      0.94     33131

F1 (train): 0.8915949252378794

--- Validación ---
              precision    recall  f1-score   support

           0       0.93      0.88      0.90      5394
           1       0.68      0.78      0.72      1706

    accuracy                           0.86      7100
   macro avg       0.80      0.83      0.81      7100
weighted avg       0.87      0.86      0.86      7100

F1 (val): 0.7243449781659389


In [5]:
model4 = RandomForestClassifier(class_weight='balanced',n_estimators=150, max_depth=10, min_samples_split=2, min_samples_leaf=1, max_features='sqrt', random_state=42)
model4.fit(X_train, y_train)

In [6]:

y_train_pred4 = model4.predict(X_train)
print("--- Entrenamiento ---")
print(classification_report(y_train, y_train_pred4))
print("F1 (train):", f1_score(y_train, y_train_pred4))

y_val_pred4 = model4.predict(X_val)
print("\n--- Validación ---")
print(classification_report(y_val, y_val_pred4))
print("F1 (val):", f1_score(y_val, y_val_pred4))

--- Entrenamiento ---
              precision    recall  f1-score   support

           0       0.96      0.78      0.86     25170
           1       0.57      0.90      0.69      7961

    accuracy                           0.81     33131
   macro avg       0.76      0.84      0.78     33131
weighted avg       0.87      0.81      0.82     33131

F1 (train): 0.6936696580160078

--- Validación ---
              precision    recall  f1-score   support

           0       0.96      0.78      0.86      5394
           1       0.56      0.89      0.68      1706

    accuracy                           0.80      7100
   macro avg       0.76      0.83      0.77      7100
weighted avg       0.86      0.80      0.82      7100

F1 (val): 0.6843057440072365


### Optimizacion de parametros

#### Grid Search

In [7]:
from sklearn.model_selection import GridSearchCV

from scipy.stats import randint

In [8]:
param_grid = {
    'n_estimators': [100, 150, 200, 250],  
    'max_depth': [5, 10, 15],     
    'min_samples_split': [2, 5],       
    'min_samples_leaf': [1, 2],         
    'class_weight': ['balanced', None]     
}
grid_search = GridSearchCV(model4, param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)
print("Best parameters found: ", grid_search.best_params_)
print("Best score found: ", grid_search.best_score_)
best_model = grid_search.best_estimator_
y_val_pred = best_model.predict(X_val)
print(confusion_matrix(y_val, y_val_pred))
print(classification_report(y_val, y_val_pred))
print("F1:", f1_score(y_val, y_val_pred))
print("AUC:", roc_auc_score(y_val, best_model.predict_proba(X_val)[:, 1]))

y_train_pred = best_model.predict(X_train)
print("--- Entrenamiento ---")
print(confusion_matrix(y_val, y_val_pred))
print(classification_report(y_train, y_train_pred))
print("F1 (train):", f1_score(y_train, y_train_pred))
print("AUC (train):", roc_auc_score(y_train, best_model.predict_proba(X_train)[:, 1]))



Fitting 5 folds for each of 6 candidates, totalling 30 fits
Best parameters found:  {'class_weight': None, 'max_depth': 15, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 200}
Best score found:  0.8611874762898406
[[5132  262]
 [ 672 1034]]
              precision    recall  f1-score   support

           0       0.88      0.95      0.92      5394
           1       0.80      0.61      0.69      1706

    accuracy                           0.87      7100
   macro avg       0.84      0.78      0.80      7100
weighted avg       0.86      0.87      0.86      7100

F1: 0.6888740839440373
AUC: 0.9260118598190601
--- Entrenamiento ---
[[5132  262]
 [ 672 1034]]
              precision    recall  f1-score   support

           0       0.89      0.96      0.92     25170
           1       0.83      0.62      0.71      7961

    accuracy                           0.88     33131
   macro avg       0.86      0.79      0.82     33131
weighted avg       0.87      0.88      0.87     

Evaluamos con test

In [9]:
y_test_pred = best_model.predict(X_test)

# Métricas
print(confusion_matrix(y_test, y_test_pred))
print(classification_report(y_test, y_test_pred))
print("F1 (test):", f1_score(y_test, y_test_pred))

# AUC
y_test_proba = best_model.predict_proba(X_test)[:, 1]
print("AUC (test):", roc_auc_score(y_test, y_test_proba))

[[5129  265]
 [ 713  993]]
              precision    recall  f1-score   support

           0       0.88      0.95      0.91      5394
           1       0.79      0.58      0.67      1706

    accuracy                           0.86      7100
   macro avg       0.83      0.77      0.79      7100
weighted avg       0.86      0.86      0.85      7100

F1 (test): 0.6700404858299596
AUC (test): 0.9125759440931502


In [None]:
# Guardar el modelo en la carpeta 'artifacts'
with open(r'../artifacts/random_forest.pkl', "wb") as f:
    pickle.dump(best_model, f)

## Conclusiones

- Se dividió el dataset en entrenamiento (70%), validación (15%) y test (15%) de forma estratificada.
- Se utilizó un modelo de Random Forest, ajustando hiperparámetros con RandomizedSearchCV y GridSearchCV.
- El modelo final tuvo un buen rendimiento general:
  - F1 en test: 0.669
  - AUC en test: 0.913
  - Accuracy en test: 0.86
- No se detectó overfitting, ya que el rendimiento en entrenamiento, validación y test fue consistente.
- La clase minoritaria (1) sigue siendo más difícil de predecir, mejoro respecto a los primero modelos.
- Se usó `class_weight='balanced'` para compensar el desbalanceo de clases, pero resulto dar una mejor metrica con `class_weight='None'`.
