In [1]:
import pandas as pd
import os
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score, roc_auc_score
from sklearn.model_selection import GridSearchCV 

In [2]:
# Directorio base
base_dir = os.path.join('..','data', 'train_test_val')

# Cargar los DataFrames
train_df = pd.read_csv(os.path.join(base_dir, 'train.csv'))
test_df = pd.read_csv(os.path.join(base_dir, 'test.csv'))
val_df = pd.read_csv(os.path.join(base_dir, 'val.csv'))

# Declaracion de la variable objetivo
target = 'income'

In [4]:
X_train = train_df.drop(columns=[target])
y_train = train_df[target]

X_val = val_df.drop(columns=[target])
y_val = val_df[target]

X_test = test_df.drop(columns=[target])
y_test = test_df[target]

In [5]:
log_reg = LogisticRegression(class_weight='balanced', max_iter=1000, random_state=42)
log_reg.fit(X_train, y_train)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [8]:
y_train_pred = log_reg.predict(X_train)
print("--- Entrenamiento ---")
print(confusion_matrix(y_train, y_train_pred))
print(classification_report(y_train, y_train_pred))
print("F1 (train):", f1_score(y_train, y_train_pred))
# Evaluación en validación
y_val_pred = log_reg.predict(X_val)
print("--- Validación ---")
print(confusion_matrix(y_val, y_val_pred))
print(classification_report(y_val, y_val_pred))
print("F1 (val):", f1_score(y_val, y_val_pred))

--- Entrenamiento ---
[[17870  7300]
 [ 1046  6915]]
              precision    recall  f1-score   support

           0       0.94      0.71      0.81     25170
           1       0.49      0.87      0.62      7961

    accuracy                           0.75     33131
   macro avg       0.72      0.79      0.72     33131
weighted avg       0.83      0.75      0.77     33131

F1 (train): 0.6236471861471862
--- Validación ---
[[3856 1538]
 [ 202 1504]]
              precision    recall  f1-score   support

           0       0.95      0.71      0.82      5394
           1       0.49      0.88      0.63      1706

    accuracy                           0.75      7100
   macro avg       0.72      0.80      0.72      7100
weighted avg       0.84      0.75      0.77      7100

F1 (val): 0.6335299073294018


Probamos a escalar los el df de train con standardScaler

In [9]:
from sklearn.preprocessing import StandardScaler

Usamos fit_transform solo en el dataset de train para prevenir que no haya data leakage en los datasets de validation y test

In [10]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

In [11]:
log_reg_scaled = LogisticRegression(class_weight='balanced', max_iter=1000, random_state=42)
log_reg_scaled.fit(X_train_scaled, y_train)

In [12]:
y_train_pred = log_reg_scaled.predict(X_train_scaled)
print("--- Entrenamiento ---")
print(confusion_matrix(y_train, y_train_pred))
print(classification_report(y_train, y_train_pred))
print("F1 (train):", f1_score(y_train, y_train_pred))

# 4. Evaluación en validación
y_val_pred = log_reg_scaled.predict(X_val_scaled)
print("--- Validación ---")
print(confusion_matrix(y_val, y_val_pred))
print(classification_report(y_val, y_val_pred))
print("F1 (val):", f1_score(y_val, y_val_pred))

--- Entrenamiento ---
[[19760  5410]
 [ 1255  6706]]
              precision    recall  f1-score   support

           0       0.94      0.79      0.86     25170
           1       0.55      0.84      0.67      7961

    accuracy                           0.80     33131
   macro avg       0.75      0.81      0.76     33131
weighted avg       0.85      0.80      0.81     33131

F1 (train): 0.6680280918463914
--- Validación ---
[[4259 1135]
 [ 230 1476]]
              precision    recall  f1-score   support

           0       0.95      0.79      0.86      5394
           1       0.57      0.87      0.68      1706

    accuracy                           0.81      7100
   macro avg       0.76      0.83      0.77      7100
weighted avg       0.86      0.81      0.82      7100

F1 (val): 0.6838082001389854


In [15]:
log_reg_scaled = LogisticRegression(class_weight='balanced', max_iter=500, random_state=42, solver="liblinear", penalty="l1", )
log_reg_scaled.fit(X_train_scaled, y_train)

In [16]:
y_train_pred = log_reg_scaled.predict(X_train_scaled)
print("--- Entrenamiento ---")
print(confusion_matrix(y_train, y_train_pred))
print(classification_report(y_train, y_train_pred))
print("F1 (train):", f1_score(y_train, y_train_pred))

# 4. Evaluación en validación
y_val_pred = log_reg_scaled.predict(X_val_scaled)
print("--- Validación ---")
print(confusion_matrix(y_val, y_val_pred))
print(classification_report(y_val, y_val_pred))
print("F1 (val):", f1_score(y_val, y_val_pred))

--- Entrenamiento ---
[[19756  5414]
 [ 1256  6705]]
              precision    recall  f1-score   support

           0       0.94      0.78      0.86     25170
           1       0.55      0.84      0.67      7961

    accuracy                           0.80     33131
   macro avg       0.75      0.81      0.76     33131
weighted avg       0.85      0.80      0.81     33131

F1 (train): 0.6678286852589641
--- Validación ---
[[4257 1137]
 [ 230 1476]]
              precision    recall  f1-score   support

           0       0.95      0.79      0.86      5394
           1       0.56      0.87      0.68      1706

    accuracy                           0.81      7100
   macro avg       0.76      0.83      0.77      7100
weighted avg       0.86      0.81      0.82      7100

F1 (val): 0.6834915489696689


Observamos que empeora levemente los resultados este tipo de solver.

### Optimizacion de hiperparametros

In [17]:
param_grid = {
    'penalty': ['l1', 'l2'],
    'C': [0.01, 0.1, 1],
    'solver': ['liblinear', 'lbfgs'],
    'max_iter': [100, 500,700]
}

grid_search= GridSearchCV(
    estimator=log_reg_scaled,
    param_grid=param_grid,
    scoring='f1',
    cv=5,
    verbose=1,
    n_jobs=-1
)
grid_search.fit(X_train_scaled, y_train)
print("Mejores parámetros encontrados:")
print(grid_search.best_params_)
print("Mejor puntuación F1:")
print(grid_search.best_score_)
best_model = grid_search.best_estimator_
y_train_pred = best_model.predict(X_train_scaled)
print("--- Entrenamiento ---")
print(confusion_matrix(y_train, y_train_pred))
print(classification_report(y_train, y_train_pred))
print("F1 (train):", f1_score(y_train, y_train_pred))
# Evaluación en validación
y_val_pred = best_model.predict(X_val_scaled)
print("--- Validación ---")
print(confusion_matrix(y_val, y_val_pred))
print(classification_report(y_val, y_val_pred))
print("F1 (val):", f1_score(y_val, y_val_pred))


Fitting 5 folds for each of 36 candidates, totalling 180 fits


45 fits failed out of a total of 180.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
45 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\Victor Casas\.conda\envs\proyecto_pred\Lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
    ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Victor Casas\.conda\envs\proyecto_pred\Lib\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "c:\Users\Victor Casas\.conda\envs\proyecto_pred\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1193, in fit
    solver = _check_solver(self.solver, self.penal

Mejores parámetros encontrados:
{'C': 0.1, 'max_iter': 100, 'penalty': 'l2', 'solver': 'liblinear'}
Mejor puntuación F1:
0.6668408437450404
--- Entrenamiento ---
[[19749  5421]
 [ 1253  6708]]
              precision    recall  f1-score   support

           0       0.94      0.78      0.86     25170
           1       0.55      0.84      0.67      7961

    accuracy                           0.80     33131
   macro avg       0.75      0.81      0.76     33131
weighted avg       0.85      0.80      0.81     33131

F1 (train): 0.6677949228471877
--- Validación ---
[[4258 1136]
 [ 230 1476]]
              precision    recall  f1-score   support

           0       0.95      0.79      0.86      5394
           1       0.57      0.87      0.68      1706

    accuracy                           0.81      7100
   macro avg       0.76      0.83      0.77      7100
weighted avg       0.86      0.81      0.82      7100

F1 (val): 0.6836498378879111


In [21]:
#metricas en test
y_test_pred = best_model.predict(X_test_scaled)
print("--- Test ---")
print(confusion_matrix(y_test, y_test_pred))
print(classification_report(y_test, y_test_pred))
print("F1 (test):", f1_score(y_test, y_test_pred))
print("Accuracy (test):", accuracy_score(y_test, y_test_pred))
print("ROC AUC (test):", roc_auc_score(y_test, y_test_pred))


--- Test ---
[[4210 1184]
 [ 265 1441]]
              precision    recall  f1-score   support

           0       0.94      0.78      0.85      5394
           1       0.55      0.84      0.67      1706

    accuracy                           0.80      7100
   macro avg       0.74      0.81      0.76      7100
weighted avg       0.85      0.80      0.81      7100

F1 (test): 0.6654352343569614
Accuracy (test): 0.7959154929577464
ROC AUC (test): 0.8125813667306951


## Conclusiones

- Se dividió el dataset en entrenamiento (70%), validación (15%) y test (15%) de forma estratificada por el desbalanceo.
- Se utilizó un modelo de Regresion Logistica, ajustando hiperparámetros con GridSearchCV.
- El modelo final tuvo un buen rendimiento general:
  - F1 en test: 0.665
  - AUC en test: 0.81
  - Accuracy en test: 0.79
- No se detectó overfitting, ya que el rendimiento en entrenamiento, validación y test fue consistente.
- La precision en la clase uno es bastante baja.

In [20]:
import pickle
with open(r'../artifacts/logisticRegression.pkl', "wb") as f:
    pickle.dump(best_model, f)