# Punto 2 Taller Estadistica



In [1]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score, average_precision_score
from tqdm import tqdm



In [2]:
# Configuración del dispositivo
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")


Using device: cpu


In [4]:
# Dataset personalizado
class CreditCardDataset(Dataset):
    def __init__(self, X, y, is_multiclass=False):
        self.X = torch.FloatTensor(X).to(device)
        # Usar LongTensor para multiclass, FloatTensor para binario
        self.y = torch.LongTensor(y).to(device) if is_multiclass else torch.FloatTensor(y).to(device)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

# Modelo de Regresión Lineal para Clasificación
class LinearClassification(nn.Module):
    def __init__(self, input_dim):
        super(LinearClassification, self).__init__()
        self.linear = nn.Linear(input_dim, 1)

    def forward(self, x):
        return torch.sigmoid(self.linear(x))

# Modelo de Regresión Logística
class LogisticRegression(nn.Module):
    def __init__(self, input_dim):
        super(LogisticRegression, self).__init__()
        self.linear = nn.Linear(input_dim, 1)

    def forward(self, x):
        return torch.sigmoid(self.linear(x))

# Modelo de Regresión Logística Multiclase
class MulticlassLogisticRegression(nn.Module):
    def __init__(self, input_dim, num_classes):
        super(MulticlassLogisticRegression, self).__init__()
        self.linear = nn.Linear(input_dim, num_classes)

    def forward(self, x):
        return self.linear(x)  # No aplicamos softmax aquí ya que CrossEntropyLoss lo incluye

# Modelo de LDA
class LDAModel(nn.Module):
    def __init__(self, input_dim, num_classes):
        super(LDAModel, self).__init__()
        self.linear = nn.Linear(input_dim, num_classes)
        self.batch_norm = nn.BatchNorm1d(input_dim)

    def forward(self, x):
        x = self.batch_norm(x)
        return self.linear(x)  # No aplicamos softmax aquí ya que CrossEntropyLoss lo incluye

def train_model(model, train_loader, val_loader, criterion, optimizer, epochs=10, model_name=""):
    best_val_loss = float('inf')
    best_metrics = None

    for epoch in range(epochs):
        model.train()
        train_loss = 0

        for batch_X, batch_y in tqdm(train_loader, desc=f'{model_name} Epoch {epoch+1}/{epochs}'):
            optimizer.zero_grad()
            outputs = model(batch_X)

            # Manejar diferentes tipos de salidas según el criterio
            if isinstance(criterion, nn.CrossEntropyLoss):
                loss = criterion(outputs, batch_y)
                predictions = torch.softmax(outputs, dim=1)[:, 1]
            else:
                outputs = outputs.squeeze()
                loss = criterion(outputs, batch_y)
                predictions = outputs

            loss.backward()
            optimizer.step()
            train_loss += loss.item()

        # Validación
        model.eval()
        val_loss = 0
        all_preds = []
        all_labels = []

        with torch.no_grad():
            for batch_X, batch_y in val_loader:
                outputs = model(batch_X)
                if isinstance(criterion, nn.CrossEntropyLoss):
                    val_loss += criterion(outputs, batch_y).item()
                    predictions = torch.softmax(outputs, dim=1)[:, 1]
                else:
                    outputs = outputs.squeeze()
                    val_loss += criterion(outputs, batch_y).item()
                    predictions = outputs

                all_preds.extend(predictions.cpu().numpy())
                all_labels.extend(batch_y.cpu().numpy())

        val_loss /= len(val_loader)
        train_loss /= len(train_loader)

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            predictions = np.array(all_preds)
            true_labels = np.array(all_labels)
            best_metrics = {
                'roc_auc': roc_auc_score(true_labels, predictions),
                'pr_auc': average_precision_score(true_labels, predictions),
                'classification_report': classification_report(
                    true_labels,
                    (predictions > 0.5).astype(int)
                )
            }

        print(f'{model_name} Epoch {epoch+1}:')
        print(f'Train Loss: {train_loss:.4f}')
        print(f'Validation Loss: {val_loss:.4f}')

    return best_metrics

def main():
    # Cargar y preparar datos
    df = pd.read_csv('card_transdata.csv')
    X = df.drop('fraud', axis=1).values
    y = df['fraud'].values

    # Escalado de características
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Split de datos
    X_train, X_test, y_train, y_test = train_test_split(
        X_scaled, y, test_size=0.2, random_state=42, stratify=y
    )

    # Crear datasets y dataloaders
    batch_size = 1024

    # Configurar modelos y sus respectivos datasets
    models = {
        'Linear Classification': {
            'model': LinearClassification(X_train.shape[1]),
            'criterion': nn.BCELoss(),
            'is_multiclass': False
        },
        'Logistic Regression': {
            'model': LogisticRegression(X_train.shape[1]),
            'criterion': nn.BCELoss(),
            'is_multiclass': False
        },
        'Multiclass Logistic': {
            'model': MulticlassLogisticRegression(X_train.shape[1], 2),
            'criterion': nn.CrossEntropyLoss(),
            'is_multiclass': True
        },
        'LDA': {
            'model': LDAModel(X_train.shape[1], 2),
            'criterion': nn.CrossEntropyLoss(),
            'is_multiclass': True
        }
    }

    # Entrenar y evaluar cada modelo
    results = {}

    for name, config in models.items():
        print(f"\nEntrenando {name}...")
        model = config['model'].to(device)
        criterion = config['criterion']
        is_multiclass = config['is_multiclass']

        # Crear datasets específicos para cada modelo
        train_dataset = CreditCardDataset(X_train, y_train, is_multiclass)
        test_dataset = CreditCardDataset(X_test, y_test, is_multiclass)

        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
        test_loader = DataLoader(test_dataset, batch_size=batch_size)

        optimizer = optim.Adam(model.parameters(), lr=0.001)

        results[name] = train_model(
            model,
            train_loader,
            test_loader,
            criterion,
            optimizer,
            epochs=10,
            model_name=name
        )

        print(f"\nResultados para {name}:")
        print("Classification Report:")
        print(results[name]['classification_report'])
        print(f"ROC AUC: {results[name]['roc_auc']:.4f}")
        print(f"PR AUC: {results[name]['pr_auc']:.4f}")

if __name__ == "__main__":
    main()


Entrenando Linear Classification...


Linear Classification Epoch 1/10: 100%|██████████| 782/782 [00:12<00:00, 60.19it/s]


Linear Classification Epoch 1:
Train Loss: 0.6360
Validation Loss: 0.4792


Linear Classification Epoch 2/10: 100%|██████████| 782/782 [00:10<00:00, 72.93it/s]


Linear Classification Epoch 2:
Train Loss: 0.3977
Validation Loss: 0.3318


Linear Classification Epoch 3/10: 100%|██████████| 782/782 [00:09<00:00, 84.77it/s]


Linear Classification Epoch 3:
Train Loss: 0.2924
Validation Loss: 0.2554


Linear Classification Epoch 4/10: 100%|██████████| 782/782 [00:11<00:00, 66.55it/s] 


Linear Classification Epoch 4:
Train Loss: 0.2369
Validation Loss: 0.2139


Linear Classification Epoch 5/10: 100%|██████████| 782/782 [00:13<00:00, 59.25it/s]


Linear Classification Epoch 5:
Train Loss: 0.2062
Validation Loss: 0.1906


Linear Classification Epoch 6/10: 100%|██████████| 782/782 [00:15<00:00, 50.15it/s]


Linear Classification Epoch 6:
Train Loss: 0.1883
Validation Loss: 0.1769


Linear Classification Epoch 7/10: 100%|██████████| 782/782 [00:10<00:00, 72.07it/s]


Linear Classification Epoch 7:
Train Loss: 0.1777
Validation Loss: 0.1701


Linear Classification Epoch 8/10: 100%|██████████| 782/782 [00:09<00:00, 84.80it/s]


Linear Classification Epoch 8:
Train Loss: 0.1711
Validation Loss: 0.1647


Linear Classification Epoch 9/10: 100%|██████████| 782/782 [00:11<00:00, 66.20it/s]


Linear Classification Epoch 9:
Train Loss: 0.1678
Validation Loss: 0.1622


Linear Classification Epoch 10/10: 100%|██████████| 782/782 [00:12<00:00, 60.95it/s]


Linear Classification Epoch 10:
Train Loss: 0.1654
Validation Loss: 0.1619

Resultados para Linear Classification:
Classification Report:
              precision    recall  f1-score   support

         0.0       0.96      0.99      0.97    182519
         1.0       0.86      0.53      0.66     17481

    accuracy                           0.95    200000
   macro avg       0.91      0.76      0.82    200000
weighted avg       0.95      0.95      0.95    200000

ROC AUC: 0.9746
PR AUC: 0.7779

Entrenando Logistic Regression...


Logistic Regression Epoch 1/10: 100%|██████████| 782/782 [00:10<00:00, 73.69it/s]


Logistic Regression Epoch 1:
Train Loss: 0.5562
Validation Loss: 0.4251


Logistic Regression Epoch 2/10: 100%|██████████| 782/782 [00:09<00:00, 79.60it/s] 


Logistic Regression Epoch 2:
Train Loss: 0.3583
Validation Loss: 0.3017


Logistic Regression Epoch 3/10: 100%|██████████| 782/782 [00:12<00:00, 63.74it/s]


Logistic Regression Epoch 3:
Train Loss: 0.2698
Validation Loss: 0.2383


Logistic Regression Epoch 4/10: 100%|██████████| 782/782 [00:13<00:00, 56.46it/s]


Logistic Regression Epoch 4:
Train Loss: 0.2237
Validation Loss: 0.2044


Logistic Regression Epoch 5/10: 100%|██████████| 782/782 [00:14<00:00, 53.43it/s]


Logistic Regression Epoch 5:
Train Loss: 0.1987
Validation Loss: 0.1836


Logistic Regression Epoch 6/10: 100%|██████████| 782/782 [00:13<00:00, 59.74it/s]


Logistic Regression Epoch 6:
Train Loss: 0.1840
Validation Loss: 0.1726


Logistic Regression Epoch 7/10: 100%|██████████| 782/782 [00:11<00:00, 69.13it/s]


Logistic Regression Epoch 7:
Train Loss: 0.1747
Validation Loss: 0.1680


Logistic Regression Epoch 8/10: 100%|██████████| 782/782 [00:11<00:00, 71.01it/s]


Logistic Regression Epoch 8:
Train Loss: 0.1699
Validation Loss: 0.1626


Logistic Regression Epoch 9/10: 100%|██████████| 782/782 [00:13<00:00, 57.97it/s]


Logistic Regression Epoch 9:
Train Loss: 0.1671
Validation Loss: 0.1620


Logistic Regression Epoch 10/10: 100%|██████████| 782/782 [00:13<00:00, 56.93it/s]


Logistic Regression Epoch 10:
Train Loss: 0.1649
Validation Loss: 0.1630

Resultados para Logistic Regression:
Classification Report:
              precision    recall  f1-score   support

         0.0       0.96      0.99      0.97    182519
         1.0       0.85      0.52      0.65     17481

    accuracy                           0.95    200000
   macro avg       0.90      0.76      0.81    200000
weighted avg       0.95      0.95      0.95    200000

ROC AUC: 0.9751
PR AUC: 0.7747

Entrenando Multiclass Logistic...


Multiclass Logistic Epoch 1/10: 100%|██████████| 782/782 [00:14<00:00, 53.95it/s]


Multiclass Logistic Epoch 1:
Train Loss: 0.4951
Validation Loss: 0.3157


Multiclass Logistic Epoch 2/10: 100%|██████████| 782/782 [00:10<00:00, 75.35it/s]


Multiclass Logistic Epoch 2:
Train Loss: 0.2545
Validation Loss: 0.2114


Multiclass Logistic Epoch 3/10: 100%|██████████| 782/782 [00:10<00:00, 71.42it/s]


Multiclass Logistic Epoch 3:
Train Loss: 0.1927
Validation Loss: 0.1758


Multiclass Logistic Epoch 4/10: 100%|██████████| 782/782 [00:12<00:00, 62.25it/s] 


Multiclass Logistic Epoch 4:
Train Loss: 0.1687
Validation Loss: 0.1594


Multiclass Logistic Epoch 5/10: 100%|██████████| 782/782 [00:13<00:00, 57.82it/s]


Multiclass Logistic Epoch 5:
Train Loss: 0.1566
Validation Loss: 0.1503


Multiclass Logistic Epoch 6/10: 100%|██████████| 782/782 [00:12<00:00, 64.47it/s]


Multiclass Logistic Epoch 6:
Train Loss: 0.1494
Validation Loss: 0.1446


Multiclass Logistic Epoch 7/10: 100%|██████████| 782/782 [00:16<00:00, 47.07it/s]


Multiclass Logistic Epoch 7:
Train Loss: 0.1449
Validation Loss: 0.1408


Multiclass Logistic Epoch 8/10: 100%|██████████| 782/782 [00:10<00:00, 76.40it/s]


Multiclass Logistic Epoch 8:
Train Loss: 0.1418
Validation Loss: 0.1382


Multiclass Logistic Epoch 9/10: 100%|██████████| 782/782 [00:10<00:00, 72.17it/s]


Multiclass Logistic Epoch 9:
Train Loss: 0.1398
Validation Loss: 0.1365


Multiclass Logistic Epoch 10/10: 100%|██████████| 782/782 [00:12<00:00, 60.68it/s] 


Multiclass Logistic Epoch 10:
Train Loss: 0.1385
Validation Loss: 0.1354

Resultados para Multiclass Logistic:
Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.99      0.98    182519
           1       0.89      0.56      0.69     17481

    accuracy                           0.96    200000
   macro avg       0.92      0.78      0.83    200000
weighted avg       0.95      0.96      0.95    200000

ROC AUC: 0.9696
PR AUC: 0.7978

Entrenando LDA...


LDA Epoch 1/10: 100%|██████████| 782/782 [00:13<00:00, 58.18it/s]


LDA Epoch 1:
Train Loss: 0.2610
Validation Loss: 0.1475


LDA Epoch 2/10: 100%|██████████| 782/782 [00:12<00:00, 61.83it/s]


LDA Epoch 2:
Train Loss: 0.1398
Validation Loss: 0.1368


LDA Epoch 3/10: 100%|██████████| 782/782 [00:10<00:00, 72.82it/s]


LDA Epoch 3:
Train Loss: 0.1349
Validation Loss: 0.1341


LDA Epoch 4/10: 100%|██████████| 782/782 [00:10<00:00, 73.85it/s]


LDA Epoch 4:
Train Loss: 0.1334
Validation Loss: 0.1341


LDA Epoch 5/10: 100%|██████████| 782/782 [00:13<00:00, 57.97it/s]


LDA Epoch 5:
Train Loss: 0.1331
Validation Loss: 0.1341


LDA Epoch 6/10: 100%|██████████| 782/782 [00:13<00:00, 57.92it/s]


LDA Epoch 6:
Train Loss: 0.1335
Validation Loss: 0.1336


LDA Epoch 7/10: 100%|██████████| 782/782 [00:13<00:00, 58.49it/s]


LDA Epoch 7:
Train Loss: 0.1336
Validation Loss: 0.1345


LDA Epoch 8/10: 100%|██████████| 782/782 [00:13<00:00, 56.25it/s]


LDA Epoch 8:
Train Loss: 0.1336
Validation Loss: 0.1336


LDA Epoch 9/10: 100%|██████████| 782/782 [00:10<00:00, 71.71it/s]


LDA Epoch 9:
Train Loss: 0.1333
Validation Loss: 0.1362


LDA Epoch 10/10: 100%|██████████| 782/782 [00:13<00:00, 59.61it/s]


LDA Epoch 10:
Train Loss: 0.1332
Validation Loss: 0.1348

Resultados para LDA:
Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.99      0.98    182519
           1       0.89      0.62      0.73     17481

    accuracy                           0.96    200000
   macro avg       0.93      0.81      0.85    200000
weighted avg       0.96      0.96      0.96    200000

ROC AUC: 0.9670
PR AUC: 0.8049
