In [1]:
print("hello world")

hello world


In [2]:
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import numpy as np

print("imports complete")

imports complete


In [3]:
class MLP(nn.Module):
    def __init__(self, input_dim, num_classes=2):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 32),
            nn.ReLU(),
            nn.Linear(32, 16),
            nn.ReLU(),
            nn.Linear(16, num_classes)
        )

    def forward(self, x):
        return self.net(x)


In [4]:
# Shrink data function

def filter_data(X, y, numeric_cols, gamma, use_mahalanobis, path_name=None):
    if use_mahalanobis:
        # Mahalanobis Distance function
        def mahalanobis(x):
            return np.sqrt((x - mean) @ inv_cov @ (x - mean).T)
        

        mean = X.mean(axis=0)
        cov = np.cov(X, rowvar=False)
        inv_cov = np.linalg.pinv(cov)


        X["md"] = [mahalanobis(x) for x in X.values]

        print(X["md"])

        # Trim based on percentile thresholds
        low_p, high_p = gamma, 1 - gamma
        low_val = X["md"].quantile(low_p)
        high_val = X["md"].quantile(high_p)

        mask = pd.Series([True] * X.shape[0])
        for i in range(X.shape[0]):
            mask[i] = low_val <= X["md"][i]  <= high_val
        
        X = X.drop(columns=["md"])
        

    else:
        def compute_shrunk_bounds(df, numeric_cols, gamma=0.1, low_pct=1.0, high_pct=99.0):
            # df: raw data
            mins = df[numeric_cols].quantile(low_pct/100.0).values
            maxs = df[numeric_cols].quantile(high_pct/100.0).values
            spans = maxs - mins
            # avoid zero spans
            spans[spans == 0] = 1e-6
            low = mins + gamma * spans
            high = maxs - gamma * spans
            return low, high
        
        low_bounds, high_bounds = compute_shrunk_bounds(X, numeric_cols, gamma=gamma)

        mask = pd.Series([True] * X.shape[0])
        for i, col in enumerate(numeric_cols):
            mask &= (X[col] >= low_bounds[i]) & (X[col] <= high_bounds[i])


    # Do the shrinking
    X_shrunk, y_shrunk = X[mask], y[mask]
    X_out, y_out = X[~mask], y[~mask]

    X_shrunk = X_shrunk.reset_index(drop=True)
    y_shrunk = y_shrunk.reset_index(drop=True)

    # Save to file
    if path_name:
        in_distribution_file = path_name + "in_distribution.csv"
        out_distribution_file = path_name + "out_of_distribution.csv"

        output_data = pd.concat([X_shrunk, y_shrunk], axis=1)
        output_data.to_csv(in_distribution_file, index=False)

        output_data = pd.concat([X_out, y_out], axis=1)
        output_data.to_csv(out_distribution_file, index=False)


    return X_shrunk, y_shrunk, X_out, y_out


# Training model function

def train_model(X, y, model, criterion, optimizer, filename=None):
    # Ready data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)  # "fit" learns the mean/variance of each feature
    X_test = scaler.transform(X_test)  # fit isn't necessary because the scaler already learned the features

    # Convert to PyTorch tensors
    X_train = torch.tensor(X_train, dtype=torch.float32)
    X_test = torch.tensor(X_test, dtype=torch.float32)
    y_train = torch.tensor(y_train.values.squeeze(), dtype=torch.long)
    y_test = torch.tensor(y_test.values.squeeze(), dtype=torch.long)

    # Training loop
    num_epochs = 100
    for epoch in range(num_epochs):
        optimizer.zero_grad()
        outputs = model(X_train)
        loss = criterion(outputs, y_train.argmax(dim=1)) # convert to class indicies
        loss.backward()     # backprop
        optimizer.step()    # update gradients

        print(f"Epoch {epoch+1}/{num_epochs}: loss {loss}")

    # Output test accuracy
    predictions = model(X_test)
    _, predicted_classes = torch.max(predictions, 1)
    accuracy = (predicted_classes == y_test.argmax(dim=1)).float().mean()
    print(f"Test Accuracy: {accuracy.item()*100:.2f}%")

    # Verify that weights can be saved
    if filename:
        torch.save(model.state_dict(), filename)
        print("Model weights saved to " + filename)

In [5]:
# Research infrastructure
# Given dataset name OR pure data file upload, this file needs to create an MLP, train it, and output the weights to a file
# There should also be a parameter for shrunk data or not (this takes a parameter for how much data from each side to filter)


###### 1. LOAD DATASET ######

# Option 1: UCI Dataset
uci_function = datasets.load_breast_cancer                                  # Parameter
dataset = uci_function()
class_names = dataset.target_names
X = pd.DataFrame(dataset.data, columns=dataset.feature_names)
y = pd.DataFrame(dataset.target, columns=['target'])
y = pd.get_dummies(y['target'])
y.columns = class_names


# Option 2: Load data from file
# dataset = pd.read_csv('bc_shrunk_data.csv')                                  # Parameter
# class_names = ["malignant", "benign"]                                        # Parameter
# X = dataset.drop(columns=class_names)                                        # May need to undo one-hot encoding here...
# y = dataset[[class_names]]


categorical_features = X.select_dtypes(include=["object", "category", "bool"]).columns.tolist()
numeric_features = X.select_dtypes(include=["int64", "float64"]).columns.tolist()

print(X.head())
print(y.head())
print(class_names)

print(categorical_features)
print(numeric_features)

print("\nData loaded")


   mean radius  mean texture  mean perimeter  mean area  mean smoothness  \
0        17.99         10.38          122.80     1001.0          0.11840   
1        20.57         17.77          132.90     1326.0          0.08474   
2        19.69         21.25          130.00     1203.0          0.10960   
3        11.42         20.38           77.58      386.1          0.14250   
4        20.29         14.34          135.10     1297.0          0.10030   

   mean compactness  mean concavity  mean concave points  mean symmetry  \
0           0.27760          0.3001              0.14710         0.2419   
1           0.07864          0.0869              0.07017         0.1812   
2           0.15990          0.1974              0.12790         0.2069   
3           0.28390          0.2414              0.10520         0.2597   
4           0.13280          0.1980              0.10430         0.1809   

   mean fractal dimension  ...  worst radius  worst texture  worst perimeter  \
0           

In [None]:
###### 2. SHRINK DATASET ######
# Optionally filter out data that is outside some percentage range

gamma = 0.05
use_mahalanobis = False
filename = "../data/breast_cancer/datasets/baseline/90_percent/"

X_shrunk, y_shrunk, X_out, y_out = filter_data(X, y, numeric_features, gamma, use_mahalanobis, filename)

print("Old X size: ", X.shape)
print("New X size: ", X_shrunk.shape)
print("Old y size: ", y.shape)
print("New y size: ", y_shrunk.shape)

# don't do one-hot encoding...?


Old X size:  (569, 30)
New X size:  (231, 30)
Old y size:  (569, 2)
New y size:  (231, 2)


In [7]:
###### 3. TRAIN MODEL ######

# X_model, y_model = X, y
X_model, y_model = X_shrunk, y_shrunk

input_dim = X_model.shape[1]
num_classes = 2
model = MLP(input_dim, num_classes)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

filename = "../data/breast_cancer/model_weights/baseline/90_percent.pth"

train_model(X_model, y_model, model, criterion, optimizer, filename)


Epoch 1/100: loss 0.7023522853851318
Epoch 2/100: loss 0.6962097883224487
Epoch 3/100: loss 0.6901255249977112
Epoch 4/100: loss 0.6841297745704651
Epoch 5/100: loss 0.678066611289978
Epoch 6/100: loss 0.6718850135803223
Epoch 7/100: loss 0.665648341178894
Epoch 8/100: loss 0.6593338251113892
Epoch 9/100: loss 0.6529463529586792
Epoch 10/100: loss 0.6464577317237854
Epoch 11/100: loss 0.6399446129798889
Epoch 12/100: loss 0.6332703232765198
Epoch 13/100: loss 0.6264426112174988
Epoch 14/100: loss 0.6194862723350525
Epoch 15/100: loss 0.6123668551445007
Epoch 16/100: loss 0.6051577925682068
Epoch 17/100: loss 0.5977831482887268
Epoch 18/100: loss 0.5902435183525085
Epoch 19/100: loss 0.5825263261795044
Epoch 20/100: loss 0.5745843052864075
Epoch 21/100: loss 0.5664354562759399
Epoch 22/100: loss 0.5580757260322571
Epoch 23/100: loss 0.5495439767837524
Epoch 24/100: loss 0.5408124327659607
Epoch 25/100: loss 0.5318822264671326
Epoch 26/100: loss 0.5227730870246887
Epoch 27/100: loss 0.51

In [13]:
###### 4. EVALUATE AGAINST BASELINE ######
# X, y store the baseline data


X_tensor = StandardScaler().fit_transform(X)
X_tensor = torch.tensor(X_tensor, dtype=torch.float32)
y_tensor = torch.tensor(y.values.squeeze(), dtype=torch.long)

predictions = model(X_tensor)
_, predicted_classes = torch.max(predictions, 1)
accuracy = (predicted_classes == y_tensor.argmax(dim=1)).float().mean()
print(f"Baseline Accuracy: {accuracy.item()*100:.2f}%")


# Also run against in-distribution and out-of-distribution data
X_tensor = StandardScaler().fit_transform(X_shrunk)
X_tensor = torch.tensor(X_tensor, dtype=torch.float32)
y_tensor = torch.tensor(y_shrunk.values.squeeze(), dtype=torch.long)

predictions = model(X_tensor)
_, predicted_classes = torch.max(predictions, 1)
accuracy = (predicted_classes == y_tensor.argmax(dim=1)).float().mean()
print(f"In-distribution Accuracy: {accuracy.item()*100:.2f}%")


X_tensor = StandardScaler().fit_transform(X_out)
X_tensor = torch.tensor(X_tensor, dtype=torch.float32)
y_tensor = torch.tensor(y_out.values.squeeze(), dtype=torch.long)

predictions = model(X_tensor)
_, predicted_classes = torch.max(predictions, 1)
accuracy = (predicted_classes == y_tensor.argmax(dim=1)).float().mean()
print(f"Out-of-distribution Accuracy: {accuracy.item()*100:.2f}%")



Baseline Accuracy: 95.25%
In-distribution Accuracy: 97.40%
Out-of-distribution Accuracy: 89.94%
