In [1]:
print("hello world")

hello world


In [2]:
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import numpy as np

print("imports complete")

imports complete


In [3]:
class MLP(nn.Module):
    def __init__(self, input_dim, num_classes=2):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 32),
            nn.ReLU(),
            nn.Linear(32, 16),
            nn.ReLU(),
            nn.Linear(16, num_classes)
        )

    def forward(self, x):
        return self.net(x)


In [4]:
# Shrink data function

def filter_data(X, y, numeric_cols, gamma, use_mahalanobis, path_name=None):
    if use_mahalanobis:
        # Mahalanobis Distance function
        def mahalanobis(x):
            return np.sqrt((x - mean) @ inv_cov @ (x - mean).T)
        

        mean = X.mean(axis=0)
        cov = np.cov(X, rowvar=False)
        inv_cov = np.linalg.pinv(cov)


        X["md"] = [mahalanobis(x) for x in X.values]

        print(X["md"])

        # Trim based on percentile thresholds
        low_p, high_p = gamma, 1 - gamma
        low_val = X["md"].quantile(low_p)
        high_val = X["md"].quantile(high_p)

        mask = pd.Series([True] * X.shape[0])
        for i in range(X.shape[0]):
            mask[i] = low_val <= X["md"][i]  <= high_val
        
        X = X.drop(columns=["md"])
        

    else:
        def compute_shrunk_bounds(df, numeric_cols, gamma=0.1, low_pct=1.0, high_pct=99.0):
            # df: raw data
            mins = df[numeric_cols].quantile(low_pct/100.0).values
            maxs = df[numeric_cols].quantile(high_pct/100.0).values
            spans = maxs - mins
            # avoid zero spans
            spans[spans == 0] = 1e-6
            low = mins + gamma * spans
            high = maxs - gamma * spans
            return low, high
        
        low_bounds, high_bounds = compute_shrunk_bounds(X, numeric_cols, gamma=gamma)

        mask = pd.Series([True] * X.shape[0])
        for i, col in enumerate(numeric_cols):
            mask &= (X[col] >= low_bounds[i]) & (X[col] <= high_bounds[i])


    # Do the shrinking
    X_shrunk, y_shrunk = X[mask], y[mask]
    X_out, y_out = X[~mask], y[~mask]

    X_shrunk = X_shrunk.reset_index(drop=True)
    y_shrunk = y_shrunk.reset_index(drop=True)

    # Save to file
    if path_name:
        in_distribution_file = path_name + "in_distribution.csv"
        out_distribution_file = path_name + "out_of_distribution.csv"

        output_data = pd.concat([X_shrunk, y_shrunk], axis=1)
        output_data.to_csv(in_distribution_file, index=False)

        output_data = pd.concat([X_out, y_out], axis=1)
        output_data.to_csv(out_distribution_file, index=False)


    return X_shrunk, y_shrunk, X_out, y_out


# Training model function

def train_model(X, y, model, criterion, optimizer, filename=None):
    # Ready data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)  # "fit" learns the mean/variance of each feature
    X_test = scaler.transform(X_test)  # fit isn't necessary because the scaler already learned the features

    # Convert to PyTorch tensors
    X_train = torch.tensor(X_train, dtype=torch.float32)
    X_test = torch.tensor(X_test, dtype=torch.float32)
    y_train = torch.tensor(y_train.values.squeeze(), dtype=torch.long)
    y_test = torch.tensor(y_test.values.squeeze(), dtype=torch.long)

    # Training loop
    num_epochs = 100
    for epoch in range(num_epochs):
        optimizer.zero_grad()
        outputs = model(X_train)
        loss = criterion(outputs, y_train.argmax(dim=1)) # convert to class indicies
        loss.backward()     # backprop
        optimizer.step()    # update gradients

        print(f"Epoch {epoch+1}/{num_epochs}: loss {loss}")

    # Output test accuracy
    predictions = model(X_test)
    _, predicted_classes = torch.max(predictions, 1)
    accuracy = (predicted_classes == y_test.argmax(dim=1)).float().mean()
    print(f"Test Accuracy: {accuracy.item()*100:.2f}%")

    # Verify that weights can be saved
    if filename:
        torch.save(model.state_dict(), filename)
        print("Model weights saved to " + filename)

In [5]:
# Research infrastructure
# Given dataset name OR pure data file upload, this file needs to create an MLP, train it, and output the weights to a file
# There should also be a parameter for shrunk data or not (this takes a parameter for how much data from each side to filter)


###### 1. LOAD DATASET ######

# Option 1: UCI Dataset
uci_function = datasets.load_wine                                  # Parameter
dataset = uci_function()
class_names = dataset.target_names
X = pd.DataFrame(dataset.data, columns=dataset.feature_names)
y = pd.DataFrame(dataset.target, columns=['target'])
y = pd.get_dummies(y['target'])
y.columns = class_names


# Option 2: Load data from file
# dataset = pd.read_csv('bc_shrunk_data.csv')                                  # Parameter
# class_names = ["malignant", "benign"]                                        # Parameter
# X = dataset.drop(columns=class_names)                                        # May need to undo one-hot encoding here...
# y = dataset[[class_names]]


categorical_features = X.select_dtypes(include=["object", "category", "bool"]).columns.tolist()
numeric_features = X.select_dtypes(include=["int64", "float64"]).columns.tolist()

print(X.head())
print(y.head())
print(class_names)

print(categorical_features)
print(numeric_features)

print("\nData loaded")


   alcohol  malic_acid   ash  alcalinity_of_ash  magnesium  total_phenols  \
0    14.23        1.71  2.43               15.6      127.0           2.80   
1    13.20        1.78  2.14               11.2      100.0           2.65   
2    13.16        2.36  2.67               18.6      101.0           2.80   
3    14.37        1.95  2.50               16.8      113.0           3.85   
4    13.24        2.59  2.87               21.0      118.0           2.80   

   flavanoids  nonflavanoid_phenols  proanthocyanins  color_intensity   hue  \
0        3.06                  0.28             2.29             5.64  1.04   
1        2.76                  0.26             1.28             4.38  1.05   
2        3.24                  0.30             2.81             5.68  1.03   
3        3.49                  0.24             2.18             7.80  0.86   
4        2.69                  0.39             1.82             4.32  1.04   

   od280/od315_of_diluted_wines  proline  
0                  

In [6]:
###### 2. SHRINK DATASET ######
# Optionally filter out data that is outside some percentage range

gamma = 0.1
use_mahalanobis = False
filename = "../data/wine/datasets/baseline/80_percent/"

X_shrunk, y_shrunk, X_out, y_out = filter_data(X, y, numeric_features, gamma, use_mahalanobis, filename)

print("Old X size: ", X.shape)
print("New X size: ", X_shrunk.shape)
print("Old y size: ", y.shape)
print("New y size: ", y_shrunk.shape)

# don't do one-hot encoding...?


Old X size:  (178, 13)
New X size:  (35, 13)
Old y size:  (178, 3)
New y size:  (35, 3)


In [7]:
###### 3. TRAIN MODEL ######

# X_model, y_model = X, y
X_model, y_model = X_shrunk, y_shrunk

input_dim = X_model.shape[1]
num_classes = 3
model = MLP(input_dim, num_classes)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

filename = "../data/wine/model_weights/baseline/80_percent.pth"

train_model(X_model, y_model, model, criterion, optimizer, filename)


Epoch 1/100: loss 1.1263524293899536
Epoch 2/100: loss 1.116843581199646
Epoch 3/100: loss 1.1075656414031982
Epoch 4/100: loss 1.0983203649520874
Epoch 5/100: loss 1.0890705585479736
Epoch 6/100: loss 1.0797736644744873
Epoch 7/100: loss 1.0705145597457886
Epoch 8/100: loss 1.0613768100738525
Epoch 9/100: loss 1.0521339178085327
Epoch 10/100: loss 1.043121576309204
Epoch 11/100: loss 1.033964991569519
Epoch 12/100: loss 1.0246137380599976
Epoch 13/100: loss 1.015146255493164
Epoch 14/100: loss 1.0054153203964233
Epoch 15/100: loss 0.9954060316085815
Epoch 16/100: loss 0.9851688146591187
Epoch 17/100: loss 0.9748384356498718
Epoch 18/100: loss 0.9642773866653442
Epoch 19/100: loss 0.9535003304481506
Epoch 20/100: loss 0.9425334930419922
Epoch 21/100: loss 0.9313855767250061
Epoch 22/100: loss 0.9199113845825195
Epoch 23/100: loss 0.9083814024925232
Epoch 24/100: loss 0.8966102004051208
Epoch 25/100: loss 0.884552538394928
Epoch 26/100: loss 0.8723201155662537
Epoch 27/100: loss 0.86000

In [8]:
###### 4. EVALUATE AGAINST BASELINE ######
# X, y store the baseline data


X_tensor = StandardScaler().fit_transform(X)
X_tensor = torch.tensor(X_tensor, dtype=torch.float32)
y_tensor = torch.tensor(y.values.squeeze(), dtype=torch.long)

predictions = model(X_tensor)
_, predicted_classes = torch.max(predictions, 1)
accuracy = (predicted_classes == y_tensor.argmax(dim=1)).float().mean()
print(f"Baseline Accuracy: {accuracy.item()*100:.2f}%")


# Also run against in-distribution and out-of-distribution data
X_tensor = StandardScaler().fit_transform(X_shrunk)
X_tensor = torch.tensor(X_tensor, dtype=torch.float32)
y_tensor = torch.tensor(y_shrunk.values.squeeze(), dtype=torch.long)

predictions = model(X_tensor)
_, predicted_classes = torch.max(predictions, 1)
accuracy = (predicted_classes == y_tensor.argmax(dim=1)).float().mean()
print(f"In-distribution Accuracy: {accuracy.item()*100:.2f}%")


X_tensor = StandardScaler().fit_transform(X_out)
X_tensor = torch.tensor(X_tensor, dtype=torch.float32)
y_tensor = torch.tensor(y_out.values.squeeze(), dtype=torch.long)

predictions = model(X_tensor)
_, predicted_classes = torch.max(predictions, 1)
accuracy = (predicted_classes == y_tensor.argmax(dim=1)).float().mean()
print(f"Out-of-distribution Accuracy: {accuracy.item()*100:.2f}%")



Baseline Accuracy: 65.73%
In-distribution Accuracy: 97.14%
Out-of-distribution Accuracy: 54.55%
