In [1]:
import torch
import torchvision
from torchvision import transforms
import numpy as np
from dvutils.Data_Shapley import Data_Shapley
from dvutils.models_defined import Model_Train, MLP_MNIST

In [2]:
# set seed
np.random.seed(0)
torch.manual_seed(0)

<torch._C.Generator at 0x130300650>

In [3]:
# arguments
device = 'cpu' # ['cpu', 'cuda']
metric = 'neg_loss' # ['neg_loss', 'acc']

# parameter for NN training
model_fn = MLP_MNIST
lr = 0.1
optimizer_fn = torch.optim.Adam
loss_fn = torch.nn.CrossEntropyLoss
batch_size = 200
epochs = 5


In [4]:
# load dataset
data = torchvision.datasets.MNIST(root='./data', train=True, download=True, transform=transforms.ToTensor())
data_val = torchvision.datasets.MNIST(root='./data', train=False, download=True, transform=transforms.ToTensor())
val_loader = torch.utils.data.DataLoader(data_val, batch_size=batch_size * 5, shuffle=False)

# select 1000 data points for data valuation
selected_idxs = np.random.choice(len(data), 1000, replace=False)
data_selected = torch.utils.data.Subset(data, selected_idxs)
train_loader = torch.utils.data.DataLoader(data_selected, batch_size=batch_size, shuffle=False)

# initialize a training process
model_train = Model_Train(model_fn, optimizer_fn, loss_fn, lr, batch_size, epochs, device)

Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz to ./data/MNIST/raw/train-images-idx3-ubyte.gz


100%|██████████| 9912422/9912422 [00:24<00:00, 408957.16it/s]


Extracting ./data/MNIST/raw/train-images-idx3-ubyte.gz to ./data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz to ./data/MNIST/raw/train-labels-idx1-ubyte.gz


100%|██████████| 28881/28881 [00:00<00:00, 415953.68it/s]


Extracting ./data/MNIST/raw/train-labels-idx1-ubyte.gz to ./data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz to ./data/MNIST/raw/t10k-images-idx3-ubyte.gz


100%|██████████| 1648877/1648877 [00:02<00:00, 602476.27it/s]


Extracting ./data/MNIST/raw/t10k-images-idx3-ubyte.gz to ./data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz to ./data/MNIST/raw/t10k-labels-idx1-ubyte.gz


100%|██████████| 4542/4542 [00:00<00:00, 9141328.58it/s]

Extracting ./data/MNIST/raw/t10k-labels-idx1-ubyte.gz to ./data/MNIST/raw






In [5]:
# initialize the object to compute data Shapley
dshap = Data_Shapley(train_loader, val_loader, model_train)

In [6]:
# computing data Shapley for one data point (i.e., the data point with idx=0) using Monte Carlo method
data_value = dshap.run_idx(idx=0, method="mc", mc_iteration=10, metric="neg_loss")

[Runing Monte Carlo for one data point]: 100%|██████████| 10/10 [00:53<00:00,  5.32s/it]


In [7]:
print('data value: ',  data_value.item())

data value:  0.0010454746661707759


In [None]:
# computing data Shapley using Monte Carlo method for all data points (i.e., get the data value for all 1000 data points). Can take very long time
data_values = dshap.run_all(method="mc", mc_iteration=10, metric="neg_loss")

In [None]:
print('data values: ', data_values)

In [None]:
# running exact data Shapley for one data point (i.e., the data point with idx=0)
data_value = dshap.run_idx(idx=0, method="exact", metric="neg_loss") # only run this when the number of data point is 10ish in total, otherwise get error

In [None]:
print('data value: ', data_value)