# Estimating Training Data Influence by Tracing Gradient Descent

1) Import of necessary libraries and California Housing dataset fetching function:

In [1]:
!pip install torchmetrics
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler

from tqdm import tqdm
from sklearn.datasets import fetch_california_housing
from torchmetrics import ExplainedVariance

Collecting torchmetrics
  Downloading torchmetrics-0.11.4-py3-none-any.whl (519 kB)
[K     |████████████████████████████████| 519 kB 1.2 MB/s eta 0:00:01
Installing collected packages: torchmetrics
Successfully installed torchmetrics-0.11.4


2) Setup to enable saving checkpoints in a dedicated folder, to reuse them in the implementation of TracInCP and in other eventualities:

In [2]:
def savecheckpoint(checkpoint, filename="chpcheckpoint.pth.tar", dir_name="batch"):

  print("=> saving checkpoint")
  ph = F"./{dir_name}/{filename}"
  torch.save(checkpoint, ph)

3: Definition of the model: to replicate the results of the paper, a 3 hidden layer neural network has been defined with 162K parameters.

In [3]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(8, 400)
        self.fc2 = nn.Linear(400, 400)
        self.fc3 = nn.Linear(400, 1)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.fc3(x)
        return x

# Create an instance of the network
model = Net()

# Count the total number of parameters in the network
total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print("Total Parameters:", total_params)

Total Parameters: 164401


5) Dataset is fetched and separated into features X and labels y:

In [9]:
data = fetch_california_housing()
X = data.data.astype(np.float32)
y = data.target

5) Then it is further processed: first the features are standardized with a standard scaler then they are splitted into Train and Test sets. The tradeoff between train and test is set to 8:2 as in the paper.

In [10]:
# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

5) The splitted sets are processed to convert them into tensors and batched. The model is loaded into CUDA if it is available, else it will run on cpu.

In [11]:
# Convert numpy arrays to PyTorch tensors
X_train = torch.tensor(X_train, dtype=torch.float)
X_test = torch.tensor(X_test, dtype=torch.float)
Y_train = torch.tensor(Y_train, dtype=torch.float).view(-1, 1)
Y_test = torch.tensor(Y_test, dtype=torch.float).view(-1, 1)

datasets = torch.utils.data.TensorDataset(X_train, Y_train)
datasets_test = torch.utils.data.TensorDataset(X_test, Y_test)

batch_size = 8

train_iter = torch.utils.data.DataLoader(datasets, batch_size=batch_size, shuffle=True)
test_iter = torch.utils.data.DataLoader(datasets_test, batch_size=batch_size, shuffle=True)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print(device)

# Create an instance of the model
model = Net()
model.to(device)

cuda


Net(
  (fc1): Linear(in_features=8, out_features=400, bias=True)
  (fc2): Linear(in_features=400, out_features=400, bias=True)
  (fc3): Linear(in_features=400, out_features=1, bias=True)
  (relu): ReLU()
)

6) Training: in order to replicate also here the process of the paper, an Adam optimizer is exploited and the network is trained for 200 epochs.

In [13]:
import os
if not os.path.exists("CHP_checkpoints/batch_"+str(batch_size)):
    os.makedirs("CHP_checkpoints/batch_"+str(batch_size))

# Define the loss function and optimizer

loss = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
loss_list = []

num_epochs = 200

for epoch in tqdm(range(num_epochs)):
    if epoch % 10 == 0:
        savecheckpoint(model.state_dict(), str(epoch) + "chpcheckpoint.pth.tar", "CHP_checkpoints/batch_"+str(batch_size))
    for x, y in train_iter:
        x = x.to(device)
        y = y.to(device)
        output = model(x)
        l = loss(output, y)
        optimizer.zero_grad()
        l.backward()
        optimizer.step()
    loss_list.append(l.data)
    print("epoch {} loss: {:.4f}".format(epoch + 1, l.item()))

  0%|                                                   | 0/200 [00:00<?, ?it/s]

=> saving checkpoint


  0%|▏                                          | 1/200 [00:02<09:30,  2.87s/it]

epoch 1 loss: 0.0938


  1%|▍                                          | 2/200 [00:05<09:24,  2.85s/it]

epoch 2 loss: 0.1826


  2%|▋                                          | 3/200 [00:08<09:28,  2.89s/it]

epoch 3 loss: 0.1430


  2%|▊                                          | 4/200 [00:11<09:29,  2.90s/it]

epoch 4 loss: 0.2051


  2%|█                                          | 5/200 [00:14<09:25,  2.90s/it]

epoch 5 loss: 0.0298


  3%|█▎                                         | 6/200 [00:17<09:18,  2.88s/it]

epoch 6 loss: 0.0989


  4%|█▌                                         | 7/200 [00:20<09:14,  2.87s/it]

epoch 7 loss: 0.0616


  4%|█▋                                         | 8/200 [00:23<09:10,  2.86s/it]

epoch 8 loss: 0.0748


  4%|█▉                                         | 9/200 [00:25<09:11,  2.89s/it]

epoch 9 loss: 0.0708


  5%|██                                        | 10/200 [00:28<09:08,  2.89s/it]

epoch 10 loss: 0.1408
=> saving checkpoint


  6%|██▎                                       | 11/200 [00:31<09:03,  2.88s/it]

epoch 11 loss: 0.0829


  6%|██▌                                       | 12/200 [00:34<09:00,  2.87s/it]

epoch 12 loss: 0.3976


  6%|██▋                                       | 13/200 [00:37<08:57,  2.87s/it]

epoch 13 loss: 0.0313


  7%|██▉                                       | 14/200 [00:40<08:53,  2.87s/it]

epoch 14 loss: 0.0558


  8%|███▏                                      | 15/200 [00:43<08:49,  2.86s/it]

epoch 15 loss: 0.1902


  8%|███▎                                      | 16/200 [00:45<08:45,  2.85s/it]

epoch 16 loss: 0.3117


  8%|███▌                                      | 17/200 [00:48<08:41,  2.85s/it]

epoch 17 loss: 0.2924


  9%|███▊                                      | 18/200 [00:51<08:41,  2.86s/it]

epoch 18 loss: 0.1564


 10%|███▉                                      | 19/200 [00:54<08:37,  2.86s/it]

epoch 19 loss: 0.0336


 10%|████▏                                     | 20/200 [00:57<08:29,  2.83s/it]

epoch 20 loss: 0.2780
=> saving checkpoint


 10%|████▍                                     | 21/200 [01:00<08:26,  2.83s/it]

epoch 21 loss: 0.0828


 11%|████▌                                     | 22/200 [01:03<08:26,  2.84s/it]

epoch 22 loss: 0.3424


 12%|████▊                                     | 23/200 [01:05<08:25,  2.85s/it]

epoch 23 loss: 0.1494


 12%|█████                                     | 24/200 [01:08<08:21,  2.85s/it]

epoch 24 loss: 0.0869


 12%|█████▎                                    | 25/200 [01:11<08:18,  2.85s/it]

epoch 25 loss: 0.1337


 13%|█████▍                                    | 26/200 [01:14<08:15,  2.84s/it]

epoch 26 loss: 0.3806


 14%|█████▋                                    | 27/200 [01:17<08:13,  2.85s/it]

epoch 27 loss: 0.2543


 14%|█████▉                                    | 28/200 [01:20<08:15,  2.88s/it]

epoch 28 loss: 0.2849


 14%|██████                                    | 29/200 [01:23<08:15,  2.90s/it]

epoch 29 loss: 0.1374


 15%|██████▎                                   | 30/200 [01:26<08:15,  2.91s/it]

epoch 30 loss: 0.0769
=> saving checkpoint


 16%|██████▌                                   | 31/200 [01:29<08:13,  2.92s/it]

epoch 31 loss: 0.1588


 16%|██████▋                                   | 32/200 [01:32<08:03,  2.88s/it]

epoch 32 loss: 0.2605





KeyboardInterrupt: 

7) Results: the model built achieved a variance of 0.72 on test set and 0.93 on train set, which is very similar to the variances obtained in the paper (respectively 0.72 and 0.70)

In [27]:
# Evaluate the model
model.eval()
with torch.no_grad():
    X_test = X_test.to(device)
    Y_test = Y_test.to(device)
    predicted = model(X_test)
    mse = loss(predicted, Y_test)
    mae = torch.mean(torch.abs(predicted - Y_test))

    explained_variance = ExplainedVariance()
    exp_var_test = explained_variance(predicted, Y_test)

    X_train = X_train.to(device)
    Y_train = Y_train.to(device)
    predicted_train = model(X_train)

    exp_var_train = explained_variance(predicted_train, Y_train)

    print("Mean Squared Error: {:.4f}".format(mse))
    print("Mean Absolute Error: {:.4f}".format(mae))
    print("Explained Variance on test set: {:.4f}".format(exp_var_test))
    print("Explained Variance on train set: {:.4f}".format(exp_var_train))

Mean Squared Error: 0.3043
Mean Absolute Error: 0.3598
Explained Variance on test set: 0.7678
Explained Variance on train set: 0.9305
