# Example for:
Transfer Learning Empirical Experiment from all 10 classes of MNIST into all 10 classes of FashionMNIST \
Note that here we are using the same architecture we used in FashionMNIST folder, with 5 CNN layers and 1 linear layer

### Setup and Hyperparams

In [1]:
# Specify which gpu
import os
gpu_id = 0
os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu_id)

import sys
sys.path.append('/home/arnisaf/mp-tl-study')
from functions.utils import *

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

np.random.seed(0)
torch.manual_seed(0)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
if torch.cuda.is_available():
    torch.cuda.manual_seed(0)
    torch.cuda.manual_seed_all(0)  # if using multi-GPU

In [2]:
# cuts=0 means: end-to-end model if we are reinitializing
cuts = [0,1,2,3,4,5.6]

In [3]:
# Changes Here for the experiments
params = {
      # MODEL ARCHITECTURE PARAMS
      'depth': 6,
      'num_channels': 64,
      'kernel_size': 3,
      'activation_function': nn.ReLU,
      # TRAINING PARAMS
      'device': device,
      'lr_pretrain': 0.001,   
      'lr_fine_tune': 0.001, 
      'num_train': 40,
      'early_stop_patience': 6,
      'batch_size':4096,
      # DATASET PARAMS
      'pre_train_classes': list(range(10)),
      'fine_tune_classes': list(range(10)),
      # EXPERIMENT SETTING PARAMS
      'use_pooling': True,   # CHANGE
      'pooling_every_n_layers': 2, # add pooling after every n layers specified here. For only one pooling after all the CNN layers, this equals params['depth']
      # default value for pooling_every_n_layers is 1 (after each cnn layer)
      'pooling_stride': 2,
      'freeze': True,         # CHANGE: freeze the conv layers before the cut
      'reinit': True         # CHANGE: reinit the conv lyers only after the cut
      }

In [4]:
root_dir = './data'  # Specify your data directory here
transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.5,), (0.5,))])
dataloader_wrapped = TransferLearningWrapper(params, datasets.MNIST, datasets.FashionMNIST, root_dir, transform=transform)

In [5]:
dataloader_wrapped.pretrain_train_loader.dataset.__len__()

54000

In [6]:
train_loader_reduced = reduce_dataset(dataloader_wrapped.finetune_train_loader, 0.5, seed = 0)
train_loader_reduced.dataset.__len__()

27000

## Pretraining

In [7]:
#Create DNN model
pretrained_model = CustomCNN(params, dataloader_wrapped.output_dim)
pretrained_model.to(device)

CustomCNN(
  (conv0): Conv2d(1, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (act0): ReLU()
  (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (act1): ReLU()
  (pool1): AvgPool2d(kernel_size=2, stride=2, padding=0)
  (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (act2): ReLU()
  (conv3): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (act3): ReLU()
  (pool3): AvgPool2d(kernel_size=2, stride=2, padding=0)
  (conv4): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (act4): ReLU()
  (conv5): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (act5): ReLU()
  (pool5): AvgPool2d(kernel_size=2, stride=2, padding=0)
  (fc): Linear(in_features=576, out_features=10, bias=True)
)

In [9]:
# Train and evaluate
trainer = Trainer(pretrained_model, dataloader_wrapped, params["lr_pretrain"], params)
train_acc, test_acc, effective_epochs, checkpoints = trainer.train()

print(f"Final Training Accuracy: {train_acc:.4f}")
print(f"Final Test Accuracy: {test_acc:.4f}")

Epoch: 0 	Training Accuracy: 47.74%
Validation Accuracy: 47.67%
Epoch: 1 	Training Accuracy: 75.33%
Validation Accuracy: 74.87%
Epoch: 2 	Training Accuracy: 82.54%
Validation Accuracy: 81.30%
Epoch: 3 	Training Accuracy: 87.35%
Validation Accuracy: 86.38%
Epoch: 4 	Training Accuracy: 89.96%
Validation Accuracy: 89.22%
Epoch: 5 	Training Accuracy: 92.27%
Validation Accuracy: 91.60%
Epoch: 6 	Training Accuracy: 93.92%
Validation Accuracy: 93.58%
Epoch: 7 	Training Accuracy: 95.48%
Validation Accuracy: 95.22%
Epoch: 8 	Training Accuracy: 95.81%
Validation Accuracy: 95.62%
Epoch: 9 	Training Accuracy: 94.51%
Validation Accuracy: 94.42%
Epoch: 10 	Training Accuracy: 96.38%
Validation Accuracy: 96.10%
Epoch: 11 	Training Accuracy: 97.17%
Validation Accuracy: 96.97%
Epoch: 12 	Training Accuracy: 97.34%
Validation Accuracy: 97.03%
Epoch: 13 	Training Accuracy: 97.67%
Validation Accuracy: 97.30%
Epoch: 14 	Training Accuracy: 97.27%
Validation Accuracy: 97.13%
Epoch: 15 	Training Accuracy: 97.97

In [10]:
#torch.save(pretrained_model.state_dict(), 'pretrained_models/pretrained_model_only_numbers_new.pth')

In [8]:
pretrained_model.load_state_dict(torch.load('pretrained_models/pretrained_model_only_numbers_new.pth'))
pretrained_model.to(device)

CustomCNN(
  (conv0): Conv2d(1, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (act0): ReLU()
  (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (act1): ReLU()
  (pool1): AvgPool2d(kernel_size=2, stride=2, padding=0)
  (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (act2): ReLU()
  (conv3): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (act3): ReLU()
  (pool3): AvgPool2d(kernel_size=2, stride=2, padding=0)
  (conv4): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (act4): ReLU()
  (conv5): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (act5): ReLU()
  (pool5): AvgPool2d(kernel_size=2, stride=2, padding=0)
  (fc): Linear(in_features=576, out_features=10, bias=True)
)

In [9]:
eval(pretrained_model, device, dataloader_wrapped.test_loader, debug=True, classification_report_flag=True, is_cnn=True)


Average loss: 0.0307, Accuracy: 59436.0/60000 (99%)

              precision    recall  f1-score   support

     Class 0       0.99      1.00      1.00      5923
     Class 1       0.99      1.00      0.99      6742
     Class 2       0.99      0.99      0.99      5958
     Class 3       0.99      0.99      0.99      6131
     Class 4       0.99      0.99      0.99      5842
     Class 5       0.99      0.99      0.99      5421
     Class 6       0.99      0.99      0.99      5918
     Class 7       0.99      1.00      0.99      6265
     Class 8       0.99      0.98      0.99      5851
     Class 9       0.98      0.99      0.98      5949

    accuracy                           0.99     60000
   macro avg       0.99      0.99      0.99     60000
weighted avg       0.99      0.99      0.99     60000



0.9906

In [12]:
# save model for later use
foldername = "pretrained_models/pretrained_mnist_to_fashion_all_classes"
os.mkdir(foldername)
torch.save(pretrained_model.state_dict(), os.path.join(foldername, 'pretrained_model.pth'))

params_tmp = copy.deepcopy(params)
del params_tmp["device"]
params_tmp["activation_function"] = str(params_tmp.get("activation_function", nn.ReLU))
#save params as well
with open(os.path.join(foldername, 'params.json'), 'w') as fp:
    json.dump(params_tmp, fp)

## Fine-tuning Experiments

In [None]:
# load results: to continue from a checkpoint (actually don't run)
with open('results.json', 'r') as f:
    results = json.load(f)

### Baselines (End-to-end models trained on subsets of fine-tuning dataset)
We also reuse the baselines a lot! so skip if we already have the jsons

In [8]:
baselines_results = []
percentages = [0.001, 0.01, 0.1, 0.5, 1.0]

In [9]:
dataloader_wrapped.update_phase('finetune')

for sampled_percentage in percentages:      
    if sampled_percentage <= 0.01:
        repeats = 25
    elif sampled_percentage < 0.5:
        repeats = 20
    else:
        repeats = 5
    
    for repeat in range(repeats):
        # Print or log the sampled values for transparency
        print(f"\nSampled Percentage: {sampled_percentage}, Lr: {params['lr_fine_tune']}, Repeat: {repeat}")

        # Reduce the dataset
        train_loader_reduced = reduce_dataset(dataloader_wrapped.train_loader, sampled_percentage, seed = repeat)
        torch.manual_seed(repeat)
        #train_loader_reduced = reduce_dataset(dataloader_wrapped.train_loader, sampled_percentage, seed = repeat)
        dataset_namespace_new = SimpleNamespace(train_loader=train_loader_reduced, test_loader=dataloader_wrapped.test_loader, val_loader=dataloader_wrapped.val_loader)

        # Copy and then cut the model - we already deepcopy it in the function: pretrained_model
        params_tmp = copy.deepcopy(params)
        params_tmp["reinit"] = True
        model_new = cut_custom_cnn_model(pretrained_model, cut_point=0, params=params_tmp, output_dim=dataloader_wrapped.output_dim)
        model_new.to(device)

        # Train and evaluate
        trainer = Trainer(model_new, dataset_namespace_new, params['lr_fine_tune'], params)
        train_acc, test_acc, effective_epochs, checkpoints = trainer.train(verbose=0)
        print(f"Training Accuracy: {train_acc:.4f}, Test Accuracy: {test_acc:.4f}")

        # Store the results
        baselines_results.append({"lr":params['lr_fine_tune'], "sampled_percentage":sampled_percentage, "sampled_cut_point":-1, "repeat":repeat, "train_acc":train_acc, "test_acc":test_acc}) # -1 for the cut point means it's baseline


Sampled Percentage: 0.001, Lr: 0.001, Repeat: 0


In [None]:
print(baselines_results)

In [21]:
# save baseline results
params_tmp = copy.deepcopy(params)
del params_tmp["device"]
params_tmp["activation_function"] = str(params_tmp["activation_function"])
results = [params_tmp] + results

with open(f'results_jsons/baselines_freeze_{params["freeze"]}_pool_{params["use_pooling"]}_lr_{params["lr_fine_tune"]}_dummy_run.json', 'w') as f:
    json.dump(results, f)

### Fine-tuning

In [10]:
results = []
#percentages = [0.001, 0.01, 0.1, 0.5, 1.0]
percentages = [0.01]

In [11]:
dataloader_wrapped.update_phase('finetune')

for sampled_percentage in percentages:

    if sampled_percentage <= 0.01:
        repeats = 25
    elif sampled_percentage < 0.5:
        repeats = 20
    else:
        repeats = 5
        
    for sampled_cut_point in cuts:

        for repeat in range(repeats):
            # Add the combination to the tested set
            # tested_combinations.add((sampled_percentage, sampled_cut_point))

            # Print or log the sampled values for transparency
            print(f"\nSampled Percentage: {sampled_percentage}, Sampled Cut Point: {sampled_cut_point}, Lr: {params['lr_fine_tune']}, Repeat: {repeat}")

            # Reduce the dataset
            train_loader_reduced = reduce_dataset(dataloader_wrapped.train_loader, sampled_percentage, seed=repeat)
            dataset_namespace_new = SimpleNamespace(train_loader=train_loader_reduced, test_loader=dataloader_wrapped.test_loader, val_loader=dataloader_wrapped.val_loader)
            torch.manual_seed(repeat) # because in the cut function we reinitialize some layers too (at least the dense layers)
            
            # Copy and then cut the model - we already deepcopy it in the function: pretrained_model
            model_new = cut_custom_cnn_model(pretrained_model, cut_point=sampled_cut_point, params=params, output_dim=dataloader_wrapped.output_dim)
            model_new.to(device)
            
            # Train and evaluate
            trainer = Trainer(model_new, dataset_namespace_new, params['lr_fine_tune'], params)
            train_acc, test_acc, effective_epochs, checkpoints = trainer.train(verbose=0)
            print(f"Training Accuracy: {train_acc:.4f}, Test Accuracy: {test_acc:.4f}")

            # Store the results
            results.append({"lr":params['lr_fine_tune'], "sampled_percentage":sampled_percentage, "sampled_cut_point":sampled_cut_point, "repeat":repeat, "train_acc":train_acc, "test_acc":test_acc})


Sampled Percentage: 0.01, Sampled Cut Point: 0, Lr: 0.001, Repeat: 0
Training Accuracy: 0.7630, Test Accuracy: 0.7379

Sampled Percentage: 0.01, Sampled Cut Point: 0, Lr: 0.001, Repeat: 1
Early stopping invoked.
Training Accuracy: 0.5704, Test Accuracy: 0.5429

Sampled Percentage: 0.01, Sampled Cut Point: 0, Lr: 0.001, Repeat: 2
Training Accuracy: 0.6870, Test Accuracy: 0.6838

Sampled Percentage: 0.01, Sampled Cut Point: 0, Lr: 0.001, Repeat: 3
Training Accuracy: 0.7130, Test Accuracy: 0.6703

Sampled Percentage: 0.01, Sampled Cut Point: 0, Lr: 0.001, Repeat: 4
Training Accuracy: 0.7167, Test Accuracy: 0.6886

Sampled Percentage: 0.01, Sampled Cut Point: 0, Lr: 0.001, Repeat: 5
Training Accuracy: 0.7222, Test Accuracy: 0.7058

Sampled Percentage: 0.01, Sampled Cut Point: 0, Lr: 0.001, Repeat: 6
Training Accuracy: 0.7667, Test Accuracy: 0.7047

Sampled Percentage: 0.01, Sampled Cut Point: 0, Lr: 0.001, Repeat: 7
Training Accuracy: 0.7370, Test Accuracy: 0.6763

Sampled Percentage: 0.0

TypeError: slice indices must be integers or None or have an __index__ method

In [12]:
print(results)

[{'lr': 0.001, 'sampled_percentage': 0.01, 'sampled_cut_point': 0, 'repeat': 0, 'train_acc': 0.762962962962963, 'test_acc': 0.73785}, {'lr': 0.001, 'sampled_percentage': 0.01, 'sampled_cut_point': 0, 'repeat': 1, 'train_acc': 0.5703703703703704, 'test_acc': 0.5428666666666667}, {'lr': 0.001, 'sampled_percentage': 0.01, 'sampled_cut_point': 0, 'repeat': 2, 'train_acc': 0.687037037037037, 'test_acc': 0.6838333333333333}, {'lr': 0.001, 'sampled_percentage': 0.01, 'sampled_cut_point': 0, 'repeat': 3, 'train_acc': 0.7129629629629629, 'test_acc': 0.6703}, {'lr': 0.001, 'sampled_percentage': 0.01, 'sampled_cut_point': 0, 'repeat': 4, 'train_acc': 0.7166666666666667, 'test_acc': 0.6885666666666667}, {'lr': 0.001, 'sampled_percentage': 0.01, 'sampled_cut_point': 0, 'repeat': 5, 'train_acc': 0.7222222222222222, 'test_acc': 0.7058}, {'lr': 0.001, 'sampled_percentage': 0.01, 'sampled_cut_point': 0, 'repeat': 6, 'train_acc': 0.7666666666666667, 'test_acc': 0.7047}, {'lr': 0.001, 'sampled_percentage

In [14]:
cuts = [5,6]

In [15]:
dataloader_wrapped.update_phase('finetune')

for sampled_percentage in percentages:

    if sampled_percentage <= 0.01:
        repeats = 25
    elif sampled_percentage < 0.5:
        repeats = 20
    else:
        repeats = 5
        
    for sampled_cut_point in cuts:

        for repeat in range(repeats):
            # Add the combination to the tested set
            # tested_combinations.add((sampled_percentage, sampled_cut_point))

            # Print or log the sampled values for transparency
            print(f"\nSampled Percentage: {sampled_percentage}, Sampled Cut Point: {sampled_cut_point}, Lr: {params['lr_fine_tune']}, Repeat: {repeat}")

            # Reduce the dataset
            train_loader_reduced = reduce_dataset(dataloader_wrapped.train_loader, sampled_percentage, seed=repeat)
            dataset_namespace_new = SimpleNamespace(train_loader=train_loader_reduced, test_loader=dataloader_wrapped.test_loader, val_loader=dataloader_wrapped.val_loader)
            torch.manual_seed(repeat) # because in the cut function we reinitialize some layers too (at least the dense layers)
            
            # Copy and then cut the model - we already deepcopy it in the function: pretrained_model
            model_new = cut_custom_cnn_model(pretrained_model, cut_point=sampled_cut_point, params=params, output_dim=dataloader_wrapped.output_dim)
            model_new.to(device)
            
            # Train and evaluate
            trainer = Trainer(model_new, dataset_namespace_new, params['lr_fine_tune'], params)
            train_acc, test_acc, effective_epochs, checkpoints = trainer.train(verbose=0)
            print(f"Training Accuracy: {train_acc:.4f}, Test Accuracy: {test_acc:.4f}")

            # Store the results
            results.append({"lr":params['lr_fine_tune'], "sampled_percentage":sampled_percentage, "sampled_cut_point":sampled_cut_point, "repeat":repeat, "train_acc":train_acc, "test_acc":test_acc})


Sampled Percentage: 0.01, Sampled Cut Point: 5, Lr: 0.001, Repeat: 0
Training Accuracy: 0.8519, Test Accuracy: 0.7451

Sampled Percentage: 0.01, Sampled Cut Point: 5, Lr: 0.001, Repeat: 1
Training Accuracy: 0.8463, Test Accuracy: 0.7377

Sampled Percentage: 0.01, Sampled Cut Point: 5, Lr: 0.001, Repeat: 2
Training Accuracy: 0.7981, Test Accuracy: 0.7270

Sampled Percentage: 0.01, Sampled Cut Point: 5, Lr: 0.001, Repeat: 3
Training Accuracy: 0.7981, Test Accuracy: 0.7214

Sampled Percentage: 0.01, Sampled Cut Point: 5, Lr: 0.001, Repeat: 4
Training Accuracy: 0.8278, Test Accuracy: 0.7417

Sampled Percentage: 0.01, Sampled Cut Point: 5, Lr: 0.001, Repeat: 5
Training Accuracy: 0.8519, Test Accuracy: 0.7493

Sampled Percentage: 0.01, Sampled Cut Point: 5, Lr: 0.001, Repeat: 6
Training Accuracy: 0.8093, Test Accuracy: 0.7254

Sampled Percentage: 0.01, Sampled Cut Point: 5, Lr: 0.001, Repeat: 7
Training Accuracy: 0.8648, Test Accuracy: 0.7503

Sampled Percentage: 0.01, Sampled Cut Point: 5,

In [16]:
print(results)

[{'lr': 0.001, 'sampled_percentage': 0.01, 'sampled_cut_point': 0, 'repeat': 0, 'train_acc': 0.762962962962963, 'test_acc': 0.73785}, {'lr': 0.001, 'sampled_percentage': 0.01, 'sampled_cut_point': 0, 'repeat': 1, 'train_acc': 0.5703703703703704, 'test_acc': 0.5428666666666667}, {'lr': 0.001, 'sampled_percentage': 0.01, 'sampled_cut_point': 0, 'repeat': 2, 'train_acc': 0.687037037037037, 'test_acc': 0.6838333333333333}, {'lr': 0.001, 'sampled_percentage': 0.01, 'sampled_cut_point': 0, 'repeat': 3, 'train_acc': 0.7129629629629629, 'test_acc': 0.6703}, {'lr': 0.001, 'sampled_percentage': 0.01, 'sampled_cut_point': 0, 'repeat': 4, 'train_acc': 0.7166666666666667, 'test_acc': 0.6885666666666667}, {'lr': 0.001, 'sampled_percentage': 0.01, 'sampled_cut_point': 0, 'repeat': 5, 'train_acc': 0.7222222222222222, 'test_acc': 0.7058}, {'lr': 0.001, 'sampled_percentage': 0.01, 'sampled_cut_point': 0, 'repeat': 6, 'train_acc': 0.7666666666666667, 'test_acc': 0.7047}, {'lr': 0.001, 'sampled_percentage

In [12]:
# save fine-tuning results
params_tmp = copy.deepcopy(params)
del params_tmp["device"]
params_tmp["activation_function"] = str(params_tmp["activation_function"])
results = [params_tmp] + results

with open(f'results_jsons/results_freeze_{params["freeze"]}_reinit_{params["reinit"]}_pool_{params["use_pooling"]}_lr_{params["lr_fine_tune"]}_MNIST_to_Fashion_{percentages[0]}_to_{percentages[-1]}.json', 'w') as f:
    json.dump(results, f)
results = results[1:]