In [1]:
# prevent random files being included in dataset
!rm -rf `find -type d -name .ipynb_checkpoints`

In [8]:
import papermill as pm
import mlflow
import torch
from utils import md5_dir, set_seed
from torch.utils.data import DataLoader
from functools import partial
from tqdm import tqdm
import os
# tqdm = partial(tqdm, position = 0, leave = True)

from loss_functions import kd_loss
from datasets import TrainImageNetDataset

import datetime

import torch
import torch.nn.functional as F

from utils import *

# if using pretrained model
from torchvision.models import ResNet50_Weights

In [3]:
set_seed(42)
torch.cuda.empty_cache()

In [4]:
# Default Parameters
run_id = "f13020b4e2304bd7837e10d17c6ea8ea"
train_data_path = "../data/ImageNet/ILSVRC/Data/CLS-LOC/train/"
# test_data_path = "data/ImageNet/ILSVRC/Data/CLS-LOC/val/"

train_data_labels_path = "../data/ImageNet/LOC_train_solution.csv"
# test_data_labels_path = "data/ImageNet/LOC_val_solution.csv"

label_mapping_path = "../data/ImageNet/LOC_synset_mapping.txt"

resnet50_weights = ResNet50_Weights.DEFAULT

preprocess = resnet50_weights.transforms()

device = "cuda" if torch.cuda.is_available() else "cpu"

In [5]:
train_dataset = TrainImageNetDataset(train_data_path, train_data_labels_path, label_mapping_path, preprocess, data_reduction = 0)

# test_dataset = TestImageNetDataset(test_data_path, test_data_labels_path, label_mapping_path, preprocess)

train_dataloader = DataLoader(train_dataset, batch_size=80, shuffle=True)

In [6]:
# if using pretrained model
from torchvision.models import resnet50, ResNet50_Weights, resnet18

resnet50_pretrained_weights = ResNet50_Weights.DEFAULT

epochs = 6
lr = 0.0001

In [16]:
os.listdir()

['__pycache__',
 'run_experiment.ipynb',
 'datasets.py',
 'utils.py',
 'output',
 'run_preprocessing.ipynb',
 'run_evaluation.ipynb',
 'run_training.ipynb',
 'mlruns',
 'test.txt',
 'mdoel_cherckpionts',
 '.ipynb_checkpoints',
 'environment.txt',
 'loss_functions.py']

In [24]:
def train_student(student, teacher, train_dataloader, criterion, optimizer, epochs, device, model_save_path):
    """
    - student: The smaller, untrained model that uses the teacher's output as an additional label
    - teacher: The pretrained model used to help the student model learn
    - train_dataloader: Dataloader for training data
    - criterion: The loss function
    - optimizer: The optimization algorithm
    - epochs: Number of training epochs
    - device: Device to run training
    """
    teacher.eval()
    teacher.to(device)
    student.train()
    student.to(device)
    epoch_print = 6000
    
    for epoch in range(epochs):
        running_loss = 0.0
        loss_cntr = 0
        previous_loss = 0.0

        for inputs, labels in tqdm(train_dataloader, position = 0, leave = True):
            inputs, labels = inputs.to(device), labels.to(device)
            labels = F.one_hot(labels, num_classes=1000).float()

            # Zero the gradients 
            optimizer.zero_grad()

            teacher_predictions = teacher(inputs)
            student_predictions = student(inputs)

            loss = criterion(student_predictions, labels, teacher_predictions, 0.5, 0.5)

            loss.backward()

            optimizer.step()

            previous_loss += loss
            if loss_cntr > 0 and loss_cntr % epoch_print == 0:
                print("Current loss:", loss)
                print("Running loss:", running_loss)
                print("Loss delta:", previous_loss - loss)
                print("Avg loss per epoch", previous_loss/epoch_print)
                previous_loss = 0.0
            loss_cntr += 1
            
            running_loss += loss.item()

            print(loss.item())
            if epoch == 0 and loss_cntr == 15:
                if model_save_path:
                    torch.save(student.state_dict(), model_save_path)
                return

        average_loss = running_loss / len(train_dataloader)
        print(f'Epoch [{epoch+1}/{epochs}], Loss: {average_loss:.4f}')

        with mlflow.start_run(run_id=run_id) as run:
            mlflow.log_metric("student_training_loss", average_loss)

    with mlflow.start_run(run_id=run_id) as run:
        mlflow.pytorch.log_model(
            pytorch_model=teacher.to("cpu"),
            artifact_path="teacher",)

        mlflow.pytorch.log_model(
            pytorch_model=student.to("cpu"),
            artifact_path="student")

from torchvision.models import resnet50, ResNet50_Weights, resnet18

resnet50_pretrained_weights = ResNet50_Weights.DEFAULT

teacher = resnet50(weights=resnet50_pretrained_weights)


student = resnet18(weights=None)
pathh = "03_19_034125"
model_load_path = "./mdoel_cherckpionts/" + pathh
if model_load_path:
    student.load_state_dict(torch.load(model_load_path))


optimizer = torch.optim.Adam(student.parameters(), lr = 0.001)
epochs = 6

current_time = datetime.datetime.now().strftime('%m_%d_%H%M%S')
model_save_path = "./mdoel_cherckpionts/" + current_time

train_student(student, teacher, train_dataloader, kd_loss, optimizer, epochs, device, model_save_path)

  0%|                                       | 1/16015 [00:00<2:52:05,  1.55it/s]

4.660658836364746


  0%|                                       | 2/16015 [00:01<2:54:02,  1.53it/s]

4.621598243713379


  0%|                                       | 3/16015 [00:01<2:43:27,  1.63it/s]

4.47307825088501


  0%|                                       | 4/16015 [00:02<2:40:59,  1.66it/s]

4.615926742553711


  0%|                                       | 5/16015 [00:03<2:36:36,  1.70it/s]

4.51435661315918


  0%|                                       | 6/16015 [00:03<2:34:13,  1.73it/s]

4.277093887329102


  0%|                                       | 7/16015 [00:04<2:33:29,  1.74it/s]

4.480215549468994


  0%|                                       | 8/16015 [00:04<2:37:20,  1.70it/s]

4.319109916687012


  0%|                                       | 9/16015 [00:05<2:34:59,  1.72it/s]

4.301661491394043


  0%|                                      | 10/16015 [00:05<2:37:03,  1.70it/s]

4.417448043823242


  0%|                                      | 11/16015 [00:06<2:43:24,  1.63it/s]

4.419055938720703


  0%|                                      | 12/16015 [00:07<2:41:32,  1.65it/s]

4.527471542358398


  0%|                                      | 13/16015 [00:07<2:40:04,  1.67it/s]

4.4186296463012695


  0%|                                      | 14/16015 [00:08<2:38:10,  1.69it/s]

4.379304885864258


  0%|                                      | 14/16015 [00:09<2:52:36,  1.55it/s]

4.38444709777832





In [7]:
teacher = resnet50(weights=resnet50_pretrained_weights)
student = resnet18(weights=None)
independent_student = resnet18(weights=None)


ind_optimizer = torch.optim.Adam(student.parameters(), lr)
optimizer = torch.optim.Adam(independent_student.parameters(), lr)

ind_criterion = torch.nn.CrossEntropyLoss()
criterion = kd_loss

ind_name = "independent_student"
std_name = "student"

components = {
    ind_name: {"model": independent_student, 
               "opt": ind_optimizer, 
               "criterion": ind_criterion, 
               "running_loss": 0,
               "previous_loss": 0.0
              },
    std_name: {"model": student, 
               "opt": optimizer, 
               "criterion": criterion,
               "running_loss": 0,
               "previous_loss": 0.0
              }
}

train_models(components, teacher, train_dataloader, epochs, device, run_id)
# train_student(student, teacher, train_dataloader, kd_loss, optimizer, epochs, device)
# train_independent_student(independent_student, train_dataloader, criterion, optimizer, epochs, device)

 25%|████████▉                           | 4000/16015 [43:13<2:10:09,  1.54it/s]

student
Current loss: tensor(7.0426, device='cuda:0', grad_fn=<DivBackward1>)
Loss delta: tensor(7.0426, device='cuda:0', grad_fn=<SubBackward0>)
Avg loss per epoch 0.0


 25%|████████▉                           | 4001/16015 [43:13<2:10:58,  1.53it/s]

student
Current loss: tensor(4.3942, device='cuda:0', grad_fn=<AddBackward0>)
Loss delta: tensor(-2.6484, device='cuda:0', grad_fn=<SubBackward0>)
Avg loss per epoch tensor(0.0018, device='cuda:0', grad_fn=<DivBackward0>)


 50%|████████████████▉                 | 8000/16015 [1:26:13<1:28:18,  1.51it/s]

student
Current loss: tensor(7.1135, device='cuda:0', grad_fn=<DivBackward1>)
Loss delta: tensor(2.7193, device='cuda:0', grad_fn=<SubBackward0>)
Avg loss per epoch tensor(0.0011, device='cuda:0', grad_fn=<DivBackward0>)


 50%|████████████████▉                 | 8001/16015 [1:26:14<1:26:25,  1.55it/s]

student
Current loss: tensor(4.3814, device='cuda:0', grad_fn=<AddBackward0>)
Loss delta: tensor(-2.7321, device='cuda:0', grad_fn=<SubBackward0>)
Avg loss per epoch tensor(0.0018, device='cuda:0', grad_fn=<DivBackward0>)


 75%|██████████████████████████▏        | 12000/16015 [2:09:34<44:21,  1.51it/s]

student
Current loss: tensor(6.9600, device='cuda:0', grad_fn=<DivBackward1>)
Loss delta: tensor(2.5785, device='cuda:0', grad_fn=<SubBackward0>)
Avg loss per epoch tensor(0.0011, device='cuda:0', grad_fn=<DivBackward0>)


 75%|██████████████████████████▏        | 12001/16015 [2:09:34<45:22,  1.47it/s]

student
Current loss: tensor(4.4683, device='cuda:0', grad_fn=<AddBackward0>)
Loss delta: tensor(-2.4916, device='cuda:0', grad_fn=<SubBackward0>)
Avg loss per epoch tensor(0.0017, device='cuda:0', grad_fn=<DivBackward0>)


100%|██████████████████████████████████▉| 16000/16015 [2:52:51<00:09,  1.53it/s]

student
Current loss: tensor(7.0803, device='cuda:0', grad_fn=<DivBackward1>)
Loss delta: tensor(2.6120, device='cuda:0', grad_fn=<SubBackward0>)
Avg loss per epoch tensor(0.0011, device='cuda:0', grad_fn=<DivBackward0>)


100%|██████████████████████████████████▉| 16001/16015 [2:52:52<00:08,  1.57it/s]

student
Current loss: tensor(4.5043, device='cuda:0', grad_fn=<AddBackward0>)
Loss delta: tensor(-2.5760, device='cuda:0', grad_fn=<SubBackward0>)
Avg loss per epoch tensor(0.0018, device='cuda:0', grad_fn=<DivBackward0>)


100%|███████████████████████████████████| 16015/16015 [2:53:01<00:00,  1.54it/s]


Epoch [1/6], independent_student: 7.0242


MlflowException: Run '5363b9a6f6954354b7bb68d535b5ea88' not found

In [1]:
def train_student(student, teacher, train_dataloader, criterion, optimizer, epochs, device):
    """
    - student: The smaller, untrained model that uses the teacher's output as an additional label
    - teacher: The pretrained model used to help the student model learn
    - train_dataloader: Dataloader for training data
    - criterion: The loss function
    - optimizer: The optimization algorithm
    - epochs: Number of training epochs
    - device: Device to run training
    """
    teacher.eval()
    teacher.to(device)
    student.train()
    student.to(device)

    # with tqdm as epoch:
        # for i in tqdm(range(epochs)):
    # for epoch in tqdm(range(epochs), leave = True, position = 0):
    # in tqdm(train_dataloader, leave = True, position = 0):
    for epoch in tqdm(range(epochs), leave = True, position = 0):
        running_loss = 0.0
        sampleNum = 0
        currentLoss = 0

        for inputs, labels in tqdm(train_dataloader, leave = True, position = 0):
        # for inputs, labels in train_dataloader:
            inputs, labels = inputs.to(device), labels.to(device)
            labels = F.one_hot(labels, num_classes=1000).float()

            # Zero the gradients 
            optimizer.zero_grad()

            teacher_predictions = teacher(inputs)
            student_predictions = student(inputs)

            loss = criterion(student_predictions, labels, teacher_predictions, 0.5, 0.5)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
            currentLoss += loss.item()

            # if sampleNum % 100 == 0 and sampleNum > 0: # write this to a file somewhere else so tqdm doesnt mess up
            #     print("loss: ", str(currentLoss/100))
            #     print("total loss: ", str(running_loss/sampleNum))
            #     currentLoss = 0
            sampleNum += 1
            # print(str(running_loss) + " ", end = '')
            # break

        average_loss = running_loss / len(train_dataloader)
        print(f'Epoch [{epoch+1}/{epochs}], Loss: {average_loss:.4f}')

        # save training loss in mlflow
        with mlflow.start_run(run_id=run_id) as run:
            mlflow.log_metric("student_training_loss", average_loss)

    with mlflow.start_run(run_id=run_id) as run:
        mlflow.pytorch.log_model(
            pytorch_model=teacher.to("cpu"),
            artifact_path="teacher",)

        mlflow.pytorch.log_model(
            pytorch_model=student.to("cpu"),
            artifact_path="student")
        

optimizer = torch.optim.Adam(student.parameters(), lr)
train_student(student, teacher, train_dataloader, kd_loss, optimizer, epochs, device)

NameError: name 'torch' is not defined

In [8]:
def train_independent_student(independent_student, train_dataloader, criterion, optimizer, epochs, device):
    """
    - teacher: The pretrained model used to help the student model learn
    - student: The smaller, untrained model that uses the teacher's output as an additional label
    - criterion: The loss function
    - optimizer: The optimization algorithm
    - epochs: Number of training epochs
    - device: Device to run training
    """
    independent_student.train()
    independent_student.to(device)
    
    for epoch in tqdm.tqdm(range(epochs)):
        running_loss = 0.0

        for inputs, labels in tqdm(train_dataloader, leave = True, position = 0):
            inputs, labels = inputs.to(device), labels.to(device)
            labels = F.one_hot(labels, num_classes=1000).float()

            # Zero the gradients 
            optimizer.zero_grad()

            independent_student_predictions = independent_student(inputs)

            loss = criterion(independent_student_predictions, labels)

            loss.backward()

            optimizer.step()

            running_loss += loss.item()
            break

        average_loss = running_loss / len(train_dataloader)
        print(f'Epoch [{epoch+1}/{epochs}], Loss: {average_loss:.4f}')


        # save training loss in mlflow
        with mlflow.start_run(run_id=run_id) as run:
            mlflow.log_metric("independent_student_training_loss", average_loss)

    with mlflow.start_run(run_id=run_id) as run:
        mlflow.pytorch.log_model(
            pytorch_model=independent_student.to("cpu"),
            artifact_path="independent_student"
        )

optimizer = torch.optim.Adam(independent_student.parameters(), lr)
criterion = torch.nn.CrossEntropyLoss()
train_independent_student(independent_student, train_dataloader, criterion, optimizer, epochs, device)

AttributeError: type object 'tqdm' has no attribute 'tqdm'