# co-training

In [1]:
import torch
import torch.nn as nn
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
from torchvision.models import resnet50

In [2]:
from math import floor
import numpy as np
import copy
import random
import pickle
import os

In [3]:
def train(loader, model, loss_fn, optimizer, device):
    size = len(loader.dataset)
    model.train()
    i = 0
    for batch, (X, y) in enumerate(loader):
        X, y = X.to(device), y.to(device)

        # Compute prediction error
        pred = model(X)
        loss = loss_fn(pred, y)

        # Backpropagation
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        
        loss, current = loss, (batch + 1) * len(X)
        print(f"loss: {loss:>7f} [{current:5d} / {size:>5d}]")
        i += 1
        
        # if batch % 10 == 0:
        #     loss, current = loss, (batch + 1) * len(X)
        #     print(f"loss: {loss:>7f} [{current:5d} / {size:>5d}]")

In [4]:
def test(loader, model, loss_fn, device):
  size = len(loader.dataset)
  num_batches = len(loader)
  model.eval()
  test_loss, correct = 0, 0
  with torch.no_grad():
    for X, y in loader:
      X, y = X.to(device), y.to(device)
      pred = model(X)
      test_loss += loss_fn(pred, y).item()
      correct += (pred.argmax(1) == y).type(torch.float).sum().item()
    test_loss /= num_batches
    correct /= size
      
    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f}")
    return test_loss

In [5]:
def predict(loader, model, device):
    print(f"Number of instances: {len(loader.dataset)}")
    model.eval()
    predictions = []
    with torch.no_grad():
        for X, y in loader:
            X, y = X.to(device), y.to(device)
            output = model(X)
            predictions.append(output)
    return torch.cat(predictions) # output shape (# instances, # outputs)

In [6]:
# takes in a Tensor of shape e.g. (# instances, # prob outputs) and returns a tuple
# (Tensor[top probabilities], Tensor[predicted labels], Tensor[instance indexes])
def get_topk_pred(pred, k):
    prob, label = torch.max(pred, 1)
    idx = torch.argsort(prob, descending=True)[:k]
    return prob[idx].cpu(), label[idx].cpu(), idx.cpu()

In [7]:
def add_to_imagefolder(paths, labels, dataset):
    """
    Adds the paths with the labels to an image classification dataset

    :list paths: a list of absolute image paths to add to the dataset
    :list labels: a list of labels for each path
    :Dataset dataset: the dataset to add the samples to
    """

    new_samples = list(zip(paths, labels))

    dataset.samples += new_samples

    return dataset.samples

In [8]:
def remove_collisions(lbl_model0, lbl_model1, idx_model0, idx_model1):
    # find instances and indices of instances that have
    # been labeled as most confident by both model0, model1
    inter, idx_inter0, idx_inter1 = np.intersect1d(
                                        idx_model0,
                                        idx_model1,
                                        return_indices=True)

    print(f"Number of predictions (model0): {len(idx_model0)}")
    print(f"Number of predictions (model1): {len(idx_model1)}")
    print(f"Found {len(inter)} potential conflicting predictions")

    # bool mask to identify the conflicting predictions (collision)
    mask_coll = lbl_model0[idx_inter0] != lbl_model1[idx_inter1]
    collisions = inter[mask_coll]

    print(f"Found {len(collisions)} conflicting predictions")

    if (len(collisions) > 0):
        print(f"Collisions: {collisions}")
        # find where these collisions are actually at
        # in their respective lists, and remove them...
        # maybe want to return this as well? ...
        idx_coll0 = idx_inter0[mask_coll]
        idx_coll1 = idx_inter1[mask_coll]

        # masks to remove the instances with conflicting predictions
        mask0 = np.ones(len(idx_model0), dtype=bool)
        mask0[idx_coll0] = False
        mask1 = np.ones(len(idx_model1), dtype=bool)
        mask1[idx_coll1] = False

        lbl_model0 = lbl_model0[mask0]
        lbl_model1 = lbl_model1[mask1]
        idx_model0 = idx_model0[mask0]
        idx_model1 = idx_model1[mask1]

    return lbl_model0, lbl_model1, idx_model0, idx_model1

In [9]:
# train two models on two different views
# then add top k% of predictions on the unlabeled set
# to the labeled datasets
def cotrain(loader0, loader1, loader_unlbl0, loader_unlbl1,
            model0, model1, k, device):

    # make predictions
    print("Making predictions with model0...")
    pred_model0 = predict(loader_unlbl0, model0, device)
    print("Making predictions with model1...")
    pred_model1 = predict(loader_unlbl1, model1, device)

    # get top-k predictions (labels, instance indexes in the dataset)
    print("Finding top-k predictions...")
    _, lbl_topk0, idx_topk0 = get_topk_pred(
                                    pred_model0,
                                    k if k <= len(pred_model0) else len(pred_model0))
    _, lbl_topk1, idx_topk1 = get_topk_pred(
                                    pred_model1, 
                                    k if k <= len(pred_model1) else len(pred_model1))

    print(f"Number of unlabeled instances: {len(loader_unlbl0.dataset)}")

    # what if two models predict confidently on the same instance?
    # find and remove conflicting predictions from the lists
    # may want to return the indices of the collisions too...?
    lbl_topk0, lbl_topk1, idx_topk0, idx_topk1 = \
    remove_collisions(lbl_topk0, lbl_topk1, idx_topk0, idx_topk1)

    # convert from list to array for the convenient numpy indexing 
    samples_unlbl0 = np.stack([np.array(a) for a in loader_unlbl0.dataset.samples])
    samples_unlbl1 = np.stack([np.array(a) for a in loader_unlbl1.dataset.samples])

    print(f"Shape of samples_unlbl0: {samples_unlbl0.shape}")
    print(f"last element: {samples_unlbl0[-1][:]}") 

    list_samples0 = [(str(a[0]), int(a[1])) for a in list(samples_unlbl0[idx_topk0])]
    list_samples1 = [(str(a[0]), int(a[1])) for a in list(samples_unlbl1[idx_topk1])]
    
    print(f"Number of elements in list_samples0: {len(list_samples0)}")
    print(f"last element: {list_samples0[-1]}")
    
    paths0 = [i for i, _ in list_samples0]
    paths1 = [i for i, _ in list_samples1]

    print(f"Number of elements in paths0: {len(paths0)}")
    print(f"last element: {paths0[-1]}")

    print(f"Number of elements in lbl_topk0: {len(lbl_topk0)}")
    print(f"last element: {lbl_topk0[-1]}")

    # add pseudolabeled instances to the labeled datasets
    loader0.dataset.samples = add_to_imagefolder(paths1, lbl_topk1.tolist(), loader0.dataset)
    loader1.dataset.samples = add_to_imagefolder(paths0, lbl_topk0.tolist(), loader1.dataset)

    print(f"loader0 number of samples: {len(loader0.dataset.samples)}")
    print(f"last element: {loader0.dataset.samples[-1]}")

    # remove instances from unlabeled dataset
    mask_unlbl0 = np.ones(len(loader_unlbl0.dataset), dtype=bool)
    mask_unlbl1 = np.ones(len(loader_unlbl1.dataset), dtype=bool)

    mask_unlbl0[idx_topk0] = False
    mask_unlbl1[idx_topk1] = False

    print(f"Number of unlabeled instances to remove: {(~mask_unlbl0).sum()}")

    samples_unlbl0 = samples_unlbl0[mask_unlbl0]
    samples_unlbl1 = samples_unlbl1[mask_unlbl1]

    list_unlbl0 = [(str(a[0]), int(a[1])) for a in list(samples_unlbl0)]
    list_unlbl1 = [(str(a[0]), int(a[1])) for a in list(samples_unlbl1)]

    loader_unlbl0.dataset.samples = list_unlbl0
    loader_unlbl1.dataset.samples = list_unlbl1

    print(f"loader_unlbl0 number of samples: {len(loader_unlbl0.dataset.samples)}")

In [10]:
# function to split the datasets of the two views so that
# the samples in the views are still aligned, time-wise, by index
def train_test_split_samples(samples0, samples1, test_size, random_state=None):
    if random_state is not None:
        random.seed(random_state)

    assert test_size > 0 and test_size < 1, \
        'test_size should be a float between (0, 1)'

    assert len(samples0) == len(samples1), \
        'number of samples in samples0, samples1 are not equal'
    
    idx_samples = list(range(len(samples0)))
    idx_test = random.sample(idx_samples, floor(test_size * len(samples0)))
    idx_train = list(set(idx_samples) - set(idx_test))

    # convert to np array for convenient array indexing shenanigans
    samples0_np = np.stack([np.array(a) for a in samples0])
    samples1_np = np.stack([np.array(a) for a in samples1])
    
    samples_train0 = [(str(a[0]), int(a[1])) for a in list(samples0_np[idx_train])]
    samples_test0 = [(str(a[0]), int(a[1])) for a in list(samples0_np[idx_test])]
    samples_train1 = [(str(a[0]), int(a[1])) for a in list(samples1_np[idx_train])]
    samples_test1 = [(str(a[0]), int(a[1])) for a in list(samples1_np[idx_test])]

    return samples_train0, samples_train1, samples_test0, samples_test1

In [11]:
def create_imagefolder(data, samples, path, transform, new_path=None):
    imgfolder = datasets.ImageFolder(path, transform=transform)
    imgfolder.class_to_idx = data['class_map']
    imgfolder.classes = list(data['class_map'].keys())
    imgfolder.samples = samples

    if new_path is not None:
        imgfolder.root = new_path

    return imgfolder

In [12]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"using {device}")

using cuda


In [13]:
# with open('cotraining_samples_lists.pkl', 'rb') as fp:
#     dict = pickle.load(fp)
with open('cotraining_samples_lists_fixed.pkl', 'rb') as fp:
    dict = pickle.load(fp)

In [14]:
dict.keys()

dict_keys(['labeled', 'inferred', 'class_map'])

In [15]:
dict['class_map']

{'dry': 0, 'snow': 1, 'wet': 2}

In [16]:
print(dict['labeled'][0])
print(dict['inferred'][0])

('/ourdisk/hpc/ai2es/jroth/data/labeled/bronx_allsites/snow/NYSDOT_uyomtjhwsay_2022-01-29-06-51-02.jpg', 1)
('/ourdisk/hpc/ai2es/datasets/DOT/Skyline_6464/20220129/I_87_at_Interchange_3_(Yonkers_Mile_Square_Road)__Northbound__Skyline_6464_2022-01-29-06:50:09.jpg', 1)


In [17]:
# split data into labeled/unlabeled
# split the data so we get 70/10/20 train/val/test
samples_train0, samples_train1, samples_unlbl0, samples_unlbl1 = \
    train_test_split_samples(dict['labeled'], dict['inferred'],
                             test_size=0.75, random_state=13)

In [18]:
print(len(samples_unlbl0))
print(len(samples_train0))

3227
1076


In [19]:
# split the data so we get 70/10/20 train/val/test

# split the data so we get 70/10/20 train/val/test
samples_train0, samples_train1, samples_test0, samples_test1 = \
        train_test_split_samples(samples_train0, samples_train1,
                                 test_size=0.2, random_state=13)

samples_train0, samples_train1, samples_val0, samples_val1 = \
        train_test_split_samples(samples_train0, samples_train1,
                                 test_size=0.125, random_state=13)

print(len(samples_train0))
print(len(samples_val0))
print(len(samples_unlbl0))
print(len(samples_test0))

754
107
3227
215


In [20]:
trans = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor()
    ])

In [21]:
    # Create ImageFolder objects for first view
    dummy_path = '/ourdisk/hpc/ai2es/jroth/data/labeled' 
    data_train0 = create_imagefolder(dict, samples_train0, dummy_path, trans)
    data_unlbl0 = create_imagefolder(dict, samples_unlbl0, dummy_path, trans)
    data_val0 = create_imagefolder(dict, samples_val0, dummy_path, trans)
    data_test0 = create_imagefolder(dict, samples_test0, dummy_path, trans)

    # Create ImageFolder objects for second view (we will also update the root/path)
    new_path = '/ourdisk/hpc/ai2es'
    data_train1 = create_imagefolder(dict, samples_train1, dummy_path, trans, new_path)
    data_unlbl1 = create_imagefolder(dict, samples_unlbl1, dummy_path, trans, new_path)
    data_val1 = create_imagefolder(dict, samples_val1, dummy_path, trans, new_path)
    data_test1 = create_imagefolder(dict, samples_test1, dummy_path, trans, new_path)

In [22]:
# sanity check
print(len(data_train0.samples))
print(len(data_unlbl0.samples))
print(len(data_val0.samples))
print(len(data_test0.samples))

754
3227
107
215


In [23]:
loader_kwargs = {'batch_size': 64, 'shuffle': False}

loader_train0 = DataLoader(data_train0, **loader_kwargs)
loader_unlbl0 = DataLoader(data_unlbl0, **loader_kwargs)
loader_val0 = DataLoader(data_val0, **loader_kwargs)
loader_test0 = DataLoader(data_test0, **loader_kwargs)

loader_train1 = DataLoader(data_train1, **loader_kwargs)
loader_unlbl1 = DataLoader(data_unlbl1, **loader_kwargs)
loader_val1 = DataLoader(data_val1, **loader_kwargs)
loader_test1 = DataLoader(data_test1, **loader_kwargs)

In [24]:
# sanity checks
print(len(loader_train0.dataset))
print(len(loader_unlbl0.dataset))
print(len(loader_val0.dataset))
print(len(loader_test0.dataset))

754
3227
107
215


In [25]:
model0, model1 = resnet50().to(device), resnet50().to(device)

In [26]:
# define loss function and optimizer
loss_fn = nn.CrossEntropyLoss()
optimizer0 = torch.optim.SGD(model0.parameters(), lr=1e-3,momentum=0.9)
optimizer1 = torch.optim.SGD(model1.parameters(), lr=1e-3, momentum=0.9)

# we also need to define some sort of learning rate/early stopping scheduler
scheduler0 = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer0)
scheduler1 = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer1)

In [27]:
train(loader_train0, model0, loss_fn, optimizer0, device)
train(loader_train1, model1, loss_fn, optimizer1, device)

loss: 7.918698 [   64 /   754]
loss: 7.163846 [  128 /   754]
loss: 5.569634 [  192 /   754]
loss: 3.425819 [  256 /   754]
loss: 2.244082 [  320 /   754]
loss: 1.429785 [  384 /   754]
loss: 1.638918 [  448 /   754]
loss: 1.192577 [  512 /   754]
loss: 1.884185 [  576 /   754]
loss: 1.425311 [  640 /   754]
loss: 1.524723 [  704 /   754]
loss: 1.206422 [  600 /   754]
loss: 7.396282 [   64 /   754]
loss: 6.639572 [  128 /   754]
loss: 4.940058 [  192 /   754]
loss: 3.292403 [  256 /   754]
loss: 2.244731 [  320 /   754]
loss: 1.600625 [  384 /   754]
loss: 1.776936 [  448 /   754]
loss: 1.153889 [  512 /   754]
loss: 1.636461 [  576 /   754]
loss: 1.696192 [  640 /   754]
loss: 1.559057 [  704 /   754]
loss: 1.121512 [  600 /   754]


In [28]:
k = int(len(loader_unlbl0.dataset) * 0.05)
cotrain(loader_train0, loader_train1, loader_unlbl0, loader_unlbl1, model0, model1, k, device)

Making predictions with model0...
Number of instances: 3227
Making predictions with model1...
Number of instances: 3227
Finding top-k predictions...
Number of unlabeled instances: 3227
Number of predictions (model0): 161
Number of predictions (model1): 161
Found 2 potential conflicting predictions
Found 0 conflicting predictions
Shape of samples_unlbl0: (3227, 2)
last element: ['/ourdisk/hpc/ai2es/jroth/data/labeled/bronx_allsites/dry/NYSDOT_uyomtjhwsay_2022-02-14-20-31-06.jpg'
 '0']
Number of elements in list_samples0: 161
last element: ('/ourdisk/hpc/ai2es/jroth/data/labeled/bronx_allsites/dry/NYSDOT_uyomtjhwsay_2022-02-02-04-31-12.jpg', 0)
Number of elements in paths0: 161
last element: /ourdisk/hpc/ai2es/jroth/data/labeled/bronx_allsites/dry/NYSDOT_uyomtjhwsay_2022-02-02-04-31-12.jpg
Number of elements in lbl_topk0: 161
last element: 0
loader0 number of samples: 915
last element: ('/ourdisk/hpc/ai2es/datasets/DOT/Skyline_6464/20220126/I_87_at_Interchange_3_(Yonkers_Mile_Square_Road

In [41]:
# once again, many sanity checks
print(len(loader_train0.dataset))
print(len(loader_train1.dataset))

print(loader_train0.dataset.samples[-1])
print(loader_train1.dataset.samples[-1])

print(len(loader_unlbl0.dataset))
print(len(loader_unlbl1.dataset))

print(loader_unlbl0.dataset.samples[1])
print(loader_unlbl1.dataset.samples[1])

915
915
('/ourdisk/hpc/ai2es/datasets/DOT/Skyline_6464/20220126/I_87_at_Interchange_3_(Yonkers_Mile_Square_Road)__Northbound__Skyline_6464_2022-01-26-21:30:19.jpg', 0)
('/ourdisk/hpc/ai2es/jroth/data/labeled/bronx_allsites/dry/NYSDOT_uyomtjhwsay_2022-02-02-04-31-12.jpg', 0)
3066
3066
('/ourdisk/hpc/ai2es/jroth/data/labeled/bronx_allsites/wet/NYSDOT_m4er5dez4ab_2022-02-08-02-41-03.jpg', 2)
('/ourdisk/hpc/ai2es/datasets/DOT/Skyline_6464/20220208/I_87_at_Interchange_3_(Yonkers_Mile_Square_Road)__Northbound__Skyline_6464_2022-02-08-02:40:29.jpg', 2)


In [None]:
##################################################

In [22]:
for b in range(64, 512 + 1, 32):
    model0 = resnet50().to(device)
    loader_batch0 = DataLoader(data_train0, b, shuffle=False)
    print(f"Batch size: {b}")
    train(loader_batch0, model0, loss_fn, optimizer0, device)

Batch size: 64
loss: 7.275452 [   64 /  3443]
loss: 7.242462 [  128 /  3443]
loss: 7.271658 [  192 /  3443]
Batch size: 96
loss: 6.953737 [   96 /  3443]
loss: 6.971457 [  192 /  3443]
loss: 6.974708 [  288 /  3443]
Batch size: 128
loss: 7.107782 [  128 /  3443]
loss: 7.115186 [  256 /  3443]
loss: 7.094270 [  384 /  3443]
Batch size: 160
loss: 7.628152 [  160 /  3443]
loss: 7.727857 [  320 /  3443]
loss: 7.776735 [  480 /  3443]
Batch size: 192
loss: 6.342412 [  192 /  3443]
loss: 6.226246 [  384 /  3443]
loss: 6.290689 [  576 /  3443]
Batch size: 224
loss: 6.562408 [  224 /  3443]
loss: 6.534523 [  448 /  3443]
loss: 6.540992 [  672 /  3443]
Batch size: 256
loss: 7.188202 [  256 /  3443]
loss: 7.134872 [  512 /  3443]
loss: 7.188458 [  768 /  3443]
Batch size: 288
loss: 7.491960 [  288 /  3443]
loss: 7.488218 [  576 /  3443]
loss: 7.497365 [  864 /  3443]
Batch size: 320
loss: 7.076617 [  320 /  3443]
loss: 7.073470 [  640 /  3443]
loss: 7.090246 [  960 /  3443]
Batch size: 352
loss:

OutOfMemoryError: CUDA out of memory. Tried to allocate 294.00 MiB (GPU 0; 31.74 GiB total capacity; 29.09 GiB already allocated; 120.88 MiB free; 30.41 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF