# co-training

In [1]:
import torch
import torch.nn as nn
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
from torchvision.models import resnet50

In [2]:
from math import floor
import numpy as np
import copy
import random
import pickle
import os

In [3]:
def train(loader, model, loss_fn, optimizer, device):
    size = len(loader.dataset)
    model.train()
    i = 0
    for batch, (X, y) in enumerate(loader):
        X, y = X.to(device), y.to(device)

        # Compute prediction error
        pred = model(X)
        loss = loss_fn(pred, y)

        # Backpropagation
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        
        loss, current = loss, (batch + 1) * len(X)
        print(f"loss: {loss:>7f} [{current:5d} / {size:>5d}]")
        i += 1
        
        # if batch % 10 == 0:
        #     loss, current = loss, (batch + 1) * len(X)
        #     print(f"loss: {loss:>7f} [{current:5d} / {size:>5d}]")

In [4]:
def test(loader, model, loss_fn, device):
  size = len(loader.dataset)
  num_batches = len(loader)
  model.eval()
  test_loss, correct = 0, 0
  with torch.no_grad():
    for X, y in loader:
      X, y = X.to(device), y.to(device)
      pred = model(X)
      test_loss += loss_fn(pred, y).item()
      correct += (pred.argmax(1) == y).type(torch.float).sum().item()
    test_loss /= num_batches
    correct /= size
      
    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f}")
    return test_loss

In [5]:
def predict(loader, model, device):
    model.eval()
    predictions = []
    with torch.no_grad():
        for X, y in loader:
            X, y = X.to(device), y.to(device)
            output = model(X)
            predictions.append(output)
    return torch.cat(predictions) # output shape (# instances, # outputs)

In [6]:
# takes in a Tensor of shape e.g. (# instances, # prob outputs) and returns a tuple
# (Tensor[top probabilities], Tensor[predicted labels], Tensor[instance indexes])
def get_topk_pred(pred, k):
    prob, label = torch.max(pred, 1)
    idx = torch.argsort(prob, descending=True)[:k]
    return prob[idx].cpu(), label[idx].cpu(), idx.cpu()

In [7]:
def add_to_imagefolder(paths, labels, dataset):
    """
    Adds the paths with the labels to an image classification dataset

    :list paths: a list of absolute image paths to add to the dataset
    :list labels: a list of labels for each path
    :Dataset dataset: the dataset to add the samples to
    """

    new_samples = list(zip(paths, labels))

    dataset.samples += new_samples

    return dataset.samples

In [8]:
def remove_collisions(lbl_model0, lbl_model1, idx_model0, idx_model1):
    # find instances and indices of instances that have
    # been labeled as most confident by both model0, model1
    inter, idx_inter0, idx_inter1 = np.intersect1d(
                                        idx_model0,
                                        idx_model1,
                                        return_indices=True)

    print(f"Number of predictions (model0): {len(idx_model0)}")
    print(f"Number of predictions (model1): {len(idx_model1)}")
    print(f"Found {len(inter)} potential conflicting predictions")

    # bool mask to identify the conflicting predictions (collision)
    mask_coll = lbl_model0[idx_inter0] != lbl_model1[idx_inter1]
    collisions = inter[mask_coll]

    print(f"Found {len(collisions)} conflicting predictions")

    if (len(collisions) > 0):
        print(f"Collisions: {collisions}")
        # find where these collisions are actually at
        # in their respective lists, and remove them...
        # maybe want to return this as well? ...
        idx_coll0 = idx_inter0[mask_coll]
        idx_coll1 = idx_inter1[mask_coll]

        # masks to remove the instances with conflicting predictions
        mask0 = np.ones(len(idx_model0), dtype=bool)
        mask0[idx_coll0] = False
        mask1 = np.ones(len(idx_model1), dtype=bool)
        mask1[idx_coll1] = False

        lbl_model0 = lbl_model0[mask0]
        lbl_model1 = lbl_model1[mask1]
        idx_model0 = idx_model0[mask0]
        idx_model1 = idx_model1[mask1]

    return lbl_model0, lbl_model1, idx_model0, idx_model1

In [9]:
# train two models on two different views
# then add top k% of predictions on the unlabeled set
# to the labeled datasets
def cotrain(loader0, loader1, loader_unlbl,
            model0, model1, loss_fn, optimizer0, optimizer1,
            k, device):

    # get top-k predictions (labels, instance indexes in the dataset)
    _, lbl_topk0, idx_topk0 = get_topk_pred(
                                    pred_model0,
                                    k if k <= len(pred_model0) else len(pred_model0))
    _, lbl_topk1, idx_topk1 = get_topk_pred(
                                    pred_model1, 
                                    k if k <= len(pred_model1) else len(pred_model1))

    print(f"Number of unlabeled instances: {len(loader_unlbl.dataset)}")

    # what if two models predict confidently on the same instance?
    # find and remove conflicting predictions from the lists
    # may want to return the indices of the collisions too...?
    lbl_topk0, lbl_topk1, idx_topk0, idx_topk1 = \
    remove_collisions(lbl_topk0, lbl_topk1, idx_topk0, idx_topk1)

    # convert from list to array for the convenient numpy indexing
    samples_unlbl = np.stack([np.array(a) for a in loader_unlbl.dataset.samples])
    list_samples0 = [(str(a[0]), int(a[1])) for a in list(samples_unlbl[idx_topk0])]
    list_samples1 = [(str(a[0]), int(a[1])) for a in list(samples_unlbl[idx_topk1])] 

    paths0 = [i for i, _ in list_samples0]
    paths1 = [i for i, _ in list_samples1]

    # add pseudolabeled instances to the labeled datasets
    loader0.dataset.samples = add_to_imagefolder(paths1, list(lbl_topk1), loader0.dataset)
    loader1.dataset.samples = add_to_imagefolder(paths0, list(lbl_topk0), loader1.dataset)

    # remove instances from unlabeled dataset
    mask_unlbl = np.ones(len(loader_unlbl.dataset), dtype=bool)
    mask_unlbl[idx_topk0] = False
    mask_unlbl[idx_topk1] = False
    print(f"Number of unlabeled instances to remove: {(~mask_unlbl).sum()}")
    samples_unlbl = samples_unlbl[mask_unlbl]
    list_unlbl = [(str(a[0]), int(a[1])) for a in list(samples_unlbl)]
    loader_unlbl.dataset.samples = list_unlbl

In [18]:
# function to split the datasets of the two views so that
# the samples in the views are still aligned, time-wise, by index
def train_test_split_samples(samples0, samples1, test_size, random_state=None):
    if random_state is not None:
        random.seed(random_state)

    assert test_size > 0 and test_size < 1, \
        'test_size should be a float between (0, 1)'

    assert len(samples0) == len(samples1), \
        'number of samples in samples0, samples1 are not equal'
    
    idx_samples = list(range(len(samples0)))
    idx_test = random.sample(idx_samples, floor(test_size * len(samples0)))
    idx_train = list(set(idx_samples) - set(idx_test))

    # convert to np array for convenient array indexing shenanigans
    samples0_np = np.stack([np.array(a) for a in samples0])
    samples1_np = np.stack([np.array(a) for a in samples1])
    
    samples_train0 = [(str(a[0]), int(a[1])) for a in list(samples0_np[idx_train])]
    samples_test0 = [(str(a[0]), int(a[1])) for a in list(samples0_np[idx_test])]
    samples_train1 = [(str(a[0]), int(a[1])) for a in list(samples1_np[idx_train])]
    samples_test1 = [(str(a[0]), int(a[1])) for a in list(samples1_np[idx_test])]

    return samples_train0, samples_train1, samples_test0, samples_test1

In [11]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"using {device}")

using cuda


In [12]:
# with open('cotraining_samples_lists.pkl', 'rb') as fp:
#     dict = pickle.load(fp)
with open('cotraining_samples_lists_fixed.pkl', 'rb') as fp:
    dict = pickle.load(fp)

In [13]:
dict.keys()

dict_keys(['labeled', 'inferred', 'class_map'])

In [14]:
dict['class_map']

{'dry': 0, 'snow': 1, 'wet': 2}

In [15]:
print(dict['labeled'][0])
print(dict['inferred'][0])

('/ourdisk/hpc/ai2es/jroth/data/labeled/bronx_allsites/snow/NYSDOT_uyomtjhwsay_2022-01-29-06-51-02.jpg', 1)
('/ourdisk/hpc/ai2es/datasets/DOT/Skyline_6464/20220129/I_87_at_Interchange_3_(Yonkers_Mile_Square_Road)__Northbound__Skyline_6464_2022-01-29-06:50:09.jpg', 1)


In [19]:
# split data into labeled/unlabeled
samples_unlbl0, samples_unlbl1, samples_train0, samples_train1 = \
    train_test_split_samples(dict['labeled'], dict['inferred'],
                  test_size=0.75, random_state=13)

In [20]:
print(len(samples_unlbl0))
print(len(samples_train0))

1076
3227


In [22]:
# split the data so we get 70/10/20 train/val/test

# split labeled data into 80/20 train/test
samples_train0, samples_train1, samples_test0, samples_test1 = \
    train_test_split_samples(samples_train0, samples_train1,
                  test_size=0.2, random_state=13)

print(len(samples_train0))
print(len(samples_test0))

# split it again so we get 70/10 train/val (surely this is right...)
samples_train0, samples_train1, samples_val0, samples_val1 = \
    train_test_split_samples(samples_train0, samples_train1,
                  test_size=.125, random_state=13)

print(len(samples_train0))
print(len(samples_val0))

2582
645
2260
322


In [None]:
##################################################

In [16]:
trans = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor()
    ])

In [17]:
# make some dummies and update the classmap
data_train0 = datasets.ImageFolder('/ourdisk/hpc/ai2es/jroth/data/labeled', transform=trans)
data_train0.class_to_idx = dict['class_map']
data_train0.classes = list(dict['class_map'].keys())

# make train, val sets with the actual data from above
data_train1 = copy.deepcopy(data_train0)
data_val0 = copy.deepcopy(data_train0)
data_val1 = copy.deepcopy(data_train0)

data_train0.samples = samples_train0
data_train1.samples = samples_train1
data_val0.samples = samples_val0
data_val1.samples = samples_val1

# update root for train1, val1 as it's different
# don't know if this is needed, but, whatever
data_train1.root = '/ourdisk/hpc/ai2es'
data_val1.root = '/ourdisk/hpc/ai2es'

In [21]:
# sanity check
print(data_train0.samples[0])
print(data_train1.samples[0])
print(data_val0.samples[0])
print(data_val1.samples[0])

('/ourdisk/hpc/ai2es/jroth/data/labeled/bronx_allsites/snow/NYSDOT_uyomtjhwsay_2022-01-29-06-51-02.jpg', 1)
('/ourdisk/hpc/ai2es/datasets/DOT/Skyline_6464/20220129/I_87_at_Interchange_3_(Yonkers_Mile_Square_Road)__Northbound__Skyline_6464_2022-01-29-06:50:09.jpg', 1)
('/ourdisk/hpc/ai2es/jroth/data/labeled/bronx_allsites/dry/NYSDOT_m4er5dez4ab_2022-02-25-22-31-02.jpg', 0)
('/ourdisk/hpc/ai2es/datasets/DOT/Skyline_6464/20220225/I_87_at_Interchange_3_(Yonkers_Mile_Square_Road)__Northbound__Skyline_6464_2022-02-25-22:30:25.jpg', 0)


In [19]:
model0, model1 = resnet50().to(device), resnet50().to(device)

In [20]:
# define loss function and optimizer
loss_fn = nn.CrossEntropyLoss()
optimizer0 = torch.optim.SGD(model0.parameters(), lr=1e-3,momentum=0.9)
optimizer1 = torch.optim.SGD(model1.parameters(), lr=1e-3, momentum=0.9)

# we also need to define some sort of learning rate/early stopping scheduler
scheduler0 = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer0)
scheduler1 = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer1)

In [23]:
# we probably want to test for bigger batches until it can't fit into mem
# .........?
batch_size = 64

loader_train0 = DataLoader(data_train0, batch_size, shuffle=False)
loader_val0 = DataLoader(data_val0, batch_size, shuffle=False)
loader_train1 = DataLoader(data_train1, batch_size, shuffle=False)
loader_val1 = DataLoader(data_val1, batch_size, shuffle=False)

In [24]:
print(loader_train0.dataset)
print(loader_val0.dataset)
print(loader_train1.dataset)
print(loader_val1.dataset)

Dataset ImageFolder
    Number of datapoints: 3443
    Root location: /ourdisk/hpc/ai2es/jroth/data/labeled
    StandardTransform
Transform: Compose(
               Resize(size=256, interpolation=bilinear, max_size=None, antialias=warn)
               CenterCrop(size=(224, 224))
               ToTensor()
           )
Dataset ImageFolder
    Number of datapoints: 860
    Root location: /ourdisk/hpc/ai2es/jroth/data/labeled
    StandardTransform
Transform: Compose(
               Resize(size=256, interpolation=bilinear, max_size=None, antialias=warn)
               CenterCrop(size=(224, 224))
               ToTensor()
           )
Dataset ImageFolder
    Number of datapoints: 3443
    Root location: /ourdisk/hpc/ai2es
    StandardTransform
Transform: Compose(
               Resize(size=256, interpolation=bilinear, max_size=None, antialias=warn)
               CenterCrop(size=(224, 224))
               ToTensor()
           )
Dataset ImageFolder
    Number of datapoints: 860
    Root loca

In [22]:
for b in range(64, 512 + 1, 32):
    model0 = resnet50().to(device)
    loader_batch0 = DataLoader(data_train0, b, shuffle=False)
    print(f"Batch size: {b}")
    train(loader_batch0, model0, loss_fn, optimizer0, device)

Batch size: 64
loss: 7.275452 [   64 /  3443]
loss: 7.242462 [  128 /  3443]
loss: 7.271658 [  192 /  3443]
Batch size: 96
loss: 6.953737 [   96 /  3443]
loss: 6.971457 [  192 /  3443]
loss: 6.974708 [  288 /  3443]
Batch size: 128
loss: 7.107782 [  128 /  3443]
loss: 7.115186 [  256 /  3443]
loss: 7.094270 [  384 /  3443]
Batch size: 160
loss: 7.628152 [  160 /  3443]
loss: 7.727857 [  320 /  3443]
loss: 7.776735 [  480 /  3443]
Batch size: 192
loss: 6.342412 [  192 /  3443]
loss: 6.226246 [  384 /  3443]
loss: 6.290689 [  576 /  3443]
Batch size: 224
loss: 6.562408 [  224 /  3443]
loss: 6.534523 [  448 /  3443]
loss: 6.540992 [  672 /  3443]
Batch size: 256
loss: 7.188202 [  256 /  3443]
loss: 7.134872 [  512 /  3443]
loss: 7.188458 [  768 /  3443]
Batch size: 288
loss: 7.491960 [  288 /  3443]
loss: 7.488218 [  576 /  3443]
loss: 7.497365 [  864 /  3443]
Batch size: 320
loss: 7.076617 [  320 /  3443]
loss: 7.073470 [  640 /  3443]
loss: 7.090246 [  960 /  3443]
Batch size: 352
loss:

OutOfMemoryError: CUDA out of memory. Tried to allocate 294.00 MiB (GPU 0; 31.74 GiB total capacity; 29.09 GiB already allocated; 120.88 MiB free; 30.41 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF