# co-training

In [1]:
import torch
import torch.nn as nn
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
from torchvision.models import resnet50

In [2]:
from tqdm import tqdm
from math import floor
import numpy as np
import copy
import random
import pickle
import os

In [3]:
def train(loader, model, loss_fn, optimizer, device):
    size = len(loader.dataset)
    model.train()
    for batch, (X, y) in enumerate(loader):
        X, y = X.to(device), y.to(device)

        # Compute prediction error
        pred = model(X)
        loss = loss_fn(pred, y)

        # Backpropagation
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        
        loss, current = loss, (batch + 1) * len(X)
        print(f"loss: {loss:>7f} [{current:5d} / {size:>5d}]")
        
        # if batch % 10 == 0:
        #     loss, current = loss, (batch + 1) * len(X)
        #     print(f"loss: {loss:>7f} [{current:5d} / {size:>5d}]")

In [4]:
def test(loader, model, loss_fn, device):
  size = len(loader.dataset)
  num_batches = len(loader)
  model.eval()
  test_loss, correct = 0, 0
  with torch.no_grad():
    for X, y in loader:
      X, y = X.to(device), y.to(device)
      pred = model(X)
      test_loss += loss_fn(pred, y).item()
      correct += (pred.argmax(1) == y).type(torch.float).sum().item()
    test_loss /= num_batches
    correct /= size
      
    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f}")
    return test_loss

In [5]:
def predict(loader, model, device):
    model.eval()
    predictions = []
    with torch.no_grad():
        for X, y in loader:
            X, y = X.to(device), y.to(device)
            output = model(X)
            predictions.append(output)
    return torch.cat(predictions) # output shape (# instances, # outputs)

In [6]:
# takes in a Tensor of shape e.g. (# instances, # prob outputs) and returns a tuple
# (Tensor[top probabilities], Tensor[predicted labels], Tensor[instance indexes])
def get_topk_pred(pred, k):
    prob, label = torch.max(pred, 1)
    idx = torch.argsort(prob, descending=True)[:k]
    return prob[idx].cpu(), label[idx].cpu(), idx.cpu() # ...?

In [7]:
def add_to_imagefolder(paths, labels, dataset):
    """
    Adds the paths with the labels to an image classification dataset

    :list paths: a list of absolute image paths to add to the dataset
    :list labels: a list of labels for each path
    :Dataset dataset: the dataset to add the samples to
    """

    new_samples = list(zip(paths, labels))

    dataset.samples += new_samples

    return dataset.samples

In [8]:
def remove_collisions(lbl_model0, lbl_model1, idx_model0, idx_model1):
    # find instances and indices of instances that have
    # been labeled as most confident by both model0, model1
    inter, idx_inter0, idx_inter1 = np.intersect1d(
                                        idx_model0,
                                        idx_model1,
                                        return_indices=True)

    print(f"Number of predictions (model0): {len(idx_model0)}")
    print(f"Number of predictions (model1): {len(idx_model1)}")
    print(f"Found {len(inter)} instances in predict(model0) INTERSECT predict(model1)")

    # bool mask to identify the conflicting predictions (collision)
    mask_coll = lbl_model0[idx_inter0] != lbl_model1[idx_inter1]
    collisions = inter[mask_coll]

    print(f"Found {len(collisions)} conflicting predictions")

    if (len(collisions) > 0):
        print(f"Collisions: {collisions}")
        # find where these collisions are actually at
        # in their respective lists, and remove them...
        idx_coll0 = idx_inter0[mask_coll]
        idx_coll1 = idx_inter1[mask_coll]

        # TODO we probably want to keep some log some of
        # these of things... something like that...
        # it may be better if we get the image paths
        # as these are just indices. hmmm

        # masks to remove the instances with conflicting predictions
        mask0 = np.ones(len(idx_model0), dtype=bool)
        mask0[idx_coll0] = False
        mask1 = np.ones(len(idx_model1), dtype=bool)
        mask1[idx_coll1] = False

        lbl_model0 = lbl_model0[mask0]
        lbl_model1 = lbl_model1[mask1]
        idx_model0 = idx_model0[mask0]
        idx_model1 = idx_model1[mask1]

    return lbl_model0, lbl_model1, idx_model0, idx_model1


In [9]:
# train two models on two different views
# then add top k% of predictions on the unlabeled set
# to the labeled datasets
def cotrain(
        loader0, loader1, loader_unlbl,
        model0, model1, loss_fn, optimizer0, optimizer1,
        k, device):

    # TODO early stopping stuffs (???)
    # we don't want to do just one forward pass!
    # PyTorch has ReduceLROnPlateau, but...
    # anyways move this outside this function, maybe.
    print("training model0 ...")
    train(loader0, model0, loss_fn, optimizer0, device)
    print("------------------------------\ntraining model1 ...")
    train(loader1, model1, loss_fn, optimizer1, device)

    print("------------------------------\nmaking predictions with model0 ...")
    pred_model0 = predict(loader_unlbl, model0, device)
    print("making predictions with model1 ...")
    pred_model1 = predict(loader_unlbl, model1, device)

    # get top-k predictions (labels, instance indexes in the dataset)
    _, lbl_topk0, idx_topk0 = get_topk_pred(
                                    pred_model0,
                                    k if k <= len(pred_model0) else len(pred_model0))
    _, lbl_topk1, idx_topk1 = get_topk_pred(
                                    pred_model1, 
                                    k if k <= len(pred_model1) else len(pred_model1))

    print(f"Number of unlabeled instances: {len(loader_unlbl.dataset)}")

    # what if two models predict confidently on the same instance?
    # find and remove conflicting predictions from the lists
    lbl_topk0, lbl_topk1, idx_topk0, idx_topk1 = \
    remove_collisions(lbl_topk0, lbl_topk1, idx_topk0, idx_topk1)

    # convert from list to array for the convenient numpy indexing
    samples_unlbl = np.stack([np.array(a) for a in loader_unlbl.dataset.samples])
    list_samples0 = [(str(a[0]), int(a[1])) for a in list(samples_unlbl[idx_topk0])]
    list_samples1 = [(str(a[0]), int(a[1])) for a in list(samples_unlbl[idx_topk1])] 

    paths0 = [i for i, _ in list_samples0]
    paths1 = [i for i, _ in list_samples1]

    # add pseudolabeled instances to the labeled datasets
    loader0.dataset.samples = add_to_imagefolder(paths1, list(lbl_topk1), loader0.dataset)
    loader1.dataset.samples = add_to_imagefolder(paths0, list(lbl_topk0), loader1.dataset)

    # remove instances from unlabeled dataset
    mask_unlbl = np.ones(len(loader_unlbl.dataset), dtype=bool)
    mask_unlbl[idx_topk0] = False
    mask_unlbl[idx_topk1] = False
    print(f"Number of unlabeled instances to remove: {(~mask_unlbl).sum()}")
    samples_unlbl = samples_unlbl[mask_unlbl]
    list_unlbl = [(str(a[0]), int(a[1])) for a in list(samples_unlbl)]
    loader_unlbl.dataset.samples = list_unlbl

In [10]:
random.seed(13)

In [11]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"using {device}")

using cuda


In [12]:
# with open('cotraining_samples_lists.pkl', 'rb') as fp:
#     dict = pickle.load(fp)
with open('cotraining_samples_lists_fixed.pkl', 'rb') as fp:
    dict = pickle.load(fp)

In [13]:
dict.keys()

dict_keys(['labeled', 'inferred', 'class_map'])

In [14]:
dict['class_map']

{'dry': 0, 'snow': 1, 'wet': 2}

In [15]:
print(dict['labeled'][0])
print(dict['inferred'][0])

('/ourdisk/hpc/ai2es/jroth/data/labeled/bronx_allsites/snow/NYSDOT_uyomtjhwsay_2022-01-29-06-51-02.jpg', 1)
('/ourdisk/hpc/ai2es/datasets/DOT/Skyline_6464/20220129/I_87_at_Interchange_3_(Yonkers_Mile_Square_Road)__Northbound__Skyline_6464_2022-01-29-06:50:09.jpg', 1)


In [16]:
data_labeled0 = dict['labeled']
data_labeled1 = dict['inferred']

In [17]:
# holdout -- 20% for a validation set
p = 0.20

idx_samples = list(range(len(data_labeled0)))
idx_val = random.sample(idx_samples, floor(p * len(data_labeled0)))
idx_train = list(set(idx_samples) - set(idx_val))

lab0_samples = np.stack([np.array(a) for a in data_labeled0])
lab1_samples = np.stack([np.array(a) for a in data_labeled1])


samples_train0 = [(str(a[0]), int(a[1])) for a in list(lab0_samples[idx_train])]
samples_val0 = [(str(a[0]), int(a[1])) for a in list(lab0_samples[idx_val])]
samples_train1 = [(str(a[0]), int(a[1])) for a in list(lab1_samples[idx_train])]
samples_val1 = [(str(a[0]), int(a[1])) for a in list(lab1_samples[idx_val])]

In [18]:
print(len(samples_train0))
print(len(samples_train1))
print(len(samples_val0))
print(len(samples_val1))

3443
3443
860
860


In [19]:
trans = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor()
    ])

In [20]:
# make some dummies and update the classmap
data_train0 = datasets.ImageFolder('/ourdisk/hpc/ai2es/jroth/data/labeled', transform=trans)
data_train0.class_to_idx = dict['class_map']
data_train0.classes = list(dict['class_map'].keys())

# make train, val sets with the actual data from above
data_train1 = copy.deepcopy(data_train0)
data_val0 = copy.deepcopy(data_train0)
data_val1 = copy.deepcopy(data_train0)

data_train0.samples = samples_train0
data_train1.samples = samples_train1
data_val0.samples = samples_val0
data_val1.samples = samples_val1

# update root for train1, val1 as it's different
# don't know if this is needed, but, whatever
data_train1.root = '/ourdisk/hpc/ai2es'
data_val1.root = '/ourdisk/hpc/ai2es'

In [21]:
# sanity check
print(data_train0.samples[0])
print(data_train1.samples[0])
print(data_val0.samples[0])
print(data_val1.samples[0])

('/ourdisk/hpc/ai2es/jroth/data/labeled/bronx_allsites/snow/NYSDOT_uyomtjhwsay_2022-01-29-06-51-02.jpg', 1)
('/ourdisk/hpc/ai2es/datasets/DOT/Skyline_6464/20220129/I_87_at_Interchange_3_(Yonkers_Mile_Square_Road)__Northbound__Skyline_6464_2022-01-29-06:50:09.jpg', 1)
('/ourdisk/hpc/ai2es/jroth/data/labeled/bronx_allsites/dry/NYSDOT_m4er5dez4ab_2022-02-25-22-31-02.jpg', 0)
('/ourdisk/hpc/ai2es/datasets/DOT/Skyline_6464/20220225/I_87_at_Interchange_3_(Yonkers_Mile_Square_Road)__Northbound__Skyline_6464_2022-02-25-22:30:25.jpg', 0)


In [22]:
model0, model1 = resnet50().to(device), resnet50().to(device)

In [23]:
# define loss function and optimizer (trusty ol' SGD)
loss_fn = nn.CrossEntropyLoss()
optimizer0 = torch.optim.SGD(model0.parameters(), lr=1e-3,momentum=0.9)
optimizer1 = torch.optim.SGD(model1.parameters(), lr=1e-3, momentum=0.9)

# we also need to define some sort of learning rate/early stopping scheduler
scheduler0 = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer0)
scheduler1 = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer1)

In [24]:
# we probably want to test for bigger batches until it can't fit into mem
# as well. ahh
batch_size = 64

loader_train0 = DataLoader(data_train0, batch_size, shuffle=False)
loader_val0 = DataLoader(data_val0, batch_size, shuffle=False)
loader_train1 = DataLoader(data_train1, batch_size, shuffle=False)
loader_val1 = DataLoader(data_val1, batch_size, shuffle=False)

In [25]:
print(loader_train0.dataset)
print(loader_val0.dataset)
print(loader_train1.dataset)
print(loader_val1.dataset)

Dataset ImageFolder
    Number of datapoints: 3443
    Root location: /ourdisk/hpc/ai2es/jroth/data/labeled
    StandardTransform
Transform: Compose(
               Resize(size=256, interpolation=bilinear, max_size=None, antialias=warn)
               CenterCrop(size=(224, 224))
               ToTensor()
           )
Dataset ImageFolder
    Number of datapoints: 860
    Root location: /ourdisk/hpc/ai2es/jroth/data/labeled
    StandardTransform
Transform: Compose(
               Resize(size=256, interpolation=bilinear, max_size=None, antialias=warn)
               CenterCrop(size=(224, 224))
               ToTensor()
           )
Dataset ImageFolder
    Number of datapoints: 3443
    Root location: /ourdisk/hpc/ai2es
    StandardTransform
Transform: Compose(
               Resize(size=256, interpolation=bilinear, max_size=None, antialias=warn)
               CenterCrop(size=(224, 224))
               ToTensor()
           )
Dataset ImageFolder
    Number of datapoints: 860
    Root loca

In [26]:
train(loader_train0, model0, loss_fn, optimizer0, device)

loss: 6.988656 [   64 /  3443]
loss: 6.233513 [  128 /  3443]
loss: 5.145735 [  192 /  3443]
loss: 3.044444 [  256 /  3443]
loss: 1.923487 [  320 /  3443]
loss: 1.382159 [  384 /  3443]
loss: 1.552302 [  448 /  3443]
loss: 1.758082 [  512 /  3443]
loss: 1.628604 [  576 /  3443]
loss: 1.205660 [  640 /  3443]
loss: 1.675334 [  704 /  3443]
loss: 2.002286 [  768 /  3443]
loss: 1.344727 [  832 /  3443]
loss: 1.128285 [  896 /  3443]
loss: 0.981537 [  960 /  3443]
loss: 0.846240 [ 1024 /  3443]
loss: 0.942550 [ 1088 /  3443]
loss: 0.878120 [ 1152 /  3443]
loss: 0.966138 [ 1216 /  3443]
loss: 1.078527 [ 1280 /  3443]
loss: 0.973281 [ 1344 /  3443]
loss: 0.949259 [ 1408 /  3443]
loss: 0.859360 [ 1472 /  3443]
loss: 0.751226 [ 1536 /  3443]
loss: 0.741047 [ 1600 /  3443]
loss: 0.689999 [ 1664 /  3443]
loss: 0.976324 [ 1728 /  3443]
loss: 1.019539 [ 1792 /  3443]
loss: 0.849389 [ 1856 /  3443]
loss: 1.203480 [ 1920 /  3443]
loss: 0.880234 [ 1984 /  3443]
loss: 0.703524 [ 2048 /  3443]
loss: 0.

In [27]:
test(loader_val0, model0, loss_fn, device)

Test Error: 
 Accuracy: 62.7%, Avg loss: 0.924459


NameError: name 'tess_loss' is not defined