In [2]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
#from comet_ml import Experiment
from torch.utils.data.sampler import SubsetRandomSampler
import torch
from torch import nn
import torchvision
import torch.nn.functional as F
from torch.autograd import Variable
from torch.utils.data import DataLoader, Sampler
from torchvision import datasets, transforms, models
from torch import optim
from torch.optim import Adam
from torch.utils.data.sampler import SubsetRandomSampler

import pickle
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import axes3d, Axes3D
import numpy as np
import PIL
from PIL import Image
from pathlib import Path
import time

import numba
import umap 
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC


ModuleNotFoundError: No module named 'umap'

In [None]:
def load_split_train_test(num_workers, datadir, valid_size = .2):
    train_transforms = transforms.Compose([transforms.Resize(224),
                                        transforms.ToTensor(),
                                       transforms.Normalize((0.5, 0.5, 0.5),(0.5, 0.5, 0.5))])
    test_transforms = transforms.Compose([transforms.Resize(224),
                                        transforms.ToTensor(),                                   
                                       transforms.Normalize((0.5, 0.5, 0.5),(0.5, 0.5, 0.5))])
    train_data = datasets.ImageFolder(datadir,       
                    transform=train_transforms)
    test_data = datasets.ImageFolder(datadir,
                    transform=test_transforms)

    num_train = len(train_data)
    indices = list(range(num_train))
    split = int(np.floor(valid_size * num_train))
    np.random.shuffle(indices)
    
    train_idx, test_idx = indices[split:], indices[:split]
    train_sampler = SubsetRandomSampler(train_idx)
    test_sampler = SubsetRandomSampler(test_idx)
    print('number of workers to load data', num_workers)
    print('batch size: ',hyper_params['batch_size'])
    trainloader = torch.utils.data.DataLoader(train_data,
                   sampler=train_sampler, batch_size=hyper_params['batch_size'], num_workers=num_workers)
    testloader = torch.utils.data.DataLoader(test_data,
                   sampler=test_sampler, batch_size=hyper_params['batch_size'], num_workers=num_workers)
    return trainloader, testloader

In [None]:
def convert_to_cpu(trainloader, testloader):
    train_imgs, train_label = next(iter(trainloader))
    test_imgs, test_label = next(iter(testloader))

    # Convert image to numpy
    train_imgs_np = train_imgs.to('cpu').numpy()
    test_imgs_np = test_imgs.to('cpu').numpy()

    train_labs_np = train_label.to('cpu').numpy()
    test_labs_np = test_label.to('cpu').numpy()
    print(np.shape(train_imgs_np))
    
    nsamples1, nz, nx, ny = np.shape(train_imgs_np)
    nsamples2, nz, nx, ny = np.shape(test_imgs_np)
    print(nz, nx, ny)

    train_imgs_np = train_imgs_np.reshape((nsamples1,nz*nx*ny))
    test_imgs_np = test_imgs_np.reshape((nsamples2,nz*nx*ny))

    data = np.array(np.vstack([train_imgs_np, test_imgs_np]), dtype=np.float64)

    return (train_imgs_np, train_labs_np, test_imgs_np, test_labs_np)

In [None]:
def write_data(train_imgs_np, train_labs_np, test_imgs_np, test_labs_np):        
    
    #can only write 4gb at a time so had to split into 4 chunks
    with open('train_imgs','wb') as f:
        pickle.dump(train_imgs_np, f, protocol=4)
    with open('train_labs','wb') as f:
        pickle.dump(train_labs_np, f, protocol=4)
    with open('test_imgs','wb') as f:
        pickle.dump(test_imgs_np, f, protocol=4)
    with open('test_labs','wb') as f:
        pickle.dump(test_labs_np, f, protocol=4)

In [None]:
def load_data():
    
    train_imgs_np = np.array(np.load('train_imgs'))
    train_labs_np = np.array(np.load('train_labs'))
    test_imgs_np = np.array(np.load('test_imgs'))
    test_labs_np = np.array(np.load('test_labs'))
    
    print('done loading arrays')
    
    return (train_imgs_np, train_labs_np, test_imgs_np, test_labs_np)


# MAIN

In [None]:
def load_data():
    
    train_imgs_np = np.array(np.load('cpi_data/train_imgs'))
    train_labs_np = np.array(np.load('cpi_data/train_labs'))
    test_imgs_np = np.array(np.load('cpi_data/test_imgs'))
    test_labs_np = np.array(np.load('cpi_data/test_labs'))
    
    print('done loading arrays')
    
    return (train_imgs_np, train_labs_np, test_imgs_np, test_labs_np)

train_imgs_np, train_labs_np, test_imgs_np, test_labs_np = load_data()


In [None]:
num_workers = 10
data_dir = 'cpi_data/'
valid_size = 0.2
hyper_params = {
    "num_classes": 3,
    "input_size": 2048,
    "hidden_size": 512,   
    "batch_size": 11100,
    "num_epochs": 5,
    "learning_rate": 0.0002   #The lower the value, the slower we travel along the downward slope
}
classes=['aggs','junk','columns']

trainloader, testloader = load_split_train_test(num_workers=num_workers, datadir=data_dir, valid_size=valid_size)
train_imgs_np, train_labs_np, test_imgs_np, test_labs_np = convert_to_cpu(trainloader, testloader)
#print('Done converting to np on CPU')
write_data(train_imgs_np, train_labs_np, test_imgs_np, test_labs_np)
#print('Done writing')
#train_imgs_np, train_labs_np, test_imgs_np, test_labs_np = load_data()

#%time embedding = umap.UMAP(n_neighbors=50).fit_transform(train_imgs_np)
#print('Done with UMAP')



In [None]:
nz, nx, ny = 3, 224, 224
train_imgs = train_imgs_np.reshape((8887,nx,ny,nz))

In [None]:
print(np.shape(train_imgs_np))
class_names = ['aggs', 'columns','junk']
plt.figure(figsize=(10,10))
for i in range(25):
    plt.subplot(5,5,i+1)
    plt.xticks([])
    plt.yticks([])
    plt.grid(False)
    plt.imshow(train_imgs[i], cmap=plt.cm.binary)
    plt.xlabel(class_names[train_labs_np[i]])
plt.show()

In [None]:
%time trans = umap.UMAP(n_neighbors=50).fit(train_imgs_np, train_labs_np)

In [None]:
file = open('trans', 'rb')
trans = pickle.load(file)

In [None]:
print(type(trans))
%time trans_train=trans.transform(train_imgs_np) 
%time trans_test=trans.transform(test_imgs_np) 
print(type(trans_test))

In [None]:
#filename = 'trans_test_50'
#filehandler = open(filename, 'wb')
#%time pickle.dump(trans_test, filehandler, protocol=4)
#filehandler.close()

#with open('trans_test_50.msgpack', 'wb') as outfile:
#    embedding = msgpack.pack(trans_test, outfile)
#with open('trans.msgpack', 'rb') as data_file:
#    embedding = msgpack.unpack(data_file)


In [None]:
print(np.shape(train_imgs_np), np.shape(test_imgs_np))

In [None]:
#%time trans = umap.UMAP(n_neighbors=5, random_state=42).fit(train_imgs_np, )
#embedded into two dimensions in the locations by class
plt.scatter(trans.embedding_[:, 0], trans.embedding_[:, 1], s= 5, c=train_labs_np, cmap='Spectral')
plt.title('Embedding of the training set by UMAP', fontsize=24);

In [None]:
#%time embedding = umap.UMAP(n_neighbors=15).fit_transform(train_imgs_np)
#We can now train some new models (again an SVC and a KNN classifier) on the embedded training data
svc = SVC().fit(trans.embedding_, train_labs_np)
knn = KNeighborsClassifier().fit(trans.embedding_, train_labs_np)

In [None]:
svc.score(trans.transform(test_imgs_np), test_labs_np), knn.score(trans.transform(test_imgs_np), test_labs_np)

In [None]:
#transform() method on that model to transform the test set into the learned space
%time test_embedding = trans.transform(test_imgs_np)

In [None]:
plt.scatter(test_embedding[:, 0], test_embedding[:, 1], s= 5, c=test_labs_np, cmap='Spectral')
plt.title('Embedding of the test set by UMAP', fontsize=24);

In [None]:
target=np.hstack([train_labels_np, test_labels_np])
fig, ax = plt.subplots(1, figsize=(14, 10))
plt.scatter(*embedding.T, s=50, c=target, alpha=1.0)
plt.setp(ax, xticks=[], yticks=[])
cbar = plt.colorbar(boundaries=np.arange(4))
cbar.set_ticks(np.arange(3))
cbar.set_ticklabels(classes)
plt.title('Habit Classification via UMAP')

In [None]:
target=np.hstack([train_labels_np, test_labels_np])
fig, ax = plt.subplots(1, figsize=(14, 10))
plt.scatter(*embedding.T, s=50, c=target, alpha=1.0)
plt.setp(ax, xticks=[], yticks=[])
cbar = plt.colorbar(boundaries=np.arange(4))
cbar.set_ticks(np.arange(3))
cbar.set_ticklabels(classes)
plt.title('Habit Classification via UMAP')

In [None]:
target=np.hstack([train_labels_np, test_labels_np])
fig, ax = plt.subplots(1, figsize=(14, 10))
plt.scatter(*embedding.T, s=50, c=target, alpha=1.0)
plt.setp(ax, xticks=[], yticks=[])
cbar = plt.colorbar(boundaries=np.arange(4))
cbar.set_ticks(np.arange(3))
cbar.set_ticklabels(classes)
plt.title('Habit Classification via UMAP')

In [None]:
target=np.hstack([train_labels_np, test_labels_np])
fig, ax = plt.subplots(1, figsize=(14, 10))
plt.scatter(*embedding.T, s=50, c=target, alpha=1.0)
plt.setp(ax, xticks=[], yticks=[])
cbar = plt.colorbar(boundaries=np.arange(4))
cbar.set_ticks(np.arange(3))
cbar.set_ticklabels(classes)
plt.title('Habit Classification via UMAP')

In [None]:
target=np.hstack([train_labels_np, test_labels_np])
fig, ax = plt.subplots(1, figsize=(14, 10))
plt.scatter(*embedding.T, s=50, c=target, alpha=1.0)
plt.setp(ax, xticks=[], yticks=[])
cbar = plt.colorbar(boundaries=np.arange(4))
cbar.set_ticks(np.arange(3))
cbar.set_ticklabels(classes)
plt.title('Habit Classification via UMAP');

In [None]:
def draw_umap(n_neighbors=150, min_dist=0.1, n_components=3, metric='euclidean', title=''):
    fit = umap.UMAP(
        n_neighbors=n_neighbors,
        min_dist=min_dist,
        n_components=n_components,
        metric=metric
    )
    u = fit.fit_transform(data);
    fig = plt.figure()
    if n_components == 1:
        ax = fig.add_subplot(111)
        ax.scatter(u[:,0], range(len(u)), c=target)
    if n_components == 2:
        ax = fig.add_subplot(111)
        ax.scatter(u[:,0], u[:,1], c=target)
    if n_components == 3:
        ax = fig.add_subplot(111, projection='3d')
        ax.scatter(u[:,0], u[:,1], u[:,2], c=target, s=100)
    plt.title(title, fontsize=18)

In [None]:
target=np.hstack([train_labels_np, test_labels_np])
for n in (5, 25, 50, 100, 200):
    draw_umap(n_neighbors=n, title='n_neighbors = {}'.format(n))

In [None]:
#embedding = umap.UMAP(n_neighbors=15).fit_transform(data)
%time svc = SVC().fit(train_imgs_np, train_labels_np)
%time knn = KNeighborsClassifier().fit(train_imgs_np, train_labels_np)

In [None]:
%time svc.score(test_imgs_np, test_labels_np), knn.score(test_imgs_np, test_labels_np)