# Read data: vectors and labels

In [None]:
data_folder = '../data'

In [None]:
import numpy as np
import pandas as pd

# MNIST, USPS and Pendigits are easy

We can use the sklearn API to fetch data for the Pendigits, MNIST and USPS datasets.

Of these datasets pendigits is the smallest, with only 1797 samples, and is only 64 dimensional. This makes a good first dataset to test things out on -- the dataset is small enough that practically anything should be able to run on this efficiently.

USPS provides a slightly more challenging dataset, with almost 10,000 samples and 256 dimensions, but is still samall enough to be tractable for even naive clustering implementations.

MNIST provides a good basic scaling test with 70,000 samples in 784 dimensions. In practice this is not a very large dataset compared to many that people want to cluster, although the dimensionality may provide some challenges.

# Buildings and COIL are harder

The buildings and COIL-29 datasets provide some slightly more challenging image based problems, with more complex images to be dealt with. Both are still small in number of samples, so should be easily tractable. COIL *should* be relatively easy to cluster since the different classes should provide fairly tight and distinct clusters (being 72 images of the same object from different angles for each class). The buildings dataset, which has colour images from many angles and different lighting conditions, should be a much more challenging problem to cluster if using simple euclidean distance on the flattened vectors.

In [None]:
data_set_list = ['pendigits', 'coil', 'mnist', 'usps', 'buildings']

In [None]:
def read_pendigits(data_folder = '../data'):
    from sklearn.datasets import load_digits
    digits = load_digits()
    raw_data = np.asarray(digits.data.astype(np.float32))
    labels = np.asarray(digits.target) 
    return(raw_data, labels)

In [None]:
def read_coil(data_folder = '../data'):
    import re
    import zipfile
    import imageio.v2 as imageio
    images_zip = zipfile.ZipFile(f'{data_folder}/coil20.zip')
    mylist = images_zip.namelist()
    r = re.compile(".*\.png$")
    filelist = list(filter(r.match, mylist))
    images_zip.extractall(data_folder + '/.')
    
    coil_feature_vectors = []
    for filename in filelist:
        im = imageio.imread(data_folder + '/' + filename)
        coil_feature_vectors.append(im.flatten())
    coil_20_data = np.asarray(coil_feature_vectors)
    coil_20_target = pd.Series(filelist).str.extract("obj([0-9]+)", expand=False).values.astype(np.int32)
    
    raw_coil = coil_20_data.astype(np.float32)
    return(raw_coil, coil_20_target)

In [None]:
def read_mnist(data_folder = '../data'):
    from sklearn.datasets import fetch_openml
    mnist = fetch_openml("MNIST_784")
    raw_mnist = np.asarray(mnist.data.astype(np.float32))
    targets = np.array(mnist.target.astype('int'))
    return(raw_mnist, targets)

In [None]:
def read_usps(data_folder = '../data'):
    from sklearn.datasets import fetch_openml
    usps = fetch_openml("USPS", version=2)
    raw_usps = np.asarray(usps.data.astype(np.float32))
    targets = np.array(usps.target.astype('int'))
    return(raw_usps, targets)

In [None]:
def read_buildings(data_folder = '../data'):
    from glob import glob
    from PIL import Image
    buildings_data = []
    buildings_target = []
    for i in range(1, 41):
        directory = f"{data_folder}/sheffield_buildings/Dataset/{i}"
        images = np.vstack([np.asarray(Image.open(filename).resize((96, 96))).flatten() for filename in glob(f"{directory}/*")])
        labels = np.full(len(glob(f"{directory}/*")), i, dtype=np.int32)
        buildings_data.append(images)
        buildings_target.append(labels)
    buildings_data = np.vstack(buildings_data)
    buildings_target = np.hstack(buildings_target)
    return(buildings_data, buildings_target)

In [None]:
def map_id_name(dataset_id=-1, dataset_name=None, data_set_list=data_set_list):
    n = len(data_set_list)
    if(dataset_name is None and dataset_id == -1):
        raise ValueError('Need to define dataset_name or dataset_id')
    if(dataset_name is None):
        if(dataset_id not in list(range(n))):
            raise ValueError(f'dataset_id must an integer be between 0 and {n-1}')
        else:
            dataset_name = data_set_list[dataset_id]
    if(dataset_id == -1 and dataset_name not in data_set_list):
        raise ValueError(f'dataset_name must be in {data_set_list}')
    else:
        dataset_id = data_set_list.index(dataset_name)
    return(dataset_id, dataset_name)

In [None]:
def read(dataset_id, data_folder = '../data'):
    if(dataset_id == 0):
        raw_data, labels = read_pendigits(data_folder)

    if(dataset_id==1):
        raw_data, labels = read_coil(data_folder)

    if(dataset_id==2):
        raw_data, labels = read_mnist(data_folder)

    if(dataset_id==3):
        raw_data, labels = read_usps(data_folder)

    if(dataset_id==4):
        raw_data, labels = read_buildings(data_folder)
        
    return(raw_data, labels)

In [None]:
def get_dataset(dataset_id=-1, dataset_name=None, data_set_list=data_set_list, top_n=None, data_folder = '../data'):
    dataset_id, dataset_name = map_id_name(dataset_id, dataset_name)
    print(dataset_name)
        
    raw_data, labels = read(dataset_id, data_folder)
    
    if(raw_data.shape[0] != len(labels)): 
        raise ValueError(f'data and labels of different lengths {raw_data.shape[0]} and {len(labels)}')
    if(top_n is not None and top_n < len(labels)):
        raw_data = raw_data[:top_n]
        labels = labels[:top_n]
    return(raw_data, labels)

In [4]:
# Could set set_op_mix_ratio=0.0 to get a pure fuzzy intersection - similar to the exploration of bi-directional edges in graphs
# graph_type is 'nx' or 'ig' to designate Networkx or iGraph respectively.

def get_umap_graph(raw_data=None, dataset_id=-1, dataset_name=None, return_all=False, set_op_mix_ratio=1.0, graph_type='ig'):
    import umap
    dataset_id, dataset_name = map_id_name(dataset_id, dataset_name)
    if(raw_data is None):
        raw_data, targets = get_dataset(dataset_id)
    
    # pendigits
    if(dataset_id == 0):
        A, sigmas, rhos, dists = umap.umap_.fuzzy_simplicial_set(X=raw_data, 
                                                         n_neighbors=15, 
                                                         random_state=0, 
                                                         metric='euclidean', 
                                                         return_dists=True,
                                                        set_op_mix_ratio=set_op_mix_ratio)
    # coil
    if(dataset_id == 1):
        A, sigmas, rhos, dists = umap.umap_.fuzzy_simplicial_set(X=raw_data,
                                                            n_neighbors=5,  
                                                            random_state=0, 
                                                         metric='euclidean', 
                                                         return_dists=True,
                                                        set_op_mix_ratio=set_op_mix_ratio)
    # mnist
    if(dataset_id == 2):
        A, sigmas, rhos, dists = umap.umap_.fuzzy_simplicial_set(X=raw_data,
                                                            n_neighbors=10,  
                                                            random_state=42, 
                                                         metric='euclidean', 
                                                         return_dists=True,
                                                         set_op_mix_ratio=set_op_mix_ratio)
    # usps
    if(dataset_id == 3):
        A, sigmas, rhos, dists = umap.umap_.fuzzy_simplicial_set(X=raw_data,
                                                            n_neighbors=10,  
                                                            random_state=42, 
                                                         metric='euclidean', 
                                                         return_dists=True,
                                                         set_op_mix_ratio=set_op_mix_ratio)
    # buildings    
    if(dataset_id == 4):
        A, sigmas, rhos, dists = umap.umap_.fuzzy_simplicial_set(X=raw_data,
                                                            n_neighbors=8,  
                                                            random_state=42, 
                                                         metric='euclidean', 
                                                         return_dists=True,
                                                        set_op_mix_ratio=set_op_mix_ratio)
    if(graph_type=='nx'):
        G = nx.from_scipy_sparse_matrix(A, edge_attribute='weight')
    else:
        G = ig.Graph.Weighted_Adjacency(A)
    if(return_all):
        return(G, A, sigmas, rhos, dists)
    else:
        return(G)

In [None]:
def get_umap_vectors(dataset_id=-1, dataset_name=None):
    import umap
    dataset_id, dataset_name = map_id_name(dataset_id, dataset_name)
    if(raw_data is None):
        raw_data, targets = get_dataset(dataset_id)
    
    # pendigits
    if(dataset_id == 0):
        umap_rep = umap.UMAP(n_neighbors=15,n_components=4, min_dist=1e-8, random_state=0).fit_transform(raw_data)

    # coil
    if(dataset_id == 1):
        umap_rep = umap.UMAP(n_neighbors=5, n_components=4, min_dist=1e-8, random_state=0, n_epochs=1000).fit_transform(raw_data)
    
    # mnist
    if(dataset_id == 2):
        umap_rep = umap.UMAP(n_neighbors=10, n_components=4, min_dist=1e-8, random_state=42, n_epochs=500).fit_transform(raw_data)

    # usps    
    if(dataset_id == 3):
        umap_rep = umap.UMAP(n_neighbors=10, n_components=4, min_dist=1e-8, random_state=42, n_epochs=500).fit_transform(raw_data)

    # buildings    
    if(dataset_id == 4):
        umap_rep = umap.UMAP(n_neighbors=8, n_components=4, min_dist=1e-8, random_state=42, n_epochs=1000).fit_transform(raw_data)
    
    return(umap_rep)
  