# Pulling data locally

This notebook will copy the datasets into the data_folder path. All the other notebooks will read the data from this folder. 

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from src import paths
data_folder = paths['data_path']
print(data_folder)

In [None]:
from sklearn.datasets import fetch_openml
from sklearn.datasets import load_digits

import numpy as np
import pandas as pd
import requests
import zipfile
import imageio
from PIL import Image
import os
from glob import glob
import re
import rarfile
import urllib.request

# MNIST, USPS and Pendigits are easy

We can use the sklearn API to fetch data for the Pendigits, MNIST and USPS datasets.

Of these datasets pendigits is the smallest, with only 1797 samples, and is only 64 dimensional. This makes a good first dataset to test things out on -- the dataset is small enough that practically anything should be able to run on this efficiently.

USPS provides a slightly more challenging dataset, with almost 10,000 samples and 256 dimensions, but is still samall enough to be tractable for even naive clustering implementations.

MNIST provides a good basic scaling test with 70,000 samples in 784 dimensions. In practice this is not a very large dataset compared to many that people want to cluster, although the dimensionality may provide some challenges.

In [None]:
digits = load_digits()
mnist = fetch_openml("MNIST_784")
usps = fetch_openml("USPS")

# Buildings and COIL are harder

The buildings and COIL-20 datasets provide some slightly more challenging image based problems, with more complex images to be dealt with. Both are still small in number of samples, so should be easily tractable. COIL *should* be relatively easy to cluster since the different classes should provide fairly tight and distinct clusters (being 72 images of the same object from different angles for each class). The buildings dataset, which has colour images from many angles and different lighting conditions, should be a much more challenging problem to cluster if using simple euclidean distance on the flattened vectors.

In [None]:
if not os.path.isdir(data_folder):
    bashCommand = f"mkdir {data_folder}"
    os.system(bashCommand)

## Coil-20

In [None]:
%%time
if not os.path.isfile(data_folder / 'coil20.zip'):
    results = requests.get('http://www.cs.columbia.edu/CAVE/databases/SLAM_coil-20_coil-100/coil-20/coil-20-proc.zip')
    with open(data_folder / 'coil20.zip', "wb") as code:
        code.write(results.content)

In [None]:
images_zip = zipfile.ZipFile(data_folder / 'coil20.zip')
mylist = images_zip.namelist()
r = re.compile(".*\.png$")
filelist = list(filter(r.match, mylist))
images_zip.extractall(str(data_folder) + '/.')

In [None]:
%%time
coil_feature_vectors = []
for filename in filelist:
    im = imageio.imread(data_folder / filename)
    coil_feature_vectors.append(im.flatten())
coil_20_data = np.asarray(coil_feature_vectors)
coil_20_target = pd.Series(filelist).str.extract("obj([0-9]+)", expand=False).values.astype(np.int32)

## Buildings

In [None]:
if not os.path.isfile(data_folder / 'buildings.rar'):
    results = requests.get('http://eprints.lincoln.ac.uk/id/eprint/16079/1/dataset.rar')
    with open(data_folder / 'buildings.rar', "wb") as code:
        code.write(results.content)

In [None]:
if not os.path.isfile(data_folder / 'sheffield_buildings/Dataset/Dataset/1/S1-01.jpeg'):
    rf = rarfile.RarFile(f'{data_folder}/buildings.rar')
    rf.extractall(f'{data_folder}/sheffield_buildings')

In [None]:
buildings_data = []
buildings_target = []
for i in range(1, 41):
    directory = data_folder / f"sheffield_buildings/Dataset/Dataset/{i}"
    images = np.vstack([np.asarray(Image.open(filename).resize((96, 96))).flatten() for filename in glob(f"{directory}/*")])
    labels = np.full(len(glob(f"{directory}/*")), i, dtype=np.int32)
    buildings_data.append(images)
    buildings_target.append(labels)
buildings_data = np.vstack(buildings_data)
buildings_target = np.hstack(buildings_target)

## Clusterable Data

In [None]:
if not os.path.isfile(data_folder / 'clusterable_data.npy'):
    git_repo_url = 'https://github.com/scikit-learn-contrib/hdbscan/blob/master/notebooks/clusterable_data.npy?raw=true'
    urllib.request.urlretrieve(git_repo_url, filename=f"{data_folder}/clusterable_data.npy")
data= np.load(f'{data_folder}/clusterable_data.npy')