## Setup

In [None]:
import pandas as pd
import numpy as np

## MNIST

In [None]:
from sklearn.datasets import fetch_openml
data = fetch_openml('mnist_784')
data.data.to_csv('mnist.csv', header=False, index=False)

In [None]:
data.data.shape

(70000, 784)

In [None]:
data.target.to_csv('mnistclasses.csv', header=False, index=False)

## MakeBlobs (large)

In [None]:
from sklearn.datasets import make_blobs

X, y = make_blobs(n_samples=250000, centers=5, n_features=400)

In [None]:
X.shape

(250000, 400)

In [None]:
y.shape

(250000,)

In [None]:
pd.DataFrame(X).to_csv('blobsLarge.csv', header=False, index=False)
pd.DataFrame(y).to_csv('blobsLargeClasses.csv', header=False, index=False)

## Wine

In [None]:
from sklearn.datasets import load_wine

data = load_wine()
X, y = data.data, data.target

In [None]:
X.shape

(178, 13)

In [None]:
y.shape

(178,)

In [None]:
pd.DataFrame(X).to_csv('wine.csv', header=False, index=False)
pd.DataFrame(y).to_csv('wineClasses.csv', header=False, index=False)

## Breast Cancer

In [None]:
from sklearn.datasets import load_breast_cancer

data = load_breast_cancer()
X, y = data.data, data.target

In [None]:
X.shape

(569, 30)

In [None]:
y.shape

(569,)

In [None]:
pd.DataFrame(X).to_csv('cancer.csv', header=False, index=False)
pd.DataFrame(y).to_csv('cancerClasses.csv', header=False, index=False)

## Digits

In [None]:
from sklearn.datasets import load_digits

data = load_digits()
X, y = data.data, data.target

In [None]:
X.shape

(1797, 64)

In [None]:
y.shape

(1797,)

In [None]:
pd.DataFrame(X).to_csv('digits.csv', header=False, index=False)
pd.DataFrame(y).to_csv('digitsClasses.csv', header=False, index=False)

## CIFAR-10

In [None]:
from tensorflow.keras.datasets import cifar10
import numpy as np

(X_train, y_train), (X_test, y_test) = cifar10.load_data()

# Optionally, merge train and test into single X and y
X = np.concatenate((X_train, X_test), axis=0)
y = np.concatenate((y_train, y_test), axis=0).flatten()

In [None]:
X1 = X.reshape((X.shape[0], -1))

In [None]:
X1.shape

(60000, 3072)

In [None]:
y.shape

(60000,)

In [None]:
pd.DataFrame(X1).to_csv('cifar.csv', header=False, index=False)
pd.DataFrame(y).to_csv('cifarClasses.csv', header=False, index=False)

## Fashion MNIST

In [None]:
from tensorflow.keras.datasets import fashion_mnist
import numpy as np

# Load data
(X_train, y_train), (X_test, y_test) = fashion_mnist.load_data()

# Combine train and test
X = np.concatenate((X_train, X_test), axis=0)
y = np.concatenate((y_train, y_test), axis=0)

print(f"Fashion-MNIST: X shape = {X.shape}, y shape = {y.shape}")

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/train-labels-idx1-ubyte.gz
[1m29515/29515[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/train-images-idx3-ubyte.gz
[1m26421880/26421880[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/t10k-labels-idx1-ubyte.gz
[1m5148/5148[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1us/step
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/t10k-images-idx3-ubyte.gz
[1m4422102/4422102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step
Fashion-MNIST: X shape = (70000, 28, 28), y shape = (70000,)


In [None]:
X1 = X.reshape((X.shape[0], -1))
X1.shape

(70000, 784)

In [None]:
y.shape

(70000,)

In [None]:
pd.DataFrame(X1).to_csv('fashion.csv', header=False, index=False)
pd.DataFrame(y).to_csv('fashionClasses.csv', header=False, index=False)

## cover

In [None]:
import pandas as pd

In [None]:
from sklearn.datasets import make_blobs
import numpy as np

# Target configuration
n_samples = 60000
n_features = 6668  # Closest divisible-by-4 to hit ~400M elements
n_classes = 10

# Sanity check
total_elements = n_samples * n_features
print(f"Total elements: {total_elements:,}")  # 400,080,000

# Generate blobs
X, y = make_blobs(n_samples=n_samples,
                  n_features=n_features,
                  centers=n_classes,
                  random_state=42)

print("X shape:", X.shape)  # (60000, 6668)
print("y shape:", y.shape)  # (60000,)

Total elements: 400,080,000
X shape: (60000, 6668)
y shape: (60000,)


In [None]:
pd.DataFrame(X).to_csv('blobsWide.csv', header=False, index=False)
pd.DataFrame(y).to_csv('blobsWideClasses.csv', header=False, index=False)

## Tiny Image Net


## Testing corner

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from time import perf_counter_ns
import numpy as np
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [None]:
def skKNN(train_data, train_labels, test_data, k):
    KNN = KNeighborsClassifier(n_neighbors = k, algorithm="brute")
    KNN.fit(train_data, train_labels)
    predictions = KNN.predict(test_data)
    return predictions

In [None]:
    training, testing, trainingclasses, y_test = train_test_split(X, y, train_size=0.9)

    start_time = perf_counter_ns()
    preds = skKNN(training, trainingclasses, testing, 100)
    end_time = perf_counter_ns()
    elapsed_time = end_time - start_time

    print("Execution time: ", (elapsed_time/1000000000) , "seconds or ", elapsed_time, "nanoseconds")
    print(f"Accuracy: {accuracy_score(y_test, preds)}")

KeyboardInterrupt: 