handwritten digits

datasets:

optical recognition of handwritten digits: http://archive.ics.uci.edu/dataset/80/optical+recognition+of+handwritten+digits

MNIST: http://archive.ics.uci.edu/dataset/683/mnist+database+of+handwritten+digits

pen-based recognition of handwritten digits: http://archive.ics.uci.edu/dataset/81/pen+based+recognition+of+handwritten+digits

Semeion: http://archive.ics.uci.edu/dataset/178/semeion+handwritten+digit



In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_openml, load_digits
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import pickle
import os

In [2]:
RAW = "../data/raw/"
PROCESSED = "../data/processed/"

os.makedirs(RAW, exist_ok=True)
os.makedirs(PROCESSED, exist_ok=True)

## Importing datasets

### Optical

In [4]:
uci = load_digits()

X_uci = uci.data.astype(np.float32)
y_uci = uci.target.astype(int)

with open(PROCESSED + "uci_raw.pkl", "wb") as f:
    pickle.dump((X_uci, y_uci), f)

print(X_uci.shape, y_uci.shape)

(1797, 64) (1797,)


### MNIST

In [6]:
from tensorflow.keras.datasets import mnist
(X_train_mnist, y_train_mnist), (X_test_mnist, y_test_mnist) = mnist.load_data()
X_train_mnist = X_train_mnist.reshape(-1, 28*28).astype("float32")
X_test_mnist = X_test_mnist.reshape(-1, 28*28).astype("float32")
import numpy as np
X_mnist = np.vstack([X_train_mnist, X_test_mnist])
y_mnist = np.hstack([y_train_mnist, y_test_mnist])

print(X_mnist.shape, y_mnist.shape)


Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz
[1m11490434/11490434[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 0us/step
(70000, 784) (70000,)


### Pen-based

In [9]:
# code from UCI webpage:

from ucimlrepo import fetch_ucirepo

# fetch dataset
pen_based_recognition_of_handwritten_digits = fetch_ucirepo(id=81)

# data (as pandas dataframes)
X = pen_based_recognition_of_handwritten_digits.data.features
y = pen_based_recognition_of_handwritten_digits.data.targets

# metadata
print(pen_based_recognition_of_handwritten_digits.metadata)

# variable information
print(pen_based_recognition_of_handwritten_digits.variables)


{'uci_id': 81, 'name': 'Pen-Based Recognition of Handwritten Digits', 'repository_url': 'https://archive.ics.uci.edu/dataset/81/pen+based+recognition+of+handwritten+digits', 'data_url': 'https://archive.ics.uci.edu/static/public/81/data.csv', 'abstract': 'Digit database of 250 samples from 44 writers', 'area': 'Computer Science', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 10992, 'num_features': 16, 'feature_types': ['Integer'], 'demographics': [], 'target_col': ['Class'], 'index_col': None, 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 1996, 'last_updated': 'Wed Jul 01 1998', 'dataset_doi': '10.24432/C5MG6K', 'creators': ['E. Alpaydin', 'Fevzi. Alimoglu'], 'intro_paper': None, 'additional_info': {'summary': 'We create a digit database by collecting 250 samples from 44 writers. The samples written by 30 writers are used for training, cross-validation and writer dependent testing, and the digits written by t

In [12]:
X_pen = X.values.astype(np.float32)
y_pen = y.values.astype(int)

with open(PROCESSED + "pen_raw.pkl", "wb") as f:
    pickle.dump((X_pen, y_pen), f)

print(X_pen.shape, y_pen.shape)


(10992, 16) (10992, 1)


### Semeion

In [10]:
semeion_raw = np.loadtxt(RAW + "semeion.data")

X_semeion = semeion_raw[:, :256].astype(np.float32)
y_semeion = semeion_raw[:, 256:].argmax(axis=1).astype(int)

with open(PROCESSED + "semeion_raw.pkl", "wb") as f:
    pickle.dump((X_semeion, y_semeion), f)

print(X_semeion.shape, y_semeion.shape)


(1593, 256) (1593,)


## Preprocessing

In [13]:
scaler_mnist = StandardScaler()
X_mnist_scaled = scaler_mnist.fit_transform(X_mnist)

scaler_uci = StandardScaler()
X_uci_scaled = scaler_uci.fit_transform(X_uci)

scaler_pen = StandardScaler()
X_pen_scaled = scaler_pen.fit_transform(X_pen)

scaler_semeion = StandardScaler()
X_semeion_scaled = scaler_semeion.fit_transform(X_semeion)


In [14]:
datasets_scaled = {
    "mnist": (X_mnist_scaled, y_mnist),
    "uci": (X_uci_scaled, y_uci),
    "pen": (X_pen_scaled, y_pen),
    "semeion": (X_semeion_scaled, y_semeion),
}

for name, data in datasets_scaled.items():
    with open(PROCESSED + f"{name}_scaled.pkl", "wb") as f:
        pickle.dump(data, f)


In [15]:
for name, (X_data, y_data) in datasets_scaled.items():
    print(f"{name}: {X_data.shape}, labels: {np.unique(y_data)}")


mnist: (70000, 784), labels: [0 1 2 3 4 5 6 7 8 9]
uci: (1797, 64), labels: [0 1 2 3 4 5 6 7 8 9]
pen: (10992, 16), labels: [0 1 2 3 4 5 6 7 8 9]
semeion: (1593, 256), labels: [0 1 2 3 4 5 6 7 8 9]
