## Classification Of Hand Written Digits

In [1]:
import pyspark
import numpy as np
from splearn.rdd import ArrayRDD
from sklearn.datasets import fetch_mldata
from sklearn.model_selection import train_test_split


import matplotlib
import matplotlib.pyplot as plt
matplotlib.rcParams['figure.figsize']=[10, 10]
%matplotlib inline


sc = pyspark.SparkContext(appName="MNIST Classification")

### Grab the MNIST Data

In [25]:
def load_mnist(data_dir):
    """
    Load the MNIST dataset
    
    Parameters:
    ----------
    * `data_dir` [str]
        Location for the data.
        - If it does not exit, the data will be downloaded there.
        
    Returns:
    -------
    * `X` [nd-array shape=(70000, 784)]
        Handwritten digits data.
    * `y` [nd-array shape(70000,)]
        Labels.
    """
    mnist = fetch_mldata('MNIST original', data_home=data_dir)
    X = mnist['data']
    y = mnist['target']
    return X, y


X, y = load_mnist('../data')

### Creating Training and Test Splits

In [7]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

### Setting up RDDs and ArrayRDDs

In [26]:
def create_arrayrdd(array, num_partitions=4):
    """
    Create an ArrayRDD for Pyspark
    
    Parameters:
    ----------
    * `array` [numpy nd-array]
        Array to be converted to ArrayRDD.
        
    * `num_partitions` [int default=4]
        Number of distributed blocks to create.
    
    Returns:
    -------
    An ArrayRDD
    """
    rdd = sc.parallelize(array, num_partitions)
    return ArrayRDD(rdd)


X_trainArry = create_arrayrdd(X_train, 4)
y_trainArry = create_arrayrdd(y_train, 4)

X_testArry = create_arrayrdd(X_test, 4)
y_testArry = create_arrayrdd(y_test, 4)

### A Few Sanity Checks:

In [27]:
print('Number of distributed partitions in array: {}'.format(X_trainArry.getNumPartitions()))
print('Training set has shape {}'.format(X_trainArry.shape))

# Sum of the training set partitions == number of training set examples?
sum = 0
for block in X_trainArry:
    sum += len(block)
    
print('Total number of examples across partitions: {}'.format(sum))

Number of distributed partitions in array: 4
Training set has shape (46900, 784)
Total number of examples across partitions: 46900
