## Classification Of Hand Written Digits

In [1]:
import pyspark
import numpy as np
from splearn.rdd import ArrayRDD
from sklearn.datasets import fetch_mldata
from sklearn.model_selection import train_test_split

from pyspark.ml.linalg import Vectors
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf
from pyspark.sql.types import *
import numpy as np

import matplotlib
import matplotlib.pyplot as plt
matplotlib.rcParams['figure.figsize']=[10, 10]
%matplotlib inline


#initialize spark session
spark = SparkSession\
        .builder\
        .appName("Test")\
        .config('spark.sql.warehouse.dir', 'file:///C:/')\
       .getOrCreate()
    
sc = spark.sparkContext

#sc = pyspark.SparkContext(appName="MNIST Classification")

### Grab the MNIST Data

In [2]:
def load_mnist(data_dir):
    """
    Load the MNIST dataset
    
    Parameters:
    ----------
    * `data_dir` [str]
        Location for the data.
        - If it does not exit, the data will be downloaded there.
        
    Returns:
    -------
    * `X` [nd-array shape=(70000, 784)]
        Handwritten digits data.
    * `y` [nd-array shape(70000,)]
        Labels.
    """
    mnist = fetch_mldata('MNIST original', data_home=data_dir)
    X = mnist['data']
    y = mnist['target']
    return X, y


X, y = load_mnist('../data')

In [3]:
type(X)

numpy.ndarray

### Creating Training and Test Splits

In [4]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

### Setting up RDDs and ArrayRDDs

In [5]:
def create_arrayrdd(array, num_partitions=4):
    """
    Create an ArrayRDD for Pyspark
    
    Parameters:
    ----------
    * `array` [numpy nd-array]
        Array to be converted to ArrayRDD.
        
    * `num_partitions` [int default=4]
        Number of distributed blocks to create.
    
    Returns:
    -------
    An ArrayRDD
    """
    rdd = sc.parallelize(array, num_partitions)
    return ArrayRDD(rdd)


X_trainArry = create_arrayrdd(X_train, 4)
y_trainArry = create_arrayrdd(y_train, 4)

X_testArry = create_arrayrdd(X_test, 4)
y_testArry = create_arrayrdd(y_test, 4)

### A Few Sanity Checks:

In [6]:
print('Number of distributed partitions in array: {}'.format(X_trainArry.getNumPartitions()))
print('Training set has shape {}'.format(X_trainArry.shape))

# Sum of the training set partitions == number of training set examples?
sum = 0
for block in X_trainArry:
    sum += len(block)
    
print('Total number of examples across partitions: {}'.format(sum))

Number of distributed partitions in array: 4
Training set has shape (46900, 784)
Total number of examples across partitions: 46900


In [7]:
from splearn.rdd import DictRDD


def create_dictRdd(X, y, num_partitions=4):
    """
    """
    X_rdd = sc.parallelize(X, num_partitions)
    y_rdd = sc.parallelize(y, num_partitions)

    Z = DictRDD((X_rdd, y_rdd), columns=('X', 'y'),
                dtype=[np.ndarray, np.ndarray])
    return Z


z = create_dictRdd(X_train, y_train, 4)

In [8]:
X_trainRDD = sc.parallelize(X_train, 4)
y_trainRDD = sc.parallelize(y_train, 4)
X_testRDD = sc.parallelize(X_test, 4)
y_testRDD = sc.parallelize(y_test, 4)

In [9]:
y = np.ones(5)
X = np.zeros((5, 5))

In [10]:
y

array([ 1.,  1.,  1.,  1.,  1.])

In [11]:
X.shape

(5, 5)

In [12]:
stacked = np.column_stack((y, X))
stacked.shape

(5, 6)

In [13]:
from pyspark.ml.linalg import Vectors

dff = map(lambda x: (int(x[0]), Vectors.dense(x[1:])), stacked)

In [14]:
mydf = spark.createDataFrame(dff,schema=["label", "features"])

In [15]:
mydf

DataFrame[label: bigint, features: vector]

In [32]:
#df = np.concatenate([y, X])
#df = np.stack((y, X), axis=-1)
X = np.column_stack((y, X))

In [37]:
X[0]

array([   0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,
          0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,
          0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,
          0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,
          0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,
          0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,
          0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,
          0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,
          0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,
          0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,
          0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,
          0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,
          0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,
          0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,
          0.,    0.,

In [15]:
X.shape

(70000, 784)

In [17]:
import pandas as pd

In [18]:
df = pd.DataFrame(X)

In [23]:
df1 = pd.DataFrame(y)

In [28]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,774,775,776,777,778,779,780,781,782,783
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [25]:
df1.shape

(70000, 1)

In [27]:
df1.join(df)

ValueError: columns overlap but no suffix specified: RangeIndex(start=0, stop=1, step=1)