### Spark example
https://medium.com/@GalarnykMichael/install-spark-on-mac-pyspark-453f395f240b#.l3vblzc9u

In [1]:
conf = SparkConf().setAll([('spark.executor.memory', '4g'), 
('spark.executor.cores', '3'), ('spark.cores.max', '3'), 
('spark.driver.memory','10g')])

sc = SparkContext.getOrCreate(conf=conf)

In [2]:
sc.applicationId

'local-1593008096258'

In [3]:
try:
    import sparktorch
except ModuleNotFoundError:
    !pip install sparktorch
    
try:
    import pyspark
except ModuleNotFoundError:
    !pip install pyspark

### Estimating the Value of Pi

In [4]:
import numpy as np

TOTAL = 1000000
dots = sc.parallelize([2.0 * np.random.random(2) - 1.0 for
                       i in range(TOTAL)]).cache()
print("Number of random points:", dots.count())

stats = dots.stats()
print('Mean:', stats.mean())
print('stdev:', stats.stdev())

Number of random points: 1000000
Mean: [ 5.93738243e-04 -3.99781079e-05]
stdev: [0.57754099 0.57722272]


### Sparktorch
https://github.com/dmmiller612/sparktorch

In [4]:
import numpy as np

In [5]:
from torchvision import datasets, transforms

transform = transforms.Compose([transforms.ToTensor(),
                              transforms.Normalize((0.5,), (0.5,)),
                              ])

trainset = datasets.MNIST('./', download=True, train=True,
                          transform=transform)

In [6]:
print(type(trainset))
print(type(trainset.data))
print(trainset.data.shape)
print(trainset.targets)

<class 'torchvision.datasets.mnist.MNIST'>
<class 'torch.Tensor'>
torch.Size([60000, 28, 28])
tensor([5, 0, 4,  ..., 5, 6, 8])


In [7]:
n_samples, x, y = trainset.data.shape
x_train = trainset.data.reshape((n_samples, x*y))
x_train = x_train.numpy()
y_train = trainset.targets.numpy()

print(x_train, y_train)

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]] [5 0 4 ... 5 6 8]


In [13]:
# concatenate x_train and y_train
y_train = y_train.reshape((y_train.shape[0], 1))
print(y_train.shape)
train_data = np.concatenate((x_train, y_train), axis=1)
print(train_data.shape)
train_data

(60000, 1)
(60000, 785)


array([[0, 0, 0, ..., 0, 0, 5],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 4],
       ...,
       [0, 0, 0, ..., 0, 0, 5],
       [0, 0, 0, ..., 0, 0, 6],
       [0, 0, 0, ..., 0, 0, 8]])

In [9]:
num_cols = train_data.shape[1]
print(num_cols)

785


In [10]:
from sparktorch import serialize_torch_obj, SparkTorch
import torch
import torch.nn as nn
from pyspark.ml.feature import VectorAssembler
from pyspark.sql import SparkSession
from pyspark.ml.pipeline import Pipeline

spark = SparkSession.builder.appName("examples").getOrCreate()

In [11]:
train_data_sp = sc.parallelize(train_data)

In [12]:
train_data_sp.count()

KeyboardInterrupt: 

In [11]:
train_data_sp = train_data_sp.map(lambda x: [int(i) for i in x])
train_data_sp

PythonRDD[2] at RDD at PythonRDD.scala:53

In [12]:
cols = ['c{}'.format(i) for i in range(num_cols)]
train_data_sp = train_data_sp.toDF(cols)

KeyboardInterrupt: 

In [15]:
print(train_data_sp.count(), len(train_data_sp.columns))

60000 784


In [16]:
train_data_sp.columns[:2]

['c0', 'c1']

In [90]:
# concatenate x_train_sp and y_train_sp
from pyspark.sql.types import StructType

schema = StructType(x_train_sp.schema.fields + y_train_sp.schema.fields)
df = x_train_sp.rdd.zip(y_train_sp.rdd)

In [91]:
def sp_concat(x, num_features):
    d1, d2 = x
    return list(d1[:num_features]) + [d2[0]]

In [None]:
network = nn.Sequential(
    nn.Linear(784, 256),
    nn.ReLU(),
    nn.Linear(256, 256),
    nn.ReLU(),
    nn.Linear(256, 10),
    nn.Softmax(dim=1)
)

# Build the pytorch object
torch_obj = serialize_torch_obj(
    model=network,
    criterion=nn.CrossEntropyLoss(),
    optimizer=torch.optim.Adam,
    lr=0.0001
)

# Setup features
vector_assembler = VectorAssembler(
    inputCols=train_data.columns[1:785], outputCol='features')

# Create a SparkTorch Model with torch distributed. Barrier execution is on by default for this mode.
spark_model = SparkTorch(
    inputCol='features',
    labelCol='_c0',
    predictionCol='predictions',
    torchObj=torch_obj,
    iters=50,
    verbose=1
)

# Can be used in a pipeline and saved.
p = Pipeline(stages=[vector_assembler, spark_model]).fit(df)
p.save('simple_dnn')