In [1]:
from sparktorch import SparkTorch, serialize_torch_obj
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.sql.functions import rand
from pyspark.ml.pipeline import Pipeline
import torch
import torch.nn as nn

In [2]:
spark = SparkSession.builder \
    .appName("SparkTorchTest")\
    .config("spark.executor.instances", "2") \
    .config("spark.executor.memory", "30G") \
    .config("spark.executor.cores", "24") \
    .config("spark.executor.cores.max", "24") \
    .config("spark.driver.memory", "20G") \
    .config("spark.dynamicAllocation.enabled", False) \
    .master("spark://10.21.24.101:7077") \
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/11/20 12:07:32 WARN SparkConf: Note that spark.local.dir will be overridden by the value set by the cluster manager (via SPARK_LOCAL_DIRS in mesos/standalone/kubernetes and LOCAL_DIRS in YARN).


In [3]:
# Read in mnist_train.csv dataset
df = spark.read.option("inferSchema", "true").csv('mnist_train.csv').orderBy(rand()).repartition(2)

                                                                                

In [4]:
df = df.selectExpr([f"cast({col_name} as double) as {col_name}" for col_name in df.columns])

In [9]:
df = df.na.drop()

In [11]:
# Create a simple neural network
network = nn.Sequential(
    nn.Linear(784, 256),
    nn.ReLU(),
    nn.Linear(256, 256),
    nn.ReLU(),
    nn.Linear(256, 10)
)

# Build the pytorch object
torch_obj = serialize_torch_obj(
    model=network,
    criterion=nn.CrossEntropyLoss(),
    optimizer=torch.optim.Adam,
    lr=0.001
)

# Setup features (In this csv, the label is the first column)
vector_assembler = VectorAssembler(inputCols=df.columns[1:785], outputCol='features')

# Setup a SparkTorch model for training
# Note: This uses the barrier execution mode, which is sensitive to the number of partitions
spark_model = SparkTorch(
    inputCol='features',
    labelCol='_c0',
    torchObj=torch_obj,
    iters=1000,
    miniBatch=256, # Setup internal mini batch size
    earlyStopPatience=40, # Add early stopping based on validation loss
    validationPct=0.2 # add validation percentage
)

# Create and save the Pipeline
model = Pipeline(stages=[vector_assembler, spark_model]).fit(df)

                                                                                

In [12]:
model.save('sparktorch_mnist_model')

23/11/20 12:16:47 WARN TaskSetManager: Stage 32 contains a task of very large size (3849 KiB). The maximum recommended task size is 1000 KiB.
