In [1]:
%r
library(SparkR)
df <- createDataFrame(faithful)

# Displays the content of the DataFrame to stdout
head(df)

In [2]:
%r
library(SparkR)
diamondsDF <- read.df("/databricks-datasets/Rdatasets/data-001/csv/ggplot2/diamonds.csv",
                    source = "csv", header="true", inferSchema = "true")
head(diamonds)

In [3]:
%r
printSchema(diamondsDF)

In [4]:
%r
display(diamondsDF)

In [5]:
%r
write.df(irisDF2, path="dbfs:/tmp/iris.parquet", source="parquet", mode="overwrite")

In [6]:
%fs ls dbfs:/tmp/people.parquet

In [7]:
%r
# Register earlier df as temp table
registerTempTable(people, "peopleTemp")

In [8]:
%r
age <- sql("SELECT age FROM peopleTemp")
head(age)

In [9]:
%r
require(SparkR)

# Create DataFrame
df <- createDataFrame(faithful)
df

In [10]:
%r
# Select only the "eruptions" column
head(select(df, df$eruptions))

In [11]:
%r
# You can also pass in column name as strings
head(select(df, "eruptions"))

In [12]:
%r
# Filter the DataFrame to only retain rows with wait times shorter than 50 mins
head(filter(df, df$waiting < 50))

In [13]:
%r
head(count(groupBy(df, df$waiting)))

In [14]:
%r
# We can also sort the output from the aggregation to get the most common waiting times
waiting_counts <- count(groupBy(df, df$waiting))
head(arrange(waiting_counts, desc(waiting_counts$count)))

In [15]:
%r
# Convert waiting time from hours to seconds.
# Note that we can assign this to a new column in the same DataFrame
df$waiting_secs <- df$waiting * 60
head(df)

In [16]:
%r
# Create the DataFrame
df <- createDataFrame(sqlContext, iris)

# Fit a linear model over the dataset.
model <- glm(Sepal_Length ~ Sepal_Width + Species, data = df, family = "gaussian")

# Model coefficients are returned in a similar format to R's native glm().
summary(model)

In [17]:
%r
require(SparkR)

# Read diamonds.csv dataset as SparkDataFrame
diamonds <- read.df("/databricks-datasets/Rdatasets/data-001/csv/ggplot2/diamonds.csv",
                  source = "com.databricks.spark.csv", header="true", inferSchema = "true")
diamonds <- withColumnRenamed(diamonds, "", "rowID")

# Split data into Training set and Test set
trainingData <- sample(diamonds, FALSE, 0.7)
testData <- except(diamonds, trainingData)

# Exclude rowIDs
trainingData <- trainingData[, -1]
testData <- testData[, -1]

print(count(diamonds))
print(count(trainingData))
print(count(testData))

In [18]:
%r
head(trainingData)

In [19]:
%r
# Indicate family = "gaussian" to train a linear regression model
lrModel <- glm(price ~ ., data = trainingData, family = "gaussian")

# Print a summary of trained linear regression model
summary(lrModel)

In [20]:
%r
# Generate predictions using the trained Linear Regression model
predictions <- predict(lrModel, newData = testData)

# View predictions against mpg column
display(select(predictions, "price", "prediction"))

In [21]:
%r
errors <- select(predictions, predictions$price, predictions$prediction, alias(predictions$price - predictions$prediction, "error"))
display(errors)

In [22]:
%r
# Calculate RMSE
head(select(errors, alias(sqrt(sum(errors$error^2 , na.rm = TRUE) / nrow(errors)), "RMSE")))

In [23]:
%r
# Subset data to include rows where diamond cut = "Premium" or diamond cut = "Very Good"
trainingDataSub <- subset(trainingData, trainingData$cut %in% c("Premium", "Very Good"))
testDataSub <- subset(testData, testData$cut %in% c("Premium", "Very Good"))

In [24]:
%r
# Indicate family = "binomial" to train a logistic regression model
logrModel <- glm(cut ~ price + color + clarity + depth, data = trainingDataSub, family = "binomial")

# Print summary of Logistic Regression model
# Note: This only works in Spark 1.6+
summary(logrModel)

In [25]:
%r
# Generate predictions using the trained Linear Regression model
predictionsLogR <- predict(logrModel, newData = testDataSub)

# View predictions against label column
display(select(predictionsLogR, "label", "prediction"))

In [26]:
%r
# Evaluate Logistic Regression model
errorsLogR <- select(predictionsLogR, predictionsLogR$label, predictionsLogR$prediction, alias(abs(predictionsLogR$label - predictionsLogR$prediction), "error"))
display(errorsLogR)

In [27]:
%scala
val bikeStations = sqlContext.sql("SELECT * FROM sf_201508_station_data")
val tripData = sqlContext.sql("SELECT * FROM sf_201508_trip_data")

In [28]:
%scala
display(bikeStations)

In [29]:
%scala
display(tripData)

In [30]:
%scala
bikeStations.printSchema()
tripData.printSchema()

In [31]:
%scala
import org.apache.spark.graphx._
import org.apache.spark.rdd.RDD

In [32]:
%scala
val justStations = bikeStations
  .selectExpr("float(station_id) as station_id", "name")
  .distinct()

val completeTripData = tripData
  .join(justStations, tripData("Start Station") === bikeStations("name"))
  .withColumnRenamed("station_id", "start_station_id")
  .drop("name")
  .join(justStations, tripData("End Station") === bikeStations("name"))
  .withColumnRenamed("station_id", "end_station_id")
  .drop("name")

In [33]:
%scala
val stations = completeTripData
  .select("start_station_id", "end_station_id")
  .rdd
  .distinct() // helps filter out duplicate trips
  .flatMap(x => Iterable(x(0).asInstanceOf[Number].longValue, x(1).asInstanceOf[Number].longValue)) // helps us maintain types
  .distinct()
  .toDF() // return to a DF to make merging + joining easier

stations.take(1) // this is just a station_id at this point

In [34]:
%scala
val stationVertices: RDD[(VertexId, String)] = stations
  .join(justStations, stations("value") === justStations("station_id"))
  .select("station_id", "name")
  .rdd
  .map(row => (row(0).asInstanceOf[Number].longValue, row(1).asInstanceOf[String])) // maintain type information

stationVertices.take(1)

In [35]:
%scala
val stationEdges:RDD[Edge[Long]] = completeTripData
  .select("start_station_id", "end_station_id")
  .rdd
  .map(row => Edge(row(0).asInstanceOf[Number].longValue, row(1).asInstanceOf[Number].longValue, 1))

In [36]:
%scala
val defaultStation = ("Missing Station")
val stationGraph = Graph(stationVertices, stationEdges, defaultStation)
stationGraph.cache()

In [37]:
%scala
println("Total Number of Stations: " + stationGraph.numVertices)
println("Total Number of Trips: " + stationGraph.numEdges)
// sanity check
println("Total Number of Trips in Original Data: " + tripData.count)

In [38]:
%scala
val ranks = stationGraph.pageRank(0.0001).vertices
ranks
  .join(stationVertices)
  .sortBy(_._2._1, ascending=false) // sort by the rank
  .take(10) // get the top 10
  .foreach(x => println(x._2._2))

In [39]:
%scala
stationGraph
  .groupEdges((edge1, edge2) => edge1 + edge2)
  .triplets
  .sortBy(_.attr, ascending=false)
  .map(triplet =>
    "There were " + triplet.attr.toString + " trips from " + triplet.srcAttr + " to " + triplet.dstAttr + ".")
  .take(10)
  .foreach(println)

In [40]:
%scala
stationGraph
  .inDegrees // computes in Degrees
  .join(stationVertices)
  .sortBy(_._2._1, ascending=false)
  .take(10)
  .foreach(x => println(x._2._2 + " has " + x._2._1 + " in degrees."))

In [41]:
%scala
stationGraph
  .outDegrees // out degrees
  .join(stationVertices)
  .sortBy(_._2._1, ascending=false)
  .take(10)
  .foreach(x => println(x._2._2 + " has " + x._2._1 + " out degrees."))

In [42]:
%scala
stationGraph
  .inDegrees
  .join(stationGraph.outDegrees) // join with out Degrees
  .join(stationVertices) // join with our other stations
  .map(x => (x._2._1._1.toDouble/x._2._1._2.toDouble, x._2._2)) // ratio of in to out
  .sortBy(_._1, ascending=false)
  .take(5)
  .foreach(x => println(x._2 + " has a in/out degree ratio of " + x._1))

In [43]:
%scala
stationGraph
  .inDegrees
  .join(stationGraph.inDegrees) // join with out Degrees
  .join(stationVertices) // join with our other stations
  .map(x => (x._2._1._1.toDouble/x._2._1._2.toDouble, x._2._2)) // ratio of in to out
  .sortBy(_._1)
  .take(5)
  .foreach(x => println(x._2 + " has a in/out degree ratio of " + x._1))

In [44]:
%sh --packages graphframes:graphframes:0.5.0-spark2.1-s_2.11

In [45]:
%scala
import org.apache.spark.sql._
import org.apache.spark.sql.functions._
import org.graphframes._

In [46]:
%scala
val stationVertices = bikeStations
  .withColumnRenamed("name", "id")
  .distinct()

val tripEdges = tripData
  .withColumnRenamed("Start Station", "src")
  .withColumnRenamed("End Station", "dst")

In [47]:
%scala
display(stationVertices)

In [48]:
%scala
display(tripEdges)

In [49]:
%scala
val stationGraph = GraphFrame(stationVertices, tripEdges)

tripEdges.cache()
stationVertices.cache()

In [50]:
%scala
val ranks = stationGraph.pageRank.resetProbability(0.15).maxIter(10).run()

display(ranks.vertices.orderBy(desc("pagerank")))

In [51]:
%scala
val inDeg = stationGraph.inDegrees
display(inDeg.orderBy(desc("inDegree")).limit(5))

In [52]:
%scala
val degreeRatio = inDeg.join(outDeg, inDeg.col("id") === outDeg.col("id"))
  .drop(outDeg.col("id"))
  .selectExpr("id", "double(inDegree)/double(outDegree) as degreeRatio")

degreeRatio.cache()
  
display(degreeRatio.orderBy(desc("degreeRatio")).limit(10))

In [53]:
from graphframes import *

In [54]:
vertices = sqlContext.createDataFrame([
  ("a", "Alice", 34),
  ("b", "Bob", 36),
  ("c", "Charlie", 30),
  ("d", "David", 29),
  ("e", "Esther", 32),
  ("f", "Fanny", 36),
  ("g", "Gabby", 60)], ["id", "name", "age"])

In [55]:
edges = sqlContext.createDataFrame([
  ("a", "b", "friend"),
  ("b", "c", "follow"),
  ("c", "b", "follow"),
  ("f", "c", "follow"),
  ("e", "f", "follow"),
  ("e", "d", "friend"),
  ("d", "a", "friend"),
  ("a", "e", "friend")
], ["src", "dst", "relationship"])

In [56]:
g = GraphFrame(vertices, edges)
print g

In [57]:
display(g.vertices)

In [58]:
display(g.edges)

In [59]:
display(g.inDegrees)

In [60]:
display(g.outDegrees)

In [61]:
display(g.degrees)

In [62]:
youngest = g.vertices.groupBy().min("age")
display(youngest)

In [63]:
numFollows = g.edges.filter("relationship = 'follow'").count()
print "The number of follow edges is", numFollows

In [64]:
# Search for pairs of vertices with edges in both directions between them.
motifs = g.find("(a)-[e]->(b); (b)-[e2]->(a)")
display(motifs)

In [65]:
filtered = motifs.filter("b.age > 30 or a.age > 30")
display(filtered)

In [66]:
paths = g.find("(a)-[e]->(b)")\
  .filter("e.relationship = 'follow'")\
  .filter("a.age < b.age")
# The `paths` variable contains the vertex information, which we can extract:
e2 = paths.select("e.src", "e.dst", "e.relationship")

# In Spark 1.5+, the user may simplify the previous call to:
# val e2 = paths.select("e.*")

# Construct the subgraph
g2 = GraphFrame(g.vertices, e2)

In [67]:
display(g2.vertices)

In [68]:
%scala

val firstDF = spark.range(3).toDF("myCol")
val newRow = Seq(20)
val appended = firstDF.union(newRow.toDF())
display(appended)

In [69]:
firstDF = spark.range(3).toDF("myCol")
newRow = spark.createDataFrame([[20]])
appended = firstDF.union(newRow)
display(appended)

In [70]:
%scala

val llist = Seq(("bob", "2015-01-13", 4), ("alice", "2015-04-23",10))
val left = llist.toDF("name","date","duration")
val right = Seq(("alice", 100),("bob", 23)).toDF("name","upload")

val df = left.join(right, left.col("name") === right.col("name"))

display(df)

In [71]:
%scala

val df = left.join(right, Seq("name"))
 display(df)

In [72]:
%scala
sc.parallelize(Seq("")).foreachPartition(x => {
  import org.apache.log4j.{LogManager, Level}
  import org.apache.commons.logging.LogFactory

  LogManager.getRootLogger().setLevel(Level.DEBUG)
  val log = LogFactory.getLog("EXECUTOR-LOG:")
  log.debug("START EXECUTOR DEBUG LOG LEVEL")
})

In [73]:
%sh 
curl -O http://download.tensorflow.org/example_images/flower_photos.tgz
tar xzf flower_photos.tgz

In [74]:
display(dbutils.fs.ls('file:/databricks/driver/flower_photos'))

In [75]:
# The 'file:/...' directory will be cleared out upon cluster termination. That doesn't matter for this example notebook, but in most cases we'd want to store the images in a more permanent place. Let's move the files to dbfs so we can see how to work with it in the use cases below.
img_dir = '/tmp/flower_photos'
dbutils.fs.mkdirs(img_dir)
dbutils.fs.cp('file:/databricks/driver/flower_photos/tulips', img_dir + "/tulips", recurse=True)
dbutils.fs.cp('file:/databricks/driver/flower_photos/daisy', img_dir + "/daisy", recurse=True)
dbutils.fs.cp('file:/databricks/driver/flower_photos/LICENSE.txt', img_dir)
display(dbutils.fs.ls(img_dir))

In [76]:
# Let's create a small sample set of images for quick demonstrations.
sample_img_dir = img_dir + "/sample"
dbutils.fs.mkdirs(sample_img_dir)
files = dbutils.fs.ls(img_dir + "/tulips")[0:1] + dbutils.fs.ls(img_dir + "/daisy")[0:2]
for f in files:
  dbutils.fs.cp(f.path, sample_img_dir)
display(dbutils.fs.ls(sample_img_dir))

In [77]:
from sparkdl import readImages
image_df = readImages(sample_img_dir)

In [78]:
display(image_df)

In [79]:
# Create training & test DataFrames for transfer learning - this piece of code is longer than transfer learning itself below!
from sparkdl import readImages
from pyspark.sql.functions import lit

tulips_df = readImages(img_dir + "/tulips").withColumn("label", lit(1))
daisy_df = readImages(img_dir + "/daisy").withColumn("label", lit(0))
tulips_train, tulips_test = tulips_df.randomSplit([0.6, 0.4])
daisy_train, daisy_test = daisy_df.randomSplit([0.6, 0.4])
train_df = tulips_train.unionAll(daisy_train)
test_df = tulips_test.unionAll(daisy_test)

In [80]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline
from sparkdl import DeepImageFeaturizer 

featurizer = DeepImageFeaturizer(inputCol="image", outputCol="features", modelName="InceptionV3")
lr = LogisticRegression(maxIter=20, regParam=0.05, elasticNetParam=0.3, labelCol="label")
p = Pipeline(stages=[featurizer, lr])

p_model = p.fit(train_df)

In [81]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

tested_df = p_model.transform(test_df)
evaluator = MulticlassClassificationEvaluator(metricName="accuracy")
print("Test set accuracy = " + str(evaluator.evaluate(tested_df.select("prediction", "label"))))

In [82]:
from pyspark.sql.types import DoubleType
from pyspark.sql.functions import expr
def _p1(v):
  return float(v.array[1])
p1 = udf(_p1, DoubleType())

df = tested_df.withColumn("p_1", p1(tested_df.probability))
wrong_df = df.orderBy(expr("abs(p_1 - label)"), ascending=False)
display(wrong_df.select("filePath", "p_1", "label").limit(10))

In [83]:
from sparkdl import readImages, DeepImagePredictor

image_df = readImages(sample_img_dir)

predictor = DeepImagePredictor(inputCol="image", outputCol="predicted_labels", modelName="InceptionV3", decodePredictions=True, topK=10)
predictions_df = predictor.transform(image_df)

display(predictions_df.select("filePath", "predicted_labels"))

In [84]:
df = p_model.transform(image_df)
display(df.select("filePath", (1-p1(df.probability)).alias("p_daisy")))

In [85]:
from sparkdl import readImages, TFImageTransformer
from sparkdl.transformers import utils
import tensorflow as tf

image_df = readImages(sample_img_dir)

g = tf.Graph()
with g.as_default():
    image_arr = utils.imageInputPlaceholder()
    resized_images = tf.image.resize_images(image_arr, (299, 299))
    # the following step is not necessary for this graph, but can be for graphs with variables, etc
    frozen_graph = utils.stripAndFreezeGraph(g.as_graph_def(add_shapes=True), tf.Session(graph=g), [resized_images])
      
transformer = TFImageTransformer(inputCol="image", outputCol="transformed_img", graph=frozen_graph,
                                 inputTensor=image_arr, outputTensor=resized_images,
                                 outputMode="image")
tf_trans_df = transformer.transform(image_df)

In [86]:
from keras.applications import InceptionV3

model = InceptionV3(weights="imagenet")
model.save('/tmp/model-full.h5')  # saves to the local filesystem
# move to a permanent place for future use
dbfs_model_path = 'dbfs:/models/model-full.h5'
dbutils.fs.cp('file:/tmp/model-full.h5', dbfs_model_path) 

In [87]:
from keras.applications.inception_v3 import preprocess_input
from keras.preprocessing.image import img_to_array, load_img
import numpy as np
from pyspark.sql.types import StringType
from sparkdl import KerasImageFileTransformer

def loadAndPreprocessKerasInceptionV3(uri):
  # this is a typical way to load and prep images in keras
  image = img_to_array(load_img(uri, target_size=(299, 299)))  # image dimensions for InceptionV3
  image = np.expand_dims(image, axis=0)
  return preprocess_input(image)

dbutils.fs.cp(dbfs_model_path, 'file:/tmp/model-full-tmp.h5')
transformer = KerasImageFileTransformer(inputCol="uri", outputCol="predictions",
                                        modelFile='/tmp/model-full-tmp.h5',  # local file path for model
                                        imageLoader=loadAndPreprocessKerasInceptionV3,
                                        outputMode="vector")

files = ["/dbfs" + str(f.path)[5:] for f in dbutils.fs.ls(sample_img_dir)]  # make "local" file paths for images
uri_df = sqlContext.createDataFrame(files, StringType()).toDF("uri")

keras_pred_df = transformer.transform(uri_df)

In [88]:
display(keras_pred_df.select("uri", "predictions"))

In [89]:
dbutils.fs.rm(img_dir, recurse=True)
dbutils.fs.rm(dbfs_model_path)

In [90]:
import tensorflow as tf
tf.__version__

In [91]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import argparse

# Import data
from tensorflow.examples.tutorials.mnist import input_data

import tensorflow as tf

In [92]:
mnist = input_data.read_data_sets('/tmp/data', one_hot=True)

In [93]:
x = tf.placeholder(tf.float32, [None, 784])
W = tf.Variable(tf.zeros([784, 10]))
b = tf.Variable(tf.zeros([10]))
y = tf.matmul(x, W) + b

In [94]:
y_ = tf.placeholder(tf.float32, [None, 10])

In [95]:
cross_entropy = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=y, labels=y_))
train_step = tf.train.GradientDescentOptimizer(0.5).minimize(cross_entropy)

In [96]:
correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
summary = tf.summary.scalar("accuracy", accuracy)

In [97]:
log_dir = "/tmp/tensorflow_log_dir"
dbutils.tensorboard.start(log_dir)

In [98]:
sess = tf.InteractiveSession()

# Make sure to use the same log directory for both start TensorBoard in your training.
summary_writer = tf.summary.FileWriter(log_dir, graph=sess.graph)

tf.global_variables_initializer().run()
for batch in range(1000):
  batch_xs, batch_ys = mnist.train.next_batch(100)
  _, batch_summary = sess.run([train_step, summary], feed_dict={x: batch_xs, y_: batch_ys})
  summary_writer.add_summary(batch_summary, batch)

In [99]:
print(sess.run(accuracy, feed_dict={x: mnist.test.images,
                                    y_: mnist.test.labels}))

In [100]:
dbutils.tensorboard.stop()

In [101]:
import shutil
shutil.move(log_dir, "/dbfs/tensorflow/logs")

In [102]:
import numpy as np
import os
import urllib
import gzip
import struct
def download_data(url, force_download=True): 
    fname = url.split("/")[-1]
    if force_download or not os.path.exists(fname):
        urllib.urlretrieve(url, fname)
    return fname

def read_data(label_url, image_url):
    with gzip.open(download_data(label_url)) as flbl:
        magic, num = struct.unpack(">II", flbl.read(8))
        label = np.fromstring(flbl.read(), dtype=np.int8)
    with gzip.open(download_data(image_url), 'rb') as fimg:
        magic, num, rows, cols = struct.unpack(">IIII", fimg.read(16))
        image = np.fromstring(fimg.read(), dtype=np.uint8).reshape(len(label), rows, cols)
    return (label, image)

path='http://yann.lecun.com/exdb/mnist/'
(train_lbl, train_img) = read_data(
    path+'train-labels-idx1-ubyte.gz', path+'train-images-idx3-ubyte.gz')
(val_lbl, val_img) = read_data(
    path+'t10k-labels-idx1-ubyte.gz', path+'t10k-images-idx3-ubyte.gz')

In [103]:
import matplotlib.pyplot as plt
for i in range(10):
    plt.subplot(1,10,i+1)
    plt.imshow(train_img[i], cmap='Greys_r')
    plt.axis('off')
plt.show()
display()

In [104]:
print('label: %s' % (train_lbl[0:10],))

In [105]:
import mxnet as mx

def to4d(img):
    return img.reshape(img.shape[0], 1, 28, 28).astype(np.float32)/255

In [106]:
# Create a place holder variable for the input data
data = mx.sym.Variable('data')
# Flatten the data from 4-D shape (batch_size, num_channel, width, height) 
# into 2-D (batch_size, num_channel*width*height)
data = mx.sym.Flatten(data=data)

# The first fully-connected layer
fc1  = mx.sym.FullyConnected(data=data, name='fc1', num_hidden=128)
# Apply relu to the output of the first fully-connnected layer
act1 = mx.sym.Activation(data=fc1, name='relu1', act_type="relu")

# The second fully-connected layer and the according activation function
fc2  = mx.sym.FullyConnected(data=act1, name='fc2', num_hidden = 64)
act2 = mx.sym.Activation(data=fc2, name='relu2', act_type="relu")

# The thrid fully-connected layer, note that the hidden size should be 10, which is the number of unique digits
fc3  = mx.sym.FullyConnected(data=act2, name='fc3', num_hidden=10)
# The softmax and loss layer
mlp  = mx.sym.SoftmaxOutput(data=fc3, name='softmax')

In [107]:
model = mx.model.FeedForward(
    symbol = mlp,       # network structure
    num_epoch = 10,     # number of data passes for training 
    learning_rate = 0.1 # learning rate of SGD 
)
model.fit(
    X=train_iter,       # training data
    eval_data=val_iter, # validation data
    batch_end_callback = mx.callback.Speedometer(batch_size, 200) # output progress for each 200 data batches
) 

In [108]:
plt.clf()
plt.imshow(val_img[0], cmap='Greys_r')
plt.axis('off')
plt.show()
display()

In [109]:
prob = model.predict(val_img[0:1].astype(np.float32)/255)[0]
print 'Classified as %d with probability %f' % (prob.argmax(), max(prob))

In [110]:
print 'Validation accuracy: %f%%' % (model.score(val_iter)*100,)

In [111]:
data = mx.symbol.Variable('data')
# first conv layer
conv1 = mx.sym.Convolution(data=data, kernel=(5,5), num_filter=20)
tanh1 = mx.sym.Activation(data=conv1, act_type="tanh")
pool1 = mx.sym.Pooling(data=tanh1, pool_type="max", kernel=(2,2), stride=(2,2))
# second conv layer
conv2 = mx.sym.Convolution(data=pool1, kernel=(5,5), num_filter=50)
tanh2 = mx.sym.Activation(data=conv2, act_type="tanh")
pool2 = mx.sym.Pooling(data=tanh2, pool_type="max", kernel=(2,2), stride=(2,2))
# first fullc layer
flatten = mx.sym.Flatten(data=pool2)
fc1 = mx.symbol.FullyConnected(data=flatten, num_hidden=500)
tanh3 = mx.sym.Activation(data=fc1, act_type="tanh")
# second fullc
fc2 = mx.sym.FullyConnected(data=tanh3, num_hidden=10)
# softmax loss
lenet = mx.sym.SoftmaxOutput(data=fc2, name='softmax')

In [112]:
model = mx.model.FeedForward(
    ctx = mx.gpu(0),     # use GPU 0 for training, others are same as before
    symbol = lenet,       
    num_epoch = 10,     
    learning_rate = 0.1)
model.fit(
    X=train_iter,  
    eval_data=val_iter, 
    batch_end_callback = mx.callback.Speedometer(batch_size, 200)
) 

In [113]:
print 'Validation accuracy: %f%%' % (model.score(val_iter)*100,)

In [114]:
from __future__ import print_function
import keras
from keras.datasets import mnist
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten
from keras.layers import Conv2D, MaxPooling2D
from keras import backend as K

In [115]:
# the data, shuffled and split between train and test sets
(x_train, y_train), (x_test, y_test) = mnist.load_data()

In [116]:
# input image dimensions
img_rows, img_cols = 28, 28
# number of classes (digits) to predict
num_classes = 10

if K.image_data_format() == 'channels_first':
    x_train = x_train.reshape(x_train.shape[0], 1, img_rows, img_cols)
    x_test = x_test.reshape(x_test.shape[0], 1, img_rows, img_cols)
    input_shape = (1, img_rows, img_cols)
else:
    x_train = x_train.reshape(x_train.shape[0], img_rows, img_cols, 1)
    x_test = x_test.reshape(x_test.shape[0], img_rows, img_cols, 1)
    input_shape = (img_rows, img_cols, 1)

x_train = x_train.astype('float32')
x_test = x_test.astype('float32')
x_train /= 255
x_test /= 255
print('x_train shape:', x_train.shape)
print(x_train.shape[0], 'train samples')
print(x_test.shape[0], 'test samples')

# convert class vectors to binary class matrices
y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)

In [117]:
model = Sequential()
model.add(Conv2D(32, kernel_size=(3, 3),
                 activation='relu',
                 input_shape=input_shape))
model.add(Conv2D(64, (3, 3), activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(num_classes, activation='softmax'))

model.compile(loss=keras.losses.categorical_crossentropy,
              optimizer=keras.optimizers.Adadelta(),
              metrics=['accuracy'])

In [118]:
batch_size = 128
epochs = 12

model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=epochs,
          verbose=1,
          validation_data=(x_test, y_test))

In [119]:
score = model.evaluate(x_test, y_test, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

In [120]:
# name of the cluster this script will be applied to
clusterName = "caffe-gpu"

# Caffe git hash/tag. Other hashes/tags might work, but only this specific tag was tested.
caffeGitTag = "48e73c7"

In [121]:
script = """#!/usr/bin/env bash

set -ex

echo "**** Installing Caffe dependencies ****"

apt-get update

# Requirements stated in Caffe docs.
apt-get -y install libleveldb-dev libopencv-dev libhdf5-serial-dev protobuf-compiler libatlas-base-dev
apt-get -y install --no-install-recommends libboost-all-dev

# Additional requirements on Ubuntu 16.04.
apt-get -y install libgflags-dev libprotobuf-dev libgoogle-glog-dev liblmdb-dev

# The following should be installed already. Keep them to make the script self-contained.
apt-get -y install python-dev libsnappy-dev python-pip git-all gcc

echo "**** Downloading Caffe ****"

CAFFE_HOME=/usr/local/caffe
git clone https://github.com/BVLC/caffe.git $CAFFE_HOME
cd $CAFFE_HOME
git checkout {caffeGitTag}

echo "**** Installing Caffe Python dependencies ****"

pip install -r python/requirements.txt

echo "**** Installing Caffe ****"

cp Makefile.config.example Makefile.config
echo "
INCLUDE_DIRS += /usr/include/hdf5/serial
LIBRARY_DIRS += /usr/lib/x86_64-linux-gnu/hdf5/serial
BLAS := open
USE_CUDNN := 1
" >> Makefile.config

make all -j$(nproc)
make pycaffe

# set up symlink
ln -s $CAFFE_HOME/python/caffe /usr/local/lib/python2.7/site-packages/

""".format(caffeGitTag = caffeGitTag)

In [122]:
dbutils.fs.put("dbfs:/databricks/init/%s/install-caffe-gpu.sh" % clusterName, script, True)

In [123]:
print("Init scripts installed for cluster %s:" % clusterName)
print("\n".join([x.name for x in dbutils.fs.ls("dbfs:/databricks/init/%s" % clusterName)]))

In [124]:
from pylab import *

In [125]:
import caffe

In [126]:
import os
caffe_root = "/usr/local/caffe"
os.chdir(caffe_root)

In [127]:
%sh # run scripts from caffe root

# Download data
data/mnist/get_mnist.sh

# Prepare data
examples/mnist/create_mnist.sh

In [128]:
# back to examples
os.chdir('examples')

In [129]:
from caffe import layers as L, params as P

def lenet(lmdb, batch_size):
    # our version of LeNet: a series of linear and simple nonlinear transformations
    n = caffe.NetSpec()
    
    n.data, n.label = L.Data(batch_size=batch_size, backend=P.Data.LMDB, source=lmdb,
                             transform_param=dict(scale=1./255), ntop=2)
    
    n.conv1 = L.Convolution(n.data, kernel_size=5, num_output=20, weight_filler=dict(type='xavier'))
    n.pool1 = L.Pooling(n.conv1, kernel_size=2, stride=2, pool=P.Pooling.MAX)
    n.conv2 = L.Convolution(n.pool1, kernel_size=5, num_output=50, weight_filler=dict(type='xavier'))
    n.pool2 = L.Pooling(n.conv2, kernel_size=2, stride=2, pool=P.Pooling.MAX)
    n.fc1 =   L.InnerProduct(n.pool2, num_output=500, weight_filler=dict(type='xavier'))
    n.relu1 = L.ReLU(n.fc1, in_place=True)
    n.score = L.InnerProduct(n.relu1, num_output=10, weight_filler=dict(type='xavier'))
    n.loss =  L.SoftmaxWithLoss(n.score, n.label)
    
    return n.to_proto()
    
with open('mnist/lenet_auto_train.prototxt', 'w') as f:
    f.write(str(lenet('mnist/mnist_train_lmdb', 64)))
    
with open('mnist/lenet_auto_test.prototxt', 'w') as f:
    f.write(str(lenet('mnist/mnist_test_lmdb', 100)))

In [130]:
%sh cat mnist/lenet_auto_train.prototxt

In [131]:
%sh cat mnist/lenet_auto_solver.prototxt

In [132]:
caffe.set_device(0)
caffe.set_mode_gpu()

### load the solver and create train and test nets
solver = None  # ignore this workaround for lmdb data (can't instantiate two solvers on the same data)
solver = caffe.SGDSolver('mnist/lenet_auto_solver.prototxt')

In [133]:
# each output is (batch size, feature dim, spatial dim)
[(k, v.data.shape) for k, v in solver.net.blobs.items()]

In [134]:
# just print the weight sizes (we'll omit the biases)
[(k, v[0].data.shape) for k, v in solver.net.params.items()]

In [135]:
solver.net.forward()  # train net
solver.test_nets[0].forward()  # test net (there can be more than one)

In [136]:
# we use a little trick to tile the first eight images
clf()
imshow(solver.net.blobs['data'].data[:8, 0].transpose(1, 0, 2).reshape(28, 8*28), cmap='gray'); axis('off')
display()



In [137]:
print 'train labels:', solver.net.blobs['label'].data[:8]

In [138]:
clf()
imshow(solver.test_nets[0].blobs['data'].data[:8, 0].transpose(1, 0, 2).reshape(28, 8*28), cmap='gray'); axis('off')
display()

In [139]:
print 'test labels:', solver.test_nets[0].blobs['label'].data[:8]

In [140]:
cntkVersion = "2.0.beta15.0"
useGPU = True
clusterName = "cntk-gpu"

In [141]:
device = "GPU" if useGPU else "CPU-only"

In [142]:
script = """
#!/usr/bin/env bash
CNTK_VER="{cntkVersion}"
CNTK_URL="https://cntk.ai/PythonWheel/{device}/cntk-$CNTK_VER-cp27-cp27mu-linux_x86_64.whl"
echo "Running apt-get upgrade"
apt-get upgrade

echo "Installing Open MPI"
apt-get -y install openmpi-bin

echo "Installing CNTK via pip"
pip install --upgrade $CNTK_URL
""".format(cntkVersion=cntkVersion, device=device)

In [143]:
dbutils.fs.put("dbfs:/databricks/init/{clusterName}/install-cntk.sh".format(clusterName=clusterName), script, True)

In [144]:
print "Init scripts installed for cluster: " + clusterName
print("\n".join([x.name for x in dbutils.fs.ls("dbfs:/databricks/init/%s" % clusterName)]))

In [145]:

MODEL_URL = 'http://download.tensorflow.org/models/image/imagenet/inception-2015-12-05.tgz'
model_dir = '/tmp/imagenet'

IMAGES_INDEX_URL = 'http://image-net.org/imagenet_data/urls/imagenet_fall11_urls.tgz'
images_read_limit = 1000L  # Increase this to read more images

# Number of images per batch.
# 1 batch corresponds to 1 RDD row.
image_batch_size = 3

num_top_predictions = 5

In [146]:
import numpy as np
import tensorflow as tf
import os
from tensorflow.python.platform import gfile
import os.path
import re
import sys
import tarfile
from subprocess import Popen, PIPE, STDOUT