In [1]:
%scala
case class MapEntry(key: String, value: Int)
val largeSeries = for (x <- 1 to 5000) yield MapEntry("k_%04d".format(x), x)
val largeDataFrame = sc.parallelize(largeSeries).toDF()
largeDataFrame.registerTempTable("largeTable")
display(sqlContext.sql("select * from largeTable"))

In [2]:
%scala
// Click on the Plot Options Button...to see how this pivot table was configured.
// NOTE how Pivot Tables are highlighted in green to distinguish them from regular charts.
case class PivotEntry(key: String, series_grouping: String, value: Int)
val largePivotSeries = for (x <- 1 to 5000) yield PivotEntry("k_%03d".format(x % 200),"group_%01d".format(x % 3), x)
val largePivotDataFrame = sc.parallelize(largePivotSeries).toDF()
largePivotDataFrame.registerTempTable("table_to_be_pivoted")
display(sqlContext.sql("select * from table_to_be_pivoted"))

In [3]:
%sql select key, series_grouping, sum(value) from table_to_be_pivoted group by key, series_grouping order by key, series_grouping

In [4]:
%scala
case class SalesEntry(category: String, product: String, year: Int, salesAmount: Double)
val salesEntryDataFrame = sc.parallelize(
  SalesEntry("fruits_and_vegetables", "apples", 2012, 100.50) :: 
  SalesEntry("fruits_and_vegetables", "oranges", 2012, 100.75) :: 
  SalesEntry("fruits_and_vegetables", "apples", 2013, 200.25) :: 
  SalesEntry("fruits_and_vegetables", "oranges", 2013, 300.65) :: 
  SalesEntry("fruits_and_vegetables", "apples", 2014, 300.65) :: 
  SalesEntry("fruits_and_vegetables", "oranges", 2015, 100.35) ::
  SalesEntry("butcher_shop", "beef", 2012, 200.50) :: 
  SalesEntry("butcher_shop", "chicken", 2012, 200.75) :: 
  SalesEntry("butcher_shop", "pork", 2013, 400.25) :: 
  SalesEntry("butcher_shop", "beef", 2013, 600.65) :: 
  SalesEntry("butcher_shop", "beef", 2014, 600.65) :: 
  SalesEntry("butcher_shop", "chicken", 2015, 200.35) ::
  SalesEntry("misc", "gum", 2012, 400.50) :: 
  SalesEntry("misc", "cleaning_supplies", 2012, 400.75) :: 
  SalesEntry("misc", "greeting_cards", 2013, 800.25) :: 
  SalesEntry("misc", "kitchen_utensils", 2013, 1200.65) :: 
  SalesEntry("misc", "cleaning_supplies", 2014, 1200.65) :: 
  SalesEntry("misc", "cleaning_supplies", 2015, 400.35) ::
  Nil).toDF()
salesEntryDataFrame.registerTempTable("test_sales_table")
display(sqlContext.sql("select * from test_sales_table"))

In [5]:
%sql select * from test_sales_table

In [6]:
%sql select * from test_sales_table

In [7]:
%scala
case class StateEntry(state: String, value: Int)
val stateRDD = sc.parallelize(
  StateEntry("MO", 1) :: StateEntry("MO", 10) ::
  StateEntry("NH", 4) ::
  StateEntry("MA", 8) ::
  StateEntry("NY", 4) ::
  StateEntry("CA", 7) ::  Nil).toDF()
stateRDD.registerTempTable("test_state_table")
display(sqlContext.sql("Select * from test_state_table"))

In [8]:
%scala
// Reminder: It's not a requirement to register this RDD as a temp table for Spark SQL - display can also be called directly on the RDD. 
case class WorldEntry(country: String, value: Int)
val worldRDD = sc.parallelize(
  WorldEntry("USA", 1000) ::
  WorldEntry("JPN", 23) ::
  WorldEntry("GBR", 23) ::
  WorldEntry("FRA", 21) ::
  WorldEntry("TUR", 3) ::
  Nil).toDF()
display(worldRDD)

In [9]:
%scala
case class ScatterPlotEntry(key: String, a: Double, b: Double, c: Double)
val scatterPlotRDD = sc.parallelize(
  ScatterPlotEntry("k1", 0.2, 120, 1) :: ScatterPlotEntry("k1", 0.4, 140, 1) :: ScatterPlotEntry("k1", 0.6, 160, 1) :: ScatterPlotEntry("k1", 0.8, 180, 1) ::
  ScatterPlotEntry("k2", 0.2, 220, 1) :: ScatterPlotEntry("k2", 0.4, 240, 1) :: ScatterPlotEntry("k2", 0.6, 260, 1) :: ScatterPlotEntry("k2", 0.8, 280, 1) ::
  ScatterPlotEntry("k1", 1.2, 120, 1) :: ScatterPlotEntry("k1", 1.4, 140, 1) :: ScatterPlotEntry("k1", 1.6, 160, 1) :: ScatterPlotEntry("k1", 1.8, 180, 1) ::
  ScatterPlotEntry("k2", 1.2, 220, 2) :: ScatterPlotEntry("k2", 1.4, 240, 2) :: ScatterPlotEntry("k2", 1.6, 260, 2) :: ScatterPlotEntry("k2", 1.8, 280, 2) ::
  ScatterPlotEntry("k1", 2.2, 120, 1) :: ScatterPlotEntry("k1", 2.4, 140, 1) :: ScatterPlotEntry("k1", 2.6, 160, 1) :: ScatterPlotEntry("k1", 2.8, 180, 1) ::
  ScatterPlotEntry("k2", 2.2, 220, 3) :: ScatterPlotEntry("k2", 2.4, 240, 3) :: ScatterPlotEntry("k2", 2.6, 260, 3) :: ScatterPlotEntry("k2", 2.8, 280, 3) ::
  Nil).toDF()
display(scatterPlotRDD)

In [10]:
%scala
val rng = new scala.util.Random(0)
val points = sc.parallelize((0L until 1000L).map { x => (x/100.0, 4 * math.sin(x/100.0) + rng.nextGaussian()) }).toDF()
display(points)

In [11]:
%scala
case class HistogramEntry(key1: String, key2: String, value: Double)
val HistogramRDD = sc.parallelize(
  HistogramEntry("a", "x", 0.2) :: HistogramEntry("a", "x", 0.4) :: HistogramEntry("a", "x", 0.6) :: HistogramEntry("a", "x", 0.8) :: HistogramEntry("a", "x", 1.0) ::
  HistogramEntry("b", "z", 0.2) :: HistogramEntry("b", "x", 0.4) :: HistogramEntry("b", "x", 0.6) :: HistogramEntry("b", "y", 0.8) :: HistogramEntry("b", "x", 1.0) ::
  HistogramEntry("a", "x", 0.2) :: HistogramEntry("a", "y", 0.4) :: HistogramEntry("a", "x", 0.6) :: HistogramEntry("a", "x", 0.8) :: HistogramEntry("a", "x", 1.0) ::
  HistogramEntry("b", "x", 0.2) :: HistogramEntry("b", "x", 0.4) :: HistogramEntry("b", "x", 0.6) :: HistogramEntry("b", "z", 0.8) :: HistogramEntry("b", "x", 1.0) ::
  Nil).toDF()
display(HistogramRDD)

In [12]:
%scala
case class QuantileEntry(key: String, grouping: String, otherField: Int, value: Int)
val quantileSeries = for (x <- 1 to 5000) yield QuantileEntry("key_%01d".format(x % 4),"group_%01d".format(x % 3), x, x*x)
val quantileSeriesRDD = sc.parallelize(quantileSeries).toDF()
display(quantileSeriesRDD)

In [13]:
%scala
case class QQPlotEntry(key: String, grouping: String, value: Int, value_squared: Int)
val qqPlotSeries = for (x <- 1 to 5000) yield QQPlotEntry("k_%03d".format(x % 5),"group_%01d".format(x % 3), x, x*x)
val qqPlotRDD = sc.parallelize(qqPlotSeries).toDF()

In [14]:
%scala
display(qqPlotRDD)

In [15]:
%scala
display(qqPlotRDD)

In [16]:
%scala
import java.util.Random
case class BoxEntry(key: String, grouping: String, value: Int)
val randomGenerator = new Random()
val boxSeries = for (x <- 1 to 5000) yield BoxEntry("key_%01d".format(x % 2),"group_%01d".format(x % 3), randomGenerator.nextInt(x).toInt)
val boxSeriesRDD = sc.parallelize(boxSeries).toDF()
display(boxSeriesRDD)

In [17]:
import numpy as np
import matplotlib.pyplot as plt

x = np.linspace(0, 2*np.pi, 50)
y = np.sin(x)
y2 = y + 0.1 * np.random.normal(size=x.shape)

fig, ax = plt.subplots()
ax.plot(x, y, 'k--')
ax.plot(x, y2, 'ro')

# set ticks and tick labels
ax.set_xlim((0, 2*np.pi))
ax.set_xticks([0, np.pi, 2*np.pi])
ax.set_xticklabels(['0', '$\pi$','2$\pi$'])
ax.set_ylim((-1.5, 1.5))
ax.set_yticks([-1, 0, 1])

# Only draw spine between the y-ticks
ax.spines['left'].set_bounds(-1, 1)
# Hide the right and top spines
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
# Only show ticks on the left and bottom spines
ax.yaxis.set_ticks_position('left')
ax.xaxis.set_ticks_position('bottom')
display(fig)

In [18]:
from ggplot import *
p = ggplot(meat, aes('date','beef')) + \
    geom_line(color='black') + \
    scale_x_date(breaks=date_breaks('7 years'), labels='%b %Y') + \
    scale_y_continuous(labels='comma') + theme_bw()
display(p)

In [19]:
from plotly.offline import plot
from plotly.graph_objs import *
import numpy as np

x = np.random.randn(2000)
y = np.random.randn(2000)

# Instead of simply calling plot(...), store your plot as a variable and pass it to displayHTML().
# Make sure to specify output_type='div' as a keyword argument.
# (Note that if you call displayHTML() multiple times in the same cell, only the last will take effect.)

p = plot(
  [
    Histogram2dContour(x=x, y=y, contours=Contours(coloring='heatmap')),
    Scatter(x=x, y=y, mode='markers', marker=Marker(color='white', size=3, opacity=0.3))
  ],
  output_type='div'
)

displayHTML(p)

In [20]:
%scala
val sparkDF = sqlContext.read.format("csv").load("/FileStore/tables/ms7vg26h1498071578743/")

In [21]:
sparkDF = sqlContext.read.format("csv").load("/FileStore/tables/ms7vg26h1498071578743/")


In [22]:
%r
sparkDF <- read.df(sqlContext, source = "csv", path = "/FileStore/tables/ms7vg26h1498071578743/")


In [23]:
%scala
val rdd = sc.textFile("/FileStore/tables/ms7vg26h1498071578743/")

In [24]:
rdd = sc.textFile("/FileStore/tables/ms7vg26h1498071578743/")

In [25]:
%r
df = read.csv("/dbfs/FileStore/tables/ms7vg26h1498071578743/airline.csv", header = TRUE)


In [26]:
dbutils.fs.rm("dbfs:/FileStore/tables/ms7vg26h1498071578743/", True)

In [27]:
%sh
cd ..
cd ..
cd dbfs/FileStore/tables/
ls