In [1]:
%scala
val staticDataFrame = spark.read.format("csv")
.option("header", "true")
.option("inferSchema", "true")
.load("/FileStore/tables/1x42p66f1497639007910/*.csv")
staticDataFrame.createOrReplaceTempView("retail_data")
val staticSchema = staticDataFrame.schema

In [2]:
staticDataFrame = spark.read.format("csv")\
.option("header", "true")\
.option("inferSchema", "true")\
.load("/FileStore/tables/1x42p66f1497639007910/*.csv")
staticDataFrame.createOrReplaceTempView("retail_data")
staticSchema = staticDataFrame.schema

In [3]:
%scala
import org.apache.spark.sql.functions.{window, column, desc, col}
staticDataFrame
.selectExpr(
"CustomerId",
"(UnitPrice * Quantity) as total_cost",
"InvoiceDate")
.groupBy(
col("CustomerId"), window(col("InvoiceDate"), "1 day"))
.sum("total_cost")
.orderBy(desc("sum(total_cost)"))
.take(5)

In [4]:
from pyspark.sql.functions import window, column, desc, col
staticDataFrame\
.selectExpr(
"CustomerId",
"(UnitPrice * Quantity) as total_cost" ,
"InvoiceDate" )\
.groupBy(
col("CustomerId"), window(col("InvoiceDate"), "1 day"))\
.sum("total_cost")\
.orderBy(desc("sum(total_cost)"))\
.take(5)

In [5]:
%sql
SELECT
sum(total_cost),
CustomerId,
to_date(InvoiceDate)
FROM
(SELECT
CustomerId,
(UnitPrice * Quantity) as total_cost,
InvoiceDate
FROM
retail_data)
GROUP BY
CustomerId, to_date(InvoiceDate)
ORDER BY
sum(total_cost) DESC

In [6]:
%scala
spark.conf.set("spark.sql.shuffle.partitions", "5")
val streamingDataFrame = spark.readStream
.schema(staticSchema)
.option("maxFilesPerTrigger", 1)
.format("csv")
.option("header", "true")
.load("/FileStore/tables/1x42p66f1497639007910/*.csv")

In [7]:
streamingDataFrame = spark.readStream\
.schema(staticSchema)\
.option("maxFilesPerTrigger", 1)\
.format("csv")\
.option("header", "true")\
.load("dbfs:/mnt/defg/retail-data/by-day/*.csv")

In [8]:
%scala
streamingDataFrame.isStreaming // returns true

In [9]:
%scala
val purchaseByCustomerPerHour = streamingDataFrame
.selectExpr(
"CustomerId",
"(UnitPrice * Quantity) as total_cost",
"InvoiceDate")
.groupBy(
$"CustomerId", window($"InvoiceDate", "1 day"))
.sum("total_cost")

In [10]:
purchaseByCustomerPerHour = streamingDataFrame\
.selectExpr(
"CustomerId",
"(UnitPrice * Quantity) as total_cost" ,
"InvoiceDate" )\
.groupBy(
col("CustomerId"), window(col("InvoiceDate"), "1 day"))\
.sum("total_cost")

In [11]:
%scala
purchaseByCustomerPerHour.writeStream
.format("memory") // memory = store in-memory table
.queryName("customer_purchases") // counts = name of the in-memory table
.outputMode("complete") // complete = all the counts should be in the table
.start()

In [12]:
purchaseByCustomerPerHour.writeStream\
.format("memory")\
.queryName("customer_purchases")\
.outputMode("complete")\
.start()

In [13]:
%scala
spark.sql("""
SELECT *
FROM customer_purchases
ORDER BY `sum(total_cost)` DESC
""")
.take(5)

In [14]:
spark.sql("""
SELECT *
FROM customer_purchases
ORDER BY `sum(total_cost)` DESC
""")\
.take(5)

In [15]:
staticDataFrame.printSchema()

In [16]:
%scala
import org.apache.spark.sql.functions.date_format
val preppedDataFrame = staticDataFrame
.na.fill(0)
.withColumn("day_of_week", date_format($"InvoiceDate", "EEEE"))
.coalesce(5)

In [17]:
from pyspark.sql.functions import date_format, col
preppedDataFrame = staticDataFrame\
.na.fill(0)\
.withColumn("day_of_week", date_format(col("InvoiceDate"), "EEEE"))\
.coalesce(5)

In [18]:
%scala
val trainDataFrame = preppedDataFrame
.where("InvoiceDate < '2011-07-01'")
val testDataFrame = preppedDataFrame
.where("InvoiceDate >= '2011-07-01'")

In [19]:
trainDataFrame = preppedDataFrame\
.where("InvoiceDate < '2011-07-01'")
testDataFrame = preppedDataFrame\
.where("InvoiceDate >= '2011-07-01'")

In [20]:
trainDataFrame.count()
testDataFrame.count()

In [21]:
%scala
import org.apache.spark.ml.feature.StringIndexer
val indexer = new StringIndexer()
.setInputCol("day_of_week")
.setOutputCol("day_of_week_index")

In [22]:
from pyspark.ml.feature import StringIndexer
indexer = StringIndexer()\
.setInputCol("day_of_week")\
.setOutputCol("day_of_week_index")

In [23]:
%scala
import org.apache.spark.ml.feature.OneHotEncoder
val encoder = new OneHotEncoder()
.setInputCol("day_of_week_index")
.setOutputCol("day_of_week_encoded")

In [24]:
from pyspark.ml.feature import OneHotEncoder
encoder = OneHotEncoder()\
.setInputCol("day_of_week_index")\
.setOutputCol("day_of_week_encoded")

In [25]:
%scala
import org.apache.spark.ml.feature.VectorAssembler
val vectorAssembler = new VectorAssembler()
.setInputCols(Array("UnitPrice", "Quantity", "day_of_week_encoded"))
.setOutputCol("features")

In [26]:
from pyspark.ml.feature import VectorAssembler
vectorAssembler = VectorAssembler()\
.setInputCols(["UnitPrice", "Quantity", "day_of_week_encoded"])\
.setOutputCol("features")

In [27]:
%scala
import org.apache.spark.ml.Pipeline
val transformationPipeline = new Pipeline()
.setStages(Array(indexer, encoder, vectorAssembler))

In [28]:
from pyspark.ml import Pipeline
transformationPipeline = Pipeline()\
.setStages([indexer, encoder, vectorAssembler])

In [29]:
%scala
val fittedPipeline = transformationPipeline.fit(trainDataFrame)


In [30]:
fittedPipeline = transformationPipeline.fit(trainDataFrame)

In [31]:
%scala
val transformedTraining = fittedPipeline.transform(trainDataFrame)

In [32]:
transformedTraining = fittedPipeline.transform(trainDataFrame)

In [33]:
transformedTraining.cache()

In [34]:
%scala
import org.apache.spark.ml.clustering.KMeans
val kmeans = new KMeans()
.setK(20)
.setSeed(1L)

In [35]:
from pyspark.ml.clustering import KMeans
kmeans = KMeans()\
.setK(20)\
.setSeed(1L)

In [36]:
%scala
val kmModel = kmeans.fit(transformedTraining)

In [37]:
kmModel = kmeans.fit(transformedTraining)
kmModel.computeCost(transformedTraining)

In [38]:
%scala
val transformedTest = fittedPipeline.transform(testDataFrame)

In [39]:
transformedTest = fittedPipeline.transform(testDataFrame)
kmModel.computeCost(transformedTest)

In [40]:
%scala
val bikeStations = spark.read.format("csv")
.option("header", "true")
.option("inferSchema", "true")
.load("/FileStore/tables/ltn1r4x11497653150836/201508_station_data.csv")
val bikeTrips = spark.read.format("csv")
.option("header", "true")
.option("inferSchema", "true")
.load("/FileStore/tables/ltn1r4x11497653150836/201508_trip_data.csv")

In [41]:
bikeStations = spark.read.format("csv")\
.option("header", "true")\
.option("inferSchema", "true")\
.load("/FileStore/tables/ltn1r4x11497653150836/201508_station_data.csv")
bikeTrips = spark.read.format("csv")\
.option("header", "true")\
.option("inferSchema", "true")\
.load("/FileStore/tables/ltn1r4x11497653150836/201508_trip_data.csv")

In [42]:
%scala
val stationVertices = bikeStations
.withColumnRenamed("name", "id")
.distinct()
val tripEdges = bikeTrips
.withColumnRenamed("Start Station", "src")
.withColumnRenamed("End Station", "dst")

In [43]:
stationVertices = bikeStations\
.withColumnRenamed("name", "id")\
.distinct()
tripEdges = bikeTrips\
.withColumnRenamed("Start Station", "src")\
.withColumnRenamed("End Station", "dst")

In [44]:
%scala
import org.graphframes.GraphFrame
val stationGraph = GraphFrame(stationVertices, tripEdges)
tripEdges.cache()
stationVertices.cache()

In [45]:
from graphframes import GraphFrame
stationGraph = GraphFrame(stationVertices, tripEdges)
tripEdges.cache()
stationVertices.cache()

In [46]:
%scala
import org.apache.spark.sql.functions.{desc, col}
val ranks = stationGraph.pageRank.resetProbability(0.15).maxIter(10).run()
ranks.vertices.orderBy(desc("pagerank")).take(5)

In [47]:
from pyspark.sql.functions import desc
ranks = stationGraph.pageRank(maxIter=10).resetProbability(0.15).run()
ranks.vertices.orderBy(desc("pagerank")).take(5)

In [48]:
%scala
stationGraph
.edges
.groupBy("src", "dst")
.count()
.orderBy(desc("count"))
.limit(10)
.show()

In [49]:
stationGraph\
.edges\
.groupBy("src", "dst")\
.count()\
.orderBy(desc("count"))\
.limit(10)\
.show()

In [50]:
%scala
val df = spark.range(500).toDF("number")
df.select(df.col("number") + 10)
// org.apache.spark.sql.DataFrame = [(number + 10): bigint]

In [51]:
df = spark.range(500).toDF("number")
df.select(df["number"] + 10)
# DataFrame[(number + 10): bigint]

In [52]:
%scala
spark.range(2).toDF().collect()

In [53]:
spark.range(2).collect()

In [54]:
%scala
import org.apache.spark.sql.types._
val b = ByteType()

In [55]:
import org.apache.spark.sql.types.DataTypes;
ByteType x = DataTypes.ByteType();

In [56]:
from pyspark.sql.types import *
b = byteType()

In [57]:
%scala
val df = spark.read.format("json")
.load("/FileStore/tables/8su3wraj1497632771405/2015_summary-ebaee.json")

In [58]:
df = spark.read.format("json")\
.load("/FileStore/tables/8su3wraj1497632771405/2015_summary-ebaee.json")

In [59]:
df.printSchema()

In [60]:
%scala
spark.read.format("json")
.load("/FileStore/tables/8su3wraj1497632771405/2015_summary-ebaee.json")
.schema

In [61]:
spark.read.format("json")\
.load("/FileStore/tables/8su3wraj1497632771405/2015_summary-ebaee.json")\
.schema

In [62]:
%scala
import org.apache.spark.sql.types.{StructField, StructType, StringType, LongType}
val myManualSchema = new StructType(Array(
new StructField("DEST_COUNTRY_NAME", StringType, true),
new StructField("ORIGIN_COUNTRY_NAME", StringType, true),
new StructField("count", LongType, false) // just to illustrate flipping

))
val df = spark.read.format("json")
.schema(myManualSchema)
.load("/FileStore/tables/8su3wraj1497632771405/2015_summary-ebaee.json")

In [63]:
from pyspark.sql.types import StructField, StructType, StringType, LongType
myManualSchema = StructType([
StructField("DEST_COUNTRY_NAME", StringType(), True),
StructField("ORIGIN_COUNTRY_NAME", StringType(), True),
StructField("count", LongType(), False)
])
df = spark.read.format("json")\
.schema(myManualSchema)\
.load("/FileStore/tables/8su3wraj1497632771405/2015_summary-ebaee.json")

In [64]:
%scala
import org.apache.spark.sql.functions.{col, column}
col("someColumnName")
column("someColumnName")

In [65]:
from pyspark.sql.functions import col, column
col("someColumnName")
column("someColumnName")

In [66]:
%scala
$"myColumn"
'myColumn

In [67]:
%scala
df.col("count")

In [68]:
%scala
import org.apache.spark.sql.functions.{expr, col}
(((col("someCol") + 5) * 200) - 6) < col("otherCol")

In [69]:
from pyspark.sql.functions import expr
expr("(((someCol + 5) * 200) - 6) < otherCol")

In [70]:
%scala
spark.read.format("json")
.load("/FileStore/tables/8su3wraj1497632771405/2015_summary-ebaee.json")
.columns

In [71]:
%scala
df.first()

In [72]:
df.first()

In [73]:
%scala
import org.apache.spark.sql.Row
val myRow = Row("Hello", null, 1, false)

In [74]:
from pyspark.sql import Row
myRow = Row("Hello", None, 1, False)

In [75]:
%scala
myRow(0) // type Any
myRow(0).asInstanceOf[String] // String
myRow.getString(0) // String
myRow.getInt(2) // String

In [76]:
myRow[0]
myRow[2]

In [77]:
%scala
val df = spark.read.format("json")
.load("/FileStore/tables/8su3wraj1497632771405/2015_summary-ebaee.json")
df.createOrReplaceTempView("dfTable")

In [78]:
df = spark.read.format("json")\
.load("/FileStore/tables/8su3wraj1497632771405/2015_summary-ebaee.json")
df.createOrReplaceTempView("dfTable")

In [79]:
%scala
import org.apache.spark.sql.Row
import org.apache.spark.sql.types.{StructField, StructType,
StringType, LongType}
val myManualSchema = new StructType(Array(
new StructField("some", StringType, true),
new StructField("col", StringType, true),
new StructField("names", LongType, false) // just to illustrate flipping
))
val myRows = Seq(Row("Hello", null, 1L))
val myRDD = spark.sparkContext.parallelize(myRows)
val myDf = spark.createDataFrame(myRDD, myManualSchema)
myDf.show()

In [80]:
%scala
val myDF = Seq(("Hello", 2, 1L)).toDF()

In [81]:
from pyspark.sql import Row
from pyspark.sql.types import StructField, StructType,\
StringType, LongType
myManualSchema = StructType([
StructField("some", StringType(), True),
StructField("col", StringType(), True),
StructField("names", LongType(), False)
])
myRow = Row("Hello", None, 1)
myDf = spark.createDataFrame([myRow], myManualSchema)
myDf.show()


In [82]:
%scala
df.select("DEST_COUNTRY_NAME").show(2)

In [83]:
df.select("DEST_COUNTRY_NAME").show(2)

In [84]:
%sql
SELECT DEST_COUNTRY_NAME
FROM dfTable
LIMIT 2

In [85]:
%scala
df.select(
"DEST_COUNTRY_NAME",
"ORIGIN_COUNTRY_NAME")
.show(2)

In [86]:
df.select(
"DEST_COUNTRY_NAME",
"ORIGIN_COUNTRY_NAME" )\
.show(2)

In [87]:
%sql
SELECT
DEST_COUNTRY_NAME,
ORIGIN_COUNTRY_NAME
FROM
dfTable
LIMIT 2

In [88]:
%scala
import org.apache.spark.sql.functions.{expr, col, column}
df.select(
df.col("DEST_COUNTRY_NAME"),
col("DEST_COUNTRY_NAME"),
column("DEST_COUNTRY_NAME"),
'DEST_COUNTRY_NAME,
$"DEST_COUNTRY_NAME",
expr("DEST_COUNTRY_NAME")
).show(2)

In [89]:
from pyspark.sql.functions import expr, col, column
df.select(
expr("DEST_COUNTRY_NAME"),
col("DEST_COUNTRY_NAME"),
column("DEST_COUNTRY_NAME"))\
.show(2)

In [90]:
df.select(col("DEST_COUNTRY_NAME"), "DEST_COUNTRY_NAME")

In [91]:
%scala
df.select(expr("DEST_COUNTRY_NAME AS destination"))

In [92]:
df.select(expr("DEST_COUNTRY_NAME AS destination"))

In [93]:
%sql
SELECT
DEST_COUNTRY_NAME as destination
FROM
dfTable

In [94]:
%scala
df.select(
expr("DEST_COUNTRY_NAME as destination").alias("DEST_COUNTRY_NAME")
)

In [95]:
df.select(
expr("DEST_COUNTRY_NAME as destination").alias("DEST_COUNTRY_NAME")
)

In [96]:
%scala
df.selectExpr(
"DEST_COUNTRY_NAME as newColumnName",
"DEST_COUNTRY_NAME"
).show(2)

In [97]:
df.selectExpr(
"DEST_COUNTRY_NAME as newColumnName",
"DEST_COUNTRY_NAME"
).show(2)

In [98]:
%scala
df.selectExpr(
"*", // all original columns
"(DEST_COUNTRY_NAME = ORIGIN_COUNTRY_NAME) as withinCountry"
).show(2)

In [99]:
df.selectExpr(
"*", # all original columns
"(DEST_C
  OUNTRY_NAME = ORIGIN_COUNTRY_NAME) as withinCountry" )\
.show(2)

In [100]:
%sql
SELECT
*,
(DEST_COUNTRY_NAME = ORIGIN_COUNTRY_NAME) as withinCountry
FROM
dfTable

In [101]:
%scala
df.selectExpr("avg(count)", "count(distinct(DEST_COUNTRY_NAME))").show(2)

In [102]:
df.selectExpr("avg(count)", "count(distinct(DEST_COUNTRY_NAME))").show(2)

In [103]:
%sql
SELECT
avg(count),
count(distinct(DEST_COUNTRY_NAME))
FROM
dfTable

In [104]:
%scala
import org.apache.spark.sql.functions.lit
df.select(
expr("*"),
lit(1).as("something")
).show(2)

In [105]:
from pyspark.sql.functions import lit
df.select(
expr("*"),
lit(1).alias("One")
).show(2)

In [106]:
%sql
SELECT
*,
1 as One
FROM
dfTable
LIMIT 2

In [107]:
%scala
df.withColumn("numberOne", lit(1)).show(2)

In [108]:
df.withColumn("numberOne", lit(1)).show(2)

In [109]:
%sql
SELECT
1 as numberOne
FROM
dfTable
LIMIT 2

In [110]:
%scala
df.withColumn(
"withinCountry",
expr("ORIGIN_COUNTRY_NAME == DEST_COUNTRY_NAME")
).show(2)

In [111]:
df.withColumn(
"withinCountry",
expr("ORIGIN_COUNTRY_NAME == DEST_COUNTRY_NAME"))\
.show(2)

In [112]:
%scala
df.withColumn(
"Destination",
df.col("DEST_COUNTRY_NAME"))
.columns

In [113]:
%scala
df.withColumnRenamed("DEST_COUNTRY_NAME", "dest").columns|

In [114]:
df.withColumnRenamed("DEST_COUNTRY_NAME", "dest").columns

In [115]:
%scala
import org.apache.spark.sql.functions.expr
val dfWithLongColName = df
.withColumn(
"This Long Column-Name",
expr("ORIGIN_COUNTRY_NAME"))

In [116]:
dfWithLongColName = df\
.withColumn(
"This Long Column-Name",
expr("ORIGIN_COUNTRY_NAME"))

In [117]:
%scala
dfWithLongColName
.selectExpr(
"`This Long Column-Name`",
"`This Long Column-Name` as `new col`")
.show(2)

In [118]:
dfWithLongColName\
.selectExpr(
"`This Long Column-Name`",
"`This Long Column-Name` as `new col`" )\
.show(2)
dfWithLongColName.createOrReplaceTempView("dfTableLong")

In [119]:
%sql
SELECT `This Long Column-Name` FROM dfTableLong

In [120]:
%scala
dfWithLongColName.select(col("This Long Column-Name")).columns

In [121]:
dfWithLongColName.select(expr("`This Long Column-Name`")).columns

In [122]:
df.drop("ORIGIN_COUNTRY_NAME").columns

In [123]:
dfWithLongColName.drop("ORIGIN_COUNTRY_NAME", "DEST_COUNTRY_NAME")

In [124]:
df.printSchema()
df.withColumn("count", col("count").cast("int")).printSchema()

In [125]:
%sql
SELECT
cast(count as int)
FROM
dfTable

In [126]:
%scala
val colCondition = df.filter(col("count") < 2).take(2)
val conditional = df.where("count < 2").take(2)

In [127]:
colCondition = df.filter(col("count") < 2).take(2)
conditional = df.where("count < 2").take(2)

In [128]:
%sql
SELECT
*
FROM dfTable
WHERE
count < 2

In [129]:
%scala
df.where(col("count") < 2)
.where(col("ORIGIN_COUNTRY_NAME") =!= "Croatia")
.show(2)

In [130]:
df.where(col("count") < 2)\
.where(col("ORIGIN_COUNTRY_NAME") != "Croatia")\
.show(2)

In [131]:
%sql
SELECT
*
FROM dfTable
WHERE
count < 2 AND
ORIGIN_COUNTRY_NAME != "Croatia"

In [132]:
%scala
df.select("ORIGIN_COUNTRY_NAME", "DEST_COUNTRY_NAME").count()

In [133]:
df.select("ORIGIN_COUNTRY_NAME", "DEST_COUNTRY_NAME").count()

In [134]:
%sql
SELECT
COUNT(DISTINCT ORIGIN_COUNTRY_NAME, DEST_COUNTRY_NAME)
FROM dfTable

In [135]:
%scala
df.select("ORIGIN_COUNTRY_NAME").distinct().count()

In [136]:
df.select("ORIGIN_COUNTRY_NAME").distinct().count()

In [137]:
%sql
SELECT
COUNT(DISTINCT ORIGIN_COUNTRY_NAME)
FROM dfTable

In [138]:
%scala
val seed = 5
val withReplacement = false
val fraction = 0.5
df.sample(withReplacement, fraction, seed).count()

In [139]:
seed = 5
withReplacement = False
fraction = 0.5
df.sample(withReplacement, fraction, seed).count()

In [140]:
%scala
val dataFrames = df.randomSplit(Array(0.25, 0.75), seed)
dataFrames(0).count() > dataFrames(1).count()

In [141]:
dataFrames = df.randomSplit([0.25, 0.75], seed)
dataFrames[0].count() > dataFrames[1].count()

In [142]:
%scala
import org.apache.spark.sql.Row
val schema = df.schema
val newRows = Seq(
Row("New Country", "Other Country", 5L),
Row("New Country 2", "Other Country 3", 1L)
)
val parallelizedRows = spark.sparkContext.parallelize(newRows)
val newDF = spark.createDataFrame(parallelizedRows, schema)
df.union(newDF)
.where("count = 1")
.where($"ORIGIN_COUNTRY_NAME" =!= "United States")
.show() // get all of them and we'll see our new rows at the end

In [143]:
from pyspark.sql import Row
schema = df.schema
newRows = [
Row("New Country", "Other Country", 5L),
Row("New Country 2", "Other Country 3", 1L)
]
parallelizedRows = spark.sparkContext.parallelize(newRows)
newDF = spark.createDataFrame(parallelizedRows, schema)

In [144]:

df.union(newDF)\
.where("count = 1")\
.where(col("ORIGIN_COUNTRY_NAME") != "United States")\
.show()

In [145]:
%scala
df.sort("count").show(5)
df.orderBy("count", "DEST_COUNTRY_NAME").show(5)
df.orderBy(col("count"), col("DEST_COUNTRY_NAME")).show(5)

In [146]:
df.sort("count").show(5)
df.orderBy("count", "DEST_COUNTRY_NAME").show(5)
df.orderBy(col("count"), col("DEST_COUNTRY_NAME")).show(5)

In [147]:
%scala
import org.apache.spark.sql.functions.{desc, asc}
df.orderBy(expr("count desc")).show(2)
df.orderBy(desc("count"), asc("DEST_COUNTRY_NAME")).show(2)

In [148]:
from pyspark.sql.functions import desc, asc
df.orderBy(expr("count desc")).show(2)
df.orderBy(desc(col("count")), asc(col("DEST_COUNTRY_NAME"))).show(2)

In [149]:
%sql
SELECT *
FROM dfTable
ORDER BY count DESC, DEST_COUNTRY_NAME ASC

In [150]:
%scala
spark.read.format("json")
.load("/FileStore/tables/nrptmpui1497642469139/2010_summary-506d8.json")
.sortWithinPartitions("count")

In [151]:
spark.read.format("json")\
.load("/FileStore/tables/nrptmpui1497642469139/2010_summary-506d8.json")\
.sortWithinPartitions("count")

In [152]:
%scala
df.limit(5).show()

In [153]:
df.limit(5).show()

In [154]:
%scala
df.orderBy(expr("count desc")).limit(6).show()

In [155]:
df.orderBy(expr("count desc")).limit(6).show()

In [156]:
%sql
SELECT *
FROM dfTable
LIMIT 6

In [157]:
%scala
df.rdd.getNumPartitions

In [158]:
df.rdd.getNumPartitions()

In [159]:
%scala
df.repartition(5)

In [160]:
df.repartition(5)

In [161]:
%scala
df.repartition(col("DEST_COUNTRY_NAME"))

In [162]:
df.repartition(col("DEST_COUNTRY_NAME"))

In [163]:
%scala
df.repartition(5, col("DEST_COUNTRY_NAME"))

In [164]:
df.repartition(5, col("DEST_COUNTRY_NAME"))

In [165]:
%scala
df.repartition(5, col("DEST_COUNTRY_NAME")).coalesce(2)

In [166]:

df.repartition(5, col("DEST_COUNTRY_NAME")).coalesce(2)

In [167]:
%scala
val collectDF = df.limit(10)
collectDF.take(5) // take works with an Integer count
collectDF.show() // this prints it out nicely
collectDF.show(5, false)
collectDF.collect()

In [168]:
collectDF = df.limit(10)
collectDF.take(5) # take works with an Integer count
collectDF.show() # this prints it out nicely
collectDF.show(5, False)
collectDF.collect()

In [169]:
%scala
val df = spark.read.format("csv")
.option("header", "true")
.option("inferSchema", "true")
.load("/FileStore/tables/tmdv7u711497636580369/2010_12_01-ec65d.csv")
df.printSchema()
df.createOrReplaceTempView("dfTable")

In [170]:
df = spark.read.format("csv")\
.option("header", "true")\
.option("inferSchema", "true")\
.load("/FileStore/tables/tmdv7u711497636580369/2010_12_01-ec65d.csv")
df.printSchema()
df.createOrReplaceTempView("dfTable")

In [171]:
%scala
import org.apache.spark.sql.functions.col
df.where(col("InvoiceNo").equalTo(536365))
.select("InvoiceNo", "Description")
.show(5, false)

In [172]:
%scala
import org.apache.spark.sql.functions.col
df.where(col("InvoiceNo") === 536365)
.select("InvoiceNo", "Description")
.show(5, false)

In [173]:
from pyspark.sql.functions import col
df.where(col("InvoiceNo") != 536365)\
.select("InvoiceNo", "Description")\
.show(5, False)

In [174]:
%scala
val priceFilter = col("UnitPrice") > 600
val descripFilter = col("Description").contains("POSTAGE")
df.where(col("StockCode").isin("DOT"))
.where(priceFilter.or(descripFilter))
.show(5)

In [175]:
from pyspark.sql.functions import instr
priceFilter = col("UnitPrice") > 600
descripFilter = instr(df.Description, "POSTAGE") >= 1
df.where(df.StockCode.isin("DOT"))\
.where(priceFilter | descripFilter)\
.show(5)

In [176]:
%sql
SELECT
*
FROM dfTable
WHERE
StockCode in ("DOT") AND
(UnitPrice > 600 OR
instr(Description, "POSTAGE") >= 1)

In [177]:
%scala
val DOTCodeFilter = col("StockCode") === "DOT"
val priceFilter = col("UnitPrice") > 600
val descripFilter = col("Description").contains("POSTAGE")
df.withColumn("isExpensive",
DOTCodeFilter.and(priceFilter.or(descripFilter)))
.where("isExpensive")
.select("unitPrice", "isExpensive")
.show(5)

In [178]:
from pyspark.sql.functions import instr
DOTCodeFilter = col("StockCode") == "DOT"
priceFilter = col("UnitPrice") > 600
descripFilter = instr(col("Description"), "POSTAGE") >= 1
df.withColumn("isExpensive",
DOTCodeFilter & (priceFilter | descripFilter))\
.where("isExpensive")\
.select("unitPrice", "isExpensive")\
.show(5)

In [179]:
%sql
SELECT
UnitPrice,
(StockCode = 'DOT' AND
(UnitPrice > 600 OR
instr(Description, "POSTAGE") >= 1)) as isExpensive
FROM dfTable
WHERE
(StockCode = 'DOT' AND
(UnitPrice > 600 OR
instr(Description, "POSTAGE") >= 1))

In [180]:
%scala
import org.apache.spark.sql.functions.{expr, not, col}
df.withColumn("isExpensive", not(col("UnitPrice").leq(250)))
.filter("isExpensive")
.select("Description", "UnitPrice").show(5)
df.withColumn("isExpensive", expr("NOT UnitPrice <= 250"))
.filter("isExpensive")
.select("Description", "UnitPrice").show(5)

In [181]:
%scala
import org.apache.spark.sql.functions.{expr, not, col}
df.withColumn("isExpensive", not(col("UnitPrice").leq(250)))
.filter("isExpensive")
.select("Description", "UnitPrice").show(5)
df.withColumn("isExpensive", expr("NOT UnitPrice <= 250"))
.filter("isExpensive")
.select("Description", "UnitPrice").show(5)

In [182]:
%scala
import org.apache.spark.sql.functions.{expr, pow}
val fabricatedQuantity = pow(col("Quantity") * col("UnitPrice"), 2) + 5
df.select(
expr("CustomerId"),
fabricatedQuantity.alias("realQuantity"))
.show(2)

In [183]:
from pyspark.sql.functions import expr, pow
fabricatedQuantity = pow(col("Quantity") * col("UnitPrice"), 2) + 5
df.select(
expr("CustomerId"),
fabricatedQuantity.alias("realQuantity"))\
.show(2)

In [184]:
%scala
df.selectExpr(
"CustomerId",
"(POWER((Quantity * UnitPrice), 2.0) + 5) as realQuantity")
.show(2)

In [185]:
df.selectExpr(
"CustomerId",
"(POWER((Quantity * UnitPrice), 2.0) + 5) as realQuantity" )\
.show(2)

In [186]:
%sql
SELECT
customerId,
(POWER((Quantity * UnitPrice), 2.0) + 5) as realQuantity
FROM dfTable

In [187]:
%scala
import org.apache.spark.sql.functions.{round, bround}
df.select(
round(col("UnitPrice"), 1).alias("rounded"),
col("UnitPrice"))
.show(5)

In [188]:
%scala
import org.apache.spark.sql.functions.lit
df.select(
round(lit("2.5")),
bround(lit("2.5")))
.show(2)

In [189]:
%scala
import org.apache.spark.sql.functions.lit
df.select(
round(lit("2.5")),
bround(lit("2.5")))
.show(2)

In [190]:
%sql
SELECT
round(2.5),
bround(2.5)

In [191]:
%scala
import org.apache.spark.sql.functions.{corr}
df.stat.corr("Quantity", "UnitPrice")
df.select(corr("Quantity", "UnitPrice")).show()

In [192]:
from pyspark.sql.functions import corr
df.stat.corr("Quantity", "UnitPrice")
df.select(corr("Quantity", "UnitPrice")).show()

In [193]:
%sql
SELECT
corr(Quantity, UnitPrice)
FROM
dfTable

In [194]:
%scala
df.describe().show()

In [195]:
df.describe().show()

In [196]:
%scala
import org.apache.spark.sql.functions.{count, mean, stddev_pop, min, max}

In [197]:
from pyspark.sql.functions import count, mean, stddev_pop, min, max

In [198]:
%scala
val colName = "UnitPrice"
val quantileProbs = Array(0.5)
val relError = 0.05
df.stat.approxQuantile("UnitPrice", quantileProbs, relError)

In [199]:
colName = "UnitPrice"
quantileProbs = [0.5]
relError = 0.05
df.stat.approxQuantile("UnitPrice", quantileProbs, relError)

In [200]:
%scala
df.stat.crosstab("StockCode", "Quantity").show()

In [201]:
df.stat.crosstab("StockCode", "Quantity").show()

In [202]:
%scala
df.stat.freqItems(Seq("StockCode", "Quantity")).show()

In [203]:
df.stat.freqItems(["StockCode", "Quantity"]).show()

In [204]:
%scala
import org.apache.spark.sql.functions.{initcap}
df.select(initcap(col("Description"))).show(2, false)

In [205]:
from pyspark.sql.functions import initcap
df.select(initcap(col("Description"))).show()

In [206]:
%sql
SELECT
initcap(Description)
FROM
dfTable

In [207]:
%scala
import org.apache.spark.sql.functions.{lower, upper}
df.select(
col("Description"),
lower(col("Description")),
upper(lower(col("Description"))))
.show(2)

In [208]:
from pyspark.sql.functions import lower, upper
df.select(
col("Description"),
lower(col("Description")),
upper(lower(col("Description"))))\
.show(2)

In [209]:
%sql
SELECT
Description,
lower(Description),
Upper(lower(Description))
FROM
dfTable

In [210]:
%scala
import org.apache.spark.sql.functions.{lit, ltrim, rtrim, rpad, lpad, trim}
df.select(
ltrim(lit(" HELLO ")).as("ltrim"),
rtrim(lit(" HELLO ")).as("rtrim"),
trim(lit(" HELLO ")).as("trim"),
lpad(lit("HELLO"), 3, " ").as("lp"),
rpad(lit("HELLO"), 10, " ").as("rp"))
.show(2)

In [211]:
from pyspark.sql.functions import lit, ltrim, rtrim, rpad, lpad, trim
df.select(
ltrim(lit(" HELLO ")).alias("ltrim"),
rtrim(lit(" HELLO ")).alias("rtrim"),
trim(lit(" HELLO ")).alias("trim"),
lpad(lit("HELLO"), 3, " ").alias("lp"),
rpad(lit("HELLO"), 10, " ").alias("rp"))\
.show(2)

In [212]:
%sql
SELECT
ltrim(' HELLLOOOO '),
rtrim(' HELLLOOOO '),
trim(' HELLLOOOO '),
lpad('HELLOOOO ', 3, ' '),
rpad('HELLOOOO ', 10, ' ')
FROM
dfTable

In [213]:
%scala
import org.apache.spark.sql.functions.regexp_replace
val simpleColors = Seq("black", "white", "red", "green", "blue")
val regexString = simpleColors.map(_.toUpperCase).mkString(" ")
// the   signifies `OR` in regular expression syntax
df.select(
regexp_replace(col("Description"), regexString, "COLOR")
.alias("color_cleaned"),
col("Description"))
.show(2)

In [214]:
from pyspark.sql.functions import regexp_replace
regex_string = "BLACK WHITE RED GREEN BLUE"
df.select(
regexp_replace(col("Description"), regex_string, "COLOR")
.alias("color_cleaned"),
col("Description"))\
.show(2)

In [215]:
%sql
SELECT
regexp_replace(Description, 'BLACK WHITE RED GREEN BLUE', 'COLOR') as
color_cleaned,
Description
FROM
dfTable

In [216]:
%scala
import org.apache.spark.sql.functions.translate
df.select(
translate(col("Description"), "LEET", "1337"),
col("Description"))
.show(2)

In [217]:
from pyspark.sql.functions import translate
df.select(
translate(col("Description"), "LEET", "1337"),
col("Description"))\
.show(2)

In [218]:
%sql
SELECT
translate(Description, ‘LEET’, ‘1337’),
Description
FROM
dfTable

In [219]:
%scala
import org.apache.spark.sql.functions.regexp_extract
val regexString = simpleColors
.map(_.toUpperCase)
.mkString("(", " ", ")")
// the   signifies OR in regular expression syntax
df.select(
regexp_extract(col("Description"), regexString, 1)
.alias("color_cleaned"),
col("Description"))
.show(2)

In [220]:
from pyspark.sql.functions import regexp_extract
extract_str = "(BLACK WHITE RED GREEN BLUE)"
df.select(
regexp_extract(col("Description"), extract_str, 1)
.alias("color_cleaned"),
col("Description"))\
.show(2)

In [221]:
%sql
SELECT
regexp_extract(Description, '(BLACK WHITE RED GREEN BLUE)', 1),
Description
FROM
dfTable

In [222]:
%scala
val containsBlack = col("Description").contains("BLACK")
val containsWhite = col("DESCRIPTION").contains("WHITE")
df.withColumn("hasSimpleColor", containsBlack.or(containsWhite))
.filter("hasSimpleColor")
.select("Description")
.show(3, false)

In [223]:
from pyspark.sql.functions import instr
containsBlack = instr(col("Description"), "BLACK") >= 1
containsWhite = instr(col("Description"), "WHITE") >= 1
df.withColumn("hasSimpleColor", containsBlack | containsWhite)\
.filter("hasSimpleColor")\
.select("Description")\
.show(3, False)

In [224]:
%sql
SELECT
Description
FROM
dfTable
WHERE
instr(Description, 'BLACK') >= 1 OR
instr(Description, 'WHITE') >= 1

In [225]:
%scala
val simpleColors = Seq("black", "white", "red", "green", "blue")
val selectedColumns = simpleColors.map(color => {
col("Description")
.contains(color.toUpperCase)
.alias(s"is_$color")
}):+expr("*") // could also append this value
df
.select(selectedColumns:_*)
.where(col("is_white").or(col("is_red")))
.select("Description")
.show(3, false)

In [226]:
from pyspark.sql.functions import expr, locate
simpleColors = ["black", "white", "red", "green", "blue"]
def color_locator(column, color_string):
  return locate(color_string.upper(), column)\
.cast("boolean")\
.alias("is_" + c)
selectedColumns = [color_locator(df.Description, c) for c in simpleColors]
selectedColumns.append(expr("*")) # has to a be Column type
df\
.select(*selectedColumns)\
.where(expr("is_white OR is_red"))\
.select("Description")\
.show(3, False)

In [227]:

df.printSchema()

In [228]:
%scala
import org.apache.spark.sql.functions.{current_date, current_timestamp}
val dateDF = spark.range(10)
.withColumn("today", current_date())
.withColumn("now", current_timestamp())
dateDF.createOrReplaceTempView("dateTable")

In [229]:
from pyspark.sql.functions import current_date, current_timestamp
dateDF = spark.range(10)\
.withColumn("today", current_date())\
.withColumn("now", current_timestamp())
dateDF.createOrReplaceTempView("dateTable")
dateDF.printSchema()

In [230]:
%scala
import org.apache.spark.sql.functions.{date_add, date_sub}
dateDF
.select(
date_sub(col("today"), 5),
date_add(col("today"), 5))
.show(1)

In [231]:
from pyspark.sql.functions import date_add, date_sub
dateDF\
.select(
date_sub(col("today"), 5),
date_add(col("today"), 5))\
.show(1)

In [232]:
%sql
SELECT
date_sub(today, 5),
date_add(today, 5)
FROM
dateTable

In [233]:
%scala
import org.apache.spark.sql.functions.{datediff, months_between, to_date}
dateDF
.withColumn("week_ago", date_sub(col("today"), 7))
.select(datediff(col("week_ago"), col("today")))
.show(1)
dateDF
.select(
  to_date(lit("2016-01-01")).alias("start"),
to_date(lit("2017-05-22")).alias("end"))
.select(months_between(col("start"), col("end")))
.show(1)

In [234]:
from pyspark.sql.functions import datediff, months_between, to_date
dateDF\
.withColumn("week_ago", date_sub(col("today"), 7))\
.select(datediff(col("week_ago"), col("today")))\
.show(1)
dateDF\
.select(
to_date(lit("2016-01-01")).alias("start"),
to_date(lit("2017-05-22")).alias("end"))\
.select(months_between(col("start"), col("end")))\
.show(1)

In [235]:
%sql
SELECT
to_date('2016-01-01'),
months_between('2016-01-01', '2017-01-01'),
datediff('2016-01-01', '2017-01-01')
FROM
dateTable

In [236]:
%scala
import org.apache.spark.sql.functions.{to_date, lit}
spark.range(5).withColumn("date", lit("2017-01-01"))
.select(to_date(col("date")))
.show(1)

In [237]:
%python
from pyspark.sql.functions import to_date, lit
spark.range(5).withColumn("date", lit("2017-01-01"))\
.select(to_date(col("date")))\
.show(1)

In [238]:
dateDF.select(to_date(lit("2016-20-12")),to_date(lit("2017-12-11"))).show(1)

In [239]:
%scala
import org.apache.spark.sql.functions.{unix_timestamp, from_unixtime}
val dateFormat = "yyyy-dd-MM"
val cleanDateDF = spark.range(1)
.select(
to_date(unix_timestamp(lit("2017-12-11"), dateFormat).cast("timestamp"))
.alias("date"),
to_date(unix_timestamp(lit("2017-20-12"), dateFormat).cast("timestamp"))
.alias("date2"))
cleanDateDF.createOrReplaceTempView("dateTable2")

In [240]:
from pyspark.sql.functions import unix_timestamp, from_unixtime
dateFormat = "yyyy-dd-MM"
cleanDateDF = spark.range(1)\
.select(
to_date(unix_timestamp(lit("2017-12-11"), dateFormat).cast("timestamp"))\
.alias("date"),
to_date(unix_timestamp(lit("2017-20-12"), dateFormat).cast("timestamp"))\
.alias("date2"))
cleanDateDF.createOrReplaceTempView("dateTable2")

In [241]:
%sql
SELECT
to_date(cast(unix_timestamp(date, 'yyyy-dd-MM') as timestamp)),
to_date(cast(unix_timestamp(date2, 'yyyy-dd-MM') as timestamp)),
to_date(date)
FROM
dateTable2

In [242]:
%scala
cleanDateDF
.select(
unix_timestamp(col("date"), dateFormat).cast("timestamp"))
.show()

In [243]:
cleanDateDF\
.select(
unix_timestamp(col("date"), dateFormat).cast("timestamp"))\
.show()

In [244]:
cleanDateDF.filter(col("date2") > lit("2017-12-12")).show()

In [245]:
df.na.drop()
df.na.drop("any")

In [246]:
%sql
SELECT
*
FROM
dfTable
WHERE
Description IS NOT NULL

In [247]:
df.na.drop("all")

In [248]:
%scala
df.na.drop("all", Seq("StockCode", "InvoiceNo"))

In [249]:
df.na.drop("all", subset=["StockCode", "InvoiceNo"])

In [250]:
df.na.fill("All Null values become this string")

In [251]:
%scala
df.na.fill(5, Seq("StockCode", "InvoiceNo"))

In [252]:
df.na.fill("all", subset=["StockCode", "InvoiceNo"])

In [253]:
%scala
val fillColValues = Map(
"StockCode" -> 5,
"Description" -> "No Value"
)
df.na.fill(fillColValues)

In [254]:
fill_cols_vals = {
"StockCode": 5,
"Description" : "No Value"
}
df.na.fill(fill_cols_vals)

In [255]:
%scala
df.na.replace("Description", Map("" -> "UNKNOWN"))

In [256]:
df.na.replace([""], ["UNKNOWN"], "Description")

In [257]:
df.selectExpr("(Description, InvoiceNo) as complex", "*")
df.selectExpr("struct(Description, InvoiceNo) as complex", "*")

In [258]:
%scala
import org.apache.spark.sql.functions.struct
val complexDF = df
.select(struct("Description", "InvoiceNo").alias("complex"))
complexDF.createOrReplaceTempView("complexDF")

In [259]:
from pyspark.sql.functions import struct
complexDF = df\
.select(struct("Description", "InvoiceNo").alias("complex"))
complexDF.createOrReplaceTempView("complexDF")

In [260]:
complexDF.select("complex.Description")

In [261]:
complexDF.select("complex.*")

In [262]:
%sql
SELECT
complex.*
FROM
complexDF

In [263]:
%scala
import org.apache.spark.sql.functions.split
df.select(split(col("Description"), " ")).show(2)

In [264]:
from pyspark.sql.functions import split
df.select(split(col("Description"), " ")).show(2)

In [265]:
%sql
SELECT
split(Description, ' ')
FROM
dfTable

In [266]:
%scala
df.select(split(col("Description"), " ").alias("array_col"))
.selectExpr("array_col[0]")
.show(2)

In [267]:
df.select(split(col("Description"), " ").alias("array_col"))\
.selectExpr("array_col[0]")\
.show(2)

In [268]:
%sql
SELECT
split(Description, ' ')[0]
FROM
dfTable

In [269]:
%scala
import org.apache.spark.sql.functions.array_contains
df.select(array_contains(split(col("Description"), " "), "WHITE")).show(2)

In [270]:
from pyspark.sql.functions import array_contains
df.select(array_contains(split(col("Description"), " "), "WHITE")).show(2)

In [271]:
%sql
SELECT
array_contains(split(Description, ' '), 'WHITE')
FROM
dfTable

In [272]:
%scala
import org.apache.spark.sql.functions.{split, explode}
df.withColumn("splitted", split(col("Description"), " "))
.withColumn("exploded", explode(col("splitted")))
.select("Description", "InvoiceNo", "exploded")

In [273]:
from pyspark.sql.functions import split, explode
df.withColumn("splitted", split(col("Description"), " "))\
.withColumn("exploded", explode(col("splitted")))\
.select("Description", "InvoiceNo", "exploded")

In [274]:
%scala
import org.apache.spark.sql.functions.map
df.select(map(col("Description"), col("InvoiceNo")).alias("complex_map"))
.selectExpr("complex_map['Description']")

In [275]:
%sql
SELECT
map(Description, InvoiceNo) as complex_map
FROM
dfTable
WHERE
Description IS NOT NULL

In [276]:
%scala
import org.apache.spark.sql.functions.map
df.select(map(col("Description"), col("InvoiceNo")).alias("complex_map"))
.selectExpr("explode(complex_map)")
.take(5)

In [277]:
%scala
val jsonDF = spark.range(1)
.selectExpr("""
'{"myJSONKey" : {"myJSONValue" : [1, 2, 3]}}' as jsonString
""")

In [278]:
jsonDF = spark.range(1)\
.selectExpr("""
'{"myJSONKey" : {"myJSONValue" : [1, 2, 3]}}' as jsonString
""")

In [279]:
%scala
import org.apache.spark.sql.functions.{get_json_object, json_tuple}
jsonDF.select(
get_json_object(col("jsonString"), "$.myJSONKey.myJSONValue[1]"),
json_tuple(col("jsonString"), "myJSONKey"))
.show()

In [280]:
from pyspark.sql.functions import get_json_object, json_tuple
jsonDF.select(
get_json_object(col("jsonString"), "$.myJSONKey.myJSONValue[1]"),
json_tuple(col("jsonString"), "myJSONKey"))\
.show()

In [281]:
jsonDF.selectExpr("json_tuple(jsonString, '$.myJSONKey.myJSONValue[1]') as res")

In [282]:
%scala
import org.apache.spark.sql.functions.to_json
df.selectExpr("(InvoiceNo, Description) as myStruct")
.select(to_json(col("myStruct")))

In [283]:
from pyspark.sql.functions import to_json
df.selectExpr("(InvoiceNo, Description) as myStruct")\
.select(to_json(col("myStruct")))

In [284]:
%scala
import org.apache.spark.sql.functions.from_json
import org.apache.spark.sql.types._
val parseSchema = new StructType(Array(
new StructField("InvoiceNo",StringType,true),
new StructField("Description",StringType,true)))
df.selectExpr("(InvoiceNo, Description) as myStruct")
.select(to_json(col("myStruct")).alias("newJSON"))
.select(from_json(col("newJSON"), parseSchema), col("newJSON"))

In [285]:
from pyspark.sql.functions import from_json
from pyspark.sql.types import *
parseSchema = StructType((
StructField("InvoiceNo",StringType(),True),
StructField("Description",StringType(),True)))
df.selectExpr("(InvoiceNo, Description) as myStruct")\
.select(to_json(col("myStruct")).alias("newJSON"))\
.select(from_json(col("newJSON"), parseSchema), col("newJSON"))\

In [286]:
%scala
val udfExampleDF = spark.range(5).toDF("num")
def power3(number:Double):Double = {
number * number * number
}
power3(2.0)

In [287]:
udfExampleDF = spark.range(5).toDF("num")
def power3(double_value):
  return double_value ** 3
power3(2.0)

In [288]:
%scala
import org.apache.spark.sql.functions.udf
val power3udf = udf(power3(_:Double):Double)


In [289]:
%scala
udfExampleDF.select(power3udf(col("num"))).show()

In [290]:
from pyspark.sql.functions import udf
power3udf = udf(power3)

In [291]:
from pyspark.sql.functions import col
udfExampleDF.select(power3udf(col("num"))).show()

In [292]:
%scala
spark.udf.register("power3", power3(_:Double):Double)
udfExampleDF.selectExpr("power3(num)").show()

In [293]:
udfExampleDF.selectExpr("power3(num)").show()

In [294]:
%python
from pyspark.sql.types import IntegerType, DoubleType
spark.udf.register("power3py", power3, DoubleType())
udfExampleDF.selectExpr("power3py(num)").show()

In [295]:
%sql
SELECT
power3py(12), -- doesn’t work because of return type
power3(12)