In [1]:
%sh cd ..
cd ..
cd dbfs/FileStore/tables/tmdv7u711497636580369
ls

In [2]:
%scala
val df = spark.read.format("csv")
.option("header", "true")
.option("inferSchema", "true")
.load("/FileStore/tables/tmdv7u711497636580369/2010_12_01-ec65d.csv")
df.printSchema()
df.createOrReplaceTempView("dfTable")

In [3]:
df = spark.read.format("csv")\
.option("header", "true")\
.option("inferSchema", "true")\
.load("/FileStore/tables/tmdv7u711497636580369/2010_12_01-ec65d.csv")
df.printSchema()
df.createOrReplaceTempView("dfTable")

In [4]:
%scala
import org.apache.spark.sql.functions.col
df.where(col("InvoiceNo").equalTo(536365))
.select("InvoiceNo", "Description")
.show(5, false)

In [5]:
%scala
import org.apache.spark.sql.functions.col
df.where(col("InvoiceNo") === 536365)
.select("InvoiceNo", "Description")
.show(5, false)

In [6]:
from pyspark.sql.functions import col
df.where(col("InvoiceNo") != 536365)\
.select("InvoiceNo", "Description")\
.show(5, False)

In [7]:
%scala
val priceFilter = col("UnitPrice") > 600
val descripFilter = col("Description").contains("POSTAGE")
df.where(col("StockCode").isin("DOT"))
.where(priceFilter.or(descripFilter))
.show(5)

In [8]:
from pyspark.sql.functions import instr
priceFilter = col("UnitPrice") > 600
descripFilter = instr(df.Description, "POSTAGE") >= 1
df.where(df.StockCode.isin("DOT"))\
.where(priceFilter | descripFilter)\
.show(5)

In [9]:
%sql
SELECT
*
FROM dfTable
WHERE
StockCode in ("DOT") AND
(UnitPrice > 600 OR
instr(Description, "POSTAGE") >= 1)

In [10]:
%scala
val DOTCodeFilter = col("StockCode") === "DOT"
val priceFilter = col("UnitPrice") > 600
val descripFilter = col("Description").contains("POSTAGE")
df.withColumn("isExpensive",
DOTCodeFilter.and(priceFilter.or(descripFilter)))
.where("isExpensive")
.select("unitPrice", "isExpensive")
.show(5)

In [11]:
from pyspark.sql.functions import instr
DOTCodeFilter = col("StockCode") == "DOT"
priceFilter = col("UnitPrice") > 600
descripFilter = instr(col("Description"), "POSTAGE") >= 1
df.withColumn("isExpensive",
DOTCodeFilter & (priceFilter | descripFilter))\
.where("isExpensive")\
.select("unitPrice", "isExpensive")\
.show(5)

In [12]:
%sql
SELECT
UnitPrice,
(StockCode = 'DOT' AND
(UnitPrice > 600 OR
instr(Description, "POSTAGE") >= 1)) as isExpensive
FROM dfTable
WHERE
(StockCode = 'DOT' AND
(UnitPrice > 600 OR
instr(Description, "POSTAGE") >= 1))

In [13]:
%scala
import org.apache.spark.sql.functions.{expr, not, col}
df.withColumn("isExpensive", not(col("UnitPrice").leq(250)))
.filter("isExpensive")
.select("Description", "UnitPrice").show(5)
df.withColumn("isExpensive", expr("NOT UnitPrice <= 250"))
.filter("isExpensive")
.select("Description", "UnitPrice").show(5)


In [14]:
%python
from pyspark.sql.functions import expr
df.withColumn("isExpensive", expr("NOT UnitPrice <= 250"))\
.where("isExpensive")\
.select("Description", "UnitPrice").show(5)

In [15]:
%scala
import org.apache.spark.sql.functions.{expr, pow}
val fabricatedQuantity = pow(col("Quantity") * col("UnitPrice"), 2) + 5
df.select(
expr("CustomerId"),
fabricatedQuantity.alias("realQuantity"))
.show(2)

In [16]:
from pyspark.sql.functions import expr, pow
fabricatedQuantity = pow(col("Quantity") * col("UnitPrice"), 2) + 5
df.select(
expr("CustomerId"),
fabricatedQuantity.alias("realQuantity"))\
.show(2)

In [17]:
%scala
df.selectExpr(
"CustomerId",
"(POWER((Quantity * UnitPrice), 2.0) + 5) as realQuantity")
.show(2)

In [18]:
df.selectExpr(
"CustomerId",
"(POWER((Quantity * UnitPrice), 2.0) + 5) as realQuantity")\
.show(2)

In [19]:
%sql
SELECT
customerId,
(POWER((Quantity * UnitPrice), 2.0) + 5) as realQuantity
FROM dfTable

In [20]:
%scala
import org.apache.spark.sql.functions.{round, bround}
df.select(
round(col("UnitPrice"), 1).alias("rounded"),
col("UnitPrice"))
.show(5)

In [21]:
%scala
import org.apache.spark.sql.functions.lit
df.select(
round(lit("2.5")),
bround(lit("2.5")))
.show(2)

In [22]:
from pyspark.sql.functions import lit, round, bround
df.select(
round(lit("2.5")),
bround(lit("2.5")))\
.show(2)

In [23]:
%sql
SELECT
round(2.5),
bround(2.5)

In [24]:
%scala
import org.apache.spark.sql.functions.{corr}
df.stat.corr("Quantity", "UnitPrice")
df.select(corr("Quantity", "UnitPrice")).show()

In [25]:
from pyspark.sql.functions import corr
df.stat.corr("Quantity", "UnitPrice")
df.select(corr("Quantity", "UnitPrice")).show()

In [26]:
%sql
SELECT
corr(Quantity, UnitPrice)
FROM
dfTable

In [27]:
%scala
df.describe().show()

In [28]:
df.describe().show()

In [29]:
%scala
import org.apache.spark.sql.functions.{count, mean, stddev_pop, min, max}

In [30]:
from pyspark.sql.functions import count, mean, stddev_pop, min, max

In [31]:
%scala
val colName = "UnitPrice"
val quantileProbs = Array(0.5)
val relError = 0.05
df.stat.approxQuantile("UnitPrice", quantileProbs, relError)

In [32]:
colName = "UnitPrice"
quantileProbs = [0.5]
relError = 0.05
df.stat.approxQuantile("UnitPrice", quantileProbs, relError)

In [33]:
%scala
df.stat.crosstab("StockCode", "Quantity").show()

In [34]:
df.stat.crosstab("StockCode", "Quantity").show()

In [35]:
%scala
df.stat.freqItems(Seq("StockCode", "Quantity")).show()

In [36]:
df.stat.freqItems(["StockCode", "Quantity"]).show()

In [37]:
%scala
import org.apache.spark.sql.functions.{initcap}
df.select(initcap(col("Description"))).show(2, false)

In [38]:
from pyspark.sql.functions import initcap
df.select(initcap(col("Description"))).show()

In [39]:
%sql
SELECT
initcap(Description)
FROM
dfTable

In [40]:
%scala
import org.apache.spark.sql.functions.{lower, upper}
df.select(
col("Description"),
lower(col("Description")),
upper(lower(col("Description"))))
.show(2)

In [41]:
from pyspark.sql.functions import lower, upper
df.select(
col("Description"),
lower(col("Description")),
upper(lower(col("Description"))))\
.show(2)

In [42]:
%sql
SELECT
Description,
lower(Description),
Upper(lower(Description))
FROM
dfTable

In [43]:
%scala
import org.apache.spark.sql.functions.{lit, ltrim, rtrim, rpad, lpad, trim}
df.select(
ltrim(lit(" HELLO ")).as("ltrim"),
rtrim(lit(" HELLO ")).as("rtrim"),
trim(lit(" HELLO ")).as("trim"),
lpad(lit("HELLO"), 3, " ").as("lp"),
rpad(lit("HELLO"), 10, " ").as("rp"))
.show(2)

In [44]:
from pyspark.sql.functions import lit, ltrim, rtrim, rpad, lpad, trim
df.select(
ltrim(lit(" HELLO ")).alias("ltrim"),
rtrim(lit(" HELLO ")).alias("rtrim"),
trim(lit(" HELLO ")).alias("trim"),
lpad(lit("HELLO"), 3, " ").alias("lp"),
rpad(lit("HELLO"), 10, " ").alias("rp"))\
.show(2)

In [45]:
%sql
SELECT
ltrim(' HELLLOOOO '),
rtrim(' HELLLOOOO '),
trim(' HELLLOOOO '),
lpad('HELLOOOO ', 3, ' '),
rpad('HELLOOOO ', 10, ' ')
FROM
dfTable

In [46]:
%scala
import org.apache.spark.sql.functions.regexp_replace
val simpleColors = Seq("black", "white", "red", "green", "blue")
val regexString = simpleColors.map(_.toUpperCase).mkString("|")
// the | signifies `OR` in regular expression syntax
df.select(
regexp_replace(col("Description"), regexString, "COLOR")
.alias("color_cleaned"),
col("Description"))
.show(2)

In [47]:
from pyspark.sql.functions import regexp_replace
regex_string = "BLACK|WHITE|RED|GREEN|BLUE"
df.select(
regexp_replace(col("Description"), regex_string, "COLOR")
.alias("color_cleaned"),
col("Description"))\
.show(2)

In [48]:
%sql
SELECT
regexp_replace(Description, 'BLACK|WHITE|RED|GREEN|BLUE', 'COLOR') as color_cleaned,
Description
FROM
dfTable

In [49]:
%scala
import org.apache.spark.sql.functions.translate
df.select(
translate(col("Description"), "LEET", "1337"),
col("Description"))
.show(2)

In [50]:
from pyspark.sql.functions import translate
df.select(
translate(col("Description"), "LEET", "1337"),
col("Description"))\
.show(2)

In [51]:
%sql
SELECT
translate(Description, 'LEET', '1337'),
Description
FROM
dfTable

In [52]:
%scala
import org.apache.spark.sql.functions.regexp_extract
val regexString = simpleColors
.map(_.toUpperCase)
.mkString("(", "|", ")")
// the | signifies OR in regular expression syntax
df.select(
regexp_extract(col("Description"), regexString, 1)
.alias("color_cleaned"),
col("Description"))
.show(2)

In [53]:
from pyspark.sql.functions import regexp_extract
extract_str = "(BLACK|WHITE|RED|GREEN|BLUE)"
df.select(
regexp_extract(col("Description"), extract_str, 1)
.alias("color_cleaned"),
col("Description"))\
.show(2)

In [54]:
%sql
SELECT
regexp_extract(Description, '(BLACK|WHITE|RED|GREEN|BLUE)', 1),
Description
FROM
dfTable

In [55]:
%scala
val containsBlack = col("Description").contains("BLACK")
val containsWhite = col("DESCRIPTION").contains("WHITE")
df.withColumn("hasSimpleColor", containsBlack.or(containsWhite))
.filter("hasSimpleColor")
.select("Description")
.show(3, false)

In [56]:
%python
from pyspark.sql.functions import instr
containsBlack = instr(col("Description"), "BLACK") >= 1
containsWhite = instr(col("Description"), "WHITE") >= 1
df.withColumn("hasSimpleColor", containsBlack | containsWhite)\
.filter("hasSimpleColor")\
.select("Description")\
.show(3, False)

In [57]:
%sql
SELECT
Description
FROM
dfTable
WHERE
instr(Description, 'BLACK') >= 1 OR
instr(Description, 'WHITE') >= 1

In [58]:
%scala
val simpleColors = Seq("black", "white", "red", "green", "blue")
val selectedColumns = simpleColors.map(color => {
col("Description")
.contains(color.toUpperCase)
.alias(s"is_$color")
}):+expr("*") // could also append this value
df
.select(selectedColumns:_*)
.where(col("is_white").or(col("is_red")))
.select("Description")
.show(3, false)

In [59]:
from pyspark.sql.functions import expr, locate
simpleColors = ["black", "white", "red", "green", "blue"]
def color_locator(column, color_string):
  return locate(color_string.upper(), column)\
.cast("boolean")\
.alias("is_" + c)
selectedColumns = [color_locator(df.Description, c) for c in simpleColors]
selectedColumns.append(expr("*")) # has to a be Column type
df\
.select(*selectedColumns)\
.where(expr("is_white OR is_red"))\
.select("Description")\
.show(3, False)

In [60]:
df.printSchema()

In [61]:
%scala
import org.apache.spark.sql.functions.{current_date, current_timestamp}
val dateDF = spark.range(10)
.withColumn("today", current_date())
.withColumn("now", current_timestamp())
dateDF.createOrReplaceTempView("dateTable")

In [62]:
from pyspark.sql.functions import current_date, current_timestamp
dateDF = spark.range(10)\
.withColumn("today", current_date())\
.withColumn("now", current_timestamp())
dateDF.createOrReplaceTempView("dateTable")
dateDF.printSchema()

In [63]:
%scala
import org.apache.spark.sql.functions.{date_add, date_sub}
dateDF
.select(
date_sub(col("today"), 5),
date_add(col("today"), 5))
.show(1)

In [64]:
from pyspark.sql.functions import date_add, date_sub
dateDF\
.select(
date_sub(col("today"), 5),
date_add(col("today"), 5))\
.show(1)

In [65]:
%sql
SELECT
date_sub(today, 5),
date_add(today, 5)
FROM
dateTable

In [66]:
%scala
import org.apache.spark.sql.functions.{datediff, months_between, to_date}
dateDF
.withColumn("week_ago", date_sub(col("today"), 7))
.select(datediff(col("week_ago"), col("today")))
.show(1)
dateDF
.select(
to_date(lit("2016-01-01")).alias("start"),
to_date(lit("2017-05-22")).alias("end"))
.select(months_between(col("start"), col("end")))
.show(1)

In [67]:
from pyspark.sql.functions import datediff, months_between, to_date
dateDF\
.withColumn("week_ago", date_sub(col("today"), 7))\
.select(datediff(col("week_ago"), col("today")))\
.show(1)
dateDF\
.select(
to_date(lit("2016-01-01")).alias("start"),
to_date(lit("2017-05-22")).alias("end"))\
.select(months_between(col("start"), col("end")))\
.show(1)

In [68]:
%sql
SELECT
to_date('2016-01-01'),
months_between('2016-01-01', '2017-01-01'),
datediff('2016-01-01', '2017-01-01')
FROM
dateTable

In [69]:
%scala
import org.apache.spark.sql.functions.{to_date, lit}
spark.range(5).withColumn("date", lit("2017-01-01"))
.select(to_date(col("date")))
.show(1)

In [70]:
%python
from pyspark.sql.functions import to_date, lit
spark.range(5).withColumn("date", lit("2017-01-01"))\
.select(to_date(col("date")))\
.show(1)

In [71]:
dateDF.select(to_date(lit("2016-20-12")),to_date(lit("2017-12-11"))).show(1)

In [72]:
%scala
import org.apache.spark.sql.functions.{unix_timestamp, from_unixtime}
val dateFormat = "yyyy-dd-MM"
val cleanDateDF = spark.range(1)
.select(
to_date(unix_timestamp(lit("2017-12-11"), dateFormat).cast("timestamp"))
.alias("date"),
to_date(unix_timestamp(lit("2017-20-12"), dateFormat).cast("timestamp"))
.alias("date2"))
cleanDateDF.createOrReplaceTempView("dateTable2")

In [73]:
from pyspark.sql.functions import unix_timestamp, from_unixtime
dateFormat = "yyyy-dd-MM"
cleanDateDF = spark.range(1)\
.select(
to_date(unix_timestamp(lit("2017-12-11"), dateFormat).cast("timestamp"))\
.alias("date"),
to_date(unix_timestamp(lit("2017-20-12"), dateFormat).cast("timestamp"))\
.alias("date2"))
cleanDateDF.createOrReplaceTempView("dateTable2")

In [74]:
%sql
SELECT
to_date(cast(unix_timestamp(date, 'yyyy-dd-MM') as timestamp)),
to_date(cast(unix_timestamp(date2, 'yyyy-dd-MM') as timestamp)),
to_date(date)
FROM
dateTable2

In [75]:
%scala
cleanDateDF
.select(
unix_timestamp(col("date"), dateFormat).cast("timestamp"))
.show()

In [76]:
cleanDateDF\
.select(
unix_timestamp(col("date"), dateFormat).cast("timestamp"))\
.show()

In [77]:
%scala
cleanDateDF.filter(col("date2") > lit("2017-12-12")).show()

In [78]:
cleanDateDF.filter(col("date2") > "'2017-12-12'").show()

In [79]:
df.na.drop()
df.na.drop("any")

In [80]:
%sql
SELECT
*
FROM
dfTable
WHERE
Description IS NOT NULL

In [81]:
df.na.drop("all")

In [82]:
%scala
df.na.drop("all", Seq("StockCode", "InvoiceNo"))

In [83]:
df.na.drop("all", subset=["StockCode", "InvoiceNo"])

In [84]:
df.na.fill("All Null values become this string")

In [85]:
%scala
df.na.fill(5, Seq("StockCode", "InvoiceNo"))

In [86]:
df.na.fill("all", subset=["StockCode", "InvoiceNo"])

In [87]:
%scala
val fillColValues = Map(
"StockCode" -> 5,
"Description" -> "No Value"
)
df.na.fill(fillColValues)

In [88]:
fill_cols_vals = {
"StockCode": 5,
"Description" : "No Value"
}
df.na.fill(fill_cols_vals)

In [89]:
%scala
df.na.replace("Description", Map("" -> "UNKNOWN"))

In [90]:
df.na.replace([""], ["UNKNOWN"], "Description")

In [91]:
df.selectExpr("(Description, InvoiceNo) as complex", "*")
df.selectExpr("struct(Description, InvoiceNo) as complex", "*")

In [92]:
%scala
import org.apache.spark.sql.functions.struct
val complexDF = df
.select(struct("Description", "InvoiceNo").alias("complex"))
complexDF.createOrReplaceTempView("complexDF")

In [93]:
from pyspark.sql.functions import struct
complexDF = df\
.select(struct("Description", "InvoiceNo").alias("complex"))
complexDF.createOrReplaceTempView("complexDF")

In [94]:
complexDF.select("complex.Description")

In [95]:
complexDF.select("complex.*")

In [96]:
%sql
SELECT
complex.*
FROM
complexDF

In [97]:
%scala
import org.apache.spark.sql.functions.split
df.select(split(col("Description"), " ")).show(2)

In [98]:
from pyspark.sql.functions import split
df.select(split(col("Description"), " ")).show(2)

In [99]:
%sql
SELECT
split(Description, ' ')
FROM
dfTable

In [100]:
%scala
df.select(split(col("Description"), " ").alias("array_col"))
.selectExpr("array_col[0]")
.show(2)

In [101]:
df.select(split(col("Description"), " ").alias("array_col"))\
.selectExpr("array_col[0]")\
.show(2)

In [102]:
%sql
SELECT
split(Description, ' ')[0]
FROM
dfTable

In [103]:
%scala
import org.apache.spark.sql.functions.array_contains
df.select(array_contains(split(col("Description"), " "), "WHITE")).show(2)

In [104]:
from pyspark.sql.functions import array_contains
df.select(array_contains(split(col("Description"), " "), "WHITE")).show(2)

In [105]:
%sql
SELECT
array_contains(split(Description, ' '), 'WHITE')
FROM
dfTable

In [106]:
%scala
import org.apache.spark.sql.functions.{split, explode}
df.withColumn("splitted", split(col("Description"), " "))
.withColumn("exploded", explode(col("splitted")))
.select("Description", "InvoiceNo", "exploded")

In [107]:
from pyspark.sql.functions import split, explode
df.withColumn("splitted", split(col("Description"), " "))\
.withColumn("exploded", explode(col("splitted")))\
.select("Description", "InvoiceNo", "exploded")\


In [108]:
%scala
import org.apache.spark.sql.functions.map
df.select(map(col("Description"), col("InvoiceNo")).alias("complex_map"))
.selectExpr("complex_map['Description']")

In [109]:
%sql
SELECT
map(Description, InvoiceNo) as complex_map
FROM
dfTable
WHERE
Description IS NOT NULL

In [110]:
%scala
import org.apache.spark.sql.functions.map
df.select(map(col("Description"), col("InvoiceNo")).alias("complex_map"))
.selectExpr("explode(complex_map)")
.take(5)

In [111]:
%scala
val jsonDF = spark.range(1)
.selectExpr("""
'{"myJSONKey" : {"myJSONValue" : [1, 2, 3]}}' as jsonString
""")

In [112]:
jsonDF = spark.range(1)\
.selectExpr("""
'{"myJSONKey" : {"myJSONValue" : [1, 2, 3]}}' as jsonString
""")

In [113]:
%scala
import org.apache.spark.sql.functions.{get_json_object, json_tuple}
jsonDF.select(
get_json_object(col("jsonString"), "$.myJSONKey.myJSONValue[1]"),
json_tuple(col("jsonString"), "myJSONKey"))
.show()

In [114]:
from pyspark.sql.functions import get_json_object, json_tuple
jsonDF.select(
get_json_object(col("jsonString"), "$.myJSONKey.myJSONValue[1]"),
json_tuple(col("jsonString"), "myJSONKey"))\
.show()

In [115]:
%scala
jsonDF.selectExpr("json_tuple(jsonString, '$.myJSONKey.myJSONValue[1]') as res")

In [116]:
%scala
import org.apache.spark.sql.functions.to_json
df.selectExpr("(InvoiceNo, Description) as myStruct")
.select(to_json(col("myStruct")))

In [117]:
from pyspark.sql.functions import to_json
df.selectExpr("(InvoiceNo, Description) as myStruct")\
.select(to_json(col("myStruct")))

In [118]:
%scala
import org.apache.spark.sql.functions.from_json
import org.apache.spark.sql.types._
val parseSchema = new StructType(Array(
new StructField("InvoiceNo",StringType,true),
new StructField("Description",StringType,true)))
df.selectExpr("(InvoiceNo, Description) as myStruct")
.select(to_json(col("myStruct")).alias("newJSON"))
.select(from_json(col("newJSON"), parseSchema), col("newJSON"))

In [119]:
from pyspark.sql.functions import from_json
from pyspark.sql.types import *
parseSchema = StructType((
StructField("InvoiceNo",StringType(),True),
StructField("Description",StringType(),True)))
df.selectExpr("(InvoiceNo, Description) as myStruct")\
.select(to_json(col("myStruct")).alias("newJSON"))\
.select(from_json(col("newJSON"), parseSchema), col("newJSON"))\

In [120]:
%scala
val udfExampleDF = spark.range(5).toDF("num")
def power3(number:Double):Double = {
number * number * number
}
power3(2.0)

In [121]:
udfExampleDF = spark.range(5).toDF("num")
def power3(double_value):
  return double_value ** 3
power3(2.0)

In [122]:
%scala
import org.apache.spark.sql.functions.udf
val power3udf = udf(power3(_:Double):Double)

In [123]:
%scala
udfExampleDF.select(power3udf(col("num"))).show()

In [124]:
from pyspark.sql.functions import udf
power3udf = udf(power3)

In [125]:
from pyspark.sql.functions import col
udfExampleDF.select(power3udf(col("num"))).show()

In [126]:
%scala
spark.udf.register("power3", power3(_:Double):Double)
udfExampleDF.selectExpr("power3(num)").show()

In [127]:
udfExampleDF.selectExpr("power3(num)").show()

In [128]:
from pyspark.sql.types import IntegerType, DoubleType
spark.udf.register("power3py", power3, DoubleType())
udfExampleDF.selectExpr("power3py(num)").show()

In [129]:
%sql
SELECT
power3py(12), -- doesn't work because of return type
power3(12)