In [1]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.types import *

spark = SparkSession\
    .builder\
    .appName("chapter-05-basic-operation")\
    .getOrCreate()

import os
SPARK_BOOK_DATA_PATH = os.environ['SPARK_BOOK_DATA_PATH']

In [2]:
file_path = SPARK_BOOK_DATA_PATH + "/data/flight-data/json/2015-summary.json"
df = spark.read.format("json").load(file_path)

In [3]:
df.count()

256

In [4]:
df.show(5)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Romania|   15|
|    United States|            Croatia|    1|
|    United States|            Ireland|  344|
|            Egypt|      United States|   15|
|    United States|              India|   62|
+-----------------+-------------------+-----+
only showing top 5 rows



In [5]:
# COMMAND ----------

df.schema

StructType(List(StructField(DEST_COUNTRY_NAME,StringType,true),StructField(ORIGIN_COUNTRY_NAME,StringType,true),StructField(count,LongType,true)))

In [6]:
df.printSchema()

root
 |-- DEST_COUNTRY_NAME: string (nullable = true)
 |-- ORIGIN_COUNTRY_NAME: string (nullable = true)
 |-- count: long (nullable = true)



In [7]:
# COMMAND ----------

from pyspark.sql.types import StructField, StructType, StringType, LongType

myManualSchema = StructType([
  StructField("DEST_COUNTRY_NAME", StringType(), True),
  StructField("ORIGIN_COUNTRY_NAME", StringType(), True),
  StructField("count", LongType(), False, metadata={"hello":"world"})
])

In [8]:
df2 = spark.read.format("json").schema(myManualSchema).load(file_path)

In [9]:
df2.printSchema()

root
 |-- DEST_COUNTRY_NAME: string (nullable = true)
 |-- ORIGIN_COUNTRY_NAME: string (nullable = true)
 |-- count: long (nullable = true)



In [10]:
df2.show(5)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Romania|   15|
|    United States|            Croatia|    1|
|    United States|            Ireland|  344|
|            Egypt|      United States|   15|
|    United States|              India|   62|
+-----------------+-------------------+-----+
only showing top 5 rows



In [12]:
# COMMAND ----------

from pyspark.sql.functions import col, column
col("someColumnName")
# column("someColumnName")

Column<b'someColumnName'>

In [18]:
# COMMAND ----------

from pyspark.sql.functions import expr
expr("(((someCol + 5) * 200) - 6) < otherCol")

Column<b'((((someCol + 5) * 200) - 6) < otherCol)'>

In [13]:
# add a new column to df2
df2 = df2.withColumn("count_new", col("count")*2+10)

In [15]:
df2.show(5)

+-----------------+-------------------+-----+---------+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|count_new|
+-----------------+-------------------+-----+---------+
|    United States|            Romania|   15|       40|
|    United States|            Croatia|    1|       12|
|    United States|            Ireland|  344|      698|
|            Egypt|      United States|   15|       40|
|    United States|              India|   62|      134|
+-----------------+-------------------+-----+---------+
only showing top 5 rows



In [17]:
# COMMAND ----------

from pyspark.sql import Row
myRow = Row("Hello", None, 1, False)

In [18]:
myRow

<Row('Hello', None, 1, False)>

In [20]:
# COMMAND ----------

myRow[0], myRow[2]

('Hello', 1)

In [19]:
# COMMAND ----------

from pyspark.sql import Row
from pyspark.sql.types import StructField, StructType, StringType, LongType
myManualSchema = StructType([
  StructField("some", StringType(), True),
  StructField("col", StringType(), True),
  StructField("names", LongType(), False)
])
myRow = Row("Hello", None, 1)
myRow2 = Row("World", 'Spark is cool', 1000)
myDf = spark.createDataFrame([myRow, myRow2], myManualSchema)
myDf.show()

+-----+-------------+-----+
| some|          col|names|
+-----+-------------+-----+
|Hello|         null|    1|
|World|Spark is cool| 1000|
+-----+-------------+-----+



In [21]:
# COMMAND ----------

## df = spark.read.format("json").load("../data/flight-data/json/2015-summary.json")
df2.createOrReplaceTempView("df2Table")

In [22]:
# COMMAND ----------

df2.select("DEST_COUNTRY_NAME").show(2)

+-----------------+
|DEST_COUNTRY_NAME|
+-----------------+
|    United States|
|    United States|
+-----------------+
only showing top 2 rows



In [24]:
df2tbl = spark.sql("select * from df2Table limit 5")
df2tbl.show()

+-----------------+-------------------+-----+---------+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|count_new|
+-----------------+-------------------+-----+---------+
|    United States|            Romania|   15|       40|
|    United States|            Croatia|    1|       12|
|    United States|            Ireland|  344|      698|
|            Egypt|      United States|   15|       40|
|    United States|              India|   62|      134|
+-----------------+-------------------+-----+---------+



In [25]:
# COMMAND ----------

df.select("DEST_COUNTRY_NAME", "ORIGIN_COUNTRY_NAME").show(2)

+-----------------+-------------------+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|
+-----------------+-------------------+
|    United States|            Romania|
|    United States|            Croatia|
+-----------------+-------------------+
only showing top 2 rows



How to create new column, rename column

In [26]:
# COMMAND ----------

from pyspark.sql.functions import expr, col, column
df.select(
    "ORIGIN_COUNTRY_NAME",
    expr("DEST_COUNTRY_NAME as dest"),
    col("DEST_COUNTRY_NAME").alias("dest_country"),
    column("DEST_COUNTRY_NAME"))\
  .show(5)

+-------------------+-------------+-------------+-----------------+
|ORIGIN_COUNTRY_NAME|         dest| dest_country|DEST_COUNTRY_NAME|
+-------------------+-------------+-------------+-----------------+
|            Romania|United States|United States|    United States|
|            Croatia|United States|United States|    United States|
|            Ireland|United States|United States|    United States|
|      United States|        Egypt|        Egypt|            Egypt|
|              India|United States|United States|    United States|
+-------------------+-------------+-------------+-----------------+
only showing top 5 rows



In [31]:
# COMMAND ----------

df.select(expr("DEST_COUNTRY_NAME AS destination")).show(2)

+-------------+
|  destination|
+-------------+
|United States|
|United States|
+-------------+
only showing top 2 rows



In [32]:
# COMMAND ----------

df.select(expr("DEST_COUNTRY_NAME as destination")\
    .alias("DEST_COUNTRY_NAME"))\
    .show(2)

+-----------------+
|DEST_COUNTRY_NAME|
+-----------------+
|    United States|
|    United States|
+-----------------+
only showing top 2 rows



In [33]:
# COMMAND ----------

df.selectExpr("DEST_COUNTRY_NAME as newColumnName", "DEST_COUNTRY_NAME").show(2)

+-------------+-----------------+
|newColumnName|DEST_COUNTRY_NAME|
+-------------+-----------------+
|United States|    United States|
|United States|    United States|
+-------------+-----------------+
only showing top 2 rows



In [34]:
# COMMAND ----------

df.selectExpr(
  "*", # all original columns
  "(DEST_COUNTRY_NAME = ORIGIN_COUNTRY_NAME) as withinCountry")\
  .show(2)

+-----------------+-------------------+-----+-------------+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|withinCountry|
+-----------------+-------------------+-----+-------------+
|    United States|            Romania|   15|        false|
|    United States|            Croatia|    1|        false|
+-----------------+-------------------+-----+-------------+
only showing top 2 rows



In [35]:
# COMMAND ----------

df.selectExpr("avg(count)", "count(distinct(DEST_COUNTRY_NAME))").show(2)

+-----------+---------------------------------+
| avg(count)|count(DISTINCT DEST_COUNTRY_NAME)|
+-----------+---------------------------------+
|1770.765625|                              132|
+-----------+---------------------------------+



In [36]:
# COMMAND ----------

from pyspark.sql.functions import lit
df.select(expr("*"), lit(1).alias("One")).show(2)

+-----------------+-------------------+-----+---+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|One|
+-----------------+-------------------+-----+---+
|    United States|            Romania|   15|  1|
|    United States|            Croatia|    1|  1|
+-----------------+-------------------+-----+---+
only showing top 2 rows



In [44]:
df.selectExpr("*", "count*2 as doubled").show(2)

+-----------------+-------------------+-----+-------+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|doubled|
+-----------------+-------------------+-----+-------+
|    United States|            Romania|   15|     30|
|    United States|            Croatia|    1|      2|
+-----------------+-------------------+-----+-------+
only showing top 2 rows



In [46]:
# Create a new column

df.withColumn("doubled", col("count")*2).show(2)

+-----------------+-------------------+-----+-------+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|doubled|
+-----------------+-------------------+-----+-------+
|    United States|            Romania|   15|     30|
|    United States|            Croatia|    1|      2|
+-----------------+-------------------+-----+-------+
only showing top 2 rows



In [47]:
# COMMAND ----------

df.withColumn("withinCountry", expr("ORIGIN_COUNTRY_NAME == DEST_COUNTRY_NAME"))\
  .show(2)

+-----------------+-------------------+-----+-------------+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|withinCountry|
+-----------------+-------------------+-----+-------------+
|    United States|            Romania|   15|        false|
|    United States|            Croatia|    1|        false|
+-----------------+-------------------+-----+-------------+
only showing top 2 rows



In [49]:
# COMMAND ----------

df.withColumn("domestic", col("ORIGIN_COUNTRY_NAME") == col("DEST_COUNTRY_NAME")).show(2)

+-----------------+-------------------+-----+--------+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|domestic|
+-----------------+-------------------+-----+--------+
|    United States|            Romania|   15|   false|
|    United States|            Croatia|    1|   false|
+-----------------+-------------------+-----+--------+
only showing top 2 rows



In [51]:
# rename column

df.withColumnRenamed("DEST_COUNTRY_NAME", "dest").columns


# COMMAND ----------

['dest', 'ORIGIN_COUNTRY_NAME', 'count']

In [52]:
dfWithLongColName = df.withColumn(
    "This Long Column-Name",
    expr("ORIGIN_COUNTRY_NAME"))


# COMMAND ----------

In [53]:
dfWithLongColName.selectExpr(
    "`This Long Column-Name`",
    "`This Long Column-Name` as `new col`")\
  .show(2)

+---------------------+-------+
|This Long Column-Name|new col|
+---------------------+-------+
|              Romania|Romania|
|              Croatia|Croatia|
+---------------------+-------+
only showing top 2 rows



In [56]:
# COMMAND ----------

dfWithLongColName.select(expr("`This Long Column-Name`")).columns

['This Long Column-Name']

In [55]:
# COMMAND ----------

df.where(col("count") < 2).where(col("ORIGIN_COUNTRY_NAME") != "Croatia").show(2)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|          Singapore|    1|
|          Moldova|      United States|    1|
+-----------------+-------------------+-----+
only showing top 2 rows



In [57]:
# COMMAND ----------

df.select("ORIGIN_COUNTRY_NAME", "DEST_COUNTRY_NAME").distinct().count()

256

In [58]:
# COMMAND ----------

df.select("ORIGIN_COUNTRY_NAME").distinct().count()

125

In [59]:
# COMMAND ----------

seed = 5
withReplacement = False
fraction = 0.5
df.sample(withReplacement, fraction, seed).count()

138

In [60]:
# COMMAND ----------

dataFrames = df.randomSplit([0.25, 0.75], seed)
dataFrames[0].count() > dataFrames[1].count() # False

False

In [61]:
# COMMAND ----------

from pyspark.sql import Row
schema = df.schema
newRows = [
  Row("New Country", "Other Country", 5),
  Row("New Country 2", "Other Country 3", 1)
]
parallelizedRows = spark.sparkContext.parallelize(newRows)
newDF = spark.createDataFrame(parallelizedRows, schema)

In [63]:
# COMMAND ----------

df.union(newDF)\
  .where("count = 1")\
  .where(col("ORIGIN_COUNTRY_NAME") != "United States")\
  .show()

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Croatia|    1|
|    United States|          Singapore|    1|
|    United States|          Gibraltar|    1|
|    United States|             Cyprus|    1|
|    United States|            Estonia|    1|
|    United States|          Lithuania|    1|
|    United States|           Bulgaria|    1|
|    United States|            Georgia|    1|
|    United States|            Bahrain|    1|
|    United States|   Papua New Guinea|    1|
|    United States|         Montenegro|    1|
|    United States|            Namibia|    1|
|    New Country 2|    Other Country 3|    1|
+-----------------+-------------------+-----+



In [64]:
# COMMAND ----------

df.sort("count").show(5)
df.orderBy("count", "DEST_COUNTRY_NAME").show(5)
df.orderBy(col("count"), col("DEST_COUNTRY_NAME")).show(5)

+--------------------+-------------------+-----+
|   DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+--------------------+-------------------+-----+
|               Malta|      United States|    1|
|Saint Vincent and...|      United States|    1|
|       United States|            Croatia|    1|
|       United States|          Gibraltar|    1|
|       United States|          Singapore|    1|
+--------------------+-------------------+-----+
only showing top 5 rows

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|     Burkina Faso|      United States|    1|
|    Cote d'Ivoire|      United States|    1|
|           Cyprus|      United States|    1|
|         Djibouti|      United States|    1|
|        Indonesia|      United States|    1|
+-----------------+-------------------+-----+
only showing top 5 rows

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+--

In [None]:
df=df.withColumn("count_bin", col("count") )

In [None]:
df2 = df2.withColumn("count_new", col("count")*2+10)

In [86]:
# COMMAND ----------

from pyspark.sql.functions import desc, asc
df.orderBy(desc("count")).show()

+------------------+-------------------+------+
| DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME| count|
+------------------+-------------------+------+
|     United States|      United States|370002|
|     United States|             Canada|  8483|
|            Canada|      United States|  8399|
|     United States|             Mexico|  7187|
|            Mexico|      United States|  7140|
|    United Kingdom|      United States|  2025|
|     United States|     United Kingdom|  1970|
|             Japan|      United States|  1548|
|     United States|              Japan|  1496|
|           Germany|      United States|  1468|
|     United States| Dominican Republic|  1420|
|Dominican Republic|      United States|  1353|
|     United States|            Germany|  1336|
|       South Korea|      United States|  1048|
|     United States|        The Bahamas|   986|
|       The Bahamas|      United States|   955|
|     United States|             France|   952|
|            France|      United States|

In [90]:
df.printSchema()

root
 |-- DEST_COUNTRY_NAME: string (nullable = true)
 |-- ORIGIN_COUNTRY_NAME: string (nullable = true)
 |-- count: long (nullable = true)



In [96]:
countDF = df.select("count").toDF("count")

In [98]:
countDF.show(2)

+-----+
|count|
+-----+
|   15|
|    1|
+-----+
only showing top 2 rows



In [100]:
countDF.printSchema()

root
 |-- count: long (nullable = true)



In [105]:
countdf = spark.range(10002).toDF("numflights")

In [120]:
from pyspark.sql.functions import udf
count_range = udf(lambda x: 
                   '01000<' if x < 1000 else 
                   '01000+' if (x >= 1000 and x < 5000) else
                   '05000+' if (x >= 5000 and x < 8000) else
                   '08000+' if (x >= 8000 and x < 10000) else
                   '10000>'  if (x >= 10000) else ''
)

countdf = countdf.withColumn('flights_bin', count_range(countdf.numflights))

In [121]:
countdf.show()

+----------+-----------+
|numflights|flights_bin|
+----------+-----------+
|         0|     01000<|
|         1|     01000<|
|         2|     01000<|
|         3|     01000<|
|         4|     01000<|
|         5|     01000<|
|         6|     01000<|
|         7|     01000<|
|         8|     01000<|
|         9|     01000<|
|        10|     01000<|
|        11|     01000<|
|        12|     01000<|
|        13|     01000<|
|        14|     01000<|
|        15|     01000<|
|        16|     01000<|
|        17|     01000<|
|        18|     01000<|
|        19|     01000<|
+----------+-----------+
only showing top 20 rows



In [112]:
df.show(5)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Romania|   15|
|    United States|            Croatia|    1|
|    United States|            Ireland|  344|
|            Egypt|      United States|   15|
|    United States|              India|   62|
+-----------------+-------------------+-----+
only showing top 5 rows



In [113]:
df = df.withColumnRenamed("count", "numflights")

In [114]:
df.show(5)

+-----------------+-------------------+----------+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|numflights|
+-----------------+-------------------+----------+
|    United States|            Romania|        15|
|    United States|            Croatia|         1|
|    United States|            Ireland|       344|
|            Egypt|      United States|        15|
|    United States|              India|        62|
+-----------------+-------------------+----------+
only showing top 5 rows



In [122]:
df = df.withColumn('flights_bin', count_range(df.numflights))

In [123]:
df.show(5)

+-----------------+-------------------+----------+-----------+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|numflights|flights_bin|
+-----------------+-------------------+----------+-----------+
|    United States|            Romania|        15|     01000<|
|    United States|            Croatia|         1|     01000<|
|    United States|            Ireland|       344|     01000<|
|            Egypt|      United States|        15|     01000<|
|    United States|              India|        62|     01000<|
+-----------------+-------------------+----------+-----------+
only showing top 5 rows



In [124]:
df.orderBy(col("flights_bin").desc(), col("DEST_COUNTRY_NAME").asc()).show()

+-------------------+-------------------+----------+-----------+
|  DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|numflights|flights_bin|
+-------------------+-------------------+----------+-----------+
|      United States|      United States|    370002|     10000>|
|             Canada|      United States|      8399|     08000+|
|      United States|             Canada|      8483|     08000+|
|             Mexico|      United States|      7140|     05000+|
|      United States|             Mexico|      7187|     05000+|
|            Algeria|      United States|         4|     01000<|
|             Angola|      United States|        15|     01000<|
|           Anguilla|      United States|        41|     01000<|
|Antigua and Barbuda|      United States|       126|     01000<|
|          Argentina|      United States|       180|     01000<|
|              Aruba|      United States|       346|     01000<|
|          Australia|      United States|       329|     01000<|
|            Austria|    

In [80]:
# COMMAND ----------


df.sortWithinPartitions("count")

DataFrame[DEST_COUNTRY_NAME: string, ORIGIN_COUNTRY_NAME: string, count: bigint]

In [81]:
df.show()

+--------------------+-------------------+-----+
|   DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+--------------------+-------------------+-----+
|       United States|            Romania|   15|
|       United States|            Croatia|    1|
|       United States|            Ireland|  344|
|               Egypt|      United States|   15|
|       United States|              India|   62|
|       United States|          Singapore|    1|
|       United States|            Grenada|   62|
|          Costa Rica|      United States|  588|
|             Senegal|      United States|   40|
|             Moldova|      United States|    1|
|       United States|       Sint Maarten|  325|
|       United States|   Marshall Islands|   39|
|              Guyana|      United States|   64|
|               Malta|      United States|    1|
|            Anguilla|      United States|   41|
|             Bolivia|      United States|   30|
|       United States|           Paraguay|    6|
|             Algeri

In [67]:
# COMMAND ----------

df.limit(5).show()

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Romania|   15|
|    United States|            Croatia|    1|
|    United States|            Ireland|  344|
|            Egypt|      United States|   15|
|    United States|              India|   62|
+-----------------+-------------------+-----+



In [78]:
# COMMAND ----------

df.orderBy(F.desc("count")).limit(6).show()

+-----------------+-------------------+------+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME| count|
+-----------------+-------------------+------+
|    United States|      United States|370002|
|    United States|             Canada|  8483|
|           Canada|      United States|  8399|
|    United States|             Mexico|  7187|
|           Mexico|      United States|  7140|
|   United Kingdom|      United States|  2025|
+-----------------+-------------------+------+



In [72]:
# COMMAND ----------

collectDF = df.limit(10)
collectDF.take(5) # take works with an Integer count

[Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Romania', count=15),
 Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Croatia', count=1),
 Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Ireland', count=344),
 Row(DEST_COUNTRY_NAME='Egypt', ORIGIN_COUNTRY_NAME='United States', count=15),
 Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='India', count=62)]

In [73]:
collectDF.show()

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Romania|   15|
|    United States|            Croatia|    1|
|    United States|            Ireland|  344|
|            Egypt|      United States|   15|
|    United States|              India|   62|
|    United States|          Singapore|    1|
|    United States|            Grenada|   62|
|       Costa Rica|      United States|  588|
|          Senegal|      United States|   40|
|          Moldova|      United States|    1|
+-----------------+-------------------+-----+



In [79]:
collectDF.orderBy("count").show()

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|          Moldova|      United States|    1|
|    United States|          Singapore|    1|
|    United States|            Croatia|    1|
|    United States|            Romania|   15|
|            Egypt|      United States|   15|
|          Senegal|      United States|   40|
|    United States|            Grenada|   62|
|    United States|              India|   62|
|    United States|            Ireland|  344|
|       Costa Rica|      United States|  588|
+-----------------+-------------------+-----+



In [77]:
collectDF.orderBy(F.desc("count")).show()

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|       Costa Rica|      United States|  588|
|    United States|            Ireland|  344|
|    United States|              India|   62|
|    United States|            Grenada|   62|
|          Senegal|      United States|   40|
|            Egypt|      United States|   15|
|    United States|            Romania|   15|
|    United States|            Croatia|    1|
|    United States|          Singapore|    1|
|          Moldova|      United States|    1|
+-----------------+-------------------+-----+



In [59]:
collectDF.show() # this prints it out nicely
collectDF.show(5, False)
collectDF.collect()


# COMMAND ----------

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Romania|   15|
|    United States|            Croatia|    1|
|    United States|            Ireland|  344|
|            Egypt|      United States|   15|
|    United States|              India|   62|
|    United States|          Singapore|    1|
|    United States|            Grenada|   62|
|       Costa Rica|      United States|  588|
|          Senegal|      United States|   40|
|          Moldova|      United States|    1|
+-----------------+-------------------+-----+

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|United States    |Romania            |15   |
|United States    |Croatia            |1    |
|United States    |Ireland            |344  |
|Egypt            |United States      |15   |
|United States    |India         

[Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Romania', count=15),
 Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Croatia', count=1),
 Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Ireland', count=344),
 Row(DEST_COUNTRY_NAME='Egypt', ORIGIN_COUNTRY_NAME='United States', count=15),
 Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='India', count=62),
 Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Singapore', count=1),
 Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Grenada', count=62),
 Row(DEST_COUNTRY_NAME='Costa Rica', ORIGIN_COUNTRY_NAME='United States', count=588),
 Row(DEST_COUNTRY_NAME='Senegal', ORIGIN_COUNTRY_NAME='United States', count=40),
 Row(DEST_COUNTRY_NAME='Moldova', ORIGIN_COUNTRY_NAME='United States', count=1)]

##### partition - coalesce

In [82]:
# COMMAND ----------

df.rdd.getNumPartitions() # 1

# COMMAND ----------

df.repartition(5)

# COMMAND ----------

df.repartition(col("DEST_COUNTRY_NAME"))


# COMMAND ----------

df.repartition(5, col("DEST_COUNTRY_NAME"))

# COMMAND ----------

df.repartition(5, col("DEST_COUNTRY_NAME")).coalesce(2)

DataFrame[DEST_COUNTRY_NAME: string, ORIGIN_COUNTRY_NAME: string, count: bigint]