In [1]:
from IPython.display import display, clear_output

In [2]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.types import *

spark = (SparkSession
    .builder
    .appName("chapter-05-basic-operation")
    .getOrCreate())

import os
SPARK_BOOK_DATA_PATH = os.environ['SPARK_BOOK_DATA_PATH']

In [3]:
file_path = SPARK_BOOK_DATA_PATH + "/data/flight-data/json/2015-summary.json"
df = spark.read.format("json").load(file_path)

In [4]:
df.count()

256

In [5]:
df.show(5)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Romania|   15|
|    United States|            Croatia|    1|
|    United States|            Ireland|  344|
|            Egypt|      United States|   15|
|    United States|              India|   62|
+-----------------+-------------------+-----+
only showing top 5 rows



In [6]:
df.columns

['DEST_COUNTRY_NAME', 'ORIGIN_COUNTRY_NAME', 'count']

In [6]:
df.printSchema()

root
 |-- DEST_COUNTRY_NAME: string (nullable = true)
 |-- ORIGIN_COUNTRY_NAME: string (nullable = true)
 |-- count: long (nullable = true)



In [7]:
df.schema

StructType(List(StructField(DEST_COUNTRY_NAME,StringType,true),StructField(ORIGIN_COUNTRY_NAME,StringType,true),StructField(count,LongType,true)))

In [8]:
myManualSchema = StructType([
  StructField("DEST_COUNTRY_NAME", StringType(), True),
  StructField("ORIGIN_COUNTRY_NAME", StringType(), True),
  StructField("count", LongType(), False, metadata={"hello":"world"})
])

In [9]:
df2 = spark.read.format("json").schema(myManualSchema).load(file_path)

In [10]:
df2.printSchema()

root
 |-- DEST_COUNTRY_NAME: string (nullable = true)
 |-- ORIGIN_COUNTRY_NAME: string (nullable = true)
 |-- count: long (nullable = true)



In [11]:
df2.show(5)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Romania|   15|
|    United States|            Croatia|    1|
|    United States|            Ireland|  344|
|            Egypt|      United States|   15|
|    United States|              India|   62|
+-----------------+-------------------+-----+
only showing top 5 rows



In [12]:
F.col("someColumnName")
# column("someColumnName")

Column<b'someColumnName'>

In [13]:
F.expr("(((someCol + 5) * 200) - 6) < otherCol")

Column<b'((((someCol + 5) * 200) - 6) < otherCol)'>

In [14]:
# add a new column to df2
df2 = df2.withColumn("count_new", F.col("count")*2+10)

In [15]:
df2.show(5)

+-----------------+-------------------+-----+---------+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|count_new|
+-----------------+-------------------+-----+---------+
|    United States|            Romania|   15|       40|
|    United States|            Croatia|    1|       12|
|    United States|            Ireland|  344|      698|
|            Egypt|      United States|   15|       40|
|    United States|              India|   62|      134|
+-----------------+-------------------+-----+---------+
only showing top 5 rows



In [7]:
from pyspark.sql import Row
myRow = Row("Hello", None, 1, False)

In [8]:
myRow

<Row('Hello', None, 1, False)>

In [9]:
# COMMAND ----------

myRow[0], myRow[2]

('Hello', 1)

Create a `DataFrame` out of `Row` object

In [10]:
myManualSchema = StructType([
  StructField("some", StringType(), True),
  StructField("col", StringType(), True),
  StructField("names", LongType(), False)
])
myRow = Row("Hello", None, 1)
myRow2 = Row("World", 'Spark is cool', 1000)
myDf = spark.createDataFrame([myRow, myRow2], myManualSchema)
myDf.show()

+-----+-------------+-----+
| some|          col|names|
+-----+-------------+-----+
|Hello|         null|    1|
|World|Spark is cool| 1000|
+-----+-------------+-----+



In [11]:
data = [
    ("Hello", None, 1),
    ("World", 'Spark is cool', 1000)
]
mydf2 = spark.createDataFrame(data, schema=myManualSchema)
mydf2.show()

+-----+-------------+-----+
| some|          col|names|
+-----+-------------+-----+
|Hello|         null|    1|
|World|Spark is cool| 1000|
+-----+-------------+-----+



In [20]:
type(myDf)

pyspark.sql.dataframe.DataFrame

In [21]:
# convert DataFrame to SQL Table
df2.createOrReplaceTempView("df2Table")

In [22]:
df2.select("DEST_COUNTRY_NAME").show(2)

+-----------------+
|DEST_COUNTRY_NAME|
+-----------------+
|    United States|
|    United States|
+-----------------+
only showing top 2 rows



In [23]:
df2tbl = spark.sql("select * from df2Table limit 5")
df2tbl.show()

+-----------------+-------------------+-----+---------+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|count_new|
+-----------------+-------------------+-----+---------+
|    United States|            Romania|   15|       40|
|    United States|            Croatia|    1|       12|
|    United States|            Ireland|  344|      698|
|            Egypt|      United States|   15|       40|
|    United States|              India|   62|      134|
+-----------------+-------------------+-----+---------+



In [24]:
df.select("DEST_COUNTRY_NAME", "ORIGIN_COUNTRY_NAME").show(2)

+-----------------+-------------------+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|
+-----------------+-------------------+
|    United States|            Romania|
|    United States|            Croatia|
+-----------------+-------------------+
only showing top 2 rows



How to create new column, rename column

In [25]:
(
df.select(
    "ORIGIN_COUNTRY_NAME",
    F.expr("DEST_COUNTRY_NAME as dest"),
    F.col("DEST_COUNTRY_NAME").alias("dest_country"),
    F.column("DEST_COUNTRY_NAME"))
  .show(5)
)

+-------------------+-------------+-------------+-----------------+
|ORIGIN_COUNTRY_NAME|         dest| dest_country|DEST_COUNTRY_NAME|
+-------------------+-------------+-------------+-----------------+
|            Romania|United States|United States|    United States|
|            Croatia|United States|United States|    United States|
|            Ireland|United States|United States|    United States|
|      United States|        Egypt|        Egypt|            Egypt|
|              India|United States|United States|    United States|
+-------------------+-------------+-------------+-----------------+
only showing top 5 rows



In [26]:
df.selectExpr("DEST_COUNTRY_NAME as newColumnName", "DEST_COUNTRY_NAME").show(2)

+-------------+-----------------+
|newColumnName|DEST_COUNTRY_NAME|
+-------------+-----------------+
|United States|    United States|
|United States|    United States|
+-------------+-----------------+
only showing top 2 rows



In [29]:
(df.selectExpr(
  "*", # all original columns
  "(DEST_COUNTRY_NAME = ORIGIN_COUNTRY_NAME) as withinCountry")
  .show(5,False))

+-----------------+-------------------+-----+-------------+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|withinCountry|
+-----------------+-------------------+-----+-------------+
|United States    |Romania            |15   |false        |
|United States    |Croatia            |1    |false        |
|United States    |Ireland            |344  |false        |
|Egypt            |United States      |15   |false        |
|United States    |India              |62   |false        |
+-----------------+-------------------+-----+-------------+
only showing top 5 rows



In [30]:
df.selectExpr("avg(count)", "count(distinct(DEST_COUNTRY_NAME))").show(2)

+-----------+---------------------------------+
| avg(count)|count(DISTINCT DEST_COUNTRY_NAME)|
+-----------+---------------------------------+
|1770.765625|                              132|
+-----------+---------------------------------+



`lit` value

In [31]:
df.select(F.expr("*"), F.lit(1).alias("One")).show(2)

+-----------------+-------------------+-----+---+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|One|
+-----------------+-------------------+-----+---+
|    United States|            Romania|   15|  1|
|    United States|            Croatia|    1|  1|
+-----------------+-------------------+-----+---+
only showing top 2 rows



In [32]:
df.selectExpr("*", "count*2 as doubled").show(2)

+-----------------+-------------------+-----+-------+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|doubled|
+-----------------+-------------------+-----+-------+
|    United States|            Romania|   15|     30|
|    United States|            Croatia|    1|      2|
+-----------------+-------------------+-----+-------+
only showing top 2 rows



In [34]:
# Create a new column

df.withColumn("doubled", F.col("count")*2).show(2)

+-----------------+-------------------+-----+-------+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|doubled|
+-----------------+-------------------+-----+-------+
|    United States|            Romania|   15|     30|
|    United States|            Croatia|    1|      2|
+-----------------+-------------------+-----+-------+
only showing top 2 rows



In [35]:
df.withColumn("withinCountry", F.expr("ORIGIN_COUNTRY_NAME == DEST_COUNTRY_NAME")).show(2)

+-----------------+-------------------+-----+-------------+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|withinCountry|
+-----------------+-------------------+-----+-------------+
|    United States|            Romania|   15|        false|
|    United States|            Croatia|    1|        false|
+-----------------+-------------------+-----+-------------+
only showing top 2 rows



In [37]:
df.withColumn("foreign", F.col("ORIGIN_COUNTRY_NAME") != F.col("DEST_COUNTRY_NAME")).show(2)

+-----------------+-------------------+-----+-------+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|foreign|
+-----------------+-------------------+-----+-------+
|    United States|            Romania|   15|   true|
|    United States|            Croatia|    1|   true|
+-----------------+-------------------+-----+-------+
only showing top 2 rows



In [38]:
# rename column
df.withColumnRenamed("DEST_COUNTRY_NAME", "dest").columns

['dest', 'ORIGIN_COUNTRY_NAME', 'count']

`Backtick` - how to escape column name with space or special char 

In [40]:
dfWithSpecialColNames = (
    df.withColumn(
    "This Long Column-Name",
    F.expr("ORIGIN_COUNTRY_NAME"))
    .withColumn("Business & Pleasure",F.lit("BUSINESS")) 
)

In [41]:
dfWithSpecialColNames.selectExpr("*").show(2)

+-----------------+-------------------+-----+---------------------+-------------------+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|This Long Column-Name|Business & Pleasure|
+-----------------+-------------------+-----+---------------------+-------------------+
|    United States|            Romania|   15|              Romania|           BUSINESS|
|    United States|            Croatia|    1|              Croatia|           BUSINESS|
+-----------------+-------------------+-----+---------------------+-------------------+
only showing top 2 rows



In [43]:
dfWithSpecialColNames.select(F.expr("`This Long Column-Name`")).columns

['This Long Column-Name']

In [44]:
df.where(F.col("count") < 2).where(F.col("ORIGIN_COUNTRY_NAME") != "Croatia").show(2)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|          Singapore|    1|
|          Moldova|      United States|    1|
+-----------------+-------------------+-----+
only showing top 2 rows



In [45]:
df.select("ORIGIN_COUNTRY_NAME", "DEST_COUNTRY_NAME").distinct().count()

256

In [46]:
df.select("ORIGIN_COUNTRY_NAME").distinct().count()

125

In [49]:
spark.sql("select count(distinct ORIGIN_COUNTRY_NAME, DEST_COUNTRY_NAME) as unique_routes from df2Table").show()

+-------------+
|unique_routes|
+-------------+
|          256|
+-------------+



In [48]:
spark.sql("select count(distinct ORIGIN_COUNTRY_NAME) as unique_origin from df2Table").show()

+-------------+
|unique_origin|
+-------------+
|          125|
+-------------+



In [12]:
# sample data
withReplacement, fraction, seed = False, 0.5, 5
df.sample(withReplacement, fraction, seed).count()

138

In [13]:
# split data
dataFrames = df.randomSplit([0.25, 0.75], seed)

print(f"{dataFrames[0].count()}, {dataFrames[1].count()}")
dataFrames[0].count() > dataFrames[1].count() # False

71, 185


False

In [53]:
from pyspark.sql import Row
schema = df.schema
newRows = [
  Row("New Country", "Other Country", 5),
  Row("New Country 2", "Other Country 3", 1)
]
parallelizedRows = spark.sparkContext.parallelize(newRows)
newDF = spark.createDataFrame(parallelizedRows, schema)

In [54]:
newDF.show()

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|      New Country|      Other Country|    5|
|    New Country 2|    Other Country 3|    1|
+-----------------+-------------------+-----+



`union` - merge two DataFrames with same schema

In [56]:
(
df.where(F.col("DEST_COUNTRY_NAME") == "United States")
  .where("count < 3")
  .limit(6)
  .union(newDF)
  .where("count = 1")
  .where(F.col("ORIGIN_COUNTRY_NAME") != "United States")
  .show()
)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Croatia|    1|
|    United States|          Singapore|    1|
|    United States|          Gibraltar|    1|
|    United States|             Cyprus|    1|
|    United States|            Estonia|    1|
|    New Country 2|    Other Country 3|    1|
+-----------------+-------------------+-----+



In [58]:
# orderBy

df.sort(F.desc("count")).show(5)
df.orderBy("count", F.desc("DEST_COUNTRY_NAME")).show(5)
df.orderBy(F.col("count"), F.col("DEST_COUNTRY_NAME")).show(5)

+-----------------+-------------------+------+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME| count|
+-----------------+-------------------+------+
|    United States|      United States|370002|
|    United States|             Canada|  8483|
|           Canada|      United States|  8399|
|    United States|             Mexico|  7187|
|           Mexico|      United States|  7140|
+-----------------+-------------------+------+
only showing top 5 rows

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|           Zambia|      United States|    1|
|    United States|          Lithuania|    1|
|    United States|             Cyprus|    1|
|    United States|            Estonia|    1|
|    United States|          Singapore|    1|
+-----------------+-------------------+-----+
only showing top 5 rows

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+--

In [59]:
df.select("count").toDF("cnt").show(5)

+---+
|cnt|
+---+
| 15|
|  1|
|344|
| 15|
| 62|
+---+
only showing top 5 rows



`udf` to create `categorical` column

In [60]:
countdf = spark.range(10002).toDF("numflights")

In [68]:
count_range = F.udf(lambda x: 
                   '01000' if x < 1000 else 
                   '01000+' if (x >= 1000 and x < 5000) else
                   '05000+' if (x >= 5000 and x < 8000) else
                   '08000+' if (x >= 8000 and x < 10000) else
                   '10000>'  if (x >= 10000) else ''
)

countdf = countdf.withColumn('flights_bin', count_range(countdf.numflights))

In [69]:
display(countdf.toPandas())

Unnamed: 0,numflights,flights_bin
0,0,01000
1,1,01000
2,2,01000
3,3,01000
4,4,01000
...,...,...
9997,9997,08000+
9998,9998,08000+
9999,9999,08000+
10000,10000,10000>


In [70]:
countdf.groupBy('flights_bin').count().orderBy("flights_bin").show()

+-----------+-----+
|flights_bin|count|
+-----------+-----+
|      01000| 1000|
|     01000+| 4000|
|     05000+| 3000|
|     08000+| 2000|
|     10000>|    2|
+-----------+-----+



In [71]:
df.show(5)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Romania|   15|
|    United States|            Croatia|    1|
|    United States|            Ireland|  344|
|            Egypt|      United States|   15|
|    United States|              India|   62|
+-----------------+-------------------+-----+
only showing top 5 rows



In [72]:
df = df.withColumnRenamed("count", "numflights")

In [73]:
df.show(5)

+-----------------+-------------------+----------+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|numflights|
+-----------------+-------------------+----------+
|    United States|            Romania|        15|
|    United States|            Croatia|         1|
|    United States|            Ireland|       344|
|            Egypt|      United States|        15|
|    United States|              India|        62|
+-----------------+-------------------+----------+
only showing top 5 rows



In [74]:
df = df.withColumn('flights_bin', count_range(df.numflights))

In [75]:
df.show(5)

+-----------------+-------------------+----------+-----------+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|numflights|flights_bin|
+-----------------+-------------------+----------+-----------+
|    United States|            Romania|        15|      01000|
|    United States|            Croatia|         1|      01000|
|    United States|            Ireland|       344|      01000|
|            Egypt|      United States|        15|      01000|
|    United States|              India|        62|      01000|
+-----------------+-------------------+----------+-----------+
only showing top 5 rows



In [76]:
df.orderBy(F.col("flights_bin").desc(), F.col("DEST_COUNTRY_NAME").asc()).show()

+-------------------+-------------------+----------+-----------+
|  DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|numflights|flights_bin|
+-------------------+-------------------+----------+-----------+
|      United States|      United States|    370002|     10000>|
|             Canada|      United States|      8399|     08000+|
|      United States|             Canada|      8483|     08000+|
|             Mexico|      United States|      7140|     05000+|
|      United States|             Mexico|      7187|     05000+|
| Dominican Republic|      United States|      1353|     01000+|
|            Germany|      United States|      1468|     01000+|
|              Japan|      United States|      1548|     01000+|
|        South Korea|      United States|      1048|     01000+|
|     United Kingdom|      United States|      2025|     01000+|
|      United States|            Germany|      1336|     01000+|
|      United States|              Japan|      1496|     01000+|
|      United States| Dom

In [78]:
df.printSchema()

root
 |-- DEST_COUNTRY_NAME: string (nullable = true)
 |-- ORIGIN_COUNTRY_NAME: string (nullable = true)
 |-- numflights: long (nullable = true)
 |-- flights_bin: string (nullable = true)



In [82]:
df.sortWithinPartitions("numflights").show()

+--------------------+-------------------+----------+-----------+
|   DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|numflights|flights_bin|
+--------------------+-------------------+----------+-----------+
|       United States|            Croatia|         1|      01000|
|       United States|          Singapore|         1|      01000|
|             Moldova|      United States|         1|      01000|
|               Malta|      United States|         1|      01000|
|       United States|          Gibraltar|         1|      01000|
|Saint Vincent and...|      United States|         1|      01000|
|            Suriname|      United States|         1|      01000|
|       United States|             Cyprus|         1|      01000|
|        Burkina Faso|      United States|         1|      01000|
|            Djibouti|      United States|         1|      01000|
|       United States|            Estonia|         1|      01000|
|              Zambia|      United States|         1|      01000|
|         

In [83]:
df.show()

+--------------------+-------------------+----------+-----------+
|   DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|numflights|flights_bin|
+--------------------+-------------------+----------+-----------+
|       United States|            Romania|        15|      01000|
|       United States|            Croatia|         1|      01000|
|       United States|            Ireland|       344|      01000|
|               Egypt|      United States|        15|      01000|
|       United States|              India|        62|      01000|
|       United States|          Singapore|         1|      01000|
|       United States|            Grenada|        62|      01000|
|          Costa Rica|      United States|       588|      01000|
|             Senegal|      United States|        40|      01000|
|             Moldova|      United States|         1|      01000|
|       United States|       Sint Maarten|       325|      01000|
|       United States|   Marshall Islands|        39|      01000|
|         

In [84]:
# COMMAND ----------

df.orderBy(F.desc("numflights")).limit(6).show()

+-----------------+-------------------+----------+-----------+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|numflights|flights_bin|
+-----------------+-------------------+----------+-----------+
|    United States|      United States|    370002|     10000>|
|    United States|             Canada|      8483|     08000+|
|           Canada|      United States|      8399|     08000+|
|    United States|             Mexico|      7187|     05000+|
|           Mexico|      United States|      7140|     05000+|
|   United Kingdom|      United States|      2025|     01000+|
+-----------------+-------------------+----------+-----------+



In [85]:
collectDF = df.limit(10)
collectDF.take(5) # take works with an Integer count

[Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Romania', numflights=15, flights_bin='01000'),
 Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Croatia', numflights=1, flights_bin='01000'),
 Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Ireland', numflights=344, flights_bin='01000'),
 Row(DEST_COUNTRY_NAME='Egypt', ORIGIN_COUNTRY_NAME='United States', numflights=15, flights_bin='01000'),
 Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='India', numflights=62, flights_bin='01000')]

In [86]:
collectDF.show()

+-----------------+-------------------+----------+-----------+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|numflights|flights_bin|
+-----------------+-------------------+----------+-----------+
|    United States|            Romania|        15|      01000|
|    United States|            Croatia|         1|      01000|
|    United States|            Ireland|       344|      01000|
|            Egypt|      United States|        15|      01000|
|    United States|              India|        62|      01000|
|    United States|          Singapore|         1|      01000|
|    United States|            Grenada|        62|      01000|
|       Costa Rica|      United States|       588|      01000|
|          Senegal|      United States|        40|      01000|
|          Moldova|      United States|         1|      01000|
+-----------------+-------------------+----------+-----------+



In [87]:
collectDF.orderBy("numflights").show()

+-----------------+-------------------+----------+-----------+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|numflights|flights_bin|
+-----------------+-------------------+----------+-----------+
|          Moldova|      United States|         1|      01000|
|    United States|          Singapore|         1|      01000|
|    United States|            Croatia|         1|      01000|
|    United States|            Romania|        15|      01000|
|            Egypt|      United States|        15|      01000|
|          Senegal|      United States|        40|      01000|
|    United States|            Grenada|        62|      01000|
|    United States|              India|        62|      01000|
|    United States|            Ireland|       344|      01000|
|       Costa Rica|      United States|       588|      01000|
+-----------------+-------------------+----------+-----------+



In [88]:
collectDF.orderBy(F.desc("numflights")).show()

+-----------------+-------------------+----------+-----------+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|numflights|flights_bin|
+-----------------+-------------------+----------+-----------+
|       Costa Rica|      United States|       588|      01000|
|    United States|            Ireland|       344|      01000|
|    United States|              India|        62|      01000|
|    United States|            Grenada|        62|      01000|
|          Senegal|      United States|        40|      01000|
|            Egypt|      United States|        15|      01000|
|    United States|            Romania|        15|      01000|
|    United States|            Croatia|         1|      01000|
|    United States|          Singapore|         1|      01000|
|          Moldova|      United States|         1|      01000|
+-----------------+-------------------+----------+-----------+



##### partition - coalesce


- coalesce() does not shuffle
- repartition() does shuffle

In [89]:
df.rdd.getNumPartitions() # 1

1

In [92]:
df = df.repartition(5)

In [93]:
df.rdd.getNumPartitions() # 1

5

In [95]:
df.repartition(5, F.col("DEST_COUNTRY_NAME"))

DataFrame[DEST_COUNTRY_NAME: string, ORIGIN_COUNTRY_NAME: string, numflights: bigint, flights_bin: string]

In [97]:
df_ = df.repartition(5, F.col("DEST_COUNTRY_NAME")).coalesce(2)

In [98]:
df_.rdd.getNumPartitions() 

2

In [99]:
spark.stop()