# Spark DF Functions

In [1]:
import sys
import os

In [2]:
os.environ.get('JAVA_HOME')

'C:\\Program Files\\Java\\jdk1.8.0_311'

In [3]:
import findspark

findspark.init()

In [4]:
from pyspark.sql import SparkSession
from pyspark.sql import Row
import numpy as np

In [5]:
spark = SparkSession.builder.config("spark.sql.warehouse.dir",
                                    "temp").appName("SparkSQL").getOrCreate()

In [7]:
account_df = spark.read.format("csv") \
               .option("header", "true") \
               .option("delimiter", ";") \
               .load("data/financial/account.asc")

In [8]:
account_df.show(5)

+----------+-----------+----------------+------+
|account_id|district_id|       frequency|  date|
+----------+-----------+----------------+------+
|       576|         55|POPLATEK MESICNE|930101|
|      3818|         74|POPLATEK MESICNE|930101|
|       704|         55|POPLATEK MESICNE|930101|
|      2378|         16|POPLATEK MESICNE|930101|
|      2632|         24|POPLATEK MESICNE|930102|
+----------+-----------+----------------+------+
only showing top 5 rows



In [9]:
trans_df = spark.read \
             .option("header", "true") \
             .option("delimiter", ";") \
             .csv("data/financial/trans.asc")

In [10]:
trans_df.show(5)

+--------+----------+------+------+---------+-------+-------+--------+----+-------+
|trans_id|account_id|  date|  type|operation| amount|balance|k_symbol|bank|account|
+--------+----------+------+------+---------+-------+-------+--------+----+-------+
|  695247|      2378|930101|PRIJEM|    VKLAD| 700.00| 700.00|    null|null|   null|
|  171812|       576|930101|PRIJEM|    VKLAD| 900.00| 900.00|    null|null|   null|
|  207264|       704|930101|PRIJEM|    VKLAD|1000.00|1000.00|    null|null|   null|
| 1117247|      3818|930101|PRIJEM|    VKLAD| 600.00| 600.00|    null|null|   null|
|  579373|      1972|930102|PRIJEM|    VKLAD| 400.00| 400.00|    null|null|   null|
+--------+----------+------+------+---------+-------+-------+--------+----+-------+
only showing top 5 rows



# Select Columns

In [19]:
from pyspark.sql.functions import col

account_df.select(
    account_df.frequency,
    account_df["district_id"],
    col("account_id").alias("acc_no")
).show(2)

+----------------+-----------+------+
|       frequency|district_id|acc_no|
+----------------+-----------+------+
|POPLATEK MESICNE|         55|   576|
|POPLATEK MESICNE|         74|  3818|
+----------------+-----------+------+
only showing top 2 rows



# Sort

In [21]:
trans_df.sort(trans_df.balance.asc()).show(5)

+--------+----------+------+------+---------+--------+---------+--------+----+-------+
|trans_id|account_id|  date|  type|operation|  amount|  balance|k_symbol|bank|account|
+--------+----------+------+------+---------+--------+---------+--------+----+-------+
|  980497|      3345|980520| VYDAJ|    VYBER| 1360.00|    -1.60|    null|null|   null|
| 2777508|      9192|981024| VYDAJ|    VYBER|22700.00|  -100.60|    null|null|   null|
|  417609|      1416|950531| VYDAJ|    VYBER|   30.00| -1000.10|  SLUZBY|null|   null|
|  684290|      2335|980831| VYDAJ|    VYBER|   14.60| -1000.60|  SLUZBY|null|   null|
| 1599380|      5429|980608|PRIJEM|    VKLAD|13318.00|-10010.40|    null|null|   null|
+--------+----------+------+------+---------+--------+---------+--------+----+-------+
only showing top 5 rows



In [22]:
trans_df.sort(trans_df.balance.desc()).show(5)

+--------+----------+------+------+---------+--------+--------+--------+----+-------+
|trans_id|account_id|  date|  type|operation|  amount| balance|k_symbol|bank|account|
+--------+----------+------+------+---------+--------+--------+--------+----+-------+
|  289894|       993|980607|PRIJEM|    VKLAD|28918.00|99999.60|    null|null|   null|
|  529703|      1805|980805| VYDAJ|    VYBER| 9500.00|99999.40|    null|null|   null|
|  810680|      2762|940427| VYDAJ|    VYBER|29100.00|99998.80|    null|null|   null|
| 3459917|       200|980331|PRIJEM|     null|  413.30|99998.60|    UROK|null|   null|
| 3459920|       200|980630|PRIJEM|     null|  428.50|99998.40|    UROK|null|   null|
+--------+----------+------+------+---------+--------+--------+--------+----+-------+
only showing top 5 rows



# When-Otherwise

In [37]:
from pyspark.sql.functions import when

trans_df.select(
    trans_df.trans_id,
    when(trans_df.k_symbol=="SLUZBY", "S") \
        .when(trans_df.k_symbol==None ,"A") \
        .otherwise("O").alias("symbol")
).show(10)

+--------+------+
|trans_id|symbol|
+--------+------+
|  695247|     O|
|  171812|     O|
|  207264|     O|
| 1117247|     O|
|  579373|     O|
|  771035|     O|
|  452728|     O|
|  725751|     O|
|  497211|     O|
|  232960|     O|
+--------+------+
only showing top 10 rows



# Expr

In [45]:
from pyspark.sql.functions import expr

trans_df.filter(expr("account is not null and k_symbol is not null")).show()

+--------+----------+------+------+--------------+-------+--------+--------+----+--------+
|trans_id|account_id|  date|  type|     operation| amount| balance|k_symbol|bank| account|
+--------+----------+------+------+--------------+-------+--------+--------+----+--------+
|  637742|      2177|930105|PRIJEM| PREVOD Z UCTU|5123.00| 5923.00|  DUCHOD|  YZ|62457513|
|  579374|      1972|930107|PRIJEM| PREVOD Z UCTU|5298.00| 5698.00|  DUCHOD|  UV|14132887|
| 1049882|      3592|930110|PRIJEM| PREVOD Z UCTU|6007.00| 6607.00|  DUCHOD|  MN|73166322|
|  171813|       576|930111|PRIJEM| PREVOD Z UCTU|6207.00| 7107.00|  DUCHOD|  YZ|30300313|
|  689828|      2357|930112|PRIJEM| PREVOD Z UCTU|6434.00| 7234.00|  DUCHOD|  OP|34144538|
|  477639|      1628|930112|PRIJEM| PREVOD Z UCTU|4276.00| 4976.00|  DUCHOD|  UV|15916598|
|  439036|      1493|930113|PRIJEM| PREVOD Z UCTU|5009.00| 5209.00|  DUCHOD|  AB|54522466|
|  480215|      1637|930113|PRIJEM| PREVOD Z UCTU|5718.00| 6218.00|  DUCHOD|  UV|45134812|

# Lit

In [47]:
from pyspark.sql.functions import col, lit

account_df.select(
    account_df.account_id,
    lit("AB").alias("lit_constant")
).show(5)

+----------+------------+
|account_id|lit_constant|
+----------+------------+
|       576|          AB|
|      3818|          AB|
|       704|          AB|
|      2378|          AB|
|      2632|          AB|
+----------+------------+
only showing top 5 rows



In [48]:
arr_data = [
    ("James", [["Java", "Scala", "C++"], ["Spark", "Java"]]),
    ("Michael", [["Spark", "Java", "C++"], ["Spark", "Java"]]),
    ("Robert", [["CSharp", "VB"], ["Spark", "Python"]])
]
df = spark.createDataFrame(data=arr_data, schema=['name', 'subjects'])

In [53]:
df.show(truncate=False)

+-------+-----------------------------------+
|name   |subjects                           |
+-------+-----------------------------------+
|James  |[[Java, Scala, C++], [Spark, Java]]|
|Michael|[[Spark, Java, C++], [Spark, Java]]|
|Robert |[[CSharp, VB], [Spark, Python]]    |
+-------+-----------------------------------+



# Explode

In [54]:
from pyspark.sql.functions import explode

df.select(df.name, explode(df.subjects)).show(truncate=False)

+-------+------------------+
|name   |col               |
+-------+------------------+
|James  |[Java, Scala, C++]|
|James  |[Spark, Java]     |
|Michael|[Spark, Java, C++]|
|Michael|[Spark, Java]     |
|Robert |[CSharp, VB]      |
|Robert |[Spark, Python]   |
+-------+------------------+



# Flatten

In [55]:
from pyspark.sql.functions import flatten

df.select(df.name, flatten(df.subjects)).show(truncate=False)

+-------+-------------------------------+
|name   |flatten(subjects)              |
+-------+-------------------------------+
|James  |[Java, Scala, C++, Spark, Java]|
|Michael|[Spark, Java, C++, Spark, Java]|
|Robert |[CSharp, VB, Spark, Python]    |
+-------+-------------------------------+



In [57]:
data = [
    ("James", "Sales", 3000), 
    ("Michael", "Sales", 4600),
    ("Robert", "Sales", 4100), 
    ("Maria", "Finance", 3000),
    ("James", "Sales", 3000), 
    ("Scott", "Finance", 3300),
    ("Jen", "Finance", 3900), 
    ("Jeff", "Marketing", 3000),
    ("Kumar", "Marketing", 2000), 
    ("Saif", "Sales", 4100)
]
columns = ["employee_name", "department", "salary"]
df = spark.createDataFrame(data=data, schema=columns)

In [58]:
df.show()

+-------------+----------+------+
|employee_name|department|salary|
+-------------+----------+------+
|        James|     Sales|  3000|
|      Michael|     Sales|  4600|
|       Robert|     Sales|  4100|
|        Maria|   Finance|  3000|
|        James|     Sales|  3000|
|        Scott|   Finance|  3300|
|          Jen|   Finance|  3900|
|         Jeff| Marketing|  3000|
|        Kumar| Marketing|  2000|
|         Saif|     Sales|  4100|
+-------------+----------+------+



# Window Function - row_number
row_number() window function is used to give the sequential row number starting from 1 to the result of each window partition.

In [59]:
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number

windowSpec = Window.partitionBy("department").orderBy("salary")
df.withColumn("row_number", row_number().over(windowSpec)).show(truncate=False)

+-------------+----------+------+----------+
|employee_name|department|salary|row_number|
+-------------+----------+------+----------+
|James        |Sales     |3000  |1         |
|James        |Sales     |3000  |2         |
|Robert       |Sales     |4100  |3         |
|Saif         |Sales     |4100  |4         |
|Michael      |Sales     |4600  |5         |
|Maria        |Finance   |3000  |1         |
|Scott        |Finance   |3300  |2         |
|Jen          |Finance   |3900  |3         |
|Kumar        |Marketing |2000  |1         |
|Jeff         |Marketing |3000  |2         |
+-------------+----------+------+----------+



# Window Function - rank
rank() window function is used to provide a rank to the result within a window partition. This function leaves gaps in rank when there are ties.

In [60]:
from pyspark.sql.functions import rank

windowSpec = Window.partitionBy("department").orderBy("salary")
df.withColumn("rank", rank().over(windowSpec)).show()

+-------------+----------+------+----+
|employee_name|department|salary|rank|
+-------------+----------+------+----+
|        James|     Sales|  3000|   1|
|        James|     Sales|  3000|   1|
|       Robert|     Sales|  4100|   3|
|         Saif|     Sales|  4100|   3|
|      Michael|     Sales|  4600|   5|
|        Maria|   Finance|  3000|   1|
|        Scott|   Finance|  3300|   2|
|          Jen|   Finance|  3900|   3|
|        Kumar| Marketing|  2000|   1|
|         Jeff| Marketing|  3000|   2|
+-------------+----------+------+----+



# Window Function - dense_rank
* dense_rank() window function is used to get the result with rank of rows within a window partition without any gaps.
* This is similar to rank() function difference being rank function leaves gaps in rank when there are ties.

In [62]:
from pyspark.sql.functions import dense_rank

windowSpec = Window.partitionBy("department").orderBy("salary")
df.withColumn("dense_rank", dense_rank().over(windowSpec)).show()

+-------------+----------+------+----------+
|employee_name|department|salary|dense_rank|
+-------------+----------+------+----------+
|        James|     Sales|  3000|         1|
|        James|     Sales|  3000|         1|
|       Robert|     Sales|  4100|         2|
|         Saif|     Sales|  4100|         2|
|      Michael|     Sales|  4600|         3|
|        Maria|   Finance|  3000|         1|
|        Scott|   Finance|  3300|         2|
|          Jen|   Finance|  3900|         3|
|        Kumar| Marketing|  2000|         1|
|         Jeff| Marketing|  3000|         2|
+-------------+----------+------+----------+



# Window Function - lag

In [63]:
from pyspark.sql.functions import lag

windowSpec = Window.partitionBy("department").orderBy("salary")
df.withColumn("lag", lag("salary", 2).over(windowSpec)).show()

+-------------+----------+------+----+
|employee_name|department|salary| lag|
+-------------+----------+------+----+
|        James|     Sales|  3000|null|
|        James|     Sales|  3000|null|
|       Robert|     Sales|  4100|3000|
|         Saif|     Sales|  4100|3000|
|      Michael|     Sales|  4600|4100|
|        Maria|   Finance|  3000|null|
|        Scott|   Finance|  3300|null|
|          Jen|   Finance|  3900|3000|
|        Kumar| Marketing|  2000|null|
|         Jeff| Marketing|  3000|null|
+-------------+----------+------+----+



# Window Function - lead

In [64]:
from pyspark.sql.functions import lead

windowSpec = Window.partitionBy("department").orderBy("salary")
df.withColumn("lead", lead("salary", 2).over(windowSpec)).show()

+-------------+----------+------+----+
|employee_name|department|salary|lead|
+-------------+----------+------+----+
|        James|     Sales|  3000|4100|
|        James|     Sales|  3000|4100|
|       Robert|     Sales|  4100|4600|
|         Saif|     Sales|  4100|null|
|      Michael|     Sales|  4600|null|
|        Maria|   Finance|  3000|3900|
|        Scott|   Finance|  3300|null|
|          Jen|   Finance|  3900|null|
|        Kumar| Marketing|  2000|null|
|         Jeff| Marketing|  3000|null|
+-------------+----------+------+----+

