In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import lit
from datetime import datetime

In [3]:
spark=SparkSession.builder.appName("PySparkSqlFunctions").getOrCreate()

In [4]:
spark

In [5]:
data = [
    (1,"Anuj Yadav" , "2024-08-01", 23,1000.50),
    (2,"Alina Pradhan","2024-08-02",34,2000.75),
    (3, "Jake White", "2024-08-03", 18, 3000.10),
    (4, "Jill Black", "2024-08-04", 45, 4000.25),
    (5, "James Brown", "2024-08-05", 29, 1500.30),
    (6, "Madhav Mishra", "2024-08-06", 31, 2500.45),
    (7, "John Doe", "2024-08-07", 20, 1100.50),
    (8, "Jane Smith", "2024-08-08", 30, 2200.75)

]

In [6]:
columns = ["id","name","dob","age","salary"]
df=spark.createDataFrame(data,columns)

In [7]:
df.show()

+---+-------------+----------+---+-------+
| id|         name|       dob|age| salary|
+---+-------------+----------+---+-------+
|  1|   Anuj Yadav|2024-08-01| 23| 1000.5|
|  2|Alina Pradhan|2024-08-02| 34|2000.75|
|  3|   Jake White|2024-08-03| 18| 3000.1|
|  4|   Jill Black|2024-08-04| 45|4000.25|
|  5|  James Brown|2024-08-05| 29| 1500.3|
|  6|Madhav Mishra|2024-08-06| 31|2500.45|
|  7|     John Doe|2024-08-07| 20| 1100.5|
|  8|   Jane Smith|2024-08-08| 30|2200.75|
+---+-------------+----------+---+-------+



In [8]:
#col
#select the "name " column
from pyspark.sql.functions import col
df.select(col("name")).show()

+-------------+
|         name|
+-------------+
|   Anuj Yadav|
|Alina Pradhan|
|   Jake White|
|   Jill Black|
|  James Brown|
|Madhav Mishra|
|     John Doe|
|   Jane Smith|
+-------------+



In [9]:
#lit
#adds a new column with a literal value
df_country =df.withColumn("country", lit("USA"))
df_country.show()

+---+-------------+----------+---+-------+-------+
| id|         name|       dob|age| salary|country|
+---+-------------+----------+---+-------+-------+
|  1|   Anuj Yadav|2024-08-01| 23| 1000.5|    USA|
|  2|Alina Pradhan|2024-08-02| 34|2000.75|    USA|
|  3|   Jake White|2024-08-03| 18| 3000.1|    USA|
|  4|   Jill Black|2024-08-04| 45|4000.25|    USA|
|  5|  James Brown|2024-08-05| 29| 1500.3|    USA|
|  6|Madhav Mishra|2024-08-06| 31|2500.45|    USA|
|  7|     John Doe|2024-08-07| 20| 1100.5|    USA|
|  8|   Jane Smith|2024-08-08| 30|2200.75|    USA|
+---+-------------+----------+---+-------+-------+



In [10]:
df.show()

+---+-------------+----------+---+-------+
| id|         name|       dob|age| salary|
+---+-------------+----------+---+-------+
|  1|   Anuj Yadav|2024-08-01| 23| 1000.5|
|  2|Alina Pradhan|2024-08-02| 34|2000.75|
|  3|   Jake White|2024-08-03| 18| 3000.1|
|  4|   Jill Black|2024-08-04| 45|4000.25|
|  5|  James Brown|2024-08-05| 29| 1500.3|
|  6|Madhav Mishra|2024-08-06| 31|2500.45|
|  7|     John Doe|2024-08-07| 20| 1100.5|
|  8|   Jane Smith|2024-08-08| 30|2200.75|
+---+-------------+----------+---+-------+



In [11]:
#expr
#adds 5 to the age column
from pyspark.sql.functions import expr
df_age = df.withColumn("age_plus_5", expr("age + 5"))
df_age.show()

+---+-------------+----------+---+-------+----------+
| id|         name|       dob|age| salary|age_plus_5|
+---+-------------+----------+---+-------+----------+
|  1|   Anuj Yadav|2024-08-01| 23| 1000.5|        28|
|  2|Alina Pradhan|2024-08-02| 34|2000.75|        39|
|  3|   Jake White|2024-08-03| 18| 3000.1|        23|
|  4|   Jill Black|2024-08-04| 45|4000.25|        50|
|  5|  James Brown|2024-08-05| 29| 1500.3|        34|
|  6|Madhav Mishra|2024-08-06| 31|2500.45|        36|
|  7|     John Doe|2024-08-07| 20| 1100.5|        25|
|  8|   Jane Smith|2024-08-08| 30|2200.75|        35|
+---+-------------+----------+---+-------+----------+



In [12]:
#when
from pyspark.sql.functions import when

df_classification = df.withColumn("classification", when(col("age") >= 18, "Adult").otherwise("Minor"))
df_classification.show()

+---+-------------+----------+---+-------+--------------+
| id|         name|       dob|age| salary|classification|
+---+-------------+----------+---+-------+--------------+
|  1|   Anuj Yadav|2024-08-01| 23| 1000.5|         Adult|
|  2|Alina Pradhan|2024-08-02| 34|2000.75|         Adult|
|  3|   Jake White|2024-08-03| 18| 3000.1|         Adult|
|  4|   Jill Black|2024-08-04| 45|4000.25|         Adult|
|  5|  James Brown|2024-08-05| 29| 1500.3|         Adult|
|  6|Madhav Mishra|2024-08-06| 31|2500.45|         Adult|
|  7|     John Doe|2024-08-07| 20| 1100.5|         Adult|
|  8|   Jane Smith|2024-08-08| 30|2200.75|         Adult|
+---+-------------+----------+---+-------+--------------+



In [13]:
#when classifies people as "adults" or minor
from pyspark.sql.functions import when

df_classification = df.withColumn("classification", when(col("age") >= 18, "Adult").otherwise("Minor"))
df_classification.show()

+---+-------------+----------+---+-------+--------------+
| id|         name|       dob|age| salary|classification|
+---+-------------+----------+---+-------+--------------+
|  1|   Anuj Yadav|2024-08-01| 23| 1000.5|         Adult|
|  2|Alina Pradhan|2024-08-02| 34|2000.75|         Adult|
|  3|   Jake White|2024-08-03| 18| 3000.1|         Adult|
|  4|   Jill Black|2024-08-04| 45|4000.25|         Adult|
|  5|  James Brown|2024-08-05| 29| 1500.3|         Adult|
|  6|Madhav Mishra|2024-08-06| 31|2500.45|         Adult|
|  7|     John Doe|2024-08-07| 20| 1100.5|         Adult|
|  8|   Jane Smith|2024-08-08| 30|2200.75|         Adult|
+---+-------------+----------+---+-------+--------------+



In [14]:
#concat
#concatness first and last names with a space
from pyspark.sql.functions import concat
df_concat = df.withColumn("full_name", concat(col("name"), lit(" ")))
df_concat.show()

+---+-------------+----------+---+-------+--------------+
| id|         name|       dob|age| salary|     full_name|
+---+-------------+----------+---+-------+--------------+
|  1|   Anuj Yadav|2024-08-01| 23| 1000.5|   Anuj Yadav |
|  2|Alina Pradhan|2024-08-02| 34|2000.75|Alina Pradhan |
|  3|   Jake White|2024-08-03| 18| 3000.1|   Jake White |
|  4|   Jill Black|2024-08-04| 45|4000.25|   Jill Black |
|  5|  James Brown|2024-08-05| 29| 1500.3|  James Brown |
|  6|Madhav Mishra|2024-08-06| 31|2500.45|Madhav Mishra |
|  7|     John Doe|2024-08-07| 20| 1100.5|     John Doe |
|  8|   Jane Smith|2024-08-08| 30|2200.75|   Jane Smith |
+---+-------------+----------+---+-------+--------------+



In [15]:
#substring
#extracts the first three characters from the bame column
from pyspark.sql.functions import substring
df_substring = df.withColumn("first_three_chars", substring(col("name"), 1, 3))
df_substring.show()

+---+-------------+----------+---+-------+-----------------+
| id|         name|       dob|age| salary|first_three_chars|
+---+-------------+----------+---+-------+-----------------+
|  1|   Anuj Yadav|2024-08-01| 23| 1000.5|              Anu|
|  2|Alina Pradhan|2024-08-02| 34|2000.75|              Ali|
|  3|   Jake White|2024-08-03| 18| 3000.1|              Jak|
|  4|   Jill Black|2024-08-04| 45|4000.25|              Jil|
|  5|  James Brown|2024-08-05| 29| 1500.3|              Jam|
|  6|Madhav Mishra|2024-08-06| 31|2500.45|              Mad|
|  7|     John Doe|2024-08-07| 20| 1100.5|              Joh|
|  8|   Jane Smith|2024-08-08| 30|2200.75|              Jan|
+---+-------------+----------+---+-------+-----------------+



In [16]:
#split
#split the name column into an arraay of words
from pyspark.sql.functions import split
df_split = df.withColumn("name_array", split(col("name"), " "))
df_split.show()

+---+-------------+----------+---+-------+----------------+
| id|         name|       dob|age| salary|      name_array|
+---+-------------+----------+---+-------+----------------+
|  1|   Anuj Yadav|2024-08-01| 23| 1000.5|   [Anuj, Yadav]|
|  2|Alina Pradhan|2024-08-02| 34|2000.75|[Alina, Pradhan]|
|  3|   Jake White|2024-08-03| 18| 3000.1|   [Jake, White]|
|  4|   Jill Black|2024-08-04| 45|4000.25|   [Jill, Black]|
|  5|  James Brown|2024-08-05| 29| 1500.3|  [James, Brown]|
|  6|Madhav Mishra|2024-08-06| 31|2500.45|[Madhav, Mishra]|
|  7|     John Doe|2024-08-07| 20| 1100.5|     [John, Doe]|
|  8|   Jane Smith|2024-08-08| 30|2200.75|   [Jane, Smith]|
+---+-------------+----------+---+-------+----------------+



In [17]:
#regexp replace
#replaces "john" with  "jon" in the "name" column
from pyspark.sql.functions import regexp_replace
df_replace = df.withColumn("replaced_name", regexp_replace(col("name"), "John", "Jon"))
df_replace.show()


+---+-------------+----------+---+-------+-------------+
| id|         name|       dob|age| salary|replaced_name|
+---+-------------+----------+---+-------+-------------+
|  1|   Anuj Yadav|2024-08-01| 23| 1000.5|   Anuj Yadav|
|  2|Alina Pradhan|2024-08-02| 34|2000.75|Alina Pradhan|
|  3|   Jake White|2024-08-03| 18| 3000.1|   Jake White|
|  4|   Jill Black|2024-08-04| 45|4000.25|   Jill Black|
|  5|  James Brown|2024-08-05| 29| 1500.3|  James Brown|
|  6|Madhav Mishra|2024-08-06| 31|2500.45|Madhav Mishra|
|  7|     John Doe|2024-08-07| 20| 1100.5|      Jon Doe|
|  8|   Jane Smith|2024-08-08| 30|2200.75|   Jane Smith|
+---+-------------+----------+---+-------+-------------+



In [18]:
#count
#counts the number of records in the dataframe
from pyspark.sql.functions import count
df_count = df.agg(count("*").alias("total_count"))
df_count.show()

+-----------+
|total_count|
+-----------+
|          8|
+-----------+



In [19]:
#sumn
#calculates the total salary
from pyspark.sql.functions import sum
df_sum = df.agg(sum("salary").alias("total_salary"))
df_sum.show()

+------------+
|total_salary|
+------------+
|     17303.6|
+------------+



In [20]:
#avg
#calculates the average age
from pyspark.sql.functions import avg
df_avg = df.agg(avg("age").alias("average_age"))
df_avg.show()

+-----------+
|average_age|
+-----------+
|      28.75|
+-----------+



In [21]:
#max
#finds the maximum sal
from pyspark.sql.functions import max
df_max = df.agg(max("salary").alias("max_salary"))
df_max.show()

+----------+
|max_salary|
+----------+
|   4000.25|
+----------+



In [22]:
#min
#finds tge minimum age
from pyspark.sql.functions import min
df_min = df.agg(min("age").alias("min_age"))
df_min.show()

+-------+
|min_age|
+-------+
|     18|
+-------+



In [23]:
#round
from pyspark.sql.functions import round
df_rounded = df.withColumn("rounded_salary", round(col("salary"), 0))
df_rounded.show()

+---+-------------+----------+---+-------+--------------+
| id|         name|       dob|age| salary|rounded_salary|
+---+-------------+----------+---+-------+--------------+
|  1|   Anuj Yadav|2024-08-01| 23| 1000.5|        1001.0|
|  2|Alina Pradhan|2024-08-02| 34|2000.75|        2001.0|
|  3|   Jake White|2024-08-03| 18| 3000.1|        3000.0|
|  4|   Jill Black|2024-08-04| 45|4000.25|        4000.0|
|  5|  James Brown|2024-08-05| 29| 1500.3|        1500.0|
|  6|Madhav Mishra|2024-08-06| 31|2500.45|        2500.0|
|  7|     John Doe|2024-08-07| 20| 1100.5|        1101.0|
|  8|   Jane Smith|2024-08-08| 30|2200.75|        2201.0|
+---+-------------+----------+---+-------+--------------+



In [24]:
#date_format
from pyspark.sql.functions import date_format

df_date_format = df.withColumn("formatted_dob", date_format(col("dob"), "MM/dd/yyyy"))
df_date_format.show()

+---+-------------+----------+---+-------+-------------+
| id|         name|       dob|age| salary|formatted_dob|
+---+-------------+----------+---+-------+-------------+
|  1|   Anuj Yadav|2024-08-01| 23| 1000.5|   08/01/2024|
|  2|Alina Pradhan|2024-08-02| 34|2000.75|   08/02/2024|
|  3|   Jake White|2024-08-03| 18| 3000.1|   08/03/2024|
|  4|   Jill Black|2024-08-04| 45|4000.25|   08/04/2024|
|  5|  James Brown|2024-08-05| 29| 1500.3|   08/05/2024|
|  6|Madhav Mishra|2024-08-06| 31|2500.45|   08/06/2024|
|  7|     John Doe|2024-08-07| 20| 1100.5|   08/07/2024|
|  8|   Jane Smith|2024-08-08| 30|2200.75|   08/08/2024|
+---+-------------+----------+---+-------+-------------+



In [25]:
#current date
#adds the current date to the dataframe

from pyspark.sql.functions import current_date
df_current_date = df.withColumn("current_date", current_date())
df_current_date.show()

+---+-------------+----------+---+-------+------------+
| id|         name|       dob|age| salary|current_date|
+---+-------------+----------+---+-------+------------+
|  1|   Anuj Yadav|2024-08-01| 23| 1000.5|  2025-08-10|
|  2|Alina Pradhan|2024-08-02| 34|2000.75|  2025-08-10|
|  3|   Jake White|2024-08-03| 18| 3000.1|  2025-08-10|
|  4|   Jill Black|2024-08-04| 45|4000.25|  2025-08-10|
|  5|  James Brown|2024-08-05| 29| 1500.3|  2025-08-10|
|  6|Madhav Mishra|2024-08-06| 31|2500.45|  2025-08-10|
|  7|     John Doe|2024-08-07| 20| 1100.5|  2025-08-10|
|  8|   Jane Smith|2024-08-08| 30|2200.75|  2025-08-10|
+---+-------------+----------+---+-------+------------+



In [26]:
#current_timestamp
#adds the current timesramp to the dataframe

from pyspark.sql.functions import current_timestamp

df_current_timestamp = df.withColumn("current_timestamp", current_timestamp())
df_current_timestamp.show(truncate=False)

+---+-------------+----------+---+-------+--------------------------+
|id |name         |dob       |age|salary |current_timestamp         |
+---+-------------+----------+---+-------+--------------------------+
|1  |Anuj Yadav   |2024-08-01|23 |1000.5 |2025-08-10 09:01:55.556264|
|2  |Alina Pradhan|2024-08-02|34 |2000.75|2025-08-10 09:01:55.556264|
|3  |Jake White   |2024-08-03|18 |3000.1 |2025-08-10 09:01:55.556264|
|4  |Jill Black   |2024-08-04|45 |4000.25|2025-08-10 09:01:55.556264|
|5  |James Brown  |2024-08-05|29 |1500.3 |2025-08-10 09:01:55.556264|
|6  |Madhav Mishra|2024-08-06|31 |2500.45|2025-08-10 09:01:55.556264|
|7  |John Doe     |2024-08-07|20 |1100.5 |2025-08-10 09:01:55.556264|
|8  |Jane Smith   |2024-08-08|30 |2200.75|2025-08-10 09:01:55.556264|
+---+-------------+----------+---+-------+--------------------------+



In [27]:
#year,month, days moth
from pyspark.sql.functions import year, month, dayofmonth
df_year = df.withColumn("year", year(col("dob")))
df_year.show()

#from pyspark.sql.functions import year, month, dayofmonth

#df.withColumn("year", year(col("dob"))).withColumn("month", month(col("dob"))).withColumn("day", dayofmonth(col("dob"))).show()


+---+-------------+----------+---+-------+----+
| id|         name|       dob|age| salary|year|
+---+-------------+----------+---+-------+----+
|  1|   Anuj Yadav|2024-08-01| 23| 1000.5|2024|
|  2|Alina Pradhan|2024-08-02| 34|2000.75|2024|
|  3|   Jake White|2024-08-03| 18| 3000.1|2024|
|  4|   Jill Black|2024-08-04| 45|4000.25|2024|
|  5|  James Brown|2024-08-05| 29| 1500.3|2024|
|  6|Madhav Mishra|2024-08-06| 31|2500.45|2024|
|  7|     John Doe|2024-08-07| 20| 1100.5|2024|
|  8|   Jane Smith|2024-08-08| 30|2200.75|2024|
+---+-------------+----------+---+-------+----+



In [28]:
#date_add
#adds 10 days to the "dob" columns
from pyspark.sql.functions import date_add
df_date_add = df.withColumn("dob_after_10_days", date_add(col("dob"), 10))
df_date_add.show()

+---+-------------+----------+---+-------+-----------------+
| id|         name|       dob|age| salary|dob_after_10_days|
+---+-------------+----------+---+-------+-----------------+
|  1|   Anuj Yadav|2024-08-01| 23| 1000.5|       2024-08-11|
|  2|Alina Pradhan|2024-08-02| 34|2000.75|       2024-08-12|
|  3|   Jake White|2024-08-03| 18| 3000.1|       2024-08-13|
|  4|   Jill Black|2024-08-04| 45|4000.25|       2024-08-14|
|  5|  James Brown|2024-08-05| 29| 1500.3|       2024-08-15|
|  6|Madhav Mishra|2024-08-06| 31|2500.45|       2024-08-16|
|  7|     John Doe|2024-08-07| 20| 1100.5|       2024-08-17|
|  8|   Jane Smith|2024-08-08| 30|2200.75|       2024-08-18|
+---+-------------+----------+---+-------+-----------------+



In [29]:
#date_sub
#subtracts 10 days form the "dob" column
from pyspark.sql.functions import date_sub
df_date_sub = df.withColumn("dob_before_10_days", date_sub(col("dob"), 10))
df_date_sub.show()

+---+-------------+----------+---+-------+------------------+
| id|         name|       dob|age| salary|dob_before_10_days|
+---+-------------+----------+---+-------+------------------+
|  1|   Anuj Yadav|2024-08-01| 23| 1000.5|        2024-07-22|
|  2|Alina Pradhan|2024-08-02| 34|2000.75|        2024-07-23|
|  3|   Jake White|2024-08-03| 18| 3000.1|        2024-07-24|
|  4|   Jill Black|2024-08-04| 45|4000.25|        2024-07-25|
|  5|  James Brown|2024-08-05| 29| 1500.3|        2024-07-26|
|  6|Madhav Mishra|2024-08-06| 31|2500.45|        2024-07-27|
|  7|     John Doe|2024-08-07| 20| 1100.5|        2024-07-28|
|  8|   Jane Smith|2024-08-08| 30|2200.75|        2024-07-29|
+---+-------------+----------+---+-------+------------------+



In [32]:
#datediff
#Calculates the difference in days between the current date and the "dob".

from pyspark.sql.functions import datediff

df_date_diff = df.withColumn("days_since_dob", datediff(current_date(), col("dob")))
df_date_diff.show()

+---+-------------+----------+---+-------+--------------+
| id|         name|       dob|age| salary|days_since_dob|
+---+-------------+----------+---+-------+--------------+
|  1|   Anuj Yadav|2024-08-01| 23| 1000.5|           374|
|  2|Alina Pradhan|2024-08-02| 34|2000.75|           373|
|  3|   Jake White|2024-08-03| 18| 3000.1|           372|
|  4|   Jill Black|2024-08-04| 45|4000.25|           371|
|  5|  James Brown|2024-08-05| 29| 1500.3|           370|
|  6|Madhav Mishra|2024-08-06| 31|2500.45|           369|
|  7|     John Doe|2024-08-07| 20| 1100.5|           368|
|  8|   Jane Smith|2024-08-08| 30|2200.75|           367|
+---+-------------+----------+---+-------+--------------+



In [33]:
#to_Date
#converst the "dob" column from string to date format
from pyspark.sql.functions import to_date
df_to_date = df.withColumn("dob_as_date", to_date(col("dob"), "yyyy-MM-dd"))
df_to_date.show()

+---+-------------+----------+---+-------+-----------+
| id|         name|       dob|age| salary|dob_as_date|
+---+-------------+----------+---+-------+-----------+
|  1|   Anuj Yadav|2024-08-01| 23| 1000.5| 2024-08-01|
|  2|Alina Pradhan|2024-08-02| 34|2000.75| 2024-08-02|
|  3|   Jake White|2024-08-03| 18| 3000.1| 2024-08-03|
|  4|   Jill Black|2024-08-04| 45|4000.25| 2024-08-04|
|  5|  James Brown|2024-08-05| 29| 1500.3| 2024-08-05|
|  6|Madhav Mishra|2024-08-06| 31|2500.45| 2024-08-06|
|  7|     John Doe|2024-08-07| 20| 1100.5| 2024-08-07|
|  8|   Jane Smith|2024-08-08| 30|2200.75| 2024-08-08|
+---+-------------+----------+---+-------+-----------+



In [34]:
#to_time stamp
#concerts the "dob" column from string to timestamp format

from pyspark.sql.functions import to_timestamp
df_to_timestamp = df.withColumn("dob_as_timestamp", to_timestamp(col("dob"), "yyyy-MM-dd"))
df_to_timestamp.show()

+---+-------------+----------+---+-------+-------------------+
| id|         name|       dob|age| salary|   dob_as_timestamp|
+---+-------------+----------+---+-------+-------------------+
|  1|   Anuj Yadav|2024-08-01| 23| 1000.5|2024-08-01 00:00:00|
|  2|Alina Pradhan|2024-08-02| 34|2000.75|2024-08-02 00:00:00|
|  3|   Jake White|2024-08-03| 18| 3000.1|2024-08-03 00:00:00|
|  4|   Jill Black|2024-08-04| 45|4000.25|2024-08-04 00:00:00|
|  5|  James Brown|2024-08-05| 29| 1500.3|2024-08-05 00:00:00|
|  6|Madhav Mishra|2024-08-06| 31|2500.45|2024-08-06 00:00:00|
|  7|     John Doe|2024-08-07| 20| 1100.5|2024-08-07 00:00:00|
|  8|   Jane Smith|2024-08-08| 30|2200.75|2024-08-08 00:00:00|
+---+-------------+----------+---+-------+-------------------+



In [36]:
from os import truncate
#window
#aggregrates the salary over a sliding a wonidow of one day

from pyspark.sql.functions import window
df_window = df.withColumn("window", window(col("dob"), "1 day"))
df_window.show(truncate = False)


#from pyspark.sql.functions import window

#df.groupBy(window(col("dob"), "1 day")).sum("salary").show(truncate=False)

+---+-------------+----------+---+-------+------------------------------------------+
|id |name         |dob       |age|salary |window                                    |
+---+-------------+----------+---+-------+------------------------------------------+
|1  |Anuj Yadav   |2024-08-01|23 |1000.5 |{2024-08-01 00:00:00, 2024-08-02 00:00:00}|
|2  |Alina Pradhan|2024-08-02|34 |2000.75|{2024-08-02 00:00:00, 2024-08-03 00:00:00}|
|3  |Jake White   |2024-08-03|18 |3000.1 |{2024-08-03 00:00:00, 2024-08-04 00:00:00}|
|4  |Jill Black   |2024-08-04|45 |4000.25|{2024-08-04 00:00:00, 2024-08-05 00:00:00}|
|5  |James Brown  |2024-08-05|29 |1500.3 |{2024-08-05 00:00:00, 2024-08-06 00:00:00}|
|6  |Madhav Mishra|2024-08-06|31 |2500.45|{2024-08-06 00:00:00, 2024-08-07 00:00:00}|
|7  |John Doe     |2024-08-07|20 |1100.5 |{2024-08-07 00:00:00, 2024-08-08 00:00:00}|
|8  |Jane Smith   |2024-08-08|30 |2200.75|{2024-08-08 00:00:00, 2024-08-09 00:00:00}|
+---+-------------+----------+---+-------+------------

In [37]:
#rank, dense, row_number
#applies ranking function to the salry colum
from pyspark.sql.window import Window
from pyspark.sql.functions import rank, dense_rank, row_number
windowSpec = Window.orderBy(col("salary"))
df.withColumn("rank", rank().over(windowSpec)).withColumn("dense_rank", dense_rank().over(windowSpec)).withColumn("row_number", row_number().over(windowSpec)).show()


+---+-------------+----------+---+-------+----+----------+----------+
| id|         name|       dob|age| salary|rank|dense_rank|row_number|
+---+-------------+----------+---+-------+----+----------+----------+
|  1|   Anuj Yadav|2024-08-01| 23| 1000.5|   1|         1|         1|
|  7|     John Doe|2024-08-07| 20| 1100.5|   2|         2|         2|
|  5|  James Brown|2024-08-05| 29| 1500.3|   3|         3|         3|
|  2|Alina Pradhan|2024-08-02| 34|2000.75|   4|         4|         4|
|  8|   Jane Smith|2024-08-08| 30|2200.75|   5|         5|         5|
|  6|Madhav Mishra|2024-08-06| 31|2500.45|   6|         6|         6|
|  3|   Jake White|2024-08-03| 18| 3000.1|   7|         7|         7|
|  4|   Jill Black|2024-08-04| 45|4000.25|   8|         8|         8|
+---+-------------+----------+---+-------+----+----------+----------+



In [38]:
#array
#creates a new array column
from pyspark.sql.functions import array
df_array = df.withColumn("array_column", array(col("id"), col("age")))
df_array.show()


+---+-------------+----------+---+-------+------------+
| id|         name|       dob|age| salary|array_column|
+---+-------------+----------+---+-------+------------+
|  1|   Anuj Yadav|2024-08-01| 23| 1000.5|     [1, 23]|
|  2|Alina Pradhan|2024-08-02| 34|2000.75|     [2, 34]|
|  3|   Jake White|2024-08-03| 18| 3000.1|     [3, 18]|
|  4|   Jill Black|2024-08-04| 45|4000.25|     [4, 45]|
|  5|  James Brown|2024-08-05| 29| 1500.3|     [5, 29]|
|  6|Madhav Mishra|2024-08-06| 31|2500.45|     [6, 31]|
|  7|     John Doe|2024-08-07| 20| 1100.5|     [7, 20]|
|  8|   Jane Smith|2024-08-08| 30|2200.75|     [8, 30]|
+---+-------------+----------+---+-------+------------+



In [39]:
#array_contains
#Checks if a specified element exists in an array column.

from pyspark.sql.functions import array_contains

df_array_contains = df.withColumn("contains_id_1", array_contains(array(col("id")), 1))

df_array_contains.show()


+---+-------------+----------+---+-------+-------------+
| id|         name|       dob|age| salary|contains_id_1|
+---+-------------+----------+---+-------+-------------+
|  1|   Anuj Yadav|2024-08-01| 23| 1000.5|         true|
|  2|Alina Pradhan|2024-08-02| 34|2000.75|        false|
|  3|   Jake White|2024-08-03| 18| 3000.1|        false|
|  4|   Jill Black|2024-08-04| 45|4000.25|        false|
|  5|  James Brown|2024-08-05| 29| 1500.3|        false|
|  6|Madhav Mishra|2024-08-06| 31|2500.45|        false|
|  7|     John Doe|2024-08-07| 20| 1100.5|        false|
|  8|   Jane Smith|2024-08-08| 30|2200.75|        false|
+---+-------------+----------+---+-------+-------------+



In [40]:
#explode
#Creates a new row for each element in the given array or map column.

from pyspark.sql.functions import explode

df_explode = df.withColumn("exploded_array", explode(array(col("id"), col("age"))))
df_explode.show()

+---+-------------+----------+---+-------+--------------+
| id|         name|       dob|age| salary|exploded_array|
+---+-------------+----------+---+-------+--------------+
|  1|   Anuj Yadav|2024-08-01| 23| 1000.5|             1|
|  1|   Anuj Yadav|2024-08-01| 23| 1000.5|            23|
|  2|Alina Pradhan|2024-08-02| 34|2000.75|             2|
|  2|Alina Pradhan|2024-08-02| 34|2000.75|            34|
|  3|   Jake White|2024-08-03| 18| 3000.1|             3|
|  3|   Jake White|2024-08-03| 18| 3000.1|            18|
|  4|   Jill Black|2024-08-04| 45|4000.25|             4|
|  4|   Jill Black|2024-08-04| 45|4000.25|            45|
|  5|  James Brown|2024-08-05| 29| 1500.3|             5|
|  5|  James Brown|2024-08-05| 29| 1500.3|            29|
|  6|Madhav Mishra|2024-08-06| 31|2500.45|             6|
|  6|Madhav Mishra|2024-08-06| 31|2500.45|            31|
|  7|     John Doe|2024-08-07| 20| 1100.5|             7|
|  7|     John Doe|2024-08-07| 20| 1100.5|            20|
|  8|   Jane S

In [41]:
#map
#Creates a new map column.


from pyspark.sql.functions import create_map

df_map = df.withColumn("map_column", create_map(lit("name"), col("name"), lit("age"), col("age")))

df_map.show(truncate=False)

+---+-------------+----------+---+-------+----------------------------------+
|id |name         |dob       |age|salary |map_column                        |
+---+-------------+----------+---+-------+----------------------------------+
|1  |Anuj Yadav   |2024-08-01|23 |1000.5 |{name -> Anuj Yadav, age -> 23}   |
|2  |Alina Pradhan|2024-08-02|34 |2000.75|{name -> Alina Pradhan, age -> 34}|
|3  |Jake White   |2024-08-03|18 |3000.1 |{name -> Jake White, age -> 18}   |
|4  |Jill Black   |2024-08-04|45 |4000.25|{name -> Jill Black, age -> 45}   |
|5  |James Brown  |2024-08-05|29 |1500.3 |{name -> James Brown, age -> 29}  |
|6  |Madhav Mishra|2024-08-06|31 |2500.45|{name -> Madhav Mishra, age -> 31}|
|7  |John Doe     |2024-08-07|20 |1100.5 |{name -> John Doe, age -> 20}     |
|8  |Jane Smith   |2024-08-08|30 |2200.75|{name -> Jane Smith, age -> 30}   |
+---+-------------+----------+---+-------+----------------------------------+



In [42]:
#coalesce
#Returns the first non-null value among the given columns.

from pyspark.sql.functions import coalesce

df_coalesce = df.withColumn("coalesced_value", coalesce(col("name"), col("age")))
df_coalesce.show()

+---+-------------+----------+---+-------+---------------+
| id|         name|       dob|age| salary|coalesced_value|
+---+-------------+----------+---+-------+---------------+
|  1|   Anuj Yadav|2024-08-01| 23| 1000.5|     Anuj Yadav|
|  2|Alina Pradhan|2024-08-02| 34|2000.75|  Alina Pradhan|
|  3|   Jake White|2024-08-03| 18| 3000.1|     Jake White|
|  4|   Jill Black|2024-08-04| 45|4000.25|     Jill Black|
|  5|  James Brown|2024-08-05| 29| 1500.3|    James Brown|
|  6|Madhav Mishra|2024-08-06| 31|2500.45|  Madhav Mishra|
|  7|     John Doe|2024-08-07| 20| 1100.5|       John Doe|
|  8|   Jane Smith|2024-08-08| 30|2200.75|     Jane Smith|
+---+-------------+----------+---+-------+---------------+



In [43]:
#isnull
#Checks if the column is null.

from pyspark.sql.functions import isnull

df_isnull = df.withColumn("is_name_null", isnull(col("name")))
df_isnull.show()

+---+-------------+----------+---+-------+------------+
| id|         name|       dob|age| salary|is_name_null|
+---+-------------+----------+---+-------+------------+
|  1|   Anuj Yadav|2024-08-01| 23| 1000.5|       false|
|  2|Alina Pradhan|2024-08-02| 34|2000.75|       false|
|  3|   Jake White|2024-08-03| 18| 3000.1|       false|
|  4|   Jill Black|2024-08-04| 45|4000.25|       false|
|  5|  James Brown|2024-08-05| 29| 1500.3|       false|
|  6|Madhav Mishra|2024-08-06| 31|2500.45|       false|
|  7|     John Doe|2024-08-07| 20| 1100.5|       false|
|  8|   Jane Smith|2024-08-08| 30|2200.75|       false|
+---+-------------+----------+---+-------+------------+



In [44]:
#isnan
#Checks if the column is NaN.

from pyspark.sql.functions import isnan

df_isnan = df.withColumn("is_age_nan", isnan(col("age")))
df_isnan.show()

+---+-------------+----------+---+-------+----------+
| id|         name|       dob|age| salary|is_age_nan|
+---+-------------+----------+---+-------+----------+
|  1|   Anuj Yadav|2024-08-01| 23| 1000.5|     false|
|  2|Alina Pradhan|2024-08-02| 34|2000.75|     false|
|  3|   Jake White|2024-08-03| 18| 3000.1|     false|
|  4|   Jill Black|2024-08-04| 45|4000.25|     false|
|  5|  James Brown|2024-08-05| 29| 1500.3|     false|
|  6|Madhav Mishra|2024-08-06| 31|2500.45|     false|
|  7|     John Doe|2024-08-07| 20| 1100.5|     false|
|  8|   Jane Smith|2024-08-08| 30|2200.75|     false|
+---+-------------+----------+---+-------+----------+



In [45]:
#sha2
#Applies the SHA-2 hash function to the column.

from pyspark.sql.functions import sha2

df_sha2 = df.withColumn("sha2_hash", sha2(col("name"), 256))
df_sha2.show(truncate=False)


+---+-------------+----------+---+-------+----------------------------------------------------------------+
|id |name         |dob       |age|salary |sha2_hash                                                       |
+---+-------------+----------+---+-------+----------------------------------------------------------------+
|1  |Anuj Yadav   |2024-08-01|23 |1000.5 |803ce8929027e0dc4527094db0c7942815b2ba7e525ace272edd0cdd4d61c50c|
|2  |Alina Pradhan|2024-08-02|34 |2000.75|1c0ebcf1a61f95076a6f2c07f41c817ab239d4ddfbc719f64281917732a318ee|
|3  |Jake White   |2024-08-03|18 |3000.1 |46dae60ac51d1dfbb195b62ca2a6c4fd76f0d0ef5e32ee32a3b9987bc30251ef|
|4  |Jill Black   |2024-08-04|45 |4000.25|7db233fa461a23bfe24e0c1b5f0cd82e8750969c1e9c4450761cf800425caba7|
|5  |James Brown  |2024-08-05|29 |1500.3 |b3d60509901a9912f651bac21d52c7ddffc9bc8620f5052fbddd39e3c59f84b0|
|6  |Madhav Mishra|2024-08-06|31 |2500.45|4cdec17dacd6ffe870947ac1cb6d10901adf819458055ab02df4a3540dd3f477|
|7  |John Doe     |2024-08-0

In [46]:
#md5
#Calculates the MD5 hash of a column.

from pyspark.sql.functions import md5

df_md5 = df.withColumn("md5_hash", md5(col("name")))
df_md5.show(truncate=False)


+---+-------------+----------+---+-------+--------------------------------+
|id |name         |dob       |age|salary |md5_hash                        |
+---+-------------+----------+---+-------+--------------------------------+
|1  |Anuj Yadav   |2024-08-01|23 |1000.5 |ca153ed55cbdacb54da3736335da7cb3|
|2  |Alina Pradhan|2024-08-02|34 |2000.75|057b5e6c8de5860bb4920ddcf2756182|
|3  |Jake White   |2024-08-03|18 |3000.1 |3ee9984296bc94702c3fa0b750b928fb|
|4  |Jill Black   |2024-08-04|45 |4000.25|78b82745b366583ae84dde2d90114901|
|5  |James Brown  |2024-08-05|29 |1500.3 |8495e8e406d3d625719ae2a9fb8d2f9b|
|6  |Madhav Mishra|2024-08-06|31 |2500.45|80d7e488885a319a5fb5ea29fe6b526b|
|7  |John Doe     |2024-08-07|20 |1100.5 |4c2a904bafba06591225113ad17b5cec|
|8  |Jane Smith   |2024-08-08|30 |2200.75|71768b5e2a0b3697eb3c0c6d4ebbbaf8|
+---+-------------+----------+---+-------+--------------------------------+



In [47]:
#monotonically_increasing_id
#Generates a unique, monotonically increasing 64-bit integer for each row.

from pyspark.sql.functions import monotonically_increasing_id

df_monotonically_increasing_id = df.withColumn("monotonically_increasing_id", monotonically_increasing_id())
df_monotonically_increasing_id.show()

+---+-------------+----------+---+-------+---------------------------+
| id|         name|       dob|age| salary|monotonically_increasing_id|
+---+-------------+----------+---+-------+---------------------------+
|  1|   Anuj Yadav|2024-08-01| 23| 1000.5|                          0|
|  2|Alina Pradhan|2024-08-02| 34|2000.75|                          1|
|  3|   Jake White|2024-08-03| 18| 3000.1|                          2|
|  4|   Jill Black|2024-08-04| 45|4000.25|                          3|
|  5|  James Brown|2024-08-05| 29| 1500.3|                 8589934592|
|  6|Madhav Mishra|2024-08-06| 31|2500.45|                 8589934593|
|  7|     John Doe|2024-08-07| 20| 1100.5|                 8589934594|
|  8|   Jane Smith|2024-08-08| 30|2200.75|                 8589934595|
+---+-------------+----------+---+-------+---------------------------+



In [48]:
#length
#Returns the length of a string column.

from pyspark.sql.functions import length

df_length = df.withColumn("name_length", length(col("name")))
df_length.show()

+---+-------------+----------+---+-------+-----------+
| id|         name|       dob|age| salary|name_length|
+---+-------------+----------+---+-------+-----------+
|  1|   Anuj Yadav|2024-08-01| 23| 1000.5|         10|
|  2|Alina Pradhan|2024-08-02| 34|2000.75|         13|
|  3|   Jake White|2024-08-03| 18| 3000.1|         10|
|  4|   Jill Black|2024-08-04| 45|4000.25|         10|
|  5|  James Brown|2024-08-05| 29| 1500.3|         11|
|  6|Madhav Mishra|2024-08-06| 31|2500.45|         13|
|  7|     John Doe|2024-08-07| 20| 1100.5|          8|
|  8|   Jane Smith|2024-08-08| 30|2200.75|         10|
+---+-------------+----------+---+-------+-----------+



In [49]:
#upper and lower
#Converts all characters of a string column to upper or lower case.

from pyspark.sql.functions import upper, lower

#df_upper = df.withColumn("upper_name", upper(col("name")))
#df_upper.show()


from pyspark.sql.functions import upper, lower

df_upper_lower = df.withColumn("name_upper", upper(col("name"))).withColumn("name_lower", lower(col("name")))
df_upper_lower.show()


+---+-------------+----------+---+-------+-------------+-------------+
| id|         name|       dob|age| salary|   name_upper|   name_lower|
+---+-------------+----------+---+-------+-------------+-------------+
|  1|   Anuj Yadav|2024-08-01| 23| 1000.5|   ANUJ YADAV|   anuj yadav|
|  2|Alina Pradhan|2024-08-02| 34|2000.75|ALINA PRADHAN|alina pradhan|
|  3|   Jake White|2024-08-03| 18| 3000.1|   JAKE WHITE|   jake white|
|  4|   Jill Black|2024-08-04| 45|4000.25|   JILL BLACK|   jill black|
|  5|  James Brown|2024-08-05| 29| 1500.3|  JAMES BROWN|  james brown|
|  6|Madhav Mishra|2024-08-06| 31|2500.45|MADHAV MISHRA|madhav mishra|
|  7|     John Doe|2024-08-07| 20| 1100.5|     JOHN DOE|     john doe|
|  8|   Jane Smith|2024-08-08| 30|2200.75|   JANE SMITH|   jane smith|
+---+-------------+----------+---+-------+-------------+-------------+



In [50]:
#Trims spaces from both sides, left side, or right side of a string column.

from pyspark.sql.functions import trim, ltrim, rtrim
"""
df_trim = df.withColumn("trimmed_name", trim(col("name")))
df_trim.show()

df_ltrim = df.withColumn("ltrimmed_name", ltrim(col("name")))
df_ltrim.show()

df_rtrim = df.withColumn("rtrimmed_name", rtrim(col("name")))
df_rtrim.show()
"""


from pyspark.sql.functions import trim, ltrim, rtrim

df_all = df.withColumn("name_trimmed", trim(col("name"))).withColumn("name_ltrimmed", ltrim(col("name"))).withColumn("name_rtrimmed", rtrim(col("name")))
df_all.show()

+---+-------------+----------+---+-------+-------------+-------------+-------------+
| id|         name|       dob|age| salary| name_trimmed|name_ltrimmed|name_rtrimmed|
+---+-------------+----------+---+-------+-------------+-------------+-------------+
|  1|   Anuj Yadav|2024-08-01| 23| 1000.5|   Anuj Yadav|   Anuj Yadav|   Anuj Yadav|
|  2|Alina Pradhan|2024-08-02| 34|2000.75|Alina Pradhan|Alina Pradhan|Alina Pradhan|
|  3|   Jake White|2024-08-03| 18| 3000.1|   Jake White|   Jake White|   Jake White|
|  4|   Jill Black|2024-08-04| 45|4000.25|   Jill Black|   Jill Black|   Jill Black|
|  5|  James Brown|2024-08-05| 29| 1500.3|  James Brown|  James Brown|  James Brown|
|  6|Madhav Mishra|2024-08-06| 31|2500.45|Madhav Mishra|Madhav Mishra|Madhav Mishra|
|  7|     John Doe|2024-08-07| 20| 1100.5|     John Doe|     John Doe|     John Doe|
|  8|   Jane Smith|2024-08-08| 30|2200.75|   Jane Smith|   Jane Smith|   Jane Smith|
+---+-------------+----------+---+-------+-------------+---------

In [51]:

#abs
#Returns the absolute value of a numeric column.

from pyspark.sql.functions import abs

#df_abs = df.withColumn("absolute_age", abs(col("age")))
#df_abs.show()

from pyspark.sql.functions import abs

df_abs = df.withColumn("abs_salary", abs(col("salary") - 3000))
df_abs.show()

+---+-------------+----------+---+-------+-------------------+
| id|         name|       dob|age| salary|         abs_salary|
+---+-------------+----------+---+-------+-------------------+
|  1|   Anuj Yadav|2024-08-01| 23| 1000.5|             1999.5|
|  2|Alina Pradhan|2024-08-02| 34|2000.75|             999.25|
|  3|   Jake White|2024-08-03| 18| 3000.1|0.09999999999990905|
|  4|   Jill Black|2024-08-04| 45|4000.25|            1000.25|
|  5|  James Brown|2024-08-05| 29| 1500.3|             1499.7|
|  6|Madhav Mishra|2024-08-06| 31|2500.45|  499.5500000000002|
|  7|     John Doe|2024-08-07| 20| 1100.5|             1899.5|
|  8|   Jane Smith|2024-08-08| 30|2200.75|             799.25|
+---+-------------+----------+---+-------+-------------------+



In [52]:
#sqrt
#Returns the square root of a numeric column.

from pyspark.sql.functions import sqrt

df_sqrt = df.withColumn("sqrt_age", sqrt(col("age")))
df_sqrt.show()



+---+-------------+----------+---+-------+------------------+
| id|         name|       dob|age| salary|          sqrt_age|
+---+-------------+----------+---+-------+------------------+
|  1|   Anuj Yadav|2024-08-01| 23| 1000.5| 4.795831523312719|
|  2|Alina Pradhan|2024-08-02| 34|2000.75| 5.830951894845301|
|  3|   Jake White|2024-08-03| 18| 3000.1| 4.242640687119285|
|  4|   Jill Black|2024-08-04| 45|4000.25| 6.708203932499369|
|  5|  James Brown|2024-08-05| 29| 1500.3| 5.385164807134504|
|  6|Madhav Mishra|2024-08-06| 31|2500.45|5.5677643628300215|
|  7|     John Doe|2024-08-07| 20| 1100.5|  4.47213595499958|
|  8|   Jane Smith|2024-08-08| 30|2200.75| 5.477225575051661|
+---+-------------+----------+---+-------+------------------+



In [53]:
#exp
#Computes the exponential of the given column.

from pyspark.sql.functions import exp

#df_exp = df.withColumn("exp_salary", exp(col("salary")))
#df_exp.show()

from pyspark.sql.functions import exp

df_exp= df.withColumn("exp_age", exp(col("age")))
df_exp.show()

+---+-------------+----------+---+-------+--------------------+
| id|         name|       dob|age| salary|             exp_age|
+---+-------------+----------+---+-------+--------------------+
|  1|   Anuj Yadav|2024-08-01| 23| 1000.5| 9.744803446248903E9|
|  2|Alina Pradhan|2024-08-02| 34|2000.75|5.834617425274549E14|
|  3|   Jake White|2024-08-03| 18| 3000.1| 6.565996913733051E7|
|  4|   Jill Black|2024-08-04| 45|4000.25|3.493427105748509...|
|  5|  James Brown|2024-08-05| 29| 1500.3|3.931334297144042E12|
|  6|Madhav Mishra|2024-08-06| 31|2500.45|2.904884966524742...|
|  7|     John Doe|2024-08-07| 20| 1100.5| 4.851651954097903E8|
|  8|   Jane Smith|2024-08-08| 30|2200.75|1.068647458152446...|
+---+-------------+----------+---+-------+--------------------+



In [54]:
#log, log10, log2
#Computes the logarithm of the column using a base of e, 10, or 2.

from pyspark.sql.functions import log, log10, log2

#df_log = df.withColumn("log_salary", log(col("salary")))
#df_log.show()

#df_log = df.withColumn("log_salary", log10(col("salary")))
#df_log.show()

#df_log = df.withColumn("log_salary", log2(col("salary")))
#df_log.show()

df_all = df.withColumn("log_age", log(col("age"))).withColumn("log10_age", log10(col("age"))).withColumn("log2_age", log2(col("age")))
df_all.show()

+---+-------------+----------+---+-------+------------------+------------------+-----------------+
| id|         name|       dob|age| salary|           log_age|         log10_age|         log2_age|
+---+-------------+----------+---+-------+------------------+------------------+-----------------+
|  1|   Anuj Yadav|2024-08-01| 23| 1000.5|3.1354942159291497|1.3617278360175928|4.523561956057013|
|  2|Alina Pradhan|2024-08-02| 34|2000.75|3.5263605246161616|1.5314789170422551| 5.08746284125034|
|  3|   Jake White|2024-08-03| 18| 3000.1|2.8903717578961645| 1.255272505103306|4.169925001442312|
|  4|   Jill Black|2024-08-04| 45|4000.25|3.8066624897703196|1.6532125137753437|5.491853096329675|
|  5|  James Brown|2024-08-05| 29| 1500.3| 3.367295829986474| 1.462397997898956|4.857980995127573|
|  6|Madhav Mishra|2024-08-06| 31|2500.45|3.4339872044851463|1.4913616938342726|4.954196310386876|
|  7|     John Doe|2024-08-07| 20| 1100.5| 2.995732273553991|1.3010299956639813|4.321928094887363|
|  8|   Ja

In [55]:

#greatest and least
#Returns the greatest or least value of the list of columns.

from pyspark.sql.functions import greatest, least

df_greatest = df.withColumn("greatest_value", greatest(col("id"), col("age")))
print(df_greatest.show())

df_least = df.withColumn("least_value", least(col("id"), col("age")))
print(df_least.show())

#df_all = df.withColumn("max_value", greatest(col("age"), col("salary"))).withColumn("min_value", least(col("age"), col("salary")))
#df_all.show()


+---+-------------+----------+---+-------+--------------+
| id|         name|       dob|age| salary|greatest_value|
+---+-------------+----------+---+-------+--------------+
|  1|   Anuj Yadav|2024-08-01| 23| 1000.5|            23|
|  2|Alina Pradhan|2024-08-02| 34|2000.75|            34|
|  3|   Jake White|2024-08-03| 18| 3000.1|            18|
|  4|   Jill Black|2024-08-04| 45|4000.25|            45|
|  5|  James Brown|2024-08-05| 29| 1500.3|            29|
|  6|Madhav Mishra|2024-08-06| 31|2500.45|            31|
|  7|     John Doe|2024-08-07| 20| 1100.5|            20|
|  8|   Jane Smith|2024-08-08| 30|2200.75|            30|
+---+-------------+----------+---+-------+--------------+

None
+---+-------------+----------+---+-------+-----------+
| id|         name|       dob|age| salary|least_value|
+---+-------------+----------+---+-------+-----------+
|  1|   Anuj Yadav|2024-08-01| 23| 1000.5|          1|
|  2|Alina Pradhan|2024-08-02| 34|2000.75|          2|
|  3|   Jake White|2024

In [56]:
#pow
#Raises the value of a column to the power of another column.

from pyspark.sql.functions import pow

df_pow = df.withColumn("pow_result", pow(col("id"), col("age")))
df_pow.show(truncate=False)

+---+-------------+----------+---+-------+---------------------+
|id |name         |dob       |age|salary |pow_result           |
+---+-------------+----------+---+-------+---------------------+
|1  |Anuj Yadav   |2024-08-01|23 |1000.5 |1.0                  |
|2  |Alina Pradhan|2024-08-02|34 |2000.75|1.7179869184E10      |
|3  |Jake White   |2024-08-03|18 |3000.1 |3.87420489E8         |
|4  |Jill Black   |2024-08-04|45 |4000.25|1.2379400392853803E27|
|5  |James Brown  |2024-08-05|29 |1500.3 |1.8626451492309572E20|
|6  |Madhav Mishra|2024-08-06|31 |2500.45|1.3264435183244001E24|
|7  |John Doe     |2024-08-07|20 |1100.5 |7.9792266297612E16   |
|8  |Jane Smith   |2024-08-08|30 |2200.75|1.2379400392853803E27|
+---+-------------+----------+---+-------+---------------------+



In [57]:


#round, bround
#Rounds the value of the column to the nearest integer or to the nearest integer with ties broken by rounding away from zero.

from pyspark.sql.functions import round, bround

df_round = df.withColumn("rounded_salary", round(col("salary"), 0))
print(df_round.show())

df_bround = df.withColumn("brounded_salary", bround(col("salary"), 0))
print(df_bround.show())

#df_all = df.withColumn("rounded_salary", round(col("salary"), 0)).withColumn("brounded_salary", bround(col("salary"), 0))
#df_all.show()


+---+-------------+----------+---+-------+--------------+
| id|         name|       dob|age| salary|rounded_salary|
+---+-------------+----------+---+-------+--------------+
|  1|   Anuj Yadav|2024-08-01| 23| 1000.5|        1001.0|
|  2|Alina Pradhan|2024-08-02| 34|2000.75|        2001.0|
|  3|   Jake White|2024-08-03| 18| 3000.1|        3000.0|
|  4|   Jill Black|2024-08-04| 45|4000.25|        4000.0|
|  5|  James Brown|2024-08-05| 29| 1500.3|        1500.0|
|  6|Madhav Mishra|2024-08-06| 31|2500.45|        2500.0|
|  7|     John Doe|2024-08-07| 20| 1100.5|        1101.0|
|  8|   Jane Smith|2024-08-08| 30|2200.75|        2201.0|
+---+-------------+----------+---+-------+--------------+

None
+---+-------------+----------+---+-------+---------------+
| id|         name|       dob|age| salary|brounded_salary|
+---+-------------+----------+---+-------+---------------+
|  1|   Anuj Yadav|2024-08-01| 23| 1000.5|         1000.0|
|  2|Alina Pradhan|2024-08-02| 34|2000.75|         2001.0|
|  

In [58]:
#degrees, radians
#Converts an angle from radians to degrees or from degrees to radians.

from pyspark.sql.functions import degrees, radians

df_degrees = df.withColumn("degrees_value", degrees(col("age")))
print(df_degrees.show())

df_radians = df.withColumn("radians_value", radians(col("age")))
print(df_radians.show())

#df_all = df.withColumn("degrees_value", degrees(col("age"))).withColumn("radians_value", radians(col("age")))
#df_all.show()


+---+-------------+----------+---+-------+------------------+
| id|         name|       dob|age| salary|     degrees_value|
+---+-------------+----------+---+-------+------------------+
|  1|   Anuj Yadav|2024-08-01| 23| 1000.5|1317.8029288008934|
|  2|Alina Pradhan|2024-08-02| 34|2000.75| 1948.056503444799|
|  3|   Jake White|2024-08-03| 18| 3000.1| 1031.324031235482|
|  4|   Jill Black|2024-08-04| 45|4000.25|2578.3100780887044|
|  5|  James Brown|2024-08-05| 29| 1500.3|1661.5776058793874|
|  6|Madhav Mishra|2024-08-06| 31|2500.45| 1776.169164905552|
|  7|     John Doe|2024-08-07| 20| 1100.5|1145.9155902616465|
|  8|   Jane Smith|2024-08-08| 30|2200.75|1718.8733853924696|
+---+-------------+----------+---+-------+------------------+

None
+---+-------------+----------+---+-------+------------------+
| id|         name|       dob|age| salary|     radians_value|
+---+-------------+----------+---+-------+------------------+
|  1|   Anuj Yadav|2024-08-01| 23| 1000.5|0.4014257279586958|
| 

In [59]:
#signum
#Computes the signum of a number: -1 if it's negative, 0 if it's zero, and 1 if it's positive.

from pyspark.sql.functions import signum

df_signum = df.withColumn("signum_value", signum(col("age")))
df_signum.show()

+---+-------------+----------+---+-------+------------+
| id|         name|       dob|age| salary|signum_value|
+---+-------------+----------+---+-------+------------+
|  1|   Anuj Yadav|2024-08-01| 23| 1000.5|         1.0|
|  2|Alina Pradhan|2024-08-02| 34|2000.75|         1.0|
|  3|   Jake White|2024-08-03| 18| 3000.1|         1.0|
|  4|   Jill Black|2024-08-04| 45|4000.25|         1.0|
|  5|  James Brown|2024-08-05| 29| 1500.3|         1.0|
|  6|Madhav Mishra|2024-08-06| 31|2500.45|         1.0|
|  7|     John Doe|2024-08-07| 20| 1100.5|         1.0|
|  8|   Jane Smith|2024-08-08| 30|2200.75|         1.0|
+---+-------------+----------+---+-------+------------+



In [60]:

#hex, unhex
#Converts a column to hexadecimal and back.

from pyspark.sql.functions import hex, unhex

# Create the 'hex_value' column within the 'df' DataFrame
#df = df.withColumn("hex_value", hex(col("age")))
#print(df.show())

#df_unhex = df.withColumn("unhex_value", unhex(col("hex_value")))
#print(df_unhex.show())

from pyspark.sql.functions import hex, unhex

df_all= df.withColumn("hex_name", hex(col("id"))).withColumn("unhex_name", unhex(hex(col("id"))))
df_all.show()

+---+-------------+----------+---+-------+--------+----------+
| id|         name|       dob|age| salary|hex_name|unhex_name|
+---+-------------+----------+---+-------+--------+----------+
|  1|   Anuj Yadav|2024-08-01| 23| 1000.5|       1|      [01]|
|  2|Alina Pradhan|2024-08-02| 34|2000.75|       2|      [02]|
|  3|   Jake White|2024-08-03| 18| 3000.1|       3|      [03]|
|  4|   Jill Black|2024-08-04| 45|4000.25|       4|      [04]|
|  5|  James Brown|2024-08-05| 29| 1500.3|       5|      [05]|
|  6|Madhav Mishra|2024-08-06| 31|2500.45|       6|      [06]|
|  7|     John Doe|2024-08-07| 20| 1100.5|       7|      [07]|
|  8|   Jane Smith|2024-08-08| 30|2200.75|       8|      [08]|
+---+-------------+----------+---+-------+--------+----------+



In [61]:

#nvl, nvl2
#Replaces null values with the specified value.

from pyspark.sql.functions import nvl, nvl2, col

df_nvl = df.withColumn("nvl_age", nvl(col("age"), col("id"))) # Replace 0 with a column object
print(df_nvl.show())

df_nvl2 = df.withColumn("nvl2_age", nvl2(col("age"), col("id"), col("salary"))) # Replace 0 and 10 with column objects
print(df_nvl2.show())

# df_all = df.withColumn("nvl_column", expr("nvl(null, 'default')"))
# df_all.show()


+---+-------------+----------+---+-------+-------+
| id|         name|       dob|age| salary|nvl_age|
+---+-------------+----------+---+-------+-------+
|  1|   Anuj Yadav|2024-08-01| 23| 1000.5|     23|
|  2|Alina Pradhan|2024-08-02| 34|2000.75|     34|
|  3|   Jake White|2024-08-03| 18| 3000.1|     18|
|  4|   Jill Black|2024-08-04| 45|4000.25|     45|
|  5|  James Brown|2024-08-05| 29| 1500.3|     29|
|  6|Madhav Mishra|2024-08-06| 31|2500.45|     31|
|  7|     John Doe|2024-08-07| 20| 1100.5|     20|
|  8|   Jane Smith|2024-08-08| 30|2200.75|     30|
+---+-------------+----------+---+-------+-------+

None
+---+-------------+----------+---+-------+--------+
| id|         name|       dob|age| salary|nvl2_age|
+---+-------------+----------+---+-------+--------+
|  1|   Anuj Yadav|2024-08-01| 23| 1000.5|     1.0|
|  2|Alina Pradhan|2024-08-02| 34|2000.75|     2.0|
|  3|   Jake White|2024-08-03| 18| 3000.1|     3.0|
|  4|   Jill Black|2024-08-04| 45|4000.25|     4.0|
|  5|  James Brown

In [62]:
#reverse
#Reverses the string in a column.

from pyspark.sql.functions import reverse

df_reverse = df.withColumn("reversed_name", reverse(col("name")))
df_reverse.show()

+---+-------------+----------+---+-------+-------------+
| id|         name|       dob|age| salary|reversed_name|
+---+-------------+----------+---+-------+-------------+
|  1|   Anuj Yadav|2024-08-01| 23| 1000.5|   vadaY junA|
|  2|Alina Pradhan|2024-08-02| 34|2000.75|nahdarP anilA|
|  3|   Jake White|2024-08-03| 18| 3000.1|   etihW ekaJ|
|  4|   Jill Black|2024-08-04| 45|4000.25|   kcalB lliJ|
|  5|  James Brown|2024-08-05| 29| 1500.3|  nworB semaJ|
|  6|Madhav Mishra|2024-08-06| 31|2500.45|arhsiM vahdaM|
|  7|     John Doe|2024-08-07| 20| 1100.5|     eoD nhoJ|
|  8|   Jane Smith|2024-08-08| 30|2200.75|   htimS enaJ|
+---+-------------+----------+---+-------+-------------+



In [63]:
#initcap
#Converts the first letter of each word to uppercase.

from pyspark.sql.functions import initcap

df_initcap = df.withColumn("capitalized_name", initcap(col("name")))
df_initcap.show()

+---+-------------+----------+---+-------+----------------+
| id|         name|       dob|age| salary|capitalized_name|
+---+-------------+----------+---+-------+----------------+
|  1|   Anuj Yadav|2024-08-01| 23| 1000.5|      Anuj Yadav|
|  2|Alina Pradhan|2024-08-02| 34|2000.75|   Alina Pradhan|
|  3|   Jake White|2024-08-03| 18| 3000.1|      Jake White|
|  4|   Jill Black|2024-08-04| 45|4000.25|      Jill Black|
|  5|  James Brown|2024-08-05| 29| 1500.3|     James Brown|
|  6|Madhav Mishra|2024-08-06| 31|2500.45|   Madhav Mishra|
|  7|     John Doe|2024-08-07| 20| 1100.5|        John Doe|
|  8|   Jane Smith|2024-08-08| 30|2200.75|      Jane Smith|
+---+-------------+----------+---+-------+----------------+



In [64]:
#instr
#Returns the position of the first occurrence of a substring.

from pyspark.sql.functions import instr

df_instr = df.withColumn("position_of_a", instr(col("name"), "a"))
print(df_instr.show())
print(df_instr.printSchema())

#df_instr = df.withColumn("position", instr(col("name"), "Doe"))
#df_instr.show()

+---+-------------+----------+---+-------+-------------+
| id|         name|       dob|age| salary|position_of_a|
+---+-------------+----------+---+-------+-------------+
|  1|   Anuj Yadav|2024-08-01| 23| 1000.5|            7|
|  2|Alina Pradhan|2024-08-02| 34|2000.75|            5|
|  3|   Jake White|2024-08-03| 18| 3000.1|            2|
|  4|   Jill Black|2024-08-04| 45|4000.25|            8|
|  5|  James Brown|2024-08-05| 29| 1500.3|            2|
|  6|Madhav Mishra|2024-08-06| 31|2500.45|            2|
|  7|     John Doe|2024-08-07| 20| 1100.5|            0|
|  8|   Jane Smith|2024-08-08| 30|2200.75|            2|
+---+-------------+----------+---+-------+-------------+

None
root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- dob: string (nullable = true)
 |-- age: long (nullable = true)
 |-- salary: double (nullable = true)
 |-- position_of_a: integer (nullable = true)

None


In [65]:
#locate
#Similar to instr, but can start the search from a specified position.
from pyspark.sql.functions import locate

df_locate = df.withColumn("locate_doe", locate("Doe", col("name")))
df_locate.show()


+---+-------------+----------+---+-------+----------+
| id|         name|       dob|age| salary|locate_doe|
+---+-------------+----------+---+-------+----------+
|  1|   Anuj Yadav|2024-08-01| 23| 1000.5|         0|
|  2|Alina Pradhan|2024-08-02| 34|2000.75|         0|
|  3|   Jake White|2024-08-03| 18| 3000.1|         0|
|  4|   Jill Black|2024-08-04| 45|4000.25|         0|
|  5|  James Brown|2024-08-05| 29| 1500.3|         0|
|  6|Madhav Mishra|2024-08-06| 31|2500.45|         0|
|  7|     John Doe|2024-08-07| 20| 1100.5|         6|
|  8|   Jane Smith|2024-08-08| 30|2200.75|         0|
+---+-------------+----------+---+-------+----------+



In [66]:
#soundex
#Converts a string to its Soundex code, useful for phonetic matching

from pyspark.sql.functions import soundex

df_soundex = df.withColumn("soundex_name", soundex(col("name")))
df_soundex.show()


+---+-------------+----------+---+-------+------------+
| id|         name|       dob|age| salary|soundex_name|
+---+-------------+----------+---+-------+------------+
|  1|   Anuj Yadav|2024-08-01| 23| 1000.5|        A523|
|  2|Alina Pradhan|2024-08-02| 34|2000.75|        A451|
|  3|   Jake White|2024-08-03| 18| 3000.1|        J230|
|  4|   Jill Black|2024-08-04| 45|4000.25|        J414|
|  5|  James Brown|2024-08-05| 29| 1500.3|        J521|
|  6|Madhav Mishra|2024-08-06| 31|2500.45|        M315|
|  7|     John Doe|2024-08-07| 20| 1100.5|        J530|
|  8|   Jane Smith|2024-08-08| 30|2200.75|        J525|
+---+-------------+----------+---+-------+------------+



In [67]:
#levenshtein
#Computes the Levenshtein distance between two strings.

from pyspark.sql.functions import levenshtein

df_levenshtein = df.withColumn("levenshtein_distance", levenshtein(col("name"), lit("Jon Doe")))
df_levenshtein.show()

+---+-------------+----------+---+-------+--------------------+
| id|         name|       dob|age| salary|levenshtein_distance|
+---+-------------+----------+---+-------+--------------------+
|  1|   Anuj Yadav|2024-08-01| 23| 1000.5|                   9|
|  2|Alina Pradhan|2024-08-02| 34|2000.75|                  11|
|  3|   Jake White|2024-08-03| 18| 3000.1|                   7|
|  4|   Jill Black|2024-08-04| 45|4000.25|                   8|
|  5|  James Brown|2024-08-05| 29| 1500.3|                   8|
|  6|Madhav Mishra|2024-08-06| 31|2500.45|                  12|
|  7|     John Doe|2024-08-07| 20| 1100.5|                   1|
|  8|   Jane Smith|2024-08-08| 30|2200.75|                   7|
+---+-------------+----------+---+-------+--------------------+



In [68]:
#conv
#Converts a number from one base to another.

from pyspark.sql.functions import conv

df_conv = df.withColumn("binary_value", conv(col("id"), 10, 2))
df_conv.show()

+---+-------------+----------+---+-------+------------+
| id|         name|       dob|age| salary|binary_value|
+---+-------------+----------+---+-------+------------+
|  1|   Anuj Yadav|2024-08-01| 23| 1000.5|           1|
|  2|Alina Pradhan|2024-08-02| 34|2000.75|          10|
|  3|   Jake White|2024-08-03| 18| 3000.1|          11|
|  4|   Jill Black|2024-08-04| 45|4000.25|         100|
|  5|  James Brown|2024-08-05| 29| 1500.3|         101|
|  6|Madhav Mishra|2024-08-06| 31|2500.45|         110|
|  7|     John Doe|2024-08-07| 20| 1100.5|         111|
|  8|   Jane Smith|2024-08-08| 30|2200.75|        1000|
+---+-------------+----------+---+-------+------------+



In [69]:
#translate
#Replaces characters in a string with other characters.

from pyspark.sql.functions import translate

df_translate = df.withColumn("translated_name", translate(col("name"), "o", "a"))
df_translate.show()


+---+-------------+----------+---+-------+---------------+
| id|         name|       dob|age| salary|translated_name|
+---+-------------+----------+---+-------+---------------+
|  1|   Anuj Yadav|2024-08-01| 23| 1000.5|     Anuj Yadav|
|  2|Alina Pradhan|2024-08-02| 34|2000.75|  Alina Pradhan|
|  3|   Jake White|2024-08-03| 18| 3000.1|     Jake White|
|  4|   Jill Black|2024-08-04| 45|4000.25|     Jill Black|
|  5|  James Brown|2024-08-05| 29| 1500.3|    James Brawn|
|  6|Madhav Mishra|2024-08-06| 31|2500.45|  Madhav Mishra|
|  7|     John Doe|2024-08-07| 20| 1100.5|       Jahn Dae|
|  8|   Jane Smith|2024-08-08| 30|2200.75|     Jane Smith|
+---+-------------+----------+---+-------+---------------+



In [70]:
#crc32
#Computes a cyclic redundancy check value

from pyspark.sql.functions import crc32

df_crc32 = df.withColumn("crc32_name", crc32(col("name")))
df_crc32.show()

+---+-------------+----------+---+-------+----------+
| id|         name|       dob|age| salary|crc32_name|
+---+-------------+----------+---+-------+----------+
|  1|   Anuj Yadav|2024-08-01| 23| 1000.5|4132308856|
|  2|Alina Pradhan|2024-08-02| 34|2000.75|4059018015|
|  3|   Jake White|2024-08-03| 18| 3000.1| 931505628|
|  4|   Jill Black|2024-08-04| 45|4000.25|3628743810|
|  5|  James Brown|2024-08-05| 29| 1500.3|3837056040|
|  6|Madhav Mishra|2024-08-06| 31|2500.45|3046363580|
|  7|     John Doe|2024-08-07| 20| 1100.5|1782059462|
|  8|   Jane Smith|2024-08-08| 30|2200.75|3280634359|
+---+-------------+----------+---+-------+----------+



In [71]:
#uuid
#Generates a column of UUIDs.

from pyspark.sql.functions import expr

df_uuid = df.withColumn("uuid", expr("uuid()"))

df_uuid.show(truncate = False)


+---+-------------+----------+---+-------+------------------------------------+
|id |name         |dob       |age|salary |uuid                                |
+---+-------------+----------+---+-------+------------------------------------+
|1  |Anuj Yadav   |2024-08-01|23 |1000.5 |0e018530-b1e9-40c1-bbde-cdae8738210a|
|2  |Alina Pradhan|2024-08-02|34 |2000.75|90abc624-cc10-4a58-919b-1865738c2347|
|3  |Jake White   |2024-08-03|18 |3000.1 |713abcf8-8470-4fa6-900a-5322093e3b0a|
|4  |Jill Black   |2024-08-04|45 |4000.25|59d534c8-3178-4f6a-b267-37f66192f3a4|
|5  |James Brown  |2024-08-05|29 |1500.3 |be6b7f4c-09e0-42eb-af58-04ae1cb4e668|
|6  |Madhav Mishra|2024-08-06|31 |2500.45|5e7b94d8-1731-4f96-b4e0-41fe508d6040|
|7  |John Doe     |2024-08-07|20 |1100.5 |508ef1ab-cf97-4e78-a0c1-43ac47665205|
|8  |Jane Smith   |2024-08-08|30 |2200.75|936cc725-a525-4948-ac98-dd9d3c4770df|
+---+-------------+----------+---+-------+------------------------------------+



In [72]:
#percent_rank
#Computes the percent rank of a row within a window partition.

from pyspark.sql.functions import percent_rank

windowSpec = Window.orderBy(col("salary").desc())

df_percent_rank = df.withColumn("percent_rank", percent_rank().over(windowSpec))
df_percent_rank.show()


+---+-------------+----------+---+-------+-------------------+
| id|         name|       dob|age| salary|       percent_rank|
+---+-------------+----------+---+-------+-------------------+
|  4|   Jill Black|2024-08-04| 45|4000.25|                0.0|
|  3|   Jake White|2024-08-03| 18| 3000.1|0.14285714285714285|
|  6|Madhav Mishra|2024-08-06| 31|2500.45| 0.2857142857142857|
|  8|   Jane Smith|2024-08-08| 30|2200.75|0.42857142857142855|
|  2|Alina Pradhan|2024-08-02| 34|2000.75| 0.5714285714285714|
|  5|  James Brown|2024-08-05| 29| 1500.3| 0.7142857142857143|
|  7|     John Doe|2024-08-07| 20| 1100.5| 0.8571428571428571|
|  1|   Anuj Yadav|2024-08-01| 23| 1000.5|                1.0|
+---+-------------+----------+---+-------+-------------------+



In [73]:
#cume_dist
#Computes the cumulative distribution of a value in a group of values.

from pyspark.sql.functions import cume_dist

df_cume_dist = df.withColumn("cume_dist", cume_dist().over(windowSpec))
df_cume_dist.show()


+---+-------------+----------+---+-------+---------+
| id|         name|       dob|age| salary|cume_dist|
+---+-------------+----------+---+-------+---------+
|  4|   Jill Black|2024-08-04| 45|4000.25|    0.125|
|  3|   Jake White|2024-08-03| 18| 3000.1|     0.25|
|  6|Madhav Mishra|2024-08-06| 31|2500.45|    0.375|
|  8|   Jane Smith|2024-08-08| 30|2200.75|      0.5|
|  2|Alina Pradhan|2024-08-02| 34|2000.75|    0.625|
|  5|  James Brown|2024-08-05| 29| 1500.3|     0.75|
|  7|     John Doe|2024-08-07| 20| 1100.5|    0.875|
|  1|   Anuj Yadav|2024-08-01| 23| 1000.5|      1.0|
+---+-------------+----------+---+-------+---------+



In [74]:
#ntile
#Distributes rows of an ordered partition into a specified number of buckets.

from pyspark.sql.functions import ntile

df_ntile = df.withColumn("ntile", ntile(3).over(windowSpec))
df_ntile.show()

+---+-------------+----------+---+-------+-----+
| id|         name|       dob|age| salary|ntile|
+---+-------------+----------+---+-------+-----+
|  4|   Jill Black|2024-08-04| 45|4000.25|    1|
|  3|   Jake White|2024-08-03| 18| 3000.1|    1|
|  6|Madhav Mishra|2024-08-06| 31|2500.45|    1|
|  8|   Jane Smith|2024-08-08| 30|2200.75|    2|
|  2|Alina Pradhan|2024-08-02| 34|2000.75|    2|
|  5|  James Brown|2024-08-05| 29| 1500.3|    2|
|  7|     John Doe|2024-08-07| 20| 1100.5|    3|
|  1|   Anuj Yadav|2024-08-01| 23| 1000.5|    3|
+---+-------------+----------+---+-------+-----+



In [75]:
#flatten
#Flattens an array of arrays into a single array.

from pyspark.sql.functions import flatten

df_flatten = df.withColumn("flattened_array", flatten(array(array(lit(1), lit(2)), array(lit(3)))))

df_flatten.show()

+---+-------------+----------+---+-------+---------------+
| id|         name|       dob|age| salary|flattened_array|
+---+-------------+----------+---+-------+---------------+
|  1|   Anuj Yadav|2024-08-01| 23| 1000.5|      [1, 2, 3]|
|  2|Alina Pradhan|2024-08-02| 34|2000.75|      [1, 2, 3]|
|  3|   Jake White|2024-08-03| 18| 3000.1|      [1, 2, 3]|
|  4|   Jill Black|2024-08-04| 45|4000.25|      [1, 2, 3]|
|  5|  James Brown|2024-08-05| 29| 1500.3|      [1, 2, 3]|
|  6|Madhav Mishra|2024-08-06| 31|2500.45|      [1, 2, 3]|
|  7|     John Doe|2024-08-07| 20| 1100.5|      [1, 2, 3]|
|  8|   Jane Smith|2024-08-08| 30|2200.75|      [1, 2, 3]|
+---+-------------+----------+---+-------+---------------+



In [76]:
#grouping_id
#Returns the level of grouping applied.

from pyspark.sql.functions import grouping_id
from pyspark.sql import functions as F

# Use GroupingSets to create multiple grouping levels
df_grouping_sets = df.cube("age", "salary").agg(grouping_id().alias("grouping_id"), F.sum("salary"))
df_grouping_sets.show()

+----+-------+-----------+-----------+
| age| salary|grouping_id|sum(salary)|
+----+-------+-----------+-----------+
|NULL| 1000.5|          2|     1000.5|
|  45|   NULL|          1|    4000.25|
|NULL|4000.25|          2|    4000.25|
|  34|   NULL|          1|    2000.75|
|NULL| 3000.1|          2|     3000.1|
|NULL|   NULL|          3|    17303.6|
|  23|   NULL|          1|     1000.5|
|  45|4000.25|          0|    4000.25|
|  34|2000.75|          0|    2000.75|
|NULL|2000.75|          2|    2000.75|
|  23| 1000.5|          0|     1000.5|
|  18| 3000.1|          0|     3000.1|
|  18|   NULL|          1|     3000.1|
|  29| 1500.3|          0|     1500.3|
|NULL| 1500.3|          2|     1500.3|
|NULL| 1100.5|          2|     1100.5|
|  29|   NULL|          1|     1500.3|
|  30|2200.75|          0|    2200.75|
|NULL|2200.75|          2|    2200.75|
|  20|   NULL|          1|     1100.5|
+----+-------+-----------+-----------+
only showing top 20 rows



In [77]:
#rollup
#Used for multi-dimensional aggregates, similar to cube but with a subset of it.

#from pyspark.sql.functions import rollup

df_rollup = df.rollup("age", "salary").count()
df_rollup.show()


+----+-------+-----+
| age| salary|count|
+----+-------+-----+
|  45|   NULL|    1|
|  34|   NULL|    1|
|NULL|   NULL|    8|
|  23|   NULL|    1|
|  45|4000.25|    1|
|  34|2000.75|    1|
|  23| 1000.5|    1|
|  18| 3000.1|    1|
|  18|   NULL|    1|
|  29| 1500.3|    1|
|  29|   NULL|    1|
|  30|2200.75|    1|
|  20|   NULL|    1|
|  20| 1100.5|    1|
|  30|   NULL|    1|
|  31|   NULL|    1|
|  31|2500.45|    1|
+----+-------+-----+



In [78]:
#corr
#Returns the Pearson correlation coefficient between two columns.

from pyspark.sql.functions import corr

df_corr = df.select(corr(col("age"), col("salary")))
df_corr.show()

+------------------+
| corr(age, salary)|
+------------------+
|0.5991996464292686|
+------------------+



In [79]:

#collect_list
#Returns all values from an aggregated group as a list.

from pyspark.sql.functions import collect_list

#df_collect_list = df.groupBy("age").agg(collect_list("salary").alias("salary_list"))
#df_collect_list.show()

from pyspark.sql.functions import collect_list

df_all = df.groupBy("age").agg(collect_list("name").alias("names")).show()
df_all


+---+---------------+
|age|          names|
+---+---------------+
| 34|[Alina Pradhan]|
| 18|   [Jake White]|
| 23|   [Anuj Yadav]|
| 45|   [Jill Black]|
| 29|  [James Brown]|
| 31|[Madhav Mishra]|
| 30|   [Jane Smith]|
| 20|     [John Doe]|
+---+---------------+

