## Setting environment Variables

In [1]:
import os
import sys
os.environ["PYSPARK_PYTHON"] = sys.executable
os.environ["PYSPARK_DRIVER_PYTHON"] = sys.executable

## Creating Sparksession

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder\
        .appName("built-in2")\
        .getOrCreate()

## split() function

### Dataframe

In [3]:
data = [("James, A, Smith","2018","M",3000),
            ("Michael, Rose, Jones","2010","M",4000),
            ("Robert,K,Williams","2010","M",4000),
            ("Maria,Anne,Jones","2005","F",4000),
            ("Jen,Mary,Brown","2010","",-1)
            ]

columns=["name","dob_year","gender","salary"]
df=spark.createDataFrame(data,columns)
df.show()

+--------------------+--------+------+------+
|                name|dob_year|gender|salary|
+--------------------+--------+------+------+
|     James, A, Smith|    2018|     M|  3000|
|Michael, Rose, Jones|    2010|     M|  4000|
|   Robert,K,Williams|    2010|     M|  4000|
|    Maria,Anne,Jones|    2005|     F|  4000|
|      Jen,Mary,Brown|    2010|      |    -1|
+--------------------+--------+------+------+



### Converting String to Array

In [4]:
from pyspark.sql.functions import split
# split is used to convert any string into array/ list
df.select(split(df.name, ",").alias("full_name"))\
    .drop("name").show(truncate = False)

+------------------------+
|full_name               |
+------------------------+
|[James,  A,  Smith]     |
|[Michael,  Rose,  Jones]|
|[Robert, K, Williams]   |
|[Maria, Anne, Jones]    |
|[Jen, Mary, Brown]      |
+------------------------+



### Converting String to Array using SQL Query

In [5]:
df.createOrReplaceTempView("employee_hr")
spark.sql("""
            SELECT name, SPLIT(name, ",") as full_name FROM employee_hr
            """).show()

+--------------------+--------------------+
|                name|           full_name|
+--------------------+--------------------+
|     James, A, Smith| [James,  A,  Smith]|
|Michael, Rose, Jones|[Michael,  Rose, ...|
|   Robert,K,Williams|[Robert, K, Willi...|
|    Maria,Anne,Jones|[Maria, Anne, Jones]|
|      Jen,Mary,Brown|  [Jen, Mary, Brown]|
+--------------------+--------------------+



## concat_ws() function

### Dataframe

In [6]:
columns = ["name","languagesAtSchool","currentState"]
data = [("James,,Smith",["Java","Scala","C++"],"CA"), \
    ("Michael,Rose,",["Spark","Java","C++"],"NJ"), \
    ("Robert,,Williams",["CSharp","VB"],"NV")]

df = spark.createDataFrame(data=data,schema=columns)
df.printSchema()
df.show(truncate=False)

root
 |-- name: string (nullable = true)
 |-- languagesAtSchool: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- currentState: string (nullable = true)

+----------------+------------------+------------+
|name            |languagesAtSchool |currentState|
+----------------+------------------+------------+
|James,,Smith    |[Java, Scala, C++]|CA          |
|Michael,Rose,   |[Spark, Java, C++]|NJ          |
|Robert,,Williams|[CSharp, VB]      |NV          |
+----------------+------------------+------------+



### Combining array elements

In [7]:
from pyspark.sql.functions import concat_ws
df.select(df.languagesAtSchool)\
    .withColumn("new_col", concat_ws(";", df.languagesAtSchool))\
    .show()

+------------------+--------------+
| languagesAtSchool|       new_col|
+------------------+--------------+
|[Java, Scala, C++]|Java;Scala;C++|
|[Spark, Java, C++]|Spark;Java;C++|
|      [CSharp, VB]|     CSharp;VB|
+------------------+--------------+



### Using SQL

In [8]:
df.createOrReplaceTempView("student")
spark.sql("""
            SELECT name, CONCAT_WS(";", languagesAtSchool) as lang
            FROM student
            """).show()

+----------------+--------------+
|            name|          lang|
+----------------+--------------+
|    James,,Smith|Java;Scala;C++|
|   Michael,Rose,|Spark;Java;C++|
|Robert,,Williams|     CSharp;VB|
+----------------+--------------+



## substring() function

### Dataframe

In [9]:
data = [(1,"20200828"),(2,"20180525")]
columns=["id","calendar"]
df=spark.createDataFrame(data,columns)
df.show()

+---+--------+
| id|calendar|
+---+--------+
|  1|20200828|
|  2|20180525|
+---+--------+



### withColumn using substring

In [10]:
from pyspark.sql.functions import substring
df.withColumn("Year",substring(df.calendar, 1, 4))\
    .withColumn("Month", substring(df.calendar, 5, 2))\
    .withColumn("Date",  substring(df.calendar, 7, 2))\
    .show()

+---+--------+----+-----+----+
| id|calendar|Year|Month|Date|
+---+--------+----+-----+----+
|  1|20200828|2020|   08|  28|
|  2|20180525|2018|   05|  25|
+---+--------+----+-----+----+



### select using substring

In [11]:
df.select("calendar", substring(df.calendar, 1, 4).alias('Year'),
         substring(df.calendar, 5, 2).alias('Month'),
         substring(df.calendar, 7, 2).alias('Date'))\
    .show()

+--------+----+-----+----+
|calendar|Year|Month|Date|
+--------+----+-----+----+
|20200828|2020|   08|  28|
|20180525|2018|   05|  25|
+--------+----+-----+----+



### with selectExpr()

In [12]:
df.selectExpr('calendar', 'substring(calendar, 1, 4) as year',
             'substring(calendar, 5, 2) as month',
             'substring(calendar, 7, 2) as date')\
    .show()

+--------+----+-----+----+
|calendar|year|month|date|
+--------+----+-----+----+
|20200828|2020|   08|  28|
|20180525|2018|   05|  25|
+--------+----+-----+----+



### substr() from column

In [13]:
df.withColumn("year", df.calendar.substr(1, 4))\
    .withColumn("month", df.calendar.substr(5, 2))\
    .withColumn("date", df.calendar.substr(7, 2))\
    .show()

+---+--------+----+-----+----+
| id|calendar|year|month|date|
+---+--------+----+-----+----+
|  1|20200828|2020|   08|  28|
|  2|20180525|2018|   05|  25|
+---+--------+----+-----+----+



### Using SQL

In [14]:
df.createOrReplaceTempView("details")
spark.sql("""
            SELECT id, calendar,
            SUBSTRING(calendar, 1, 4) as year,
            SUBSTRING(calendar, 5, 2) as month,
            SUBSTRING(calendar, 7, 2) as date
            from details
        """).show()

+---+--------+----+-----+----+
| id|calendar|year|month|date|
+---+--------+----+-----+----+
|  1|20200828|2020|   08|  28|
|  2|20180525|2018|   05|  25|
+---+--------+----+-----+----+



In [15]:
select_statement = "SELECT id, calendar, SUBSTRING(calendar, 1, 4) as year, SUBSTRING(calendar, 5, 2) as month, SUBSTRING(calendar, 7, 2) as date from details"
spark.sql(f"{select_statement}").show()

+---+--------+----+-----+----+
| id|calendar|year|month|date|
+---+--------+----+-----+----+
|  1|20200828|2020|   08|  28|
|  2|20180525|2018|   05|  25|
+---+--------+----+-----+----+



## regexp_replace() function

### DataFrame

In [16]:
address = [(1,"14851 Jeffrey Rd","DE"),
    (2,"43421 Margarita St","NY"),
    (3,"13111 Siemon Ave","CA")]
df = spark.createDataFrame(address,["id","address","state"])
df.show()

+---+------------------+-----+
| id|           address|state|
+---+------------------+-----+
|  1|  14851 Jeffrey Rd|   DE|
|  2|43421 Margarita St|   NY|
|  3|  13111 Siemon Ave|   CA|
+---+------------------+-----+



### Replace String with Columns

In [17]:
from pyspark.sql.functions import regexp_replace
df.withColumn("new_addr", regexp_replace('address', 'Rd', 'Road'))\
    .drop("address").show()

+---+-----+------------------+
| id|state|          new_addr|
+---+-----+------------------+
|  1|   DE|14851 Jeffrey Road|
|  2|   NY|43421 Margarita St|
|  3|   CA|  13111 Siemon Ave|
+---+-----+------------------+



### Replace Column values conditionally

In [18]:
from pyspark.sql.functions import when
df.withColumn('address',
             when(df.address.endswith("Rd"),regexp_replace(df.address, 'Rd', 'Road'))\
             .when(df.address.endswith("St"), regexp_replace(df.address, 'St', 'Street'))\
             .when(df.address.endswith("Ave"), regexp_replace(df.address, 'Ave', 'Avenue'))\
            .otherwise(df.address))\
            .show(truncate = False)

+---+----------------------+-----+
|id |address               |state|
+---+----------------------+-----+
|1  |14851 Jeffrey Road    |DE   |
|2  |43421 Margarita Street|NY   |
|3  |13111 Siemon Avenue   |CA   |
+---+----------------------+-----+



### Replace column with another column value

In [23]:
df3 = spark.createDataFrame(
   [("ABCDE_XYZ", "XYZ","FGH")], 
    ("col1", "col2","col3")
  )

In [28]:
from pyspark.sql.functions import expr
# col1 is the column that has to be replaced with wherever the values from the col2 should be replaced with col3
df3.withColumn("new_col",
              expr("regexp_replace(col1, col2, col3)")).show()

+---------+----+----+---------+
|     col1|col2|col3|  new_col|
+---------+----+----+---------+
|ABCDE_XYZ| XYZ| FGH|ABCDE_FGH|
+---------+----+----+---------+



### Replace column values with dictionary values

In [19]:
stateDict = {"DE":"Delaware", "NY":"NewYork", "CA":"California"}
df2 = df.rdd.map(lambda x :
                (x.id, x.address, stateDict[x.state])
                ).toDF(["id", "address", "state"])
df2.show()

+---+------------------+----------+
| id|           address|     state|
+---+------------------+----------+
|  1|  14851 Jeffrey Rd|  Delaware|
|  2|43421 Margarita St|   NewYork|
|  3|  13111 Siemon Ave|California|
+---+------------------+----------+



## translate() function

In [20]:
from pyspark.sql.functions import translate
df.withColumn("new_add", translate('address','123','ABC')).show()

+---+------------------+-----+------------------+
| id|           address|state|           new_add|
+---+------------------+-----+------------------+
|  1|  14851 Jeffrey Rd|   DE|  A485A Jeffrey Rd|
|  2|43421 Margarita St|   NY|4C4BA Margarita St|
|  3|  13111 Siemon Ave|   CA|  ACAAA Siemon Ave|
+---+------------------+-----+------------------+



## overlay() function

In [30]:
df4 = spark.createDataFrame([("ABCDE_XYZ", "FGH")], ("col1", "col2"))
df4.printSchema()

root
 |-- col1: string (nullable = true)
 |-- col2: string (nullable = true)



In [37]:
from pyspark.sql.functions import overlay
df4.withColumn("new_overlay", overlay("col1", "col2", 9)).show()

+---------+----+-----------+
|     col1|col2|new_overlay|
+---------+----+-----------+
|ABCDE_XYZ| FGH|ABCDE_XYFGH|
+---------+----+-----------+

