# Split Function In DataFrame

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import split, explode, size, array_contains, col

# Create a Spark session
spark = SparkSession.builder.appName("PySparkSplitFunctions").getOrCreate()

# Sample employee data
data = [
    (1, "Alice", "HR", "Communication Management"),
    (2, "Bob", "IT", "Programming Networking"),
    (3, "Charlie", "Finance", "Accounting Analysis"),
    (4, "David", "HR", "Recruiting Communication"),
    (5, "Eve", "IT", "Cloud DevOps")
]

# Define the schema
columns = ["EmployeeID", "Name", "Department", "Skills"]

# Create DataFrame
df = spark.createDataFrame(data, columns)

# Display the original DataFrame
df.show(truncate=False)

StatementMeta(, 2d819380-a991-4dff-8a36-a11b3198763e, 3, Finished, Available, Finished)

+----------+-------+----------+------------------------+
|EmployeeID|Name   |Department|Skills                  |
+----------+-------+----------+------------------------+
|1         |Alice  |HR        |Communication Management|
|2         |Bob    |IT        |Programming Networking  |
|3         |Charlie|Finance   |Accounting Analysis     |
|4         |David  |HR        |Recruiting Communication|
|5         |Eve    |IT        |Cloud DevOps            |
+----------+-------+----------+------------------------+



## 1. Split the 'Skills' Column

In [2]:
df_split = df.withColumn("Skills_Array", split(col("Skills"), ' '))
df_split.show(truncate=False)

StatementMeta(, 2d819380-a991-4dff-8a36-a11b3198763e, 4, Finished, Available, Finished)

+----------+-------+----------+------------------------+---------------------------+
|EmployeeID|Name   |Department|Skills                  |Skills_Array               |
+----------+-------+----------+------------------------+---------------------------+
|1         |Alice  |HR        |Communication Management|[Communication, Management]|
|2         |Bob    |IT        |Programming Networking  |[Programming, Networking]  |
|3         |Charlie|Finance   |Accounting Analysis     |[Accounting, Analysis]     |
|4         |David  |HR        |Recruiting Communication|[Recruiting, Communication]|
|5         |Eve    |IT        |Cloud DevOps            |[Cloud, DevOps]            |
+----------+-------+----------+------------------------+---------------------------+



## 2. Select the First Skill from 'Skills_Array'

In [3]:
df_first_skill = df_split.select(col("Name"), col("Skills_Array")[0].alias("First_Skill"))
df_first_skill.show()

StatementMeta(, 2d819380-a991-4dff-8a36-a11b3198763e, 5, Finished, Available, Finished)

+-------+-------------+
|   Name|  First_Skill|
+-------+-------------+
|  Alice|Communication|
|    Bob|  Programming|
|Charlie|   Accounting|
|  David|   Recruiting|
|    Eve|        Cloud|
+-------+-------------+



## 3. Calculate the Size of 'Skills_Array'

In [4]:
df_skill_count = df_split.select(col("Name"), size(col("Skills_Array")).alias("Skills_Count"))
df_skill_count.show()

StatementMeta(, 2d819380-a991-4dff-8a36-a11b3198763e, 6, Finished, Available, Finished)

+-------+------------+
|   Name|Skills_Count|
+-------+------------+
|  Alice|           2|
|    Bob|           2|
|Charlie|           2|
|  David|           2|
|    Eve|           2|
+-------+------------+



## 4. Check if 'Skills_Array' Contains a Specific Skill (e.g., 'Cloud')

In [5]:
df_cloud_check = df_split.select(col("Name"), array_contains(col("Skills_Array"), "Cloud").alias("Has_Cloud_Skill"))
df_cloud_check.show()

StatementMeta(, 2d819380-a991-4dff-8a36-a11b3198763e, 7, Finished, Available, Finished)

+-------+---------------+
|   Name|Has_Cloud_Skill|
+-------+---------------+
|  Alice|          false|
|    Bob|          false|
|Charlie|          false|
|  David|          false|
|    Eve|           true|
+-------+---------------+



## 5. Explode 'Skills_Array' into Individual Rows

In [6]:
df_exploded = df_split.select(col("EmployeeID"), col("Name"), explode(col("Skills_Array")).alias("Individual_Skill"))
df_exploded.show()

StatementMeta(, 2d819380-a991-4dff-8a36-a11b3198763e, 8, Finished, Available, Finished)

+----------+-------+----------------+
|EmployeeID|   Name|Individual_Skill|
+----------+-------+----------------+
|         1|  Alice|   Communication|
|         1|  Alice|      Management|
|         2|    Bob|     Programming|
|         2|    Bob|      Networking|
|         3|Charlie|      Accounting|
|         3|Charlie|        Analysis|
|         4|  David|      Recruiting|
|         4|  David|   Communication|
|         5|    Eve|           Cloud|
|         5|    Eve|          DevOps|
+----------+-------+----------------+



## Summary of Key Functions

- **split()**: This splits a column's string value into an array based on a specified delimiter (in this case, a space).
- **explode()**: Converts an array column into multiple rows, one for each element in the array.
- **size()**: Returns the number of elements in an array.
- **array_contains()**: Checks if a specific value exists in the array.
- **selectExpr()**: Allows you to use SQL expressions (like `array[0]`) to select array elements.