# Transformation part 6
### SPLIT, INDEXING, EXPLODE, ARRAYCONTAINS

In [0]:
df = (
  spark.read
  .format("csv")
  .option("header", True)
  .option("inferSchema", True)
  .load("/Volumes/learn_spark/bronze/datasets/Datasets/students_sample.csv")
)


In [0]:
df.show(10)

+----------+------+------+---+-----+-----+---------+
|student_id|  name|gender|age|class|marks|     city|
+----------+------+------+---+-----+-----+---------+
|         1| Aarav|     M| 15|  10A|   85|    Delhi|
|         2|Ananya|     F| 14|   9B|   78|   Mumbai|
|         3| Rohan|     M| 16|  10B|   92|     Pune|
|         4| Priya|     F| 15|  10A|   88|Bangalore|
|         5| Karan|     M| 14|   9A|   67|  Chennai|
|         6|  Neha|     F| 15|  10C|   90|Hyderabad|
|         7| Arjun|     M| 16|  11A|   73|    Delhi|
|         8|  Isha|     F| 14|   9C|   81|   Jaipur|
|         9|Vikram|     M| 15|  10B|   59|   Indore|
|        10| Sneha|     F| 16|  11B|   95|     Pune|
+----------+------+------+---+-----+-----+---------+
only showing top 10 rows


### split --> used to convert a column into a list

In [0]:
from pyspark.sql.functions import *
df.withColumn('class',split('class', ' ')).show()

+----------+------+------+---+-----+-----+----------+
|student_id|  name|gender|age|class|marks|      city|
+----------+------+------+---+-----+-----+----------+
|         1| Aarav|     M| 15|[10A]|   85|     Delhi|
|         2|Ananya|     F| 14| [9B]|   78|    Mumbai|
|         3| Rohan|     M| 16|[10B]|   92|      Pune|
|         4| Priya|     F| 15|[10A]|   88| Bangalore|
|         5| Karan|     M| 14| [9A]|   67|   Chennai|
|         6|  Neha|     F| 15|[10C]|   90| Hyderabad|
|         7| Arjun|     M| 16|[11A]|   73|     Delhi|
|         8|  Isha|     F| 14| [9C]|   81|    Jaipur|
|         9|Vikram|     M| 15|[10B]|   59|    Indore|
|        10| Sneha|     F| 16|[11B]|   95|      Pune|
|        11| Rahul|     M| 15|[10C]|   64|    Nagpur|
|        12| Pooja|     F| 14| [9A]|   72|    Bhopal|
|        13|  Amit|     M| 16|[11A]|   86|     Delhi|
|        14| Kavya|     F| 15|[10B]|   91| Ahmedabad|
|        15|Suresh|     M| 14| [9B]|   58|     Surat|
|        16| Meena|     F| 1

In [0]:
df_json = (
  spark.read
  .format("json")
  .option("multiline", True)
  .option("inferSchema", True)
  .load("/Volumes/learn_spark/bronze/datasets/Datasets/orders_sample.json")
)


In [0]:
df_json =df_json.select('category','customer_id','customer_name','order_date','order_id').\
    withColumn('list_name',split('customer_name', ' '))

In [0]:
df_json.show(5)

+-----------+-----------+-------------+----------+--------+------------+------------+---------------+
|   category|customer_id|customer_name|order_date|order_id|order_status|payment_mode|      list_name|
+-----------+-----------+-------------+----------+--------+------------+------------+---------------+
|Electronics|       C101| Aarav Sharma|2024-01-15|    1001|   Delivered| Credit Card|[Aarav, Sharma]|
|Electronics|       C102|   Neha Verma|2024-01-17|    1002|   Delivered|         UPI|  [Neha, Verma]|
|  Furniture|       C103|  Rohit Gupta|2024-02-02|    1003|     Shipped|  Debit Card| [Rohit, Gupta]|
|  Furniture|       C104|  Pooja Singh|2024-02-10|    1004|     Pending| Net Banking| [Pooja, Singh]|
|Accessories|       C105|   Amit Patel|2024-03-01|    1005|   Delivered|         UPI|  [Amit, Patel]|
+-----------+-----------+-------------+----------+--------+------------+------------+---------------+



### Indexing -- > used to fetch the value based on index

In [0]:
df_json.withColumn('first_name',split('customer_name', ' ')[0]).\
    withColumn('last_name',split('customer_name', ' ')[1]).show()
    


+-----------+-----------+-------------+----------+--------+---------------+----------+---------+
|   category|customer_id|customer_name|order_date|order_id|      list_name|first_name|last_name|
+-----------+-----------+-------------+----------+--------+---------------+----------+---------+
|Electronics|       C101| Aarav Sharma|2024-01-15|    1001|[Aarav, Sharma]|     Aarav|   Sharma|
|Electronics|       C102|   Neha Verma|2024-01-17|    1002|  [Neha, Verma]|      Neha|    Verma|
|  Furniture|       C103|  Rohit Gupta|2024-02-02|    1003| [Rohit, Gupta]|     Rohit|    Gupta|
|  Furniture|       C104|  Pooja Singh|2024-02-10|    1004| [Pooja, Singh]|     Pooja|    Singh|
|Accessories|       C105|   Amit Patel|2024-03-01|    1005|  [Amit, Patel]|      Amit|    Patel|
+-----------+-----------+-------------+----------+--------+---------------+----------+---------+



### explode -- > used to explode a column into rows if the existing column is a type of list

In [0]:
df_json.show(5)

+-----------+-----------+-------------+----------+--------+---------------+
|   category|customer_id|customer_name|order_date|order_id|      list_name|
+-----------+-----------+-------------+----------+--------+---------------+
|Electronics|       C101| Aarav Sharma|2024-01-15|    1001|[Aarav, Sharma]|
|Electronics|       C102|   Neha Verma|2024-01-17|    1002|  [Neha, Verma]|
|  Furniture|       C103|  Rohit Gupta|2024-02-02|    1003| [Rohit, Gupta]|
|  Furniture|       C104|  Pooja Singh|2024-02-10|    1004| [Pooja, Singh]|
|Accessories|       C105|   Amit Patel|2024-03-01|    1005|  [Amit, Patel]|
+-----------+-----------+-------------+----------+--------+---------------+



In [0]:
df_json.withColumn('list_name',explode('list_name')).show(20)

+-----------+-----------+-------------+----------+--------+---------+
|   category|customer_id|customer_name|order_date|order_id|list_name|
+-----------+-----------+-------------+----------+--------+---------+
|Electronics|       C101| Aarav Sharma|2024-01-15|    1001|    Aarav|
|Electronics|       C101| Aarav Sharma|2024-01-15|    1001|   Sharma|
|Electronics|       C102|   Neha Verma|2024-01-17|    1002|     Neha|
|Electronics|       C102|   Neha Verma|2024-01-17|    1002|    Verma|
|  Furniture|       C103|  Rohit Gupta|2024-02-02|    1003|    Rohit|
|  Furniture|       C103|  Rohit Gupta|2024-02-02|    1003|    Gupta|
|  Furniture|       C104|  Pooja Singh|2024-02-10|    1004|    Pooja|
|  Furniture|       C104|  Pooja Singh|2024-02-10|    1004|    Singh|
|Accessories|       C105|   Amit Patel|2024-03-01|    1005|     Amit|
|Accessories|       C105|   Amit Patel|2024-03-01|    1005|    Patel|
+-----------+-----------+-------------+----------+--------+---------+



### Array contains -- > returns a true or false if value exits 

In [0]:
clear(0)

[H[2J

In [0]:
df_json.withColumn('present',array_contains('list_name','Aarav')).show(10)

+-----------+-----------+-------------+----------+--------+---------------+-------+
|   category|customer_id|customer_name|order_date|order_id|      list_name|present|
+-----------+-----------+-------------+----------+--------+---------------+-------+
|Electronics|       C101| Aarav Sharma|2024-01-15|    1001|[Aarav, Sharma]|   true|
|Electronics|       C102|   Neha Verma|2024-01-17|    1002|  [Neha, Verma]|  false|
|  Furniture|       C103|  Rohit Gupta|2024-02-02|    1003| [Rohit, Gupta]|  false|
|  Furniture|       C104|  Pooja Singh|2024-02-10|    1004| [Pooja, Singh]|  false|
|Accessories|       C105|   Amit Patel|2024-03-01|    1005|  [Amit, Patel]|  false|
+-----------+-----------+-------------+----------+--------+---------------+-------+

