### converting csv to json


In [0]:
data=[(100,"Apple","book",5),
      (101,"Banana","Pen",10),
      (100,"Apple","Box",8),
      (101,"Banana","Fan",1),
      (102,"Grapes","Ball",1),
      (103,"jackFruit","Ink",10)]
df=spark.createDataFrame(data,["custid","custname","itemname","quanity"])
display(df)

custid,custname,itemname,quanity
100,Apple,book,5
101,Banana,Pen,10
100,Apple,Box,8
101,Banana,Fan,1
102,Grapes,Ball,1
103,jackFruit,Ink,10


In [0]:
from pyspark.sql import functions as f
df2=df.groupBy("custid","custname").agg(f.map_from_entries(f.collect_list(f.struct("itemname","quanity")).alias("purchase")))
df2.show(truncate=False)
df2.display()

+------+---------+---------------------------------------------------------------------+
|custid|custname |map_from_entries(collect_list(struct(itemname, quanity)) AS purchase)|
+------+---------+---------------------------------------------------------------------+
|100   |Apple    |{book -> 5, Box -> 8}                                                |
|101   |Banana   |{Pen -> 10, Fan -> 1}                                                |
|102   |Grapes   |{Ball -> 1}                                                          |
|103   |jackFruit|{Ink -> 10}                                                          |
+------+---------+---------------------------------------------------------------------+



custid,custname,"map_from_entries(collect_list(struct(itemname, quanity)) AS purchase)"
100,Apple,"Map(book -> 5, Box -> 8)"
101,Banana,"Map(Pen -> 10, Fan -> 1)"
102,Grapes,Map(Ball -> 1)
103,jackFruit,Map(Ink -> 10)


In [0]:
my_format=[i.asDict(recursive=True) for i in df2.collect()]
print(my_format)

[{'custid': 100, 'custname': 'Apple', 'map_from_entries(collect_list(struct(itemname, quanity)) AS purchase)': {'Box': 8, 'book': 5}}, {'custid': 101, 'custname': 'Banana', 'map_from_entries(collect_list(struct(itemname, quanity)) AS purchase)': {'Pen': 10, 'Fan': 1}}, {'custid': 102, 'custname': 'Grapes', 'map_from_entries(collect_list(struct(itemname, quanity)) AS purchase)': {'Ball': 1}}, {'custid': 103, 'custname': 'jackFruit', 'map_from_entries(collect_list(struct(itemname, quanity)) AS purchase)': {'Ink': 10}}]


### spliting a single column value to multiple rows in pyspark

In [0]:
df=spark.read.format("csv").option("header","false").load("dbfs:/FileStore/stream_write_1/sample.csv")
df1=df.toDF("Col1")
df1.show(truncate=False)

+--------------------------------------------------------------------------+
|Col1                                                                      |
+--------------------------------------------------------------------------+
|1001|Ram|28|Java|1002|Raj|24|Database|1004|Jam|28|DotNet|1005|Kesh|25|Java|
+--------------------------------------------------------------------------+



In [0]:
from pyspark.sql import functions as f
df2=df1.withColumn('Col2',f.regexp_replace(f.col("Col1"),"(.*?\\|){4}","$0\n"))
df2.select("Col2").show(truncate=False)


+--------------------------------------------------------------------------------+
|Col2                                                                            |
+--------------------------------------------------------------------------------+
|1001|Ram|28|Java|\n1002|Raj|24|Database|\n1004|Jam|28|DotNet|\n1005|Kesh|25|Java|
+--------------------------------------------------------------------------------+



In [0]:
df_header=['eno','ename','age','tech']
df2.select(f.explode(f.split("Col2","\n")))\
    .select(f.split("col","\|").alias("value"))\
    .select(*map(lambda i: f.col("value").getItem(df_header.index(i)).alias(i),df_header)).show()

+----+-----+---+--------+
| eno|ename|age|    tech|
+----+-----+---+--------+
|1001|  Ram| 28|    Java|
|1002|  Raj| 24|Database|
|1004|  Jam| 28|  DotNet|
|1005| Kesh| 25|    Java|
+----+-----+---+--------+



In [0]:

df_header=['eno','ename','age','tech']
df2.select(f.explode(f.split("Col2","\n")))\
    .select(f.split("col","\|").alias("value"))\
    .select(*map(lambda i: f.col("value").getItem(df_header.index(i)).alias(i),df_header)).show()

+----+-----+----+----+
| eno|ename| age|tech|
+----+-----+----+----+
|1001| 1001|1001|1001|
|1002| 1002|1002|1002|
|1004| 1004|1004|1004|
|1005| 1005|1005|1005|
+----+-----+----+----+



In [0]:
df_header=['eno','ename','age','tech']
df2.select(f.split("Col2","\n")).show(truncate=False)
df2.select(f.explode(f.split("Col2","\n"))).show(truncate=False)
df2.select(f.explode(f.split("Col2","\n")))\
    .select(f.split("col","\|").alias("value")).show(truncate=False)

+----------------------------------------------------------------------------------+
|split(Col2, \n, -1)                                                               |
+----------------------------------------------------------------------------------+
|[1001|Ram|28|Java|, 1002|Raj|24|Database|, 1004|Jam|28|DotNet|, 1005|Kesh|25|Java]|
+----------------------------------------------------------------------------------+

+---------------------+
|col                  |
+---------------------+
|1001|Ram|28|Java|    |
|1002|Raj|24|Database||
|1004|Jam|28|DotNet|  |
|1005|Kesh|25|Java    |
+---------------------+

+---------------------------+
|value                      |
+---------------------------+
|[1001, Ram, 28, Java, ]    |
|[1002, Raj, 24, Database, ]|
|[1004, Jam, 28, DotNet, ]  |
|[1005, Kesh, 25, Java]     |
+---------------------------+



### Regexp_replace and ReplaceAll

In [0]:
df=spark.read.format("csv").option("header","true").load("dbfs:/FileStore/sample.csv")
display(df)

SNo|Sname|Sub1|Sub2|Sub3|Sub4
111|apple|90|70|34|67
112|orange|67|34|98|55
113|grapes|56|89|23|90
114|mango|55|88|33|90
115|linci|89|76|89|45


In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, array

spark = SparkSession.builder.appName("example").getOrCreate()

data = [(1, ("John", "Doe")), (2, ("Jane", "Smith")), (3, ("Bob", "Johnson"))]
columns = ["id", "name"]

df = spark.createDataFrame(data, columns)

df_array = df.withColumn("name_array", array(col("name.*")))

df_array.show(truncate=False)

+---+--------------+--------------+
|id |name          |name_array    |
+---+--------------+--------------+
|1  |{John, Doe}   |[John, Doe]   |
|2  |{Jane, Smith} |[Jane, Smith] |
|3  |{Bob, Johnson}|[Bob, Johnson]|
+---+--------------+--------------+

