In [2]:
import findspark
findspark.init()

In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.streaming import *
from pyspark.sql.types import *

spark = SparkSession.builder.master("local").appName("Regexp").getOrCreate()
sc = spark.sparkContext

In [26]:
# sql adaptive query execution adaptive.coalescePartitions.enabled will makr paritions dynamic
sc.setLogLevel("Error")
spark.conf.set("spark.sql.shuffle.partitions",3)
spark.conf.get("spark.sql.shuffle.partitions")
spark.conf.set("spark.sql.adaptive.enabled","true")
spark.conf.set("spark.sql.adaptive.coalescePartitions.enabled","true")

"""
Which one results faster
1.filter > sort > count
2.sort > filter > count
3.cache > filter > sort
4.cache > sort > filter
"""

filepath = "file:///C:/Users/venka/PycharmProjects/pythonProject/dataset/"

df1=spark.read.csv(filepath + "IntDelim5.csv",header=False,inferSchema=False)
#.option('delimiter','~|') can't be more than one character error

df1.show(truncate=0)
#df1.printSchema()

+------------------------------------------------------------------------------------------------------------------------+
|_c0                                                                                                                     |
+------------------------------------------------------------------------------------------------------------------------+
|Azar|BE|8|BigData|9273564531|Ramesh|BTech|3|Java|8439761222|Parthiban|ME|6|dotNet|8876534121|Magesh|MCA|8|DBA|9023451789|
+------------------------------------------------------------------------------------------------------------------------+



In [27]:
from pyspark.sql.functions import regexp_replace, split, explode
df1=df1.withColumn("chk",regexp_replace("_c0","(.*?\\|){5}","$0-"))
df1.select("chk").show(truncate=0)

+---------------------------------------------------------------------------------------------------------------------------+
|chk                                                                                                                        |
+---------------------------------------------------------------------------------------------------------------------------+
|Azar|BE|8|BigData|9273564531|-Ramesh|BTech|3|Java|8439761222|-Parthiban|ME|6|dotNet|8876534121|-Magesh|MCA|8|DBA|9023451789|
+---------------------------------------------------------------------------------------------------------------------------+



In [28]:
#Explode into multiple records with single column
df2=df1.withColumn("col_explode",explode(split("chk","\|-"))).select("col_explode")
df2.show(truncate=0)

+--------------------------------+
|col_explode                     |
+--------------------------------+
|Azar|BE|8|BigData|9273564531    |
|Ramesh|BTech|3|Java|8439761222  |
|Parthiban|ME|6|dotNet|8876534121|
|Magesh|MCA|8|DBA|9023451789     |
+--------------------------------+



In [31]:
df2.select(split("col_explode","\|")[1]).show()

+-----------------------------+
|split(col_explode, \|, -1)[1]|
+-----------------------------+
|                           BE|
|                        BTech|
|                           ME|
|                          MCA|
+-----------------------------+



In [34]:
#convert to RDD and split by |
df2.select("col_explode").rdd.map(lambda x: x[0].split("|")).collect()

[['Azar', 'BE', '8', 'BigData', '9273564531'],
 ['Ramesh', 'BTech', '3', 'Java', '8439761222'],
 ['Parthiban', 'ME', '6', 'dotNet', '8876534121'],
 ['Magesh', 'MCA', '8', 'DBA', '9023451789']]

In [36]:
df2_rdd = df2.select("col_explode").rdd.map(lambda x: x[0].split("|"))
df2_rdd.toDF(["Name","Edu","YrsofExp","Tech","MobNum"]).show()

+---------+-----+--------+-------+----------+
|     Name|  Edu|YrsofExp|   Tech|    MobNum|
+---------+-----+--------+-------+----------+
|     Azar|   BE|       8|BigData|9273564531|
|   Ramesh|BTech|       3|   Java|8439761222|
|Parthiban|   ME|       6| dotNet|8876534121|
|   Magesh|  MCA|       8|    DBA|9023451789|
+---------+-----+--------+-------+----------+

