In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.streaming import *
from pyspark.sql.types import *

spark = SparkSession.builder.master("local").appName("MergeDataframe").getOrCreate()
sc = spark.sparkContext

In [3]:
# sql adaptive query execution adaptive.coalescePartitions.enabled will makr paritions dynamic
sc.setLogLevel("Error")
spark.conf.set("spark.sql.shuffle.partitions",3)
spark.conf.get("spark.sql.shuffle.partitions")

"""
Merging two dataframes 4 options
withColumn, Union
Define Schema,Union
Apply Outer Join
Automate Process
"""

filepath = "file:///C:/Users/venka/PycharmProjects/pythonProject/dataset/"

df1=spark.read.option('delimiter','|').csv(filepath + "MergeInput1.csv",header=True)
#.option('delimiter','~|') can't be more than one character error

df1.show(truncate=0)

+-----------------+---+
|Name             |Age|
+-----------------+---+
|Azarudeen, Shahul|25 |
|Michel, Clarke   |26 |
|Virat, Kohli     |28 |
|Andrew, Simond   |37 |
+-----------------+---+



In [19]:
df2=spark.read.option('delimiter','|').option("header",True).csv(filepath + "MergeInput2.csv")
#.option('delimiter','~|') can't be more than one character error

df2.show(truncate=0)

+---------------+---+----+
|Name           |Age|Sex |
+---------------+---+----+
|Geogre, Bush   | 59|Male|
|Flintoff, David|12 |Male|
|Adam, James    | 20|Male|
+---------------+---+----+



In [5]:
#Method1
df1_add = df1.withColumn("Sex",lit("null"))
df1_add.union(df2).show(truncate=0)

+-----------------+----+----+
|Name             |Age,|Sex |
+-----------------+----+----+
|Azarudeen, Shahul|25  |null|
|Michel, Clarke   |26  |null|
|Virat, Kohli     |28  |null|
|Andrew, Simond   |37  |null|
|Geogre, Bush     | 59 |Male|
|Flintoff, David  |12  |Male|
|Adam, James      | 20 |Male|
+-----------------+----+----+



In [7]:
#Method2
schema = StructType(
[
    StructField("Name",StringType(),True),
    StructField("Age",StringType(),True),
    StructField("Sex",StringType(),True)
]
)
df3=spark.read.option("delimiter","|").option("header",True).schema(schema).csv(filepath+"MergeInput1.csv")
df4=spark.read.option("delimiter","|").csv(filepath+"MergeInput2.csv",header=True,schema=schema)

df3.union(df4).show(truncate=0)

+-----------------+---+----+
|Name             |Age|Sex |
+-----------------+---+----+
|Azarudeen, Shahul|25 |NULL|
|Michel, Clarke   |26 |NULL|
|Virat, Kohli     |28 |NULL|
|Andrew, Simond   |37 |NULL|
|Geogre, Bush     | 59|Male|
|Flintoff, David  |12 |Male|
|Adam, James      | 20|Male|
+-----------------+---+----+



In [13]:
#Method3 Outer join
outerdf1 = spark.read.option("delimiter","|").csv(filepath+"MergeInput1.csv",header=True)
outerdf2 = spark.read.option("delimiter","|").csv(filepath+"MergeInput2.csv",header=True)
outerdf1.join(outerdf2,on=["Name","Age"],how="Outer").show(truncate=0)

+-----------------+---+----+
|Name             |Age|Sex |
+-----------------+---+----+
|Adam, James      | 20|Male|
|Andrew, Simond   |37 |NULL|
|Azarudeen, Shahul|25 |NULL|
|Flintoff, David  |12 |Male|
|Geogre, Bush     | 59|Male|
|Michel, Clarke   |26 |NULL|
|Virat, Kohli     |28 |NULL|
+-----------------+---+----+



In [21]:
#Method4 Automated Approach
autodf1 = spark.read.option("delimiter","|").csv(filepath+"MergeInput1.csv",header=True)
autodf2 = spark.read.option("delimiter","|").csv(filepath+"MergeInput2.csv",header=True)

listA = list(set(autodf1.columns)-set(autodf2.columns))
listB = list(set(autodf2.columns)-set(autodf1.columns))

for i in listA:
    autodf2 = autodf2.withColumn(i,lit("null"))

for i in listB:
    autodf1 = autodf1.withColumn(i,lit("null"))

autodf1.union(autodf2).show(truncate=0)

+-----------------+---+----+
|Name             |Age|Sex |
+-----------------+---+----+
|Azarudeen, Shahul|25 |null|
|Michel, Clarke   |26 |null|
|Virat, Kohli     |28 |null|
|Andrew, Simond   |37 |null|
|Geogre, Bush     | 59|Male|
|Flintoff, David  |12 |Male|
|Adam, James      | 20|Male|
+-----------------+---+----+

