In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as FS


In [2]:
spark = SparkSession.builder.master("local[2]").appName("Spark-Query").getOrCreate()

In [3]:
spark

**1. Given 2 dataframe having different number of columns structure how will you merge it on row basis.**

In [4]:
data1= [(1,"manish","data engineer"),(2,"rani","data analyst"),(3,"manju","data science")]
column1 =["id","name","department"]

data2= [(3,"harish"),(4,"monish"),(5,"priti")]
column2 =["id","name"]
dataframe1 = spark.createDataFrame(data = data1,schema =column1 )
dataframe2 = spark.createDataFrame(data = data2,schema =column2 )

In [5]:
dataframe1.show()

+---+------+-------------+
| id|  name|   department|
+---+------+-------------+
|  1|manish|data engineer|
|  2|  rani| data analyst|
|  3| manju| data science|
+---+------+-------------+



In [6]:
dataframe2.show()

+---+------+
| id|  name|
+---+------+
|  3|harish|
|  4|monish|
|  5| priti|
+---+------+



**Apply Union to merge data on row basis. but, union operation require no: of columns should be same.**

In [7]:
dataframe2 = dataframe2.withColumn("department",FS.lit('null'))

In [8]:
dataframe2.show()

+---+------+----------+
| id|  name|department|
+---+------+----------+
|  3|harish|      null|
|  4|monish|      null|
|  5| priti|      null|
+---+------+----------+



In [9]:
answ1 = dataframe1.union(dataframe2)

In [10]:
answ1.show()

+---+------+-------------+
| id|  name|   department|
+---+------+-------------+
|  1|manish|data engineer|
|  2|  rani| data analyst|
|  3| manju| data science|
|  3|harish|         null|
|  4|monish|         null|
|  5| priti|         null|
+---+------+-------------+



**2. how to convert any dataframe into sql views/tables**

In [11]:
answ1.createOrReplaceTempView("ans_view")

In [12]:
spark.sql("select * from ans_view").show()

+---+------+-------------+
| id|  name|   department|
+---+------+-------------+
|  1|manish|data engineer|
|  2|  rani| data analyst|
|  3| manju| data science|
|  3|harish|         null|
|  4|monish|         null|
|  5| priti|         null|
+---+------+-------------+



**3. how to convert comma separated values into sql columns<br>**
d = ("122334","2221",324251")

In [13]:
d = [{'name':"khan","id":"mango,banana,orange"}]
df1 = spark.createDataFrame(data = d )

In [14]:
df1.show()

+-------------------+----+
|                 id|name|
+-------------------+----+
|mango,banana,orange|khan|
+-------------------+----+



In [15]:
# df1 = df1.withColumn("id", FS.explode(FS.array(df1.id)))
# Note: explode not working here, will check later
df1 = df1.withColumn("id",FS.split("id",","))
df1.show()

+--------------------+----+
|                  id|name|
+--------------------+----+
|[mango, banana, o...|khan|
+--------------------+----+



In [16]:

df1.select("name",FS.explode("id")).show()

+----+------+
|name|   col|
+----+------+
|khan| mango|
|khan|banana|
|khan|orange|
+----+------+



**3. How to convert multiple rows into single rows**

In [17]:
lang_data = [('James','Java'),
  ('James','Python'),
  ('James','Python'),
  ('Anna','PHP'),
  ('Anna','Javascript'),
  ('Maria','Java'),
  ('Maria','C++'),
  ('James','Scala'),
  ('Anna','PHP'),
  ('Anna','HTML')
]
column = ['name','language']
datafrm = spark.createDataFrame(data = lang_data, schema = column)
datafrm.show()

+-----+----------+
| name|  language|
+-----+----------+
|James|      Java|
|James|    Python|
|James|    Python|
| Anna|       PHP|
| Anna|Javascript|
|Maria|      Java|
|Maria|       C++|
|James|     Scala|
| Anna|       PHP|
| Anna|      HTML|
+-----+----------+



In [18]:
datafrm.groupBy("name").agg(FS.collect_set("language").alias("lang")).show()
# datafrm.groupBy('name').agg(FS.collect_list("language").alias("prog")).show()

+-----+--------------------+
| name|                lang|
+-----+--------------------+
|James|[Scala, Java, Pyt...|
| Anna|[PHP, Javascript,...|
|Maria|         [Java, C++]|
+-----+--------------------+



**4. Format the state_dataframe to below format**
```
+---------+
|city_name|
+---------+
|      Goa|
|       AP|
|      Blr|
+---------+

```

In [19]:
state_data = [('Goa',None,None),(None,'AP',None),(None,None,'Blr')]
schema = "city1 string,city2 string,city3 string"
state_df = spark.createDataFrame(data =state_data,schema =  schema)
state_df.show()

+-----+-----+-----+
|city1|city2|city3|
+-----+-----+-----+
|  Goa| NULL| NULL|
| NULL|   AP| NULL|
| NULL| NULL|  Blr|
+-----+-----+-----+



In [20]:
state_df.withColumn("city_name",FS.coalesce(state_df.city1,state_df.city2,state_df.city3)).\
drop("city1").drop("city2").drop("city3").show()

+---------+
|city_name|
+---------+
|      Goa|
|       AP|
|      Blr|
+---------+

