In [2]:
import findspark
findspark.init()

In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.streaming import *
from pyspark.sql.types import *
from pyspark.sql.window import Window

spark = SparkSession.builder.master("local").appName("AmbiguousColumns").getOrCreate()
sc = spark.sparkContext

In [4]:
# sql adaptive query execution adaptive.coalescePartitions.enabled will makr paritions dynamic
sc.setLogLevel("Error")
spark.conf.set("spark.sql.shuffle.partitions",3)
spark.conf.get("spark.sql.shuffle.partitions")
spark.conf.set("spark.sql.adaptive.enabled","false")
spark.conf.set("spark.sql.adaptive.coalescePartitions.enabled","false")

"""
Identify ambiguous column names
1. If it is from csv, read csv will append column names by index
2. for json we need to handle
"""

#WithoutInferSchema
#headerTrue will read first row and assign column names but type is String for all
#spark job created to read first column
filepath = "file:///C:/Users/venka/PycharmProjects/pythonProject/dataset/"
df = spark.read.option("header",True) \
                .option("delimiter",",") \
                .csv(filepath + "IntAmbiguousCSV.csv",inferSchema=True)
df.show()

+-----------+-----------+------------+---------------+-------+--------+------+
|CustomerId0|CustomerId1|CustomerName|       dateTime| Amount|discount|Member|
+-----------+-----------+------------+---------------+-------+--------+------+
|       1001|       1001|        Arun| 7/15/2020 1:01|2465.22|     10%|  true|
|       1005|       1005|      Barath|7/13/2020 12:15|8399.34|      5%|  true|
|       1003|       1003|      Charle|7/18/2020 20:10|1234.88|      3%| false|
|       1004|       1004|       Gokul|7/15/2020 11:11| 1690.0|      1%|  true|
|       1005|       1005|       Messy|7/18/2020 15:11|  160.0|      3%|  true|
|       1006|       1006|      Gerold| 7/8/2020 14:16| 2546.0|      1%|  true|
+-----------+-----------+------------+---------------+-------+--------+------+



In [12]:
#Json
df_json = spark.read.option("multiline",True).option("escape","\"").json(filepath+"IntAmbiguous.json")

In [13]:
df_json.printSchema()

root
 |-- Delivery: struct (nullable = true)
 |    |-- address: string (nullable = true)
 |    |-- mob: string (nullable = true)
 |    |-- name: string (nullable = true)
 |-- name: string (nullable = true)
 |-- product: string (nullable = true)



In [9]:
df_json.show()

+--------------------+-------+---------+
|            Delivery|   name|  product|
+--------------------+-------+---------+
|{Chennai, 1234567...|   AZAR|Headphone|
|{Bangalore, 57386...|Bharath|  T-shirt|
+--------------------+-------+---------+



In [10]:
df_json.select("*",col("Delivery.*")).show()

+--------------------+-------+---------+---------+-------+-----------+
|            Delivery|   name|  product|  address|    mob|       name|
+--------------------+-------+---------+---------+-------+-----------+
|{Chennai, 1234567...|   AZAR|Headphone|  Chennai|1234567|  Azarudeen|
|{Bangalore, 57386...|Bharath|  T-shirt|Bangalore|5738612|Bharathiraj|
+--------------------+-------+---------+---------+-------+-----------+



In [27]:
df1 = df_json.select("*",col("Delivery.*")).drop("Delivery")

In [28]:
#withColumns - we can take index of a column, count of a column
df1.columns

['name', 'product', 'address', 'mob', 'name']

In [41]:
lst = []
lst_cols = df1.columns

In [51]:
for i in lst_cols:
    if lst_cols.count(i) == 2:
        ind=lst_cols.index(i)
        lst.append(ind)

In [52]:
lst1=[]
x=1
for i in lst_cols:
    if i in lst:
        i = i+str(x)
        x=x+1
    lst1.append(i)
print(lst1)

['name_0', 'product', 'address', 'mob', 'name']


In [43]:
lst

[0, 0]

In [44]:
lst1 = list(set(lst))
lst1

[0]

In [45]:
for i in lst1:
    lst_cols[i] = lst_cols[i] + "_0"

lst_cols

['name_0', 'product', 'address', 'mob', 'name']

In [49]:
#changing the schema of exisiting df with toDF method
df1 = df1.toDF(*lst_cols)
df1.printSchema()

root
 |-- name_0: string (nullable = true)
 |-- product: string (nullable = true)
 |-- address: string (nullable = true)
 |-- mob: string (nullable = true)
 |-- name: string (nullable = true)



In [50]:
df1.show()

+-------+---------+---------+-------+-----------+
| name_0|  product|  address|    mob|       name|
+-------+---------+---------+-------+-----------+
|   AZAR|Headphone|  Chennai|1234567|  Azarudeen|
|Bharath|  T-shirt|Bangalore|5738612|Bharathiraj|
+-------+---------+---------+-------+-----------+

