In [0]:
##/FileStore/tables/sample_json_file.json

df=spark.read.json("/FileStore/tables/sample_json_file.json",multiLine=True)
display(df)

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, explode
from pyspark.sql.types import StructType, ArrayType

# Create Spark session
spark = SparkSession.builder.appName("FlattenJSON").getOrCreate()

def flatten_df(nested_df):
    flat_cols = []
    nested_cols = []

    for column_name, column_type in nested_df.dtypes:
        if "." in column_name:
            column_name = column_name.replace(".", "_")
        if column_type.startswith("struct"):
            nested_cols.append(column_name)
        elif column_type.startswith("array"):
            nested_cols.append(column_name)
        else:
            flat_cols.append(col(column_name))

    while nested_cols:
        col_name = nested_cols.pop(0)

        # Check the column type
        field_type = dict(nested_df.dtypes)[col_name]
        if field_type.startswith("array"):
            nested_df = nested_df.withColumn(col_name, explode(col(col_name)))
        else:
            for field in nested_df.select(col_name + ".*").columns:
                nested_df = nested_df.withColumn(col_name + "_" + field, col(col_name + "." + field))
            nested_df = nested_df.drop(col_name)

        # Recalculate flat and nested columns
        flat_cols = []
        nested_cols = []
        for column_name, column_type in nested_df.dtypes:
            if "." in column_name:
                column_name = column_name.replace(".", "_")
            if column_type.startswith("struct") or column_type.startswith("array"):
                nested_cols.append(column_name)
            else:
                flat_cols.append(col(column_name))

    return nested_df

# Example usage
# Assuming you loaded a nested JSON file into `df`
# df = spark.read.json("path_to_nested.json")
# flat_df = flatten_df(df)
# flat_df.show(truncate=False)


In [0]:
df2=flatten_df(df)
display(df2)