In [0]:
array_data =[
    ('John',4, 1),
    ('John', 6, 2),
    ('David', 7, 3),
    ('Mike', 3, 4),
    ('David', 5 , 2),
    ('John', 7, 3),
    ('John',9,7),
    ('David', 1 , 8),
    ('David', 4 , 9),
    ('David', 7 , 4),
    ('Mike',8,5),
    ('Mike',5,2),
    ('Mike',3,8),
    ('John',2,7),
    ('David', 1 , 9)
]

array_schema = ['Name','Score_1','Score_2']
arrayDF = spark.createDataFrame(data = array_data, schema=array_schema)
display(arrayDF)

Name,Score_1,Score_2
John,4,1
John,6,2
David,7,3
Mike,3,4
David,5,2
John,7,3
John,9,7
David,1,8
David,4,9
David,7,4


####Convert Sample DataFrame into Array Dataframe

In [0]:
from pyspark.sql import functions as f 

masterDF = arrayDF.groupby('Name').agg(f.collect_list('Score_1').alias('array_score_1'),(f.collect_list('Score_2').alias('array_score_2')))
display(masterDF)
masterDF.printSchema()

Name,array_score_1,array_score_2
John,"List(4, 6, 7, 9, 2)","List(1, 2, 3, 7, 7)"
David,"List(7, 5, 1, 4, 7, 1)","List(3, 2, 8, 9, 4, 9)"
Mike,"List(3, 8, 5, 3)","List(4, 5, 2, 8)"


root
 |-- Name: string (nullable = true)
 |-- array_score_1: array (nullable = false)
 |    |-- element: long (containsNull = false)
 |-- array_score_2: array (nullable = false)
 |    |-- element: long (containsNull = false)



####Apply array_zip function on array DF

In [0]:
array_zip_df = masterDF.withColumn('Zipped_value',f.arrays_zip('array_score_1','array_score_2'))
array_zip_df.show(10, False)
#display(array_zip_df)

+-----+------------------+------------------+------------------------------------------------+
|Name |array_score_1     |array_score_2     |Zipped_value                                    |
+-----+------------------+------------------+------------------------------------------------+
|John |[4, 6, 7, 9, 2]   |[1, 2, 3, 7, 7]   |[{4, 1}, {6, 2}, {7, 3}, {9, 7}, {2, 7}]        |
|David|[7, 5, 1, 4, 7, 1]|[3, 2, 8, 9, 4, 9]|[{7, 3}, {5, 2}, {1, 8}, {4, 9}, {7, 4}, {1, 9}]|
|Mike |[3, 8, 5, 3]      |[4, 5, 2, 8]      |[{3, 4}, {8, 5}, {5, 2}, {3, 8}]                |
+-----+------------------+------------------+------------------------------------------------+



## Apply Array_Intersect

In [0]:
empDF = [
    ('John',[4,6,7,9,2],[1,2,3,7,7]),
    ('David',[7,5,1,4,7,1],[3,2,8,9,4,9]),
    ('Mike',[3,9,1,6,2],[1,2,3,5,8])
]

df = spark.createDataFrame(data=empDF, schema=['Name','Array_1','Array_2'])
display(df)

Name,Array_1,Array_2
John,"List(4, 6, 7, 9, 2)","List(1, 2, 3, 7, 7)"
David,"List(7, 5, 1, 4, 7, 1)","List(3, 2, 8, 9, 4, 9)"
Mike,"List(3, 9, 1, 6, 2)","List(1, 2, 3, 5, 8)"


In [0]:
##Apply Array_Intersect
from pyspark.sql import functions as f
outputDF = df.withColumn('Intersect', f.array_intersect('Array_1','Array_2'))
display(outputDF)

Name,Array_1,Array_2,Intersect
John,"List(4, 6, 7, 9, 2)","List(1, 2, 3, 7, 7)","List(7, 2)"
David,"List(7, 5, 1, 4, 7, 1)","List(3, 2, 8, 9, 4, 9)",List(4)
Mike,"List(3, 9, 1, 6, 2)","List(1, 2, 3, 5, 8)","List(3, 1, 2)"


## Apply Array_Except

In [0]:
empDF = [
    ('John',[4,6,7,9,2],[1,2,3,7,7]),
    ('David',[7,5,1,4,7,1],[3,2,8,9,4,9]),
    ('Mike',[3,9,1,6,2],[1,2,3,5,8])
]

df = spark.createDataFrame(data=empDF, schema=['Name','Array_1','Array_2'])
display(df)

Name,Array_1,Array_2
John,"List(4, 6, 7, 9, 2)","List(1, 2, 3, 7, 7)"
David,"List(7, 5, 1, 4, 7, 1)","List(3, 2, 8, 9, 4, 9)"
Mike,"List(3, 9, 1, 6, 2)","List(1, 2, 3, 5, 8)"


In [0]:
##Apply Array_Except
from pyspark.sql import functions as f
outputDF = df.withColumn('Intersect', f.array_except('Array_1','Array_2'))
display(outputDF)

Name,Array_1,Array_2,Intersect
John,"List(4, 6, 7, 9, 2)","List(1, 2, 3, 7, 7)","List(4, 6, 9)"
David,"List(7, 5, 1, 4, 7, 1)","List(3, 2, 8, 9, 4, 9)","List(7, 5, 1)"
Mike,"List(3, 9, 1, 6, 2)","List(1, 2, 3, 5, 8)","List(9, 6)"


##Apply Array_Sort

In [0]:
empDF = [
    ('John',[4,6,7,9,2]),
    ('David',[7,5,1,4,7,1]),
    ('Mike',[3,9,1,6,2])
]

df = spark.createDataFrame(data=empDF, schema=['Name','Array_1'])
display(df)

Name,Array_1
John,"List(4, 6, 7, 9, 2)"
David,"List(7, 5, 1, 4, 7, 1)"
Mike,"List(3, 9, 1, 6, 2)"


In [0]:
##Apply Array_Sort
from pyspark.sql import functions as f
outputDF = df.withColumn('Sorted', f.array_sort('Array_1'))
display(outputDF)

Name,Array_1,Sorted
John,"List(4, 6, 7, 9, 2)","List(2, 4, 6, 7, 9)"
David,"List(7, 5, 1, 4, 7, 1)","List(1, 1, 4, 5, 7, 7)"
Mike,"List(3, 9, 1, 6, 2)","List(1, 2, 3, 6, 9)"
