In [51]:
import findspark
findspark.init()

import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import StringType, StructField, StructType, IntegerType, ArrayType
from pyspark.sql.functions import col, lit, array

spark = SparkSession.builder.appName("show").getOrCreate()

In [52]:
from pyspark.sql.functions import explode

In [53]:
data = [('abc', [1,2]),('mno', [4,5]),('xyz', [7,8]) ]

In [54]:
schema = StructType([ StructField("id", StringType()), StructField("numbers", ArrayType(IntegerType()))])

In [55]:
df = spark.createDataFrame(data, schema)

In [56]:
df.show()

+---+-------+
| id|numbers|
+---+-------+
|abc| [1, 2]|
|mno| [4, 5]|
|xyz| [7, 8]|
+---+-------+



In [57]:
df.printSchema()

root
 |-- id: string (nullable = true)
 |-- numbers: array (nullable = true)
 |    |-- element: integer (containsNull = true)



### explode()

In [58]:
df1 = df.withColumn('explodedCol', explode(col('numbers')))
df1.show()

+---+-------+-----------+
| id|numbers|explodedCol|
+---+-------+-----------+
|abc| [1, 2]|          1|
|abc| [1, 2]|          2|
|mno| [4, 5]|          4|
|mno| [4, 5]|          5|
|xyz| [7, 8]|          7|
|xyz| [7, 8]|          8|
+---+-------+-----------+



In [59]:
df2 = df.withColumn('explodedCol', explode(col('numbers'))).select('id', 'explodedCol')
df2.show()

+---+-----------+
| id|explodedCol|
+---+-----------+
|abc|          1|
|abc|          2|
|mno|          4|
|mno|          5|
|xyz|          7|
|xyz|          8|
+---+-----------+



### split()

In [60]:
from pyspark.sql.functions import split

datas = [(1, 'maheer', '.net, azure, sql'), 
        (2, 'wafa', 'java, aws, sql')]

schemas = ["id", "name", "skills"]

In [61]:
dtf = spark.createDataFrame(datas, schemas)

In [62]:
dtf.show()

+---+------+----------------+
| id|  name|          skills|
+---+------+----------------+
|  1|maheer|.net, azure, sql|
|  2|  wafa|  java, aws, sql|
+---+------+----------------+



In [63]:
dtf = dtf.withColumn('skills', split('skills', ','))

dtf.show()

+---+------+--------------------+
| id|  name|              skills|
+---+------+--------------------+
|  1|maheer|[.net,  azure,  sql]|
|  2|  wafa|  [java,  aws,  sql]|
+---+------+--------------------+



### array()

In [64]:
from pyspark.sql.functions import array

In [65]:
dta = [(1, 'maheer', '.net', 'azure'), (2, 'wafa', 'java', 'aws')]

sch = ["id", "name", "primarySkill", "secondarySkill"]

dfa = spark.createDataFrame(dta, sch)

In [66]:
dfa.show()

+---+------+------------+--------------+
| id|  name|primarySkill|secondarySkill|
+---+------+------------+--------------+
|  1|maheer|        .net|         azure|
|  2|  wafa|        java|           aws|
+---+------+------------+--------------+



In [67]:
dfa2 = dfa.withColumn("skills", array(col('primarySkill'), col('secondarySkill')))

dfa2.show()

+---+------+------------+--------------+-------------+
| id|  name|primarySkill|secondarySkill|       skills|
+---+------+------------+--------------+-------------+
|  1|maheer|        .net|         azure|[.net, azure]|
|  2|  wafa|        java|           aws|  [java, aws]|
+---+------+------------+--------------+-------------+



### array_contains()

In [68]:
from pyspark.sql.functions import array_contains

In [70]:
d = [(1, 'maheer', ['.net', 'azure']), (2, 'wafa', ['java', 'sql'])]

s = ['id', 'name', 'skills']

dfs = spark.createDataFrame(d, s)
dfs.show()

+---+------+-------------+
| id|  name|       skills|
+---+------+-------------+
|  1|maheer|[.net, azure]|
|  2|  wafa|  [java, sql]|
+---+------+-------------+



In [71]:
dfc = dfs.withColumn('HasJavaSkill', array_contains(col('skills'), 'java'))

dfc.show()

+---+------+-------------+------------+
| id|  name|       skills|HasJavaSkill|
+---+------+-------------+------------+
|  1|maheer|[.net, azure]|       false|
|  2|  wafa|  [java, sql]|        true|
+---+------+-------------+------------+

