In [27]:
import findspark
findspark.init()

import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import StringType, StructField, StructType, IntegerType, ArrayType
from pyspark.sql.functions import col, lit, array

spark = SparkSession.builder.appName("show").getOrCreate()

In [28]:
data = [('abc', [1,2]),('mno', [4,5]),('xyz', [7,8]) ]
schema = ['id', 'numbers']

In [29]:
df = spark.createDataFrame(data, schema)

In [30]:
df.show()

+---+-------+
| id|numbers|
+---+-------+
|abc| [1, 2]|
|mno| [4, 5]|
|xyz| [7, 8]|
+---+-------+



In [31]:
df.printSchema()

root
 |-- id: string (nullable = true)
 |-- numbers: array (nullable = true)
 |    |-- element: long (containsNull = true)



### ArrayType

In [32]:
data2 = [('abc', [1,2]),('mno', [4,5]),('xyz', [7,8]) ]

In [33]:
schema2 = StructType([StructField(name="id", dataType=StringType()),StructField(name="numbers", dataType=ArrayType(IntegerType()))])

In [34]:
df2 = spark.createDataFrame(data2, schema2)

In [35]:
df2.show()

+---+-------+
| id|numbers|
+---+-------+
|abc| [1, 2]|
|mno| [4, 5]|
|xyz| [7, 8]|
+---+-------+



In [36]:
df2.printSchema()

root
 |-- id: string (nullable = true)
 |-- numbers: array (nullable = true)
 |    |-- element: integer (containsNull = true)



In [37]:
df2.withColumn('firstNumber', col('numbers')[0]).show()

+---+-------+-----------+
| id|numbers|firstNumber|
+---+-------+-----------+
|abc| [1, 2]|          1|
|mno| [4, 5]|          4|
|xyz| [7, 8]|          7|
+---+-------+-----------+



In [39]:
d = [(1,2), (3,4)]
s = ['num1', 'num2']

df = spark.createDataFrame(d, s)
df1 = df.withColumn('numbers', array(col('num1'), col('num2')))
df1.show()

+----+----+-------+
|num1|num2|numbers|
+----+----+-------+
|   1|   2| [1, 2]|
|   3|   4| [3, 4]|
+----+----+-------+

