In [1]:
import findspark
findspark.init()
from pyspark import SparkContext
from pyspark.sql import SparkSession
sc = SparkContext("local")
spark = SparkSession.builder.getOrCreate()

In [2]:
from pyspark.sql.functions import *
from pyspark.sql.types import *
import numpy as np

In [3]:
# Let us use python Array
data2 = [(1, ([2.0, 2.0, 3.0]),),
         (1, ([2.0, 3.0, 3.0]),),
         (2, ([3.0, 2.0, 3.0]),),
         (2, ([3.0, 3.0, 3.0]),)]

df = spark.createDataFrame(data2, ["id", "features"])

df.show()
df.printSchema()

+---+---------------+
| id|       features|
+---+---------------+
|  1|[2.0, 2.0, 3.0]|
|  1|[2.0, 3.0, 3.0]|
|  2|[3.0, 2.0, 3.0]|
|  2|[3.0, 3.0, 3.0]|
+---+---------------+

root
 |-- id: long (nullable = true)
 |-- features: array (nullable = true)
 |    |-- element: double (containsNull = true)



In [3]:
# Convert the python Array type to Dataframe array Type. 
convert_Array_udf = udf(lambda x: x, ArrayType(FloatType(), containsNull=False))
df = df.withColumn('features_array', convert_Array_udf('features')).drop('features')
df.show()
df.printSchema()

+---+---------------+
| id| features_array|
+---+---------------+
|  1|[2.0, 2.0, 3.0]|
|  1|[2.0, 3.0, 3.0]|
|  2|[3.0, 2.0, 3.0]|
|  2|[3.0, 3.0, 3.0]|
+---+---------------+

root
 |-- id: long (nullable = true)
 |-- features_array: array (nullable = true)
 |    |-- element: float (containsNull = false)



In [4]:
# get the final dataframe for gradient descent

step = 10.0 
multiplyPlusStep_udf = udf(lambda x, y: [step]+np.multiply(x, y).tolist(), ArrayType(FloatType(), containsNull=False))

df2=df.withColumn('multiply', multiplyPlusStep_udf('features_array', 'features_array'))

df2.show()

+---+---------------+--------------------+
| id| features_array|            multiply|
+---+---------------+--------------------+
|  1|[2.0, 2.0, 3.0]|[10.0, 4.0, 4.0, ...|
|  1|[2.0, 3.0, 3.0]|[10.0, 4.0, 9.0, ...|
|  2|[3.0, 2.0, 3.0]|[10.0, 9.0, 4.0, ...|
|  2|[3.0, 3.0, 3.0]|[10.0, 9.0, 9.0, ...|
+---+---------------+--------------------+



In [5]:
# convert to 1 and 0
binary_udf = udf(lambda x: np.where(np.array(x) ==3.0, 1, 0).tolist(), ArrayType(IntegerType(), containsNull=False) ) 
df.withColumn('bin', binary_udf('features_array')).show()  


+---+---------------+---------+
| id| features_array|      bin|
+---+---------------+---------+
|  1|[2.0, 2.0, 3.0]|[0, 0, 1]|
|  1|[2.0, 3.0, 3.0]|[0, 1, 1]|
|  2|[3.0, 2.0, 3.0]|[1, 0, 1]|
|  2|[3.0, 3.0, 3.0]|[1, 1, 1]|
+---+---------------+---------+



In [6]:
# Sum the rows
# Note: You need to convert the results back to float

# Define a UDF
sumRows_udf = udf(lambda x: float(np.sum(x)), FloatType())

# Run the UDF
df3=df2.withColumn('multiply', sumRows_udf('multiply'))

df3.show()

+---+---------------+--------+
| id| features_array|multiply|
+---+---------------+--------+
|  1|[2.0, 2.0, 3.0]|    27.0|
|  1|[2.0, 3.0, 3.0]|    32.0|
|  2|[3.0, 2.0, 3.0]|    32.0|
|  2|[3.0, 3.0, 3.0]|    37.0|
+---+---------------+--------+



In [7]:
# Sum Column-Wise

from pyspark.sql import functions as F

n = len(df.select('features_array').first()[0])

resultDF = df.agg(F.array(*[F.sum(F.col("features_array")[i]) for i in range(n)]).alias("sum"))

resultDF.show(truncate=False)

+------------------+
|sum               |
+------------------+
|[10.0, 10.0, 12.0]|
+------------------+



In [8]:
# Now, we want to sum up the column 'multuply' and get the results. 

df4 = df3.withColumn('dummyKey', lit(1.0)).groupBy('dummyKey').agg({'multiply': 'sum'})

df4.show()


+--------+-------------+
|dummyKey|sum(multiply)|
+--------+-------------+
|     1.0|        128.0|
+--------+-------------+

