In [1]:
import findspark
findspark.init()
from pyspark import SparkContext
from pyspark.sql import SparkSession, StructField
sc = SparkContext("local")
spark = SparkSession.builder.getOrCreate()

In [5]:
from pyspark.sql.types import StringType, StructField, IntegerType, StructType
# Create a list of tuples with sample employee data (at least 100 employees)
employee_data = [
    ("1", "Alice", 30, "Engineering"),
    ("2", "Bob", 25, "Engineering"),
    ("3", "Charlie", 35, "HR"),
    ("4", "David", 28, "Finance"),
    ("5", "Eve", 22, "HR"),
    # ... Add more employee data here to reach at least 100 employees
]

# Define the schema for the employee DataFrame
employee_schema = StructType([
    StructField("EmployeeID", StringType(), True),
    StructField("Name", StringType(), True),
    StructField("Age", IntegerType(), True),
    StructField("Department", StringType(), True)
])

# Create the employee DataFrame
employee_df = spark.createDataFrame(employee_data, employee_schema)

# Create a list of tuples with department data
department_data = [
    ("Engineering", "New York"),
    ("HR", "San Francisco"),
    ("Finance", "Los Angeles"),
    # ... Add more department data if needed
]

# Define the schema for the department DataFrame
department_schema = StructType([
    StructField("Department", StringType(), True),
    StructField("Location", StringType(), True)
])
department_df = spark.createDataFrame(department_data, department_schema)

employee_df.show()
department_df.show()

+----------+-------+---+-----------+
|EmployeeID|   Name|Age| Department|
+----------+-------+---+-----------+
|         1|  Alice| 30|Engineering|
|         2|    Bob| 25|Engineering|
|         3|Charlie| 35|         HR|
|         4|  David| 28|    Finance|
|         5|    Eve| 22|         HR|
+----------+-------+---+-----------+

+-----------+-------------+
| Department|     Location|
+-----------+-------------+
|Engineering|     New York|
|         HR|San Francisco|
|    Finance|  Los Angeles|
+-----------+-------------+



In [6]:
emp_dept = employee_df.alias('emp').join(department_df.alias('dept'), employee_df.Department == department_df.Department, 'right')
emp_dept.show()

emp_dept.select('Name', 'dept.Department', 'dept.Location').show()

+----------+-------+---+-----------+-----------+-------------+
|EmployeeID|   Name|Age| Department| Department|     Location|
+----------+-------+---+-----------+-----------+-------------+
|         1|  Alice| 30|Engineering|Engineering|     New York|
|         2|    Bob| 25|Engineering|Engineering|     New York|
|         4|  David| 28|    Finance|    Finance|  Los Angeles|
|         3|Charlie| 35|         HR|         HR|San Francisco|
|         5|    Eve| 22|         HR|         HR|San Francisco|
+----------+-------+---+-----------+-----------+-------------+

+-------+-----------+-------------+
|   Name| Department|     Location|
+-------+-----------+-------------+
|  Alice|Engineering|     New York|
|    Bob|Engineering|     New York|
|  David|    Finance|  Los Angeles|
|Charlie|         HR|San Francisco|
|    Eve|         HR|San Francisco|
+-------+-----------+-------------+



In [3]:
# Convert the python Array type to Dataframe array Type. 

convert_Array_udf = udf(lambda x: x, ArrayType(FloatType(), containsNull=False))


df = df.withColumn('features_array', convert_Array_udf('features')).drop('features')

df.show()
df.printSchema()

+---+---------------+
| id| features_array|
+---+---------------+
|  1|[2.0, 2.0, 3.0]|
|  1|[2.0, 3.0, 3.0]|
|  2|[3.0, 2.0, 3.0]|
|  2|[3.0, 3.0, 3.0]|
+---+---------------+

root
 |-- id: long (nullable = true)
 |-- features_array: array (nullable = true)
 |    |-- element: float (containsNull = false)



In [4]:
# get the final dataframe for gradient descent

step = 10.0 
multiplyPlusStep_udf = udf(lambda x, y: [step]+np.multiply(x, y).tolist(), ArrayType(FloatType(), containsNull=False))

df2=df.withColumn('multiply', multiplyPlusStep_udf('features_array', 'features_array'))

df2.show()

+---+---------------+--------------------+
| id| features_array|            multiply|
+---+---------------+--------------------+
|  1|[2.0, 2.0, 3.0]|[10.0, 4.0, 4.0, ...|
|  1|[2.0, 3.0, 3.0]|[10.0, 4.0, 9.0, ...|
|  2|[3.0, 2.0, 3.0]|[10.0, 9.0, 4.0, ...|
|  2|[3.0, 3.0, 3.0]|[10.0, 9.0, 9.0, ...|
+---+---------------+--------------------+



In [5]:
# convert to 1 and 0

binary_udf = udf(lambda x: np.where(np.array(x) ==3.0, 1, 0).tolist(), ArrayType(IntegerType(), containsNull=False) ) 

df.withColumn('bin', binary_udf('features_array')).show()  



+---+---------------+---------+
| id| features_array|      bin|
+---+---------------+---------+
|  1|[2.0, 2.0, 3.0]|[0, 0, 1]|
|  1|[2.0, 3.0, 3.0]|[0, 1, 1]|
|  2|[3.0, 2.0, 3.0]|[1, 0, 1]|
|  2|[3.0, 3.0, 3.0]|[1, 1, 1]|
+---+---------------+---------+



In [6]:
# Sum the rows
# Note: You need to convert the results back to float

# Define a UDF
sumRows_udf = udf(lambda x: float(np.sum(x)), FloatType())

# Run the UDF
df3=df2.withColumn('multiply', sumRows_udf('multiply'))

df3.show()

+---+---------------+--------+
| id| features_array|multiply|
+---+---------------+--------+
|  1|[2.0, 2.0, 3.0]|    27.0|
|  1|[2.0, 3.0, 3.0]|    32.0|
|  2|[3.0, 2.0, 3.0]|    32.0|
|  2|[3.0, 3.0, 3.0]|    37.0|
+---+---------------+--------+



In [7]:
# Sum Column-Wise

from pyspark.sql import functions as F

n = len(df.select('features_array').first()[0])

resultDF = df.agg(F.array(*[F.sum(F.col("features_array")[i]) for i in range(n)]).alias("sum"))

resultDF.show(truncate=False)

+------------------+
|sum               |
+------------------+
|[10.0, 10.0, 12.0]|
+------------------+



In [8]:
# Now, we want to sum up the column 'multuply' and get the results. 

df4 = df3.withColumn('dummyKey', lit(1.0)).groupBy('dummyKey').agg({'multiply': 'sum'})

df4.show()


+--------+-------------+
|dummyKey|sum(multiply)|
+--------+-------------+
|     1.0|        128.0|
+--------+-------------+

