In [0]:
# #### JOIN ####
columns = ["RId","RName", "RActive"]
data = [(1, "R1", 1),
    (2, "R2", 1),
    (3, "R3", 1)]
df_role = spark.createDataFrame(data=data, schema=columns)
df_role.createOrReplaceTempView("roles")

columns = ["UId","UName", "UActive"]
data = [(1, "U1", 1),
    (2, "U2", 1),
    (3, "U3", 0),
    (4, "U4", 1)]
df_user = spark.createDataFrame(data=data, schema=columns)
df_user.createOrReplaceTempView("users")

df_role.show()

df_user.show()

columns = ["UId", "RId"]
data = [(1, 1), (1, 2), (2, 3), (3, 1), (3,2), (3,3), (4, 1), (4,3)]
df_user_role = spark.createDataFrame(data=data, schema=columns)
df_user_role.createOrReplaceTempView("user_role")

print("Spark-Sql Approach")
new_df = spark.sql("select u.UName, r.RName, u.UActive, r.RActive from users as u, roles as r, user_role as ur where ur.UId = u.UId and ur.RId = r.RId and UActive=1")
new_df.show()

print("Dataframe-Join Approach")
print("df_user_role.join(df_user)")
df1 = df_user_role.join(df_user, df_user_role.UId == df_user.UId, "inner")
df1.show()

print("user_role_user.join(df_role)")
df2 = df1.join(df_role, df_role.RId == df1.RId, "inner")
df2.show()

In [0]:
#SQL and DF
columns = ["UId","Name", "Zone"]
data = [("1", "User1", "Z1"),
    ("2", "User2", "Z1"),
    ("3", "User3", "Z2")]

df = spark.createDataFrame(data=data, schema=columns)

#Spark DF based
df.show(truncate=False)
df.groupBy("Zone").count().show(truncate=False)

#Spark-Sql Based
df.createOrReplaceTempView("user_zone")
df_sql = spark.sql("SELECT Zone, count(Name) FROM user_zone group by Zone")
df_sql.show()

In [0]:
#UDF - Single Row/Record
#UDF’s a.k.a User Defined Functions
#extend and reuse function in sql

import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col,sum,avg

def uppercase(str):
    return str.upper() 
  
convertUDF = udf(lambda z: uppercase(z)) 

df.select(convertUDF(col("Name")).alias("Name")).show(truncate=False)

In [0]:
#Aggregations
import pyspark
from pyspark.sql import functions as F
from pyspark.sql import types as T

columns = ["UId","Name", "Zone"]
data = [("1", "User1", "Z1"),
    ("2", "User2", "Z1"),
    ("3", "User3", "Z2")]

df = spark.createDataFrame(data=data, schema=columns)

df.groupBy('Zone').agg(F.collect_list('Name').alias('value_list')).show()


In [0]:
#UDF Aggregation
import pyspark
from pyspark.sql import functions as F
from pyspark.sql import types as T

columns = ["UId","Name", "Zone"]
data = [("1", "User1", "Z1"),
    ("2", "User2", "Z1"),
    ("3", "User3", "Z2")]

df = spark.createDataFrame(data=data, schema=columns)

def concat(x):
  print("concat:", type(x), x)
  result = ""
  for i in x:
    result = result + "|" + i
  return result

#Function and Return Type
concat_udf = F.udf(concat, T.StringType())
df.groupBy('Zone').agg(concat_udf(F.collect_list('Name')).alias('users_in_zone')).show()
