In [0]:
pip install json5

In [0]:
#Use case
#Community Master Data: Users, Roles, UserRole
#Transaction Data: Transactions (transId, userId, some details)
#From userId, get the relevant username and perform anyother lookups

import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col,sum,avg
from pyspark.sql.types import Row
import json5 as json

columns = ["RId","RName", "RActive"]
data = [(1, "Role1", 1),
    (2, "Role2", 1),
    (3, "Role3", 1)]
df_role = spark.createDataFrame(data=data, schema=columns)
df_role.createOrReplaceTempView("roles")
bc_role = spark.sparkContext.broadcast(df_role.toJSON().collect())

columns = ["UId","UName", "UActive"]
data = [(1, "User1", 1),
    (2, "User2", 1),
    (3, "User3", 0),
    (4, "User4", 1)]
df_user = spark.createDataFrame(data=data, schema=columns)
df_user.createOrReplaceTempView("users")
bc_user = spark.sparkContext.broadcast(df_user.toJSON().collect())

columns = ["UId", "RId"]
data = [(1, 1), (1, 2), (2, 3), (3, 1), (3,2), (3,3), (4, 1), (4,3)]
df_user_role = spark.createDataFrame(data=data, schema=columns)
df_user_role.createOrReplaceTempView("user_role")
bc_user_role = spark.sparkContext.broadcast(df_user_role.toJSON().collect())


def myMapFunction(rec, bc_user, bc_role, bc_user_role):
  print("myMapFunction::Type:%s, Data:%s" % (type(rec), rec))
  #myMapFunction::Type:<class 'pyspark.sql.types.Row'>, Data:Row(TId=4, UId=3, TDesc='TDesc4', TDate='2021-04-16T17:14:00')
  row_dict = rec.asDict()
  userId = row_dict.get("UId")
  print("myMapFunction::user:%s, bc_user:%s, bc_user.value:%s" % (userId, type(bc_user), type(bc_user.value)))
  enrich1_val = ""
  for v in bc_user.value:
    print("myMapFunction::type(v):%s, v:%s" % (type(v), v))
    #myMapFunction::type(v):<class 'str'>, v:{"UId":1,"UName":"U1","UActive":1}
    if type(v) == str:
      v = json.loads(v)
    if v.get("UId") == userId:
      enrich1_val = v.get("UName")
  #myMapFunction::user:3, bc_user:<class 'pyspark.broadcast.Broadcast'>, bc_user.value:<class 'list'>
  row_dict["ENRICH_1"] = enrich1_val #Adding a new column and setting some value.
  print("myMapFunction::row_dict:%s" % row_dict)
  #myMapFunction::row_dict:{'TId': 4, 'UId': 3, 'TDesc': 'TDesc4', 'TDate': '2021-04-16T17:14:00', 'ENRICH_1': 'constant_value'}
  return Row(**row_dict)

columns = ["TId", "UId", "TDesc", "TDate"]
data = [
  (1, 1, "TDesc1", "2021-04-16T17:11:00"), 
  (2, 1, "TDesc2", "2021-04-16T17:12:00"), 
  (3, 4, "TDesc3", "2021-04-16T17:13:00"), 
  (4, 3, "TDesc4", "2021-04-16T17:14:00"), 
  (5, 2, "TDesc5", "2021-04-16T17:15:00")]
df_trans = spark.createDataFrame(data=data, schema=columns)

df_new_rdd = df_trans.rdd.map(lambda r: myMapFunction(r, bc_user, bc_role, bc_user_role))
print("df_new_rdd:", df_new_rdd)
df_new_rdd.toDF(["TId", "UId", "TDesc", "TDate", "ENRICH_1"]).show()

In [0]:
from pyspark.sql.types import StructType,StructField, StringType, LongType

def myFlatMapFunction(rec):
  print("myFlatMapFunction::Type:%s, Data:%s" % (type(rec), rec))
  #myFlatMapFunction::Type:<class 'pyspark.sql.types.Row'>, Data:Row(TId=4, UId=3, TDesc='TDesc4', TDate='2021-04-16T17:14:00')
  return rec

print("Source Dataframe::df_trans:", df_trans)
df_trans.printSchema()
df_trans.show()
df_new_rdd = df_trans.rdd.flatMap(lambda r: myFlatMapFunction(r))
print("df_new_rdd:", df_new_rdd)
#df_new_rdd.toDF() #Can not infer schema for type: <class 'int'>

def myForEach(rec):
  print("myForEach::type:%s, rec:%s" % (type(rec), rec))
  #myForEach::type:<class 'int'>, rec:4

#Foreach
df_new_rdd.foreach(lambda r: myForEach(r))

myschema = StructType([       
    StructField('TId', LongType(), True),
    StructField('UId', LongType(), True),
    StructField('TDesc', StringType(), True),
    StructField('TDate', StringType(), True)
])
df_new_rdd_df = spark.createDataFrame(df_new_rdd, schema = myschema)
print("RDD to DF::df_new_rdd_df:", df_new_rdd_df)
df_new_rdd_df.printSchema()



In [0]:
import pyspark.sql.functions as f

columns = ["Id", "Name", "City", "State", "Country", "Active"]
data = [
  (1, "Store-A", "New York", "New York", "USA", 1),
  (2, "Store-B", "Washington", "Seattle", "USA", 1),
  (3, "Store-C", "Dallas", "Texas", "USA", 1)
]
df_store = spark.createDataFrame(data= data, schema = columns )
print("df_store")
df_store.show()
bc_store = spark.sparkContext.broadcast(df_store.toJSON().collect())

columns = ["Id", "Name", "Desc", "Active"]
data = [(1, "Bananas", "Bananas", "1"),
             (2,"Apples","Gala Apples","1"),
             (3, "Organes", "Naval Oranges", "1")]

df_items = spark.createDataFrame(data= data, schema = columns )
print("df_items")
df_items.show()
bc_items = spark.sparkContext.broadcast(df_items.toJSON().collect())
print("df_items.toJSON().collect():", df_items.toJSON().collect())
#df_items.toJSON().collect(): ['{"Id":1,"Name":"Bananas","Desc":"Bananas","Active":"1"}']

def myTransEnrich(row, bc_store, bc_items):
  print("myTransEnrich::type:%s, data:%s" % (type(row), row))
  #myTransEnrich::type:<class 'pyspark.sql.types.Row'>, data:Row(TId=1, StoreId=1, ItemId=2, TDesc='Apples for StoreB@WAS', TDate='2021-04-19T01:01:01')
  print("myTransEnrich::bc_user.value::", type(bc_store.value), bc_store.value)
  if type(bc_store.value) == list and len(bc_store.value) > 0:
    json_stores = bc_store.value[0]
    if type(json_stores) == str:
      json_stores = json.loads(json_stores)
  #myTransEnrich::bc_user.value:: <class 'list'> ['{"Id":1,"Name":"Store-A","City":"New York","State":"New York","Country":"USA","Active":1}', '{"Id":2,"Name":"Store-B","City":"Washington","State":"Seattle","Country":"USA","Active":1}', '{"Id":3,"Name":"Store-C","City":"Dallas","State":"Texas","Country":"USA","Active":1}']

  print("myTransEnrich::bc_items.value::", type(bc_items.value), bc_items.value)
  if type(bc_items.value) == list and len(bc_items.value) > 0:
    json_items = bc_items.value[0]
    if type(json_items) == str:
      json_items = json.loads(json_items)
  #myTransEnrich::bc_items.value:: <class 'list'> ['{"Id":1,"Name":"Bananas","Desc":"Bananas","Active":"1"}', '{"Id":2,"Name":"Apples","Desc":"Gala Apples","Active":"1"}', '{"Id":3,"Name":"Organes","Desc":"Naval Oranges","Active":"1"}']
        
  #Store for-loop
  #Items for-loop
  return row
  
columns = ["TId", "StoreId", "ItemId", "TDesc", "TDate"]
data = [(1, 1, 2, "Apples for StoreB@WAS", "2021-04-19T01:01:01"),
             (1, 3, 3, "Oranges for StoreC@DAL", "2021-04-19T02:01:01"),
             (1, 2, 1, "Bananas for StoreA@NYC", "2021-04-19T03:01:01")]
df_trans = spark.createDataFrame(data= data, schema = columns )
print("df_trans")
df_trans.show()

#Using Map
df_enriched = df_trans.rdd.map(lambda row: myTransEnrich(row, bc_store, bc_items))
print("df_enriched:0:", df_enriched)
df_enriched.toDF(["TId", "StoreId", "ItemId", "TDesc", "TDate"]).show()


In [0]:
import pyspark.sql.functions as f
from pyspark.sql.types import StructType,StructField, StringType, LongType

print("================== DataFrame.JOIN ==================")

columns = ["Id", "Name", "City", "State", "Country", "Active"]
data = [
  (1, "Store-A", "New York", "New York", "USA", 1),
  (2, "Store-B", "Washington", "Seattle", "USA", 1),
  (3, "Store-C", "Dallas", "Texas", "USA", 1)
]
df_store = spark.createDataFrame(data= data, schema = columns )

columns = ["Id", "Name", "Desc", "Active"]
data = [(1, "Bananas", "Bananas", "1"),
             (2,"Apples","Gala Apples","1"),
             (3, "Organes", "Naval Oranges", "1")]

df_items = spark.createDataFrame(data= data, schema = columns )

columns = ["TId", "StoreId", "ItemId", "TDesc", "TDate"]
data = [(1, 1, 2, "Apples for StoreB@WAS", "2021-04-19T01:01:01"),
             (1, 3, 3, "Oranges for StoreC@DAL", "2021-04-19T02:01:01"),
             (1, 2, 1, "Bananas for StoreA@NYC", "2021-04-19T03:01:01")]
df_trans = spark.createDataFrame(data= data, schema = columns )

#using dataframe joins
columns = ["TId", "TDesc", "TDate", "Name", "StoreId"]
#Trans: ["TId", "StoreId", "ItemId", "TDesc", "TDate"]
#Items: ["Id", "Name", "Desc", "Active"]
df_trans_items = df_trans.join(df_items, df_items.Id == df_trans.ItemId, "inner").select([col for col in columns])
df_trans_items = df_trans_items.withColumnRenamed("Name", "ItemName")
print("df_trans_items:", df_trans_items)

columns = ["TId", "TDesc", "TDate", "ItemName", "Name", "City", "State"]
#Trans:        ["TId", "StoreId", "ItemId", "TDesc", "TDate"]
#Trans+Items:  ["TId", "ItemName", "TDesc", "TDate"]
#Store:        ["Id", "Name", "City", "State", "Country", "Active"]
df_trans_items_stores = df_trans_items.join(df_store, df_store.Id == df_trans_items.StoreId, "inner").select([col for col in columns])
df_trans_items_stores = df_trans_items_stores.withColumnRenamed("Name", "StoreName")
df_trans_items_stores = df_trans_items_stores.withColumn("Location", f.concat(df_trans_items_stores["City"],df_trans_items_stores["State"]))
print("df_trans_items_stores:", df_trans_items_stores)

df_trans_items_stores.show()



In [0]:
#SQL Style
columns = ["Id", "Name", "City", "State", "Country", "Active"]
data = [
  (1, "Store-A", "New York", "New York", "USA", 1),
  (2, "Store-B", "Washington", "Seattle", "USA", 1),
  (3, "Store-C", "Dallas", "Texas", "USA", 1)
]
df_store = spark.createDataFrame(data= data, schema = columns )
print("df_store")
df_store.show()
df_store.createOrReplaceTempView("stores")

columns = ["Id", "Name", "Desc", "Active"]
data = [(1, "Bananas", "Bananas", "1"),
             (2,"Apples","Gala Apples","1"),
             (3, "Organes", "Naval Oranges", "1")]

df_items = spark.createDataFrame(data= data, schema = columns )
print("df_items")
df_items.show()
df_items.createOrReplaceTempView("items")

  
columns = ["TId", "StoreId", "ItemId", "TDesc", "TDate"]
data = [(1, 1, 2, "Apples for StoreB@WAS", "2021-04-19T01:01:01"),
             (1, 3, 3, "Oranges for StoreC@DAL", "2021-04-19T02:01:01"),
             (1, 2, 1, "Bananas for StoreA@NYC", "2021-04-19T03:01:01")]
df_trans = spark.createDataFrame(data= data, schema = columns )
df_trans.createOrReplaceTempView("transactions")
print("df_trans")
df_trans.show()

enrich_df = spark.sql("select t.TId, s.Name as Store_Name, i.Name as Item_name, t.TDesc, t.TDate, s.Active as Store_Active, i.Active as Item_Active, s.City from items i, stores s, transactions t where t.ItemId = i.Id and t.StoreId = s.Id")
print("enrich_df::", enrich_df)
enrich_df.show()

In [0]:
import pyspark.sql.functions as f
from pyspark.sql.types import StructType,StructField, StringType, LongType

#Using WithColumn & UDF
print("================== DataFrame.WithColumn ==================")
print("df_trans:", df_trans)

def getItemValues(itemId):
  print("getItemValues:", type(itemId), itemId)
  #getItemValues: <class 'pyspark.sql.column.Column'> Column<b'ItemId'>
  #['{"Id":1,"Name":"Bananas","Desc":"Bananas","Active":"1"}']
  
  return itemId

bc_items = spark.sparkContext.broadcast(df_items.toJSON().collect())        
getItemValues = udf(getItemValues, StringType())
df_new = df_trans.withColumn("ItemName", getItemValues(df_trans.ItemId)) #f.lit("ITEM_NAME")
df_new = df_new.drop("ItemId")

def getStoreValues(storeId):
  print("getStoreValues:", type(storeId), storeId)
  #getStoreValues: <class 'pyspark.sql.column.Column'> Column<b'StoreId'>
  return storeId

df_store = spark.sparkContext.broadcast(df_store.toJSON().collect())
getStoreValues = udf(getStoreValues, StringType())
df_new = df_new.withColumn("StoreName", f.lit("SITE_NAME")) #getStoreValues(df_trans.StoreId, df_store)
df_new = df_new.drop("StoreId")

print("df_enriched:2:", df_new)
df_new.show()