This Notebook contains the following functions:<br>
1. Python UDF Function
2. Generic Framework - Business specific 
3. Generic Framework - Common Functions

In [0]:
print("Running Utility Notebook to initialize all functions to use further")

Creating UDF to convert string to number, hence we don't have to filter string values or manipulate string values manually using dictionary word_to_num={'one':'1','two':'2'}<br>
Eg. If we pass "twenty thousand two hundred and one" -> 20201

In [0]:
%pip install word2number
#print(w2n.word_to_num("two hundred and thirty five")) ---->235

In [0]:
from word2number import w2n
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType

def word_to_num(value):
    try:
        #if already numeric
        return int(value)
    except:
        try:
            return w2n.word_to_num(value.lower())
        except:
            return None
#Register this Python function and tell Spark what type it will return
#udf(function, returnType)
word_to_num_udf=udf(word_to_num,IntegerType())

###Inline/Business Specific Functions

In [0]:
from pyspark.sql import functions as f
from pyspark.sql.functions import col

def standardize_staff(df):
    return (
        df.withColumn("shipment_id",word_to_num_udf(col("shipment_id")).cast("long"))
        .withColumn("age",word_to_num_udf(col("age")).cast("int"))
        .withColumn("role",f.lower("role"))
        .withColumn("origin_hub_city",f.initcap(col("origin_hub_city")))
        .withColumn("load_dt",f.current_timestamp())
        .withColumn("full_name",f.concat_ws(" ","first_name","last_name"))
        .withColumn("hub_location",f.initcap("hub_location"))
        .drop("first_name","last_name")
        .withColumnRenamed("full_name","staff_full_name")
    )

def scrub_geotag(df):
    return(
        df.withColumn("city_name",f.initcap("city_name"))
        .withColumn("masked_hub_location",f.initcap("masked_hub_location"))
    )

def standardize_shipments(df):
    return(
        df
        .withColumn("domain", F.lit("Logistics"))
        .withColumn("ingestion_timestamp", F.current_timestamp())
        .withColumn("is_expedited", F.lit(False).cast("boolean"))
        .withColumn("shipment_date", F.to_date("shipment_date", "yy-MM-dd"))
        .withColumn("shipment_cost", F.round("shipment_cost", 2))
        .withColumn("shipment_weight_kg", F.col("shipment_weight_kg").cast("double"))
    )

def enrich_shipments(df):
    return(
        df.withCoulmn("route_segment",f.concat_ws("-","source_city","destination_city"))
        .withColumn("vehicle_identifier",f.concat_ws("_","vehicle_type", "shipment_id"))
        .withColumn("shipment_year",f.year("shipment_year"))
        .withColumn("shipment_year",f.month("shipment_year"))
        .withColumn("is_weekend",f.dayofweek("shipment_date").isin([1,7]))
        .withColumn("is_expedited",f.col("shipment_status").isin("IN_TRANSIT","DELIVERED"))
        .withColumn("cost_per_kg",
        f.round(F.col("shipment_cost") / F.col("shipment_weight_kg"), 2))
        .withColumn("tax_amount",
        f.round(F.col("shipment_cost") * 0.18, 2))
        .withColumn("days_since_shipment",
        f.datediff(F.current_date(), "shipment_date"))
        .withColumn("is_high_value",
        f.col("shipment_cost") > 50000)
    )

def split_columns(df):
    return (
        df.withColumn("order_prefix",f.substring("order_id",1,3))
        .withColumn("order_sequence",f.substring("order_id",4,10))
        .withColumn("ship_year",f.year("shipment_date"))
        .withColumn("ship_month",f.month("shipment_date"))
        .withColumn("ship_day",f.dayofweek("shipment_date"))
        .withColumn("route_lane",f.concat("->","source_city","destination_city"))
    )

def mask_name(col):
    return f.concat(
        f.substring(col, 1, 2),
        f.lit("****"),
        f.substring(col, -1, 1)
    )

###Generic Functions

In [0]:
from pyspark.sql.session import SparkSession
def get_spark_session(app_name="Some Anonymous Data Engineering Project"):
    try:
        spark=SparkSession.getActiveSession()
        if spark:
            return spark
    except:
        pass

    return (SparkSession.builder.config("spark.sql.shuffle.partitions", "1").appName(app_name).getOrCreate())

Think of shuffle as:

Spark collects data from all workers → redistributes it → creates N buckets (partitions).

spark.sql.shuffle.partitions = N
means: “After shuffle, split the result into N pieces.”

Each partition = one task = one core.

In [0]:
#All generic functions for reading data from files & tables
def read_csv_df(spark,path,header=True,infer_schema=True,sep=","):
    return_df=spark.read.option("header",header).option("inferSchema",infer_schema).option("sep",sep)\
        .csv(path)
    return return_df

def read_json_df(spark,path,mline=True):
    return_df=spark.read.json(path,multiLine=mline,mode="PERMISSIVE")
    return return_df


def read_delta_df(spark,path):
    return spark.read.format("delta").load(path)

def read_file(spark,filetype,path,header=True,infer_schema=True,mline=True):
    if filetype=="csv":
        return spark.read.csv(path,header=header,inferSchema=infer_schema)#read_csv_df(spark,path)
    elif filetype=="json":
        return read_json_df(spark,path)
    elif filetype=="delta":
        return read_delta_df(spark,path)
    elif filetype=='orc':
        return spark.read.orc(path)
    else:
        raise Exception("File type not supported")

def read_table(spark,table_name):
    return spark.table(table_name)


In [0]:
#Return Joined DF
def join_df(df1,df2,how="inner",on="shipment_id"):#To avoid cartesian/cross join, i am adding some column in the on
    return df1.join(df2, on=on, how=how)

def unionDf(df1,df2):
    return df1.union(df2)
def unionDfSql(spark,view1,view2):    
    returndf=spark.sql(f"select * from view1 union select * from view2")
    return returndf

def mergeDf(df1,df2,allowmissingcol=True):
    return df1.unionByName(df2, allowMissingColumns=allowmissingcol)