In [0]:
print("Running Utility Notebook to initialize all functions to use further")

In [0]:
%pip install word2number
#print(w2n.word_to_num("two hundred and thirty five")) ---->235

In [0]:
from word2number import w2n
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType

def word_to_num(value):
    try:
        #if already numeric
        return int(value)
    except:
        try:
            return w2n.word_to_num(value.lower())
        except:
            return None
#Register this Python function and tell Spark what type it will return
#udf(function, returnType)
word_to_num_udf=udf(word_to_num,IntegerType())

###Inline/Business Specific Functions

In [0]:
from pyspark.sql import functions as f
from pyspark.sql.functions import col

def standardize_staff(df):
    return (
        df.withColumn("shipment_id",word_to_num_udf(col("shipment_id")).cast("long"))
        .withColumn("age",word_to_num_udf(col("age")).cast("int"))
        .withColumn("role",f.lower("role"))
        .withColumn("origin_hub_city",f.initcap(col("origin_hub_city")))
        .withColumn("load_dt",f.current_timestamp())
        .withColumn("full_name",f.concat_ws(" ","first_name","last_name"))
        .withColumn("hub_location",f.initcap("hub_location"))
        .drop("first_name","last_name")
        .withColumnRenamed("full_name","staff_full_name")
    )

def scrub_geotag(df):
    return(
        df.withColumn("city_name",f.initcap("city_name"))
        .withColumn("masked_hub_location",f.initcap("masked_hub_location"))
    )

def standardize_shipments(df):
    return(
        df
        .withColumn("domain", F.lit("Logistics"))
        .withColumn("ingestion_timestamp", F.current_timestamp())
        .withColumn("is_expedited", F.lit(False).cast("boolean"))
        .withColumn("shipment_date", F.to_date("shipment_date", "yy-MM-dd"))
        .withColumn("shipment_cost", F.round("shipment_cost", 2))
        .withColumn("shipment_weight_kg", F.col("shipment_weight_kg").cast("double"))
    )

def enrich_shipments(df):
    return(
        df.withCoulmn("route_segment",f.concat_ws("-","source_city","destination_city"))
        .withColumn("vehicle_identifier",f.concat_ws("_","vehicle_type", "shipment_id"))
        .withColumn("shipment_year",f.year("shipment_year"))
        .withColumn("shipment_year",f.month("shipment_year"))
        .withColumn("is_weekend",f.dayofweek("shipment_date").isin([1,7]))
        .withColumn("is_expedited",f.col("shipment_status").isin("IN_TRANSIT","DELIVERED"))
        .withColumn("cost_per_kg",
        f.round(F.col("shipment_cost") / F.col("shipment_weight_kg"), 2))
        .withColumn("tax_amount",
        f.round(F.col("shipment_cost") * 0.18, 2))
        .withColumn("days_since_shipment",
        f.datediff(F.current_date(), "shipment_date"))
        .withColumn("is_high_value",
        f.col("shipment_cost") > 50000)
    )