In [20]:
import findspark
findspark.init()

In [21]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.streaming import *
from pyspark.sql.types import *
from pyspark.sql.window import Window
import os

spark = SparkSession.builder.master("local").appName("MaskData").getOrCreate()
sc = spark.sparkContext

In [22]:
# sql adaptive query execution adaptive.coalescePartitions.enabled will makr paritions dynamic
sc.setLogLevel("Error")
spark.conf.set("spark.sql.shuffle.partitions",3)
spark.conf.get("spark.sql.shuffle.partitions")
spark.conf.set("spark.sql.adaptive.enabled","false")
spark.conf.set("spark.sql.adaptive.coalescePartitions.enabled","false")
spark.conf.set("spark.sql.execution.arrow.enabled",True)
spark.conf.set("spark.sql.execution.arrow.fallback.enabled",True)

"""
Create output data frame with email id and mobile num masked
It is good to avoid using udf which degrades performance. Use spark inbuilt functions
"""

#WithoutInferSchema
#headerTrue will read first row and assign column names but type is String for all
#spark job created to read first column
filepath = "file:///C:/Users/venka/PycharmProjects/pythonProject/dataset/"
df = spark.read.option("header",True) \
            .option("inferSchema",True) \
            .option("delimiter",",") \
            .csv(filepath + "IntMaskData.csv")

In [4]:
df.show()

+---------------+-------------+------------+-------------------+----------+
|Customer_number|Customer_name|customer_age|              email|    mobile|
+---------------+-------------+------------+-------------------+----------+
|         301053|    Azarudeen|          28|     azar@gmail.com|9842356173|
|         305123|       Sakthi|          29|sakthi.m2@gmail.com|9764510298|
+---------------+-------------+------------+-------------------+----------+



In [23]:
def email_mask_func(colValue):
    mail_usr = colValue.split("@")[0]
    n = len(mail_usr)
    charList = list(mail_usr)
    charList[1:int(n)-1]='*'*int(n-2)
    out = "".join(charList)+"@"+colValue.split("@")[1]
    return out

In [24]:
#mask
print(email_mask_func("mailid@mail.com"))

m****d@mail.com


In [25]:
def mob_mask_func(colValue):
    n = len(colValue)
    charList = list(colValue)
    charList[2:int(n)-2]='x'*int(n-4)
    out = "".join(charList)
    return out

In [12]:
mob_mask_func("9898989896")

'98xxxxxx96'

In [26]:
#RegisterUDF
from pyspark.sql.functions import udf
mask_func_mob_udf = udf(mob_mask_func,StringType())
mask_func_email_udf = udf(email_mask_func,StringType())

In [28]:
df1 = df.withColumn("emailmask",mask_func_email_udf(df["email"])) \
        .withColumn("mobmask",mask_func_mob_udf(df["mobile"].cast(StringType()))) \
        .drop("email","mobile")
df1.show()

+---------------+-------------+------------+-------------------+----------+
|Customer_number|Customer_name|customer_age|          emailmask|   mobmask|
+---------------+-------------+------------+-------------------+----------+
|         301053|    Azarudeen|          28|     a**r@gmail.com|98xxxxxx73|
|         305123|       Sakthi|          29|s*******2@gmail.com|97xxxxxx98|
+---------------+-------------+------------+-------------------+----------+



In [35]:
#It is good to avoid UDF and use spark inbuilt functions for performance

df.createOrReplaceTempView("temp_table")

query = (
    "SELECT *, "
    "substring(split(email, '@')[0], 1, 1) || regexp_replace(substring(split(email, '@')[0], 2, length(split(email, '@')[0]) - 1), '[A-Za-z0-9_.]', '*') || "
    "substring(split(email, '@')[0], -1, 1) || '@' || split(email, '@')[1] AS masked_mail_usr, "
    
    "substring(mobile, 1, 2) || regexp_replace(substring(mobile, 2, length(mobile) - 1), '[A-Za-z0-9_.]', '*') || "
    "substring(mobile, -2, 2) AS masked_phone_nbr "
    "FROM temp_table"
)

result_df = spark.sql(query)
result_df.show()

+---------------+-------------+------------+-------------------+----------+--------------------+----------------+
|Customer_number|Customer_name|customer_age|              email|    mobile|     masked_mail_usr|masked_phone_nbr|
+---------------+-------------+------------+-------------------+----------+--------------------+----------------+
|         301053|    Azarudeen|          28|     azar@gmail.com|9842356173|     a***r@gmail.com|   98*********73|
|         305123|       Sakthi|          29|sakthi.m2@gmail.com|9764510298|s********2@gmail.com|   97*********98|
+---------------+-------------+------------+-------------------+----------+--------------------+----------------+

