### 0.1 Setup

In [1]:
from pyspark.sql import SparkSession

#Create SparkSession
spark = SparkSession.builder.appName('govtech_challenge').getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/11/26 11:05:10 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
spark

In [3]:
spark.sparkContext.getConf().getAll()

[('spark.driver.extraJavaOptions',
  '-XX:+IgnoreUnrecognizedVMOptions --add-opens=java.base/java.lang=ALL-UNNAMED --add-opens=java.base/java.lang.invoke=ALL-UNNAMED --add-opens=java.base/java.lang.reflect=ALL-UNNAMED --add-opens=java.base/java.io=ALL-UNNAMED --add-opens=java.base/java.net=ALL-UNNAMED --add-opens=java.base/java.nio=ALL-UNNAMED --add-opens=java.base/java.util=ALL-UNNAMED --add-opens=java.base/java.util.concurrent=ALL-UNNAMED --add-opens=java.base/java.util.concurrent.atomic=ALL-UNNAMED --add-opens=java.base/sun.nio.ch=ALL-UNNAMED --add-opens=java.base/sun.nio.cs=ALL-UNNAMED --add-opens=java.base/sun.security.action=ALL-UNNAMED --add-opens=java.base/sun.util.calendar=ALL-UNNAMED --add-opens=java.security.jgss/sun.security.krb5=ALL-UNNAMED'),
 ('spark.executor.id', 'driver'),
 ('spark.app.name', 'govtech_challenge'),
 ('spark.app.startTime', '1669431910567'),
 ('spark.app.id', 'local-1669431911555'),
 ('spark.driver.port', '54685'),
 ('spark.rdd.compress', 'True'),
 ('spa

### 0.2 Task

Application mobile number is 8 digits Applicant is over 18 years old as of 1 Jan 2022 Applicant has a valid email (email ends with @emailprovider.com or @emailprovider.net) You are required to format datasets in the following manner:

Split name into first_name and last_name Format birthday field into YYYYMMDD Remove any rows which do not have a name field (treat this as unsuccessful applications) Create a new field named above_18 based on the applicant's birthday Membership IDs for successful applications should be the user's last name, followed by a SHA256 hash of the applicant's birthday, truncated to first 5 digits of hash (i.e <last_name>_<hash(YYYYMMDD)>) You are required to consolidate these datasets and output the successful applications into a folder, which will be picked up by downstream engineers. Unsuccessful applications should be condolidated and dropped into a separate folder.

You can use common scheduling solutions such as cron or airflow to implement the scheduling component. Please provide a markdown file as documentation.

### 0.3 Read

In [82]:
import os
upstream_dir = "/Users/tonyngmk/repo/Data-Engineer-Tech-Challenge/1-data-pipelines/data/input"
upstream_csv = [f for f in os.listdir(upstream_dir) if f.endswith(".csv")]
upstream_csv

['applications_dataset_1.csv', 'applications_dataset_2.csv']

In [93]:
from pyspark.sql.types import StructType, StringType

defined_schema = StructType() \
.add("name", StringType(), True) \
.add("email", StringType(), True) \
.add("date_of_birth", StringType(), True) \
.add("mobile_no", StringType(), True)

sdf = spark.read.format("csv").load(upstream_dir, header=True, schema=defined_schema) # read entire directory
# problematic if directory is used for multiple input source of different schema

In [76]:
sdf.printSchema()

row = sdf.count()
col = len(sdf.columns)
print(f"Size: {row}*{col}") 

root
 |-- name: string (nullable = true)
 |-- email: string (nullable = true)
 |-- date_of_birth: string (nullable = true)
 |-- mobile_no: string (nullable = true)

Size: 4999*4


In [90]:
sdf.limit(5).show(truncate=False) # Peek

+---------------+-----------------------------+-------------+---------+
|name           |email                        |date_of_birth|mobile_no|
+---------------+-----------------------------+-------------+---------+
|Tony Shepherd  |Tony_Shepherd@petersen.com   |07/03/2016   |711447   |
|Sherry Gonzalez|Sherry_Gonzalez@caldwell.biz |14-03-1973   |66744895 |
|Ashlee Austin  |Ashlee_Austin@melendez.com   |12/09/1992   |6454197  |
|David Brown    |David_Brown@jackson-smith.biz|2001-09-22   |69082983 |
|Marc Meyer     |Marc_Meyer@chavez.com        |1996/05/17   |9727376  |
+---------------+-----------------------------+-------------+---------+



In [96]:
# Final function
def read(upstream_dir):
    """
    read folder that specifically contains data of
    membership applications of e-commerce platform
    """
    defined_schema = StructType() \
    .add("name", StringType(), True) \
    .add("email", StringType(), True) \
    .add("date_of_birth", StringType(), True) \
    .add("mobile_no", StringType(), True)

    sdf = spark.read.format("csv").load(upstream_dir, header=True, schema=defined_schema)
    return sdf

sdf = read("/Users/tonyngmk/repo/Data-Engineer-Tech-Challenge/1-data-pipelines/data/input")

### 1. `mobile_no`
- 8 digits

In [112]:
from pyspark.sql.functions import length, regexp_extract

sdf2 = sdf.withColumn("mobile_no", regexp_extract(sdf.mobile_no, r'\d+', 0).alias('mobile_no'))
sdf2 = sdf2.filter(length(sdf.mobile_no) == 8)
sdf2.limit(5).show(truncate=False)

print(f"Remaining: {sdf2.count()}/{sdf.count()}") 

+---------------+-----------------------------+-------------+---------+
|name           |email                        |date_of_birth|mobile_no|
+---------------+-----------------------------+-------------+---------+
|Sherry Gonzalez|Sherry_Gonzalez@caldwell.biz |14-03-1973   |66744895 |
|David Brown    |David_Brown@jackson-smith.biz|2001-09-22   |69082983 |
|Denise Glass   |Denise_Glass@calderon.org    |1982-05-31   |77863943 |
|Shane Davis    |Shane_Davis@smith.net        |2006/06/25   |42361783 |
|Laura Harding  |Laura_Harding@jackson.com    |2017-12-18   |32048503 |
+---------------+-----------------------------+-------------+---------+

Remaining: 937/4999


### 2. `date_of_birth`

- over 18 years old as of 1 Jan 2022
- In other words, assert DOB < 1 Jan 2004

In [184]:
from pyspark.sql.functions import coalesce, to_date

def parse_date(date_col):
    """
    parse all date formats from str to date type
    """
    date_formats = ["yyyy-MM-dd",
                    "yyyy MM dd",
                    "MM/dd/yyyy",
                    "yyyy/MM/dd",
                    "dd-MM-yyyy"]
    return coalesce(*[to_date(date_col, format) for format in date_formats])

sdf3 = sdf2.withColumn("date_of_birth2", parse_date(sdf.date_of_birth).alias('date_of_birth2'))
sdf3.filter(sdf3.date_of_birth2.isNull()).limit(5).show(truncate=False) # check for remaining unparsed

+----+-----+-------------+---------+--------------+
|name|email|date_of_birth|mobile_no|date_of_birth2|
+----+-----+-------------+---------+--------------+
+----+-----+-------------+---------+--------------+



In [198]:
acceptable_date_from = '2022-01-01'

sdf3 = sdf2.withColumn("date_of_birth", parse_date(sdf.date_of_birth).alias('date_of_birth'))
sdf3 = sdf3.filter(f"date_of_birth < DATE '{acceptable_date_from}' - interval '18' years")
sdf3.limit(5).show(truncate=False)

print(f"Remaining: {sdf3.count()}/{sdf2.count()}") 

+---------------+--------------------------------+-------------+---------+
|name           |email                           |date_of_birth|mobile_no|
+---------------+--------------------------------+-------------+---------+
|Sherry Gonzalez|Sherry_Gonzalez@caldwell.biz    |1973-03-14   |66744895 |
|David Brown    |David_Brown@jackson-smith.biz   |2001-09-22   |69082983 |
|Denise Glass   |Denise_Glass@calderon.org       |1982-05-31   |77863943 |
|Lori Zimmerman |Lori_Zimmerman@smith-bradley.biz|1998-02-17   |14572007 |
|Aaron Wilson   |Aaron_Wilson@burke-benton.net   |1987-05-17   |75053391 |
+---------------+--------------------------------+-------------+---------+

Remaining: 758/937


### 3. `email`
- Applicant has a valid email (email ends with @emailprovider.com or @emailprovider.net)

In [200]:

email_regex_pattern = "^[-_A-Za-z0-9]+@[-_A-Za-z0-9]+\\.(?:com|net)$"
sdf4 = sdf3.filter(sdf.email.rlike(email_regex_pattern))
sdf4.limit(5).show(truncate=False)

print(f"Remaining: {sdf4.count()}/{sdf3.count()}") 

+----------------+--------------------------------+-------------+---------+
|name            |email                           |date_of_birth|mobile_no|
+----------------+--------------------------------+-------------+---------+
|Aaron Wilson    |Aaron_Wilson@burke-benton.net   |1987-05-17   |75053391 |
|Samuel Lee      |Samuel_Lee@gardner-rodriguez.net|1974-09-02   |57636831 |
|Jennifer Weaver |Jennifer_Weaver@patterson.com   |1978-10-15   |74152151 |
|Leonard Hamilton|Leonard_Hamilton@brown.com      |1988-02-29   |76082775 |
|Kathy Phillips  |Kathy_Phillips@floyd.com        |1984-12-16   |57829247 |
+----------------+--------------------------------+-------------+---------+

Remaining: 526/758


### 4. `name`

- Split name into `first_name` and `last_name`

In [214]:
from pyspark.sql.functions import length, regexp_extract
# from pyspark.sql import functions as F

first_last_name_regex = r'([A-Za-z]+)\s+([A-Za-z]+)'

sdf5 = sdf4\
.withColumn("first_name", regexp_extract(sdf.name, first_last_name_regex, 1).alias('first_name'))\
.withColumn("last_name", regexp_extract(sdf.name, first_last_name_regex, 2).alias('last_name'))
# sdf5 = sdf4.withColumn('output', F.expr(r"regexp_extract_all(name, '([A-Za-z]+)\s+([A-Za-z]+)', 1)"))
sdf5.limit(5).show(truncate=False)

+----------------+--------------------------------+-------------+---------+----------+---------+
|name            |email                           |date_of_birth|mobile_no|first_name|last_name|
+----------------+--------------------------------+-------------+---------+----------+---------+
|Aaron Wilson    |Aaron_Wilson@burke-benton.net   |1987-05-17   |75053391 |Aaron     |Wilson   |
|Samuel Lee      |Samuel_Lee@gardner-rodriguez.net|1974-09-02   |57636831 |Samuel    |Lee      |
|Jennifer Weaver |Jennifer_Weaver@patterson.com   |1978-10-15   |74152151 |Jennifer  |Weaver   |
|Leonard Hamilton|Leonard_Hamilton@brown.com      |1988-02-29   |76082775 |Leonard   |Hamilton |
|Kathy Phillips  |Kathy_Phillips@floyd.com        |1984-12-16   |57829247 |Kathy     |Phillips |
+----------------+--------------------------------+-------------+---------+----------+---------+



In [205]:
regexp_extract(sdf.name, r'([A-Za-z]+)[ ]+([A-Za-z]+)', 1).alias('last_name')


# sdf4 = sdf3.filter(sdf.email.rlike(email_regex_pattern))
# sdf4.limit(5).show(truncate=False)

# print(f"Remaining: {sdf4.count()}/{sdf3.count()}") 

Column<'regexp_extract(name, ([A-Za-z]+)[ ]+([A-Za-z]+), 1) AS last_name'>

### Implement unit test

---
### Regular git commits

In [188]:
! git add .
! git commit -m "feat: add parsing of email and first and last name in notebook"
! git push origin feat-1.0-pipeline-spark

[feat-1.0-pipeline-spark c4ab88a] feat: add parsing of date of birth, filtering for above 18 years old from specified date logic in notebook
 Committer: Tony Ng <tonyngmk@MBA.home>
Your name and email address were configured automatically based
on your username and hostname. Please check that they are accurate.
You can suppress this message by setting them explicitly. Run the
following command and follow the instructions in your editor to edit
your configuration file:

    git config --global --edit

After doing this, you may fix the identity used for this commit with:

    git commit --amend --reset-author

 2 files changed, 168 insertions(+), 32 deletions(-)
Enumerating objects: 9, done.
Counting objects: 100% (9/9), done.
Delta compression using up to 8 threads
Compressing objects: 100% (5/5), done.
Writing objects: 100% (5/5), 1.40 KiB | 1.40 MiB/s, done.
Total 5 (delta 3), reused 0 (delta 0), pack-reused 0
remote: Resolving deltas: 100% (3/3), completed with 3 local objects.[K
To