## Data Transformation On The Fly

#### Objective
   ###### 1. Read streaming data from csv files
   ###### 2. Clean MobilePhone, email and Landline as data comes

In [0]:
# import libraries
from pyspark.sql.types import StringType, IntegerType, TimestampType, DateType, DoubleType, StructType, StructField
import requests
import json
import re
import datetime
import schedule
import time
import pandas as pd
import phonenumbers
from pyspark.sql import SQLContext, Row
from pyspark.sql.functions import  col
from pyspark.sql.functions import unix_timestamp, from_unixtime, explode
from pyspark.sql import functions as F
from pyspark.sql.functions import lit
from pyspark.sql.functions import monotonically_increasing_id

In [0]:
# schema for incoming stream 
stream_schema = StructType([
            StructField("id", IntegerType(), False),
            StructField("Userid", IntegerType(), True),
            StructField("SkinID", StringType(), True),
            StructField("username", StringType(), True),
            StructField("first_name", StringType(), True),
            StructField("last_name", StringType(), True),
            StructField("email", StringType(), True),
            StructField("gender", StringType(), True), 
            StructField("ip_address", StringType(), True), 
            StructField("RegDate", StringType(), True), 
            StructField("RegIP", StringType(), True), 
            StructField("LastIP", StringType(), True), 
            StructField("DOB", StringType(), True), 
            StructField("Postcode", StringType(), True), 
            StructField("MobilePhone", StringType(), True), 
            StructField("Landline", StringType(), True), 
            StructField("Address1", StringType(), True),
            StructField("City", StringType(), True),
            StructField("County", StringType(), True),
            StructField("Country", StringType(), True),
            StructField("SelfExcludedUntil", StringType(), True),
            StructField("Status", StringType(), True),
            StructField("batch", StringType(), True)])
# The batch field is added to show the batch for a record 

In [0]:
# streaming starts here by reading the input files 
inputPath = "/FileStore/users/inprogress/"
streamingInputDF = (
  spark
    .readStream
    .schema(stream_schema)
    .option("maxFilesPerTrigger", "1")
    .option("header", "true")
    .csv(inputPath)
)

In [0]:
# This function cleans the user MobilePhone
def fixUserMobile(number, country):
  # initialize variables
  is_valid_number = "N"
  clean_number = None
  number_type = None
  valid_mail = None

  p = None

  if number is not None:
      # Clean the Mobile Number first
      try:
          p = phonenumbers.parse(number, country)

          if phonenumbers.is_valid_number(p):
              is_valid_number = "Y"
          elif phonenumbers.truncate_too_long_number(p):
              is_valid_number = "Y"
          else:
              is_valid_number = "N"

          clean_number = "%s%s" % (p.country_code, p.national_number)

      except:
          p = None


  return clean_number

In [0]:
# This function cleans the user row; it cleans the Landline field
def fixUserLandline(phone_no):
  # clean up PhoneNumber
  if phone_no is not None:
    phone_no = phone_no.replace('-', '')
    if (len(phone_no) != 10):
      phone_no = None

  return phone_no

In [0]:
# This function cleans the user Email
def fixUserEmail(email):
  # validate Email 
  valid_mail = None
  if re.match(r"^[A-Za-z0-9\.\+_-]+@[A-Za-z0-9\._-]+\.[a-zA-Z]*$", email):
    valid_mail = email
  return valid_mail

In [0]:
# Convert the data pre-processing funcions into udf lambda functions
udf_fixUserMobile = udf(lambda x, y: fixUserMobile(x,y), returnType=StringType())
udf_fixUserLandline = udf(lambda x: fixUserLandline(x), returnType=StringType())
udf_fixUserEmail = udf(lambda x: fixUserEmail(x), returnType=StringType())

In [0]:
# Generate the clean stream by applying udf functions to the original stream
clean_stream = ( streamingInputDF.withColumn("MobilePhone", udf_fixUserMobile(streamingInputDF.MobilePhone, streamingInputDF.Country))
                          .withColumn("email", udf_fixUserEmail(streamingInputDF.email))
                          .withColumn("Landline", udf_fixUserLandline(streamingInputDF.Landline)))

In [0]:
# display pre-processed streaming data. Observe the email, MobilePhone and Landline columns
display(clean_stream)

id,Userid,SkinID,username,first_name,last_name,email,gender,ip_address,RegDate,RegIP,LastIP,DOB,Postcode,MobilePhone,Landline,Address1,City,County,Country,SelfExcludedUntil,Status,batch
1,1,Khaki,ghelder0,Ginevra,Helder,ghelder0@163.com,Female,62.10.105.198,03/18/2018,97.155.122.41,6.255.217.169,02/02/1976,10175,16468574360,5855224351,4952 Hovde Pass,New York City,NY,US,12/12/2017,True,2018_06_25_05_48_05
2,2,Crimson,jalexsandrovich1,Juieta,Alexsandrovich,jalexsandrovich1@who.int,Female,70.140.145.104,11/28/2017,18.88.184.200,24.140.71.73,03/11/1970,13205,13152834415,3158360709,06 Forster Terrace,Syracuse,NY,US,3/17/2018,False,2018_06_25_05_48_05
3,3,Violet,jwaldren2,Jourdain,Waldren,jwaldren2@hostgator.com,Male,237.6.226.81,01/21/2017,162.235.104.85,161.3.27.45,05/25/1979,10029,12129111820,9175804992,3 Main Avenue,New York City,NY,US,12/11/2017,True,2018_06_25_05_48_05
4,4,Fuscia,fcromly3,Franciskus,Cromly,fcromly3@pinterest.com,Male,85.62.213.243,03/01/2018,233.208.64.178,123.126.218.213,06/06/1981,11355,19178454400,3159923012,7 Vidon Pass,Flushing,NY,US,4/9/2018,True,2018_06_25_05_48_05
5,5,Maroon,aessery4,Ange,Essery,aessery4@yolasite.com,Male,182.189.82.211,10/30/2017,119.204.180.54,119.4.135.188,04/04/1996,10014,13472743326,2126575133,66033 Quincy Point,New York City,NY,US,4/24/2017,True,2018_06_25_05_48_05
6,6,Turquoise,mmcmurty5,Marcia,McMurty,mmcmurty5@dyndns.org,Female,217.8.226.0,03/07/2017,154.131.188.170,0.209.193.69,05/17/1995,11215,19174303667,3472477054,4552 Clove Alley,Brooklyn,NY,US,5/4/2017,False,2018_06_25_05_48_05
7,7,Maroon,jtams6,Justinian,Tams,jtams6@google.co.uk,Male,239.135.224.122,04/15/2018,143.88.132.192,223.66.38.180,11/30/1997,12325,15189528841,9172303048,5045 Eagle Crest Road,Schenectady,NY,US,7/11/2017,False,2018_06_25_05_48_05
8,8,Teal,bocurrigan7,Ben,O'Currigan,bocurrigan7@xing.com,Male,138.12.104.157,09/15/2017,204.211.99.201,170.145.191.192,05/28/1988,14220,17167951262,5182924574,33 Clemons Lane,Buffalo,NY,US,10/3/2017,False,2018_06_25_05_48_05
9,9,Indigo,smozzi8,Sansone,Mozzi,smozzi8@tumblr.com,Male,151.20.31.114,07/29/2017,159.195.181.20,104.7.123.142,12/01/1971,12222,15188880666,7184249138,29 Straubel Junction,Albany,NY,US,11/12/2017,False,2018_06_25_05_48_05
10,10,Green,gloveguard9,Gasper,Loveguard,gloveguard9@xrea.com,Male,77.147.214.207,01/31/2018,226.241.187.121,97.248.213.244,03/29/1994,11436,17184705571,9146586994,351 West Street,Jamaica,NY,US,8/31/2017,True,2018_06_25_05_48_05


In [0]:
# compare un-processed stream. Observe the email, MobilePhone and Landline columns
display(streamingInputDF)

id,Userid,SkinID,username,first_name,last_name,email,gender,ip_address,RegDate,RegIP,LastIP,DOB,Postcode,MobilePhone,Landline,Address1,City,County,Country,SelfExcludedUntil,Status,batch
1,1,Khaki,ghelder0,Ginevra,Helder,ghelder0@163.com,Female,62.10.105.198,03/18/2018,97.155.122.41,6.255.217.169,02/02/1976,10175,(646) 8574360,585-522-4351,4952 Hovde Pass,New York City,NY,US,12/12/2017,True,2018_06_25_05_48_05
2,2,Crimson,jalexsandrovich1,Juieta,Alexsandrovich,jalexsandrovich1@who.int,Female,70.140.145.104,11/28/2017,18.88.184.200,24.140.71.73,03/11/1970,13205,(315) 2834415,315-836-0709,06 Forster Terrace,Syracuse,NY,US,3/17/2018,False,2018_06_25_05_48_05
3,3,Violet,jwaldren2,Jourdain,Waldren,jwaldren2@hostgator.com,Male,237.6.226.81,01/21/2017,162.235.104.85,161.3.27.45,05/25/1979,10029,(212) 9111820,917-580-4992,3 Main Avenue,New York City,NY,US,12/11/2017,True,2018_06_25_05_48_05
4,4,Fuscia,fcromly3,Franciskus,Cromly,fcromly3@pinterest.com,Male,85.62.213.243,03/01/2018,233.208.64.178,123.126.218.213,06/06/1981,11355,(917) 8454400,315-992-3012,7 Vidon Pass,Flushing,NY,US,4/9/2018,True,2018_06_25_05_48_05
5,5,Maroon,aessery4,Ange,Essery,aessery4@yolasite.com,Male,182.189.82.211,10/30/2017,119.204.180.54,119.4.135.188,04/04/1996,10014,(347) 2743326,212-657-5133,66033 Quincy Point,New York City,NY,US,4/24/2017,True,2018_06_25_05_48_05
6,6,Turquoise,mmcmurty5,Marcia,McMurty,mmcmurty5@dyndns.org,Female,217.8.226.0,03/07/2017,154.131.188.170,0.209.193.69,05/17/1995,11215,(917) 4303667,347-247-7054,4552 Clove Alley,Brooklyn,NY,US,5/4/2017,False,2018_06_25_05_48_05
7,7,Maroon,jtams6,Justinian,Tams,jtams6@google.co.uk,Male,239.135.224.122,04/15/2018,143.88.132.192,223.66.38.180,11/30/1997,12325,(518) 9528841,917-230-3048,5045 Eagle Crest Road,Schenectady,NY,US,7/11/2017,False,2018_06_25_05_48_05
8,8,Teal,bocurrigan7,Ben,O'Currigan,bocurrigan7@xing.com,Male,138.12.104.157,09/15/2017,204.211.99.201,170.145.191.192,05/28/1988,14220,(716) 7951262,518-292-4574,33 Clemons Lane,Buffalo,NY,US,10/3/2017,False,2018_06_25_05_48_05
9,9,Indigo,smozzi8,Sansone,Mozzi,smozzi8@tumblr.com,Male,151.20.31.114,07/29/2017,159.195.181.20,104.7.123.142,12/01/1971,12222,(518) 8880666,718-424-9138,29 Straubel Junction,Albany,NY,US,11/12/2017,False,2018_06_25_05_48_05
10,10,Green,gloveguard9,Gasper,Loveguard,gloveguard9@xrea.com,Male,77.147.214.207,01/31/2018,226.241.187.121,97.248.213.244,03/29/1994,11436,(718) 4705571,914-658-6994,351 West Street,Jamaica,NY,US,8/31/2017,True,2018_06_25_05_48_05
