In [0]:
# import libraries
from pyspark.sql.types import StringType, IntegerType, TimestampType, DateType, DoubleType, StructType, StructField
import requests
import json
import phonenumbers 
import re
import datetime
import pandas as pd
from pyspark.sql import SQLContext, Row
from pyspark.sql.functions import  col
from pyspark.sql.functions import unix_timestamp, from_unixtime
from pyspark.sql import functions as F
from pyspark.sql.functions import lit
from pyspark.sql.functions import monotonically_increasing_id

In [0]:
# schema for SCV User Table 
user_schema = StructType([
            StructField("id", IntegerType(), False),
            StructField("Userid", IntegerType(), True),
            StructField("SkinID", StringType(), True),
            StructField("username", StringType(), True),
            StructField("first_name", StringType(), True),
            StructField("last_name", StringType(), True),
            StructField("email", StringType(), True),
            StructField("gender", StringType(), True), 
            StructField("ip_address", StringType(), True), 
            StructField("RegDate", StringType(), True), 
            StructField("RegIP", StringType(), True), 
            StructField("LastIP", StringType(), True), 
            StructField("DOB", StringType(), True), 
            StructField("Postcode", StringType(), True), 
            StructField("MobilePhone", StringType(), True), 
            StructField("Landline", StringType(), True), 
            StructField("Address1", StringType(), True),
            StructField("City", StringType(), True),
            StructField("County", StringType(), True),
            StructField("Country", StringType(), True),
            StructField("SelfExcludedUntil", StringType(), True),
            StructField("Status", StringType(), True)])
            

In [0]:
# create the base directory to store csv files
dbutils.fs.rm("/FileStore/users",recurse=True)
dbutils.fs.mkdirs("/FileStore/users")
dbutils.fs.mkdirs("/FileStore/users/inprogress")
dbutils.fs.mkdirs("/FileStore/users/completed")


In [0]:
# list the contents of the base directory
dbutils.fs.ls("dbfs:/FileStore/users")


In [0]:
def fixUserRow(c):
    # get the Mobile field
    number = c.MobilePhone

    # initialize variables 
    is_valid_number = "N"
    clean_number = None
    number_type = None
    valid_mail = None

    p = None

    if number is not None:
        # Clean the Mobile Number first
        try:
            p = phonenumbers.parse(number, c.Country)

            if phonenumbers.is_valid_number(p):
                is_valid_number = "Y"
            elif phonenumbers.truncate_too_long_number(p):
                is_valid_number = "Y"
            else:
                is_valid_number = "N"

            clean_number = "%s%s" % (p.country_code, p.national_number)
            
        except:
            p = None

    # clean up PhoneNumber
    phone_no = c.Landline
    if phone_no is not None:
      phone_no = phone_no.replace('-', '')
      if (len(phone_no) != 10):
        phone_no = None
    
    # validate Email 
    if re.match(r"^[A-Za-z0-9\.\+_-]+@[A-Za-z0-9\._-]+\.[a-zA-Z]*$", c.email):
      valid_mail = c.email
    
    return Row( 
		id = c.id, 
        Userid = c.Userid, 			
        SkinID = c.SkinID,
        username = c.username,
        first_name = c.first_name, 		
        last_name = c.last_name,	
        email = valid_mail,			
        gender = c.gender,			
        ip_address = c.ip_address,
        RegDate = c.RegDate,
        RegIP = c.RegIP,
		LastIP = c.LastIP,			
		DOB = c.DOB,			
		Postcode = c.Postcode,		
		MobilePhone = clean_number, 	
		Landline = phone_no, 		
		Address1 = c.Address1,		
        City = c.City, 			
		County = c.County,			
		Country = c.Country, 		
        SelfExcludedUntil = c.SelfExcludedUntil,
		Status = c.Status			
    )


In [0]:
# insert matching records into UserSCV table
def insertNewVersionOfUser(tableName):
  df = spark.sql("select  * from " + tableName)
  dateTimeStr = datetime.datetime.today().strftime("%m-%d-%Y %H:%M:%S")

  # select max of id from userSCV table
  lv = sqlContext.sql("select max(ID) as lastVal from UserSCV").collect()
  lastValue = lv[0]["lastVal"]
  df_userSCV = df.select("ID", \
                         "Userid1", \
                         "SkinID1", \
                         "username1", \
                         "first_name1", \
                         "last_name1", \
                         "email1", \
                         "gender1", "ip_address1", "RegDate1", "RegIP1", \
                         "LastIP1", "DOB1", "Postcode1", "MobilePhone1", "Landline1", \
                         "Address11", "City1", "County1", "Country1", \
                         "SelfExcludedUntil1", "Status1", \
                         "EntityId", \
                         "OriginalEmail", \
                         "OriginalFirstname", \
                         "OriginalLastname", \
                         "OriginalRegDate", \
                         "OriginalDOB", \
                         "OriginalPostcode", \
                         "OriginalMobilePhone", \
                         "OriginalAddress1", \
                         "OriginalCity", \
                         "Firstname_Lastname_RegIP", \
                         "Firstname_Lastname_LastIP", \
                         "Firstname_Lastname_Username", \
                         "Firstname_DOB_City",\
                         "Firstname_Postcode", \
                         "Firstname_Mobilephone", \
                         "DOB_Postcode",  \
                         "Address1_Postcode", \
                         "Firstname_Lastname_Address1_City")
  #df_userSCV = df_userSCV.withColumnRenamed("ID", "RelatedID") 
  df_userSCV = df_userSCV.withColumn("RelatedID", col("ID"))
  df_userSCV = df_userSCV.withColumn("Load_date", lit(dateTimeStr))
  df_userSCV = df_userSCV.withColumn("LastModifiedDate", lit(dateTimeStr))
  df_userSCV = df_userSCV.withColumn("CompareStatus", lit(0))
   
  #df_userSCV = df_userSCV.withColumn("ID", monotonically_increasing_id() + lastValue)
  df_userSCV = df_userSCV.select("ID", \
                        col("Userid1").alias("Userid"), col("SkinID1").alias("SkinID"), \
                        col("username1").alias("username"), col("first_name1").alias("first_name"), \
                        col("last_name1").alias("last_name"), col("email1").alias("email"), \
                        col("gender1").alias("gender"), col("ip_address1").alias("ip_address"), \
                        col("RegDate1").alias("RegDate"), col("RegIP1").alias("RegIP"), \
                        col("LastIP1").alias("LastIP"), col("DOB1").alias("DOB"), \
                        col("Postcode1").alias("Postcode"), col("MobilePhone1").alias("MobilePhone"), \
                        col("Landline1").alias("Landline"), col("Address11").alias("Address1"), \
                        col("City1").alias("City"), col("County1").alias("County"), \
                        col("Country1").alias("Country"), col("SelfExcludedUntil1").alias("SelfExcludedUntil"), \
                        col("Status1").alias("Status"), \
                         "RelatedID", \
                         "EntityId", \
                         "OriginalEmail", \
                         "OriginalFirstname", \
                         "OriginalLastname", \
                         "OriginalRegDate", \
                         "OriginalDOB", \
                         "OriginalPostcode", \
                         "OriginalMobilePhone", \
                         "OriginalAddress1", \
                         "OriginalCity", \
                         "Firstname_Lastname_RegIP", \
                         "Firstname_Lastname_LastIP", \
                         "Firstname_Lastname_Username", \
                         "Firstname_DOB_City",\
                         "Firstname_Postcode", \
                         "Firstname_Mobilephone", \
                         "DOB_Postcode",  \
                         "Address1_Postcode", \
                         "Firstname_Lastname_Address1_City", \
                         "Load_date", \
                         "LastModifiedDate",\
                         "CompareStatus")

  df_userSCV.write.insertInto("UserSCV")
  
  

In [0]:
def getDataFrameFromCSV(csvFileLocation, schema):
   
  df_new_load = sqlContext.read.format('com.databricks.spark.csv') \
                          .options(header=True).schema( schema).load(csvFileLocation)
  from pyspark.sql.functions import col

  
  # cleanse the data
  df_user_updated1 = df_new_load.rdd.map(lambda c: fixUserRow(c))
  # change the column type now
  df_new = sqlContext.createDataFrame(df_user_updated1, user_schema)
  df_new = df_new.select (col("ID").alias("ID1"), col("Userid").alias("Userid1"), col("SkinID").alias("SkinID1"), \
                        col("username").alias("username1"), col("first_name").alias("first_name1"), \
                        col("last_name").alias("last_name1"), col("email").alias("email1"), \
                        col("gender").alias("gender1"), col("ip_address").alias("ip_address1"), \
                        col("RegDate").alias("RegDate1"), col("RegIP").alias("RegIP1"), \
                        col("LastIP").alias("LastIP1"), col("DOB").alias("DOB1"), \
                        col("Postcode").alias("Postcode1"), col("MobilePhone").alias("MobilePhone1"), \
                        col("Landline").alias("Landline1"), col("Address1").alias("Address11"), \
                        col("City").alias("City1"), col("County").alias("County1"), \
                        col("Country").alias("Country1"), col("SelfExcludedUntil").alias("SelfExcludedUntil1"), \
                        col("Status").alias("Status1")) 
  return df_new
  

In [0]:
def createOutputTable(tableName):
  # create output table
  df = spark.sql("select * from " + tableName)
  dateTimeStr = datetime.datetime.today().strftime("%m-%d-%Y %H:%M:%S")


  userSCV =  df.withColumn("ID", F.monotonically_increasing_id()) \
    .withColumn("RelatedID", lit(-1).cast(IntegerType())) 
  userSCV = userSCV.withColumn("EntityId", col("ID")) 

  # rename columns 
  userSCV = userSCV.withColumn("OriginalEmail", col("email")) 
  userSCV = userSCV.withColumn("OriginalFirstname", col("first_name")) 
  userSCV = userSCV.withColumn("OriginalLastname", col("last_name")) 
  userSCV = userSCV.withColumn("OriginalRegDate", col("RegDate"))
  userSCV = userSCV.withColumn("OriginalDOB", col("DOB"))
  userSCV = userSCV.withColumn("OriginalPostcode", col("Postcode"))             
  userSCV = userSCV.withColumn("OriginalMobilePhone", col("MobilePhone"))
  userSCV = userSCV.withColumn("OriginalAddress1", col("Address1"))            
  #userSCV = userSCV.withColumn("OriginalAddress2", col("Address2"))            
  userSCV = userSCV.withColumn("OriginalCity", col("City"))
  userSCV = userSCV.withColumn("Firstname_Lastname_RegIP", F.concat(col('first_name'),lit('_'), col('last_name'), lit('_'),col('RegIP') ))       
  userSCV = userSCV.withColumn("Firstname_Lastname_LastIP", \
                               F.concat(col('first_name'),lit('_'), col('last_name'), lit('_'),col('LastIP') ))
  userSCV = userSCV.withColumn("Firstname_Lastname_Username", \
                               F.concat(col('first_name'),lit('_'), col('last_name'), lit('_'),col('Username') ))
  userSCV = userSCV.withColumn("Firstname_DOB_City", F.concat(col('first_name'),lit('_'), col('DOB'), lit('_'),col('City') ))
  userSCV = userSCV.withColumn("Firstname_Postcode", F.concat(col('first_name'),lit('_'), col('Postcode')  )) 
  userSCV = userSCV.withColumn("Firstname_Mobilephone", F.concat(col('first_name'),lit('_'), col('MobilePhone')  ))          
  userSCV = userSCV.withColumn("DOB_Postcode", F.concat(col('DOB'),lit('_'), col('Postcode')  )) 
  userSCV = userSCV.withColumn("Address1_Postcode", F.concat(col('Address1'),lit('_'), col('Postcode')  ))              
  userSCV = userSCV.withColumn("Firstname_Lastname_Address1_City", \
                               F.concat(col('first_name'),lit('_'), col('last_name'), lit('_'),col('Address1'), lit('_'), col('City') ))
  userSCV = userSCV.withColumn("Load_date", lit(dateTimeStr))
  userSCV = userSCV.withColumn("LastModifiedDate", lit(dateTimeStr))
  userSCV = userSCV.withColumn("CompareStatus", lit(0))
  userSCV = userSCV.withColumn("CompareStatus", lit(None).cast(StringType()))
  # Create a HIVE table to save Data fro Dataframe 
  if (len(spark.sql("SHOW TABLES LIKE '" + "UserSCV"+ "'").collect()) == 1):
    userSCV.write.insertInto("UserSCV")
  else:
    userSCV.write.saveAsTable("UserSCV")



In [0]:
def compareData(tableName):
  spark.sql("REFRESH TABLE  " + tableName)
  df_temp = spark.sql ("select * from " + tableName)
  count = df_temp.count()
  if (count > 0):
    insertNewVersionOfUser(tableName)
  

In [0]:
def processUserInfo(filePath):
  # Read the csv file as example
  print("-----------------------------------------------------------------------------------")
  print("Reading csv file from :" + filePath)
  df = sqlContext.read.format('com.databricks.spark.csv') \
              .options(header=True).schema( user_schema).load(filePath)
  # cleanse the data
  # Standardise the telephone number to take away -, ) from input data 

  df_user_updated = df.rdd.map(lambda c: fixUserRow(c))
  # change the column type now
  df_user = sqlContext.createDataFrame(df_user_updated, user_schema)
  # Insert into intermediate table
  # check if table exists
  if (len(spark.sql("SHOW TABLES LIKE '" + "users_load"+ "'").collect()) == 1):
    df_user.write.insertInto("users_load")
  else:
    df_user.write.saveAsTable("users_load")

    
  
  
  print("After saving data to userload")
  # check if UserSCV table exists:
  if (len(spark.sql("SHOW TABLES LIKE '" + "UserSCV"+ "'").collect()) == 1) :
     # compare the data with existing data in UserSCV
    userSCV = spark.sql("select * from UserSCV")
    # 1. Rename the base columns 
    df_new =  getDataFrameFromCSV(filePath, user_schema)
    # 2. compare the data
    # check for the minimal condition
    # whether firstName + IP equals
    print("1. checking for firstName + IP")
    df_criteria_min = userSCV.join(df_new, (userSCV.first_name == df_new.first_name1) & (userSCV.ip_address == df_new.ip_address1) )
    df_criteria_min.createOrReplaceTempView("c1_FN_IP") 
    compareData("c1_FN_IP")
    
    # This is to check  criteria: FirstName + username 
    print("1. checking for firstName + username") 
    #df_new =  getDataFrameFromCSV(csvFilePath_new, user_schema)
    df_criteria_fn_username = userSCV.join(df_new, (userSCV.first_name == df_new.first_name1) & (userSCV.username == df_new.username1) )
    df_criteria_fn_username.createOrReplaceTempView("c1_FN_username") 
    compareData("c1_FN_username")
    df_user.createOrReplaceTempView("output_user")
    createOutputTable("output_user")
    
  # create the UserSCV table
  else:
    print("UserSCV table do not exist; hence creating it")
    df_user.createOrReplaceTempView("output_user")
    createOutputTable("output_user")
    
  
  print("-----------------------------------------------------------------------------------")

 

In [0]:
def fixUserRow(c):
    # get the Mobile field
    number = c.MobilePhone

    # initialize variables 
    is_valid_number = "N"
    clean_number = None
    number_type = None
    valid_mail = None

    p = None

    if number is not None:
        # Clean the Mobile Number first
        try:
            p = phonenumbers.parse(number, c.Country)

            if phonenumbers.is_valid_number(p):
                is_valid_number = "Y"
            elif phonenumbers.truncate_too_long_number(p):
                is_valid_number = "Y"
            else:
                is_valid_number = "N"

            clean_number = "%s%s" % (p.country_code, p.national_number)
            
        except:
            p = None

    # clean up PhoneNumber
    phone_no = c.Landline
    if phone_no is not None:
      phone_no = phone_no.replace('-', '')
      if (len(phone_no) != 10):
        phone_no = None
    
    # validate Email 
    if re.match(r"^[A-Za-z0-9\.\+_-]+@[A-Za-z0-9\._-]+\.[a-zA-Z]*$", c.email):
      valid_mail = c.email
    
    return Row( 
		id = c.id, 
        Userid = c.Userid, 			
        SkinID = c.SkinID,
        username = c.username,
        first_name = c.first_name, 		
        last_name = c.last_name,	
        email = valid_mail,			
        gender = c.gender,			
        ip_address = c.ip_address,
        RegDate = c.RegDate,
        RegIP = c.RegIP,
		LastIP = c.LastIP,			
		DOB = c.DOB,			
		Postcode = c.Postcode,		
		MobilePhone = clean_number, 	
		Landline = phone_no, 		
		Address1 = c.Address1,		
        City = c.City, 			
		County = c.County,			
		Country = c.Country, 		
        SelfExcludedUntil = c.SelfExcludedUntil,
		Status = c.Status			
    )


In [0]:
import schedule
import time
import requests
import datetime

# create the base directory to store csv files
dbutils.fs.rm("/FileStore/users",recurse=True)
dbutils.fs.mkdirs("/FileStore/users")
dbutils.fs.mkdirs("/FileStore/users/inprogress")
dbutils.fs.mkdirs("/FileStore/users/completed")

 
def job():
  print("calling CSV load function")
  url = "https://my.api.mockaroo.com/users_load.json?key=6af9c3e0"
  import pandas as pd
  df = spark.createDataFrame(pd.read_csv(url))
  ts = time.time()
  st = datetime.datetime.fromtimestamp(ts).strftime('%Y_%m_%d_%H_%M')
  fileName = '/FileStore/users/inprogress/'+ st + '.tmp'
  fileprefix = '/FileStore/users/inprogress/'
  df.coalesce(1).write.format("com.databricks.spark.csv") \
    .option("header", True) \
    .option("quote", "") \
    .save(fileName)  #saved to the FileStore
    
  fileList =  dbutils.fs.ls(fileName)

  csvFileLocation = ''
  for fileInfo in fileList:   
    if ".csv" in fileInfo.path:
      print("this file is csv file.." )
      print(fileInfo.path)
      csvFileLocation = fileprefix + fileInfo.name
      
      dbutils.fs.cp(fileInfo.path,fileprefix)
      dbutils.fs.rm(fileName,recurse=True)
  
  #if (len(csvFileLocation) >0):
  #  processUserInfo(csvFileLocation)
  #  dbutills.fs.mv(csvFileLocation, '/FileStore/users/completed/')
      
schedule.every(10).seconds.do(job)
 

while True:
    schedule.run_pending()
    time.sleep(1)

In [0]:
spark.sql("REFRESH TABLE UserSCV")
df1 = spark.sql ("select  * from UserSCV")

df1.count()