In [0]:
# http://optimus-ironmussa.readthedocs.io/en/latest/sections/transforming.html#dataframetransformer-replace-na-value-columns-none
#d_frontend_t = finaldf.select( "frontend_id","network_id","frontend_name","parent_frontend")
#windowSpec = W.orderBy("frontend_id","network_id","frontend_name","parent_frontend")
#d_frontend = d_frontend_t.distinct().withColumn("seq", F.row_number().over(windowSpec))
#d_frontend.write.mode("overwrite").saveAsTable("d_frontend")

#windowSpec = W.orderBy("affiliate_id","tracker_id","referral_link")

#d_affl = finaldf.select("affiliate_id","tracker_id","referral_link").distinct().withColumn("seq", F.row_number().over(windowSpec))
#d_affl.write.mode("overwrite").saveAsTable("d_affl")

import requests
import json
import optimus as op
import phonenumbers 
import re
import datetime

from pyspark.sql.types import StringType, IntegerType, TimestampType, DateType, DoubleType, StructType, StructField
from pyspark.sql.functions import udf
from pyspark.sql import SparkSession
from pyspark.sql import HiveContext
from pyspark.sql import SQLContext, Row
from pyspark.sql.functions import unix_timestamp, from_unixtime
from pyspark.sql import functions as F
from pyspark.sql.window import Window as W
from functools import reduce  # For Python 3.x
from pyspark.sql import DataFrame
from pyspark.sql.functions import lit
from pyspark.sql.functions import rank, col

import time



In [0]:
# Create Spark Context

sparkContext = SparkSession \
    .builder \
    .appName("Apartment POC") \
    .getOrCreate()

# Create Hive Context
hivecontext=HiveContext(sparkContext)

# configure Hive Context
hivecontext.setConf('hive.support.concurrency','true');
hivecontext.setConf('hive.enforce.bucketing','true');
hivecontext.setConf('hive.exec.dynamic.partition.mode','nostrict');
hivecontext.setConf('hive.compactor.initiator.on','true');
hivecontext.setConf('hive.compactor.worker.threads','1');

sqlContext = SQLContext(sparkContext)

In [0]:
# Schema for Landlord JSON
landlord_schema = StructType([
            StructField("Landlord_id", IntegerType(), False),
            StructField("Password", StringType(), True),
            StructField("Landlord_name", StringType(), False),
            StructField("Address_line_1", StringType(), False),
            StructField("City", StringType(), False),
            StructField("Post_code", StringType(), True),
            StructField("Region", StringType(), True)])


landlordSeq_schema = StructType([
            StructField("landlord_seq", IntegerType(), False),
            StructField("Landlord_id", IntegerType(), False),
            StructField("Password", StringType(), True),
            StructField("Landlord_name", StringType(), False),
            StructField("Address_line_1", StringType(), False),
            StructField("City", StringType(), False),
            StructField("Post_code", StringType(), True),
            StructField("Region", StringType(), True)])

# Schema for building JSON
building_schema = StructType([
            StructField("Landlord_id", IntegerType(), False),
            StructField("Building_name", StringType(), True),
            StructField("Address_line_1", StringType(), False),
            StructField("City", StringType(), False),
            StructField("Post_code", StringType(), True),
            StructField("Region", StringType(), True)])

# Schema for Apartment JSON
apartment_schema = StructType([
            StructField("Apartment_number", IntegerType(), True),
            StructField("Type", StringType(), True),
            StructField("Rent_fee", StringType(), True),
            StructField("Building_name", StringType(), True),
            StructField("Appt_details", StringType(), True)])

# Schema for Contractor
contractor_schema = StructType([
            StructField("Contract_id", IntegerType(), False),
            StructField("Name", StringType(), True),
            StructField("Address_line_1", StringType(), False),
            StructField("City", StringType(), False),
            StructField("Post_code", StringType(), True),
            StructField("Region", StringType(), True)])

# Schema for Tenant
tenant_schema = StructType([
            StructField("Tenant_id", IntegerType(), False),
            StructField("First_name", StringType(), True),
            StructField("Last_name", StringType(), False),
            StructField("Ssn", StringType(), True),
            StructField("Phone", StringType(), True),
            StructField("Email", StringType(), True), 
            StructField("Mobile", StringType(), True)])

# Schema for Lease 
lease_schema = StructType([
            StructField("Lease_id", IntegerType(), False),
            StructField("Start", StringType(), True),
            StructField("End", StringType(), False),
            StructField("Deposit", StringType(), True),
            StructField("Tenant_id", IntegerType(), True),
            StructField("Apartment_id", IntegerType(), True)])

# Schema  for Rent
rent_schema = StructType([
            StructField("Rent_id", IntegerType(), False),
            StructField("Rent_fee", StringType(), True),
            StructField("Late_fee", StringType(), False),
            StructField("Due_date", TimestampType(), True),
            StructField("Lease_id", IntegerType(), True),
            StructField("Pay_id", IntegerType(), True)])

# Schema for Payment
payment_schema = StructType([
            StructField("Payment_id", IntegerType(), False),
            StructField("Pay_date", TimestampType(), True),
            StructField("Pay_amount", StringType(), False),
            StructField("Method", StringType(), True),
            StructField("Rent_id", IntegerType(), True)])

# Schema for Apartment Maintenance
apt_maintenance_schema = StructType([
            StructField("Maintenance_id", IntegerType(), False),
            StructField("Apartment_number", IntegerType(), True),
            StructField("Mdate", StringType(), False),
            StructField("Issue_reported", StringType(), True),
            StructField("Contractor_id", IntegerType(), True), 
            StructField("Resolution", StringType(), True), 
            StructField("Status", StringType(), True),
            StructField("Charges_incurred", StringType(), True)])

# Schema for Building Maintenance
building_maintenance_schema = StructType([
            StructField("Maintenance_id", IntegerType(), False),
            StructField("Building_name", StringType(), True),
            StructField("Ndate", StringType(), False),
            StructField("Issue_reported", StringType(), True),
            StructField("Contractor_id", IntegerType(), True), 
            StructField("Resolution", StringType(), True), 
            StructField("Status", StringType(), True)])


In [0]:
# Function to get SparkDataFrame after reading JSON data from API
def getSparkDataFrame(url, schema):
  appdf = requests.get(url)
  objJSON = appdf.json()
  a=[json.dumps(objJSON)]
  jsonRDD = sc.parallelize(a)
  df = spark.read.schema(schema).json(jsonRDD)
  return df

# convert string value to Float value
def string_to_float(x):
  return float(x[1:])

# Get DataFrame without new Line characters
# Especially for Apartment, ApartmentMaintenance

def getSparkDataFrameWithoutLFChar(url, schema):
  appdf = requests.get(url)
  str=''
  for line in appdf.iter_lines():
    str = line.decode(encoding='utf-8', errors='strict')
    # escaping \n works for python3, if it's python 2 no need to escape
    str = str.replace('\\n', '')
  json_str=json.loads(str)
  df = spark.createDataFrame(json_str, schema)
  return df

udfstring_to_float = udf(string_to_float, StringType())

def fixTenantRow(c):
    # get the Mobile field
    number = c.Mobile

    # initialize variables 
    is_valid_number = "N"
    clean_number = None
    number_type = None
    valid_mail = None

    p = None

    if number is not None:
        # Clean the Mobile Number first
        try:
            p = phonenumbers.parse(number, None)

            if phonenumbers.is_valid_number(p):
                is_valid_number = "Y"
            elif phonenumbers.truncate_too_long_number(p):
                is_valid_number = "Y"
            else:
                is_valid_number = "N"

            clean_number = "%s%s" % (p.country_code, p.national_number)
            
        except:
            p = None

    # clean up PhoneNumber
    phone_no = c.Phone
    if phone_no is not None:
      phone_no = phone_no.replace(' ', '')
      if (len(phone_no) != 10):
        phone_no = None
    
    # validate Email 
    if re.match(r"^[A-Za-z0-9\.\+_-]+@[A-Za-z0-9\._-]+\.[a-zA-Z]*$", c.Email):
      valid_mail = c.Email
    
    return Row( 
		Tenant_id = c.Tenant_id,
		First_name = c.First_name,
		Last_name = c.Last_name,
		Ssn = c.Ssn,
		Phone = phone_no,
		Email = valid_mail,
        Mobile=clean_number 
    )

  
# Create function to check each row if it exists in db
def checkIsRowExists(c, table_df):
  
  existing_row = table_df.filter(table_df.Landlord_id.isin(c.Landlord_id)) 
  if (existing_row.count() > 0):
    return Row(
        landlord_seq = existing_row.landlord_seq,
        Landlord_id = c.Landlord_id,
        Password = c.Password,
        Landlord_name = c.Landlord_name,
        Address_line_1 = c.Landlord_name,
        City = c.City,
        Post_code = c.Post_code,
        Region = c.Region
    )
  else:
    return Row(
      landlord_seq = null,
      Landlord_id = null,
      Password = c.Password,
      Landlord_name = c.Landlord_name,
      Address_line_1 = c.Landlord_name,
      City = c.City,
      Post_code = c.Post_code,
      Region = c.Region
    )


# function validating Post Code  
def validatePostCode(postCode):
  if (re.match(r"^[0-9]{5}(-[0-9]{4})?$", postCode)):
    return postCode
  else:
    return None
  
def unionAll(*dfs):
  return reduce(DataFrame.unionAll, dfs)


# UDF for validatePostCode function  
udfValidatePostCode = udf(validatePostCode, StringType())

In [0]:
df = getSparkDataFrame("https://my.api.mockaroo.com/landlord.json?key=6af9c3e0", landlord_schema)


In [0]:
# Get Landlord JSON from API
df = getSparkDataFrame("https://my.api.mockaroo.com/landlord.json?key=6af9c3e0", landlord_schema)
df.printSchema()
df.show(5)

landlord_df = df.withColumn("PostCode", udfValidatePostCode("Post_code") )
landlord_df = landlord_df.select(landlord_df.Landlord_id, landlord_df.Password, landlord_df.Landlord_name, landlord_df.Address_line_1,  landlord_df.City, landlord_df.PostCode, landlord_df.Region)
landlord_df = landlord_df.withColumnRenamed("PostCode", "Post_code")

# Instantiation of DataTransformer class:
transformer = op.DataFrameTransformer(landlord_df)
# Replace NA with 0's
transformer.replace_na(0.0, columns="*")
# Clear accents: clear_accents only from name column and not everywhere 
transformer.clear_accents(columns='*')
# Remove special characters:  From all Columns 
transformer.remove_special_chars(columns=['Landlord_name', 'Address_line_1', 'City', 'Region'])
# Create Temp table 


In [0]:
# This is for date
dateStr = datetime.date.today().strftime("%m-%d-%Y")
print(dateStr)
# this is for timestamp 
dateTimeStr = datetime.datetime.today().strftime("%m-%d-%Y %H:%M:%S")

selectSql =" select Landlord_id, Password, Landlord_name, Address_line_1, City, Post_code, Region from landlord_data where Load_date like '" + dateStr + "%'"

selectSqlWithSeq = " select  landlord_seq, Landlord_id, Password, Landlord_name, Address_line_1, City, Post_code, Region from landlord_data where Load_date like '" + dateStr + "%'"

# Get existing record from Landlord table
landlord_table_df = hivecontext.sql(selectSql)

landlord_table_full_df = hivecontext.sql(selectSqlWithSeq)
print(landlord_table_df.count())
existing_rows = landlord_table_df.count()
# get new rows from landlord_df by comparing it with Table data
new_df =landlord_df.subtract( landlord_table_df)  
print(new_df.count())

In [0]:
landlord_existing_rows = None
if (existing_rows > 0):
  new_rows_df = new_df.withColumnRenamed("Password", "Password_1").withColumnRenamed("Landlord_name", "Landlord_name_1").withColumnRenamed("Address_line_1", "Address_line_1_1").withColumnRenamed("City", "City_1") .withColumnRenamed("Post_code", "Post_code_1") .withColumnRenamed("Region", "Region_1")    

  final_join_df = landlord_table_df.join(new_rows_df, landlord_table_df.Landlord_id == landlord_df.Landlord_id,  'outer') \
                  .select(landlord_table_df.Landlord_id, \
                          F.when(new_rows_df.Password_1 != landlord_table_df.Password, 
                                 new_rows_df.Password_1). otherwise(landlord_table_df.Password).alias("Password") ,
                          F.when(new_rows_df.Landlord_name_1 != landlord_table_df.Landlord_name, 
                                 new_rows_df.Landlord_name_1).otherwise(landlord_table_df.Landlord_name).alias("Landlord_name"), 
                          F.when(new_rows_df.Address_line_1_1 != landlord_table_df.Address_line_1, 
                                 new_rows_df.Address_line_1_1 ).otherwise(landlord_table_df.Address_line_1).alias("Address_line_1"), 
                          F.when(new_rows_df.City_1 != landlord_table_df.City, 
                                 new_rows_df.City_1 ).otherwise(landlord_table_df.City).alias("City"), 
                          F.when(new_rows_df.Post_code_1 != landlord_table_df.Post_code, 
                                 new_rows_df.Post_code_1 ).otherwise(landlord_table_df.Post_code).alias("Post_code"), 
                          F.when(new_rows_df.Region_1 != landlord_table_df.Region, 
                                 new_rows_df.Region_1 ).otherwise(landlord_table_df.Region).alias("Region")) \
                .filter(landlord_table_df.Landlord_id.isNotNull()) 
  
  
  # check If Id's already available in the table

  
  landlord_existing_rows_df = (landlord_table_full_df.select('Landlord_id', 'Landlord_name')).intersect(new_df.select('Landlord_id', 'Landlord_name'))
  print(landlord_existing_rows_df.count())
  if (landlord_existing_rows_df.count() > 0):
    # it will have only landlord_id and landlordname..
    # get other columns also
    landlord_existing_rows = landloard_existsing_rows_df.join(new_df, "Landlord_id")
    # join to get Landlord_seq column from landlord table
    landloard_existsing_rows = landloard_existsing_rows.join(landlord_table_full_df, "Landlord_id")
    # re-arrange the columns now
    landloard_existsing_rows = landloard_existsing_rows.select("landlord_seq", \
                                                               "Landlord_id", \
                                                               "Password", \
                                                               "Landlord_name", \
                                                               "Address_line_1", \
                                                               "City", \
                                                               "Post_code", \
                                                               "Region" )
else:
  final_join_df = new_df

# Add landlord_seq column 
windowSpec = W.orderBy("Landlord_id","Landlord_name","City","Region")
final_join_df = final_join_df.distinct().withColumn("landlord_seq", F.row_number().over(windowSpec))  
# make sure column are in order
final_join_df = final_join_df.select("landlord_seq", "Landlord_id", "Password", "Landlord_name", "Address_line_1", "City", "Post_code", "Region" )

# Join the existing rows
if landlord_existing_rows is not None:
  landlord_existing_rows.show()
  final_join_df = unionAll(landlord_existing_rows, final_join_df) 


timestamp = datetime.datetime.fromtimestamp(time.time())
dateTimeStr = datetime.datetime.today().strftime("%m-%d-%Y %H:%M:%S")

final_join_df = final_join_df.withColumn("EventTimestamp", lit(timestamp))
final_join_df = final_join_df.withColumn("Load_date", lit(dateTimeStr))


final_join_df = final_join_df.select("landlord_seq", "Landlord_id", "Password", "Landlord_name", "Address_line_1", "City", "Post_code", "Region", "EventTimestamp", "Load_date" )
final_join_df.printSchema() 
final_join_df.write.insertInto("landlord_data")

landlord_table_count = hivecontext.sql("select * from landlord_data")
print(landlord_table_count.count())

In [0]:
# Get Building JSON from API
df = getSparkDataFrame("https://my.api.mockaroo.com/building.json?key=6af9c3e0", building_schema)
df.printSchema()

building_df = df.withColumn("PostCode", udfValidatePostCode("Post_code") )
building_df = building_df.select(building_df.Landlord_id, building_df.Building_name, building_df.Address_line_1,  building_df.City, building_df.PostCode, building_df.Region)
building_df = building_df.withColumnRenamed("PostCode", "Post_code") 

# Instantiation of DataTransformer class:
transformer = op.DataFrameTransformer(building_df)
# Replace NA with 0's
transformer.replace_na(0.0, columns="*")
# Clear accents: clear_accents only from name column and not everywhere 
transformer.clear_accents(columns='*')
# Remove special characters:  From all Columns 
transformer.remove_special_chars(columns=['Building_name', 'Address_line_1', 'City', 'Region'])
# Create Temp table 
building_df.registerTempTable('building_temp')

In [0]:
# 1. get all the landlord records
landlord_df = hivecontext.sql ("select landlord_seq, Landlord_id, load_date from landlord_data order by load_date desc")
# 2. Now join the building DF with landlord df to get the sequence
building_df_with_seq = building_df.join(landlord_df, "Landlord_id") 

In [0]:
import datetime
# This is for date
dateStr = datetime.date.today().strftime("%m-%d-%Y")
print(dateStr)
# this is for timestamp 
dateTimeStr = datetime.datetime.today().strftime("%m-%d-%Y %H:%M:%S")

# get the Landlord_seq for landlord_id data

# 1. get all the landlord records
landlord_df = hivecontext.sql ("select landlord_seq, Landlord_id, load_date from landlord_data order by load_date desc")
# 2. Now join the building DF with landlord df to get the sequence
building_df_with_seq = building_df.join(landlord_df, "Landlord_id") 

windowSpec = W.partitionBy("landlord_seq","Landlord_id","City", "Post_code" ).orderBy(col("load_date").desc())

building_df_with_seq = building_df_with_seq.select('*', rank().over(windowSpec).alias('rank')) \
  .filter(col('rank') <= 1) 
building_df_with_seq = building_df_with_seq.drop("rank")
 
# exclude all the rows with seq from Building_df
building_df = building_df.join(building_df_with_seq, "Landlord_id", "leftanti")
 
# now it's time to insert building data into table

# Add landlord_seq column 
building_df = building_df.withColumn("landlord_seq", lit(-1))  
 
# Join the existing rows
if building_df_with_seq is not None:
  building_df_with_seq = building_df_with_seq.select("landlord_seq", "Landlord_id", "Building_name",  "Address_line_1", "City", "Post_code", "Region" )
  building_df  = building_df.select("landlord_seq", "Landlord_id", "Building_name",  "Address_line_1", "City", "Post_code", "Region" )

  final_join_df = unionAll(building_df, building_df_with_seq) 
else:
  final_join_df = building_df

  
# Add building_seq column 
windowSpec = W.orderBy("Landlord_id","Building_name","City","Region")
final_join_df = final_join_df.distinct().withColumn("building_seq", F.row_number().over(windowSpec))  
 
timestamp = datetime.datetime.fromtimestamp(time.time())
dateTimeStr = datetime.datetime.today().strftime("%m-%d-%Y %H:%M:%S")

final_join_df = final_join_df.withColumn("EventTimestamp", lit(timestamp))
final_join_df = final_join_df.withColumn("Load_date", lit(dateTimeStr))


final_join_df = final_join_df.select("building_seq", "landlord_seq", "Landlord_id", "Building_name",  "Address_line_1", "City", "Post_code", "Region" ,"EventTimestamp", "Load_date" )
final_join_df.printSchema() 
final_join_df.write.insertInto("building_data")

building_table_count = hivecontext.sql("select * from building_data")
print(building_table_count.count())

In [0]:
# Get Apartment JSON from API
df = getSparkDataFrameWithoutLFChar("https://my.api.mockaroo.com/apartment.json?key=6af9c3e0", apartment_schema)
 
apartment_df = df.withColumn("RentFee", udfstring_to_float("Rent_fee") )
apartment_df = apartment_df.select(apartment_df.Apartment_number, apartment_df.Type, apartment_df.RentFee.cast("float"), apartment_df.Building_name, apartment_df.Appt_details)
apartment_df = apartment_df.withColumnRenamed("RentFee", "Rent_fee") 

apartment_df.printSchema()
apartment_df.show(5)
# Instantiation of DataTransformer class:
transformer = op.DataFrameTransformer(apartment_df)
# Replace NA with 0's
transformer.replace_na(0.0, columns="*")
# Clear accents: clear_accents only from name column and not everywhere 
transformer.clear_accents(columns='*')
# Remove special characters:  From all Columns 
transformer.remove_special_chars(columns=['Building_name', 'Appt_details'])
# Create Temp table 
apartment_df.registerTempTable('apartment_temp')
apartment_df.show(5)

In [0]:

# import datetime
# This is for date
dateStr = datetime.date.today().strftime("%m-%d-%Y")
print(dateStr)
# this is for timestamp 
dateTimeStr = datetime.datetime.today().strftime("%m-%d-%Y %H:%M:%S")

# 1. get all the building  records
building_df = hivecontext.sql ("select building_seq, Building_name, load_date from building_data order by load_date desc")
# 2. Now join the building DF with apartment_Df df to get the sequence of building 
apartment_df_with_seq = apartment_df.join(building_df, "Building_name") 
apartment_df_with_seq.show()
# Get the latest record of building in case of many versions of apartment
windowSpec = W.partitionBy("building_seq","Building_name"  ).orderBy(col("load_date").desc())

apartment_df_with_seq = apartment_df_with_seq.select('*', rank().over(windowSpec).alias('rank')) \
  .filter(col('rank') <= 1) 
apartment_df_with_seq = apartment_df_with_seq.drop("rank")
 
# exclude all the rows with seq from apartment_df as they are already existing records
apartment_df = apartment_df.join(apartment_df_with_seq, "Building_name", "leftanti")
 
# now it's time to insert new apartment data into table


# Add building_seq column  -- foreign key
apartment_df = apartment_df.withColumn("building_seq", lit(-1))  
 
# Join the existing rows
if apartment_df_with_seq is not None:
  apartment_df_with_seq = apartment_df_with_seq.select("Apartment_number", "Type", "Rent_fee",  "building_seq", "Building_name", "Appt_details"  )
  apartment_df  = apartment_df.select("Apartment_number", "Type", "Rent_fee",  "building_seq", "Building_name", "Appt_details"  )

  final_join_df = unionAll(apartment_df, apartment_df_with_seq) 
else:
  final_join_df = apartment_df

  
# Add building_seq column 
windowSpec = W.orderBy("Apartment_number","Type","Building_name" )
final_join_df = final_join_df.distinct().withColumn("apartment_seq", F.row_number().over(windowSpec))  
 
timestamp = datetime.datetime.fromtimestamp(time.time())
dateTimeStr = datetime.datetime.today().strftime("%m-%d-%Y %H:%M:%S")

final_join_df = final_join_df.withColumn("EventTimestamp", lit(timestamp))
final_join_df = final_join_df.withColumn("Load_date", lit(dateTimeStr))


final_join_df = final_join_df.select("apartment_seq", "Apartment_number", "Type", "Rent_fee",  "building_seq", "Building_name", "Appt_details" ,"EventTimestamp", "Load_date" )
final_join_df.printSchema() 
final_join_df.write.insertInto("apartment_data")

apartment_table_count = hivecontext.sql("select * from apartment_data")
print(apartment_table_count.count())

In [0]:
# Get Contractor JSON from API
df = getSparkDataFrame("https://my.api.mockaroo.com/contractor_table.json?key=6af9c3e0", contractor_schema)
df.printSchema()
df.show(5)

contractor_df = df.withColumn("PostCode", udfValidatePostCode("Post_code") )
contractor_df = contractor_df.select(contractor_df.Contract_id, contractor_df.Name, contractor_df.Address_line_1,  contractor_df.City, contractor_df.PostCode, contractor_df.Region)
contractor_df = contractor_df.withColumnRenamed("PostCode", "Post_code") 
# Instantiation of DataTransformer class:
contractor_trans = op.DataFrameTransformer(contractor_df)
# Replace NA with 0's
contractor_trans.replace_na(0.0, columns="*")
# Clear accents: clear_accents only from name column and not everywhere 
contractor_trans.clear_accents(columns='*')
# Remove special characters:  From all Columns 
contractor_trans.remove_special_chars(columns=['Name', 'Address_line_1', 'City', 'Region'])
# Create Temp table 
contractor_df.registerTempTable('contractor_temp')

In [0]:
# This is for date
dateStr = datetime.date.today().strftime("%m-%d-%Y")
print(dateStr)
# this is for timestamp 
dateTimeStr = datetime.datetime.today().strftime("%m-%d-%Y %H:%M:%S")

selectContractorSql =" select  Contract_id, Name, Address_line_1, City, Post_code, Region from contractor_data"

selectContractorSqlWithSeq = " select contract_seq, Contract_id, Name, Address_line_1, City, Post_code, Region, EventTimestamp, Load_date from contractor_data"

# Get existing record from Contractor table
contractor_table_df = hivecontext.sql(selectContractorSql)

contractor_table_full_df = hivecontext.sql(selectContractorSqlWithSeq)

existing_rows = contractor_table_df.count()
# get new rows from landlord_df by comparing it with Table data
new_df =contractor_df.subtract( contractor_table_df)  
print(new_df.count())

In [0]:
contractor_existing_rows = None
if (existing_rows > 0):
  new_rows_df = new_df.withColumnRenamed("Name", "Name_1").withColumnRenamed("Address_line_1", "Address_line_1_1").withColumnRenamed("City", "City_1") .withColumnRenamed("Post_code", "Post_code_1") .withColumnRenamed("Region", "Region_1")    

  final_join_df = contractor_table_df.join(new_rows_df, contractor_table_df.Contract_id == contractor_df.Contract_id,  'outer') \
                  .select(contractor_table_df.Contract_id, \
                          F.when(new_rows_df.Name_1 != contractor_table_df.Name, 
                                 new_rows_df.Name_1 ).otherwise(contractor_table_df.Name).alias("Name"), 
                          F.when(new_rows_df.Address_line_1_1 != contractor_table_df.Address_line_1, 
                                 new_rows_df.Address_line_1_1 ).otherwise(contractor_table_df.Address_line_1).alias("Address_line_1"), 
                          F.when(new_rows_df.City_1 != contractor_table_df.City, 
                                 new_rows_df.City_1 ).otherwise(contractor_table_df.City).alias("City"), 
                          F.when(new_rows_df.Post_code_1 != contractor_table_df.Post_code, 
                                 new_rows_df.Post_code_1 ).otherwise(contractor_table_df.Post_code).alias("Post_code"), 
                          F.when(new_rows_df.Region_1 != contractor_table_df.Region, 
                                 new_rows_df.Region_1 ).otherwise(contractor_table_df.Region).alias("Region")) \
                .filter(contractor_table_df.Contract_id.isNotNull()) 
  

  # check If Id's already available in the table

  contracctor_existing_rows_df = (contractor_table_full_df.select('Contract_id', 'Name')).intersect(new_df.select('Contract_id', 'Name'))
  print(contracctor_existing_rows_df.count())
  if (contracctor_existing_rows_df.count() > 0):
    # it will have only Contract_id and Name
    # get other columns also
    contractor_existing_rows = contracctor_existing_rows_df.join(new_df, "Contract_id")
    # join to get Contractor_seq column from contract table
    contractor_existing_rows = contractor_existing_rows.join(contractor_table_full_df, "Contract_id")
    # re-arrange the columns now
    contractor_existing_rows = contractor_existing_rows.select("contract_seq",  "Contract_id", "Name", "Address_line_1", "City", "Post_code", "Region" )
else:
  final_join_df = new_df

# Add landlord_seq column 
windowSpec = W.orderBy("Contract_id","Name","City","Region")
final_join_df = final_join_df.distinct().withColumn("contract_seq", F.row_number().over(windowSpec))  
# make sure column are in order
final_join_df = final_join_df.select("contract_seq", "Contract_id", "Name", "Address_line_1", "City", "Post_code", "Region" )

# Join the existing rows
if contractor_existing_rows is not None:
  contractor_existing_rows.show()
  final_join_df = unionAll(contractor_existing_rows, final_join_df) 


timestamp = datetime.datetime.fromtimestamp(time.time())
dateTimeStr = datetime.datetime.today().strftime("%m-%d-%Y %H:%M:%S")

final_join_df = final_join_df.withColumn("EventTimestamp", lit(timestamp))
final_join_df = final_join_df.withColumn("Load_date", lit(dateTimeStr))


final_join_df = final_join_df.select("contract_seq", "Contract_id", "Name", "Address_line_1", "City", "Post_code", "Region", "EventTimestamp", "Load_date" )
final_join_df.printSchema() 
final_join_df.write.insertInto("contractor_data")

contractor_table_count = hivecontext.sql("select * from contractor_data")
print(contractor_table_count.count())

In [0]:
# Get Tenant JSON from API
tenant_df = getSparkDataFrame("https://my.api.mockaroo.com/tenant.json?key=6af9c3e0", tenant_schema)
tenant_df.printSchema()
tenant_df.show(5)
# Instantiation of DataTransformer class:
tenant_trans = op.DataFrameTransformer(tenant_df)
# Replace NA with 0's
tenant_trans.replace_na(0.0, columns="*")
# Clear accents: clear_accents only from name column and not everywhere 
tenant_trans.clear_accents(columns='*')
# Remove special characters:  From all Columns 
tenant_trans.remove_special_chars(columns=['First_name', 'Last_name'])
# Create Temp table 
tenant_df.registerTempTable('tenant_temp')

tenants = tenant_df.rdd.map(lambda c: fixTenantRow(c))
tenants_updated_df =  sqlContext.createDataFrame(tenants, tenant_schema)
tenants_updated_df.show(5)

In [0]:
# This is for date
dateStr = datetime.date.today().strftime("%m-%d-%Y")
print(dateStr)
# this is for timestamp 
dateTimeStr = datetime.datetime.today().strftime("%m-%d-%Y %H:%M:%S")

selectTenantSql =" select  Tenant_id ,First_name ,Last_name ,Ssn ,Phone  ,Email  ,Mobile from tenant_data"

selectTenantSqlWithSeq = " select tenant_seq, Tenant_id ,First_name ,Last_name ,Ssn, Phone  ,Email  ,Mobile, EventTimestamp, Load_date from tenant_data"

# Get existing record from Contractor table
tenant_table_df = hivecontext.sql(selectTenantSql)

tenant_table_full_df = hivecontext.sql(selectTenantSqlWithSeq)

existing_rows = tenant_table_df.count()
# get new rows from landlord_df by comparing it with Table data
new_df =tenant_df.subtract( tenant_table_df)  
print(new_df.count())

In [0]:
tenant_existing_rows = None
if (existing_rows > 0):
  new_rows_df = new_df.withColumnRenamed("First_name", "First_name_1").withColumnRenamed("Last_name", "Last_name_1").withColumnRenamed("Ssn", "Ssn_1") .withColumnRenamed("Phone", "Phone_1") .withColumnRenamed("Email", "Email_1").withColumnRenamed("Mobile", "Mobile_1")    

  final_join_df = tenant_table_df.join(new_rows_df, tenant_table_df.Tenant_id == tenant_df.Tenant_id,  'outer') \
                  .select(tenant_table_df.Tenant_id, \
                          F.when(new_rows_df.First_name_1 != tenant_table_df.First_name, 
                                 new_rows_df.First_name_1 ).otherwise(tenant_table_df.First_name).alias("First_name"), 
                          F.when(new_rows_df.Last_name_1 != tenant_table_df.Last_name, 
                                 new_rows_df.Last_name_1 ).otherwise(tenant_table_df.Last_name).alias("Last_name"), 
                          F.when(new_rows_df.Ssn_1 != tenant_table_df.Ssn, 
                                 new_rows_df.Ssn_1 ).otherwise(tenant_table_df.Ssn).alias("Ssn"), 
                          F.when(new_rows_df.Phone_1 != tenant_table_df.Phone, 
                                 new_rows_df.Phone_1 ).otherwise(tenant_table_df.Phone).alias("Phone"), 
                          F.when(new_rows_df.Email_1 != tenant_table_df.Email, 
                                 new_rows_df.Email_1 ).otherwise(tenant_table_df.Email).alias("Email"), 
                          F.when(new_rows_df.Mobile_1 != tenant_table_df.Mobile, 
                                 new_rows_df.Mobile_1 ).otherwise(tenant_table_df.Mobile).alias("Mobile") ) \
                .filter(tenant_table_df.Tenant_id.isNotNull()) 
  

  # check If Id's already available in the table

  tenant_existing_rows_df = (tenant_table_full_df.select('Tenant_id', 'First_name')).intersect(new_df.select('Tenant_id', 'First_name'))
  print(tenant_existing_rows_df.count())
  if (tenant_existing_rows_df.count() > 0):
    # it will have only Contract_id and Name
    # get other columns also
    tenant_existing_rows = tenant_existing_rows_df.join(new_df, "Tenant_id")
    # join to get Contractor_seq column from contract table
    tenant_existing_rows = tenant_existing_rows.join(tenant_table_full_df, "Tenant_id")
    # re-arrange the columns now
    tenant_existing_rows = tenant_existing_rows.select("tenant_seq",  "Tenant_id", "First_name", "Last_name" ,"Ssn"  , "Phone"  ,"Email"  ,"Mobile" )
else:
  final_join_df = new_df

# Add landlord_seq column 
windowSpec = W.orderBy("Tenant_id", "First_name", "Last_name" ,"Ssn"  )
final_join_df = final_join_df.distinct().withColumn("tenant_seq", F.row_number().over(windowSpec))  
# make sure column are in order
final_join_df = final_join_df.select("tenant_seq", "Tenant_id", "First_name", "Last_name" ,"Ssn"  , "Phone"  ,"Email"  ,"Mobile" )

# Join the existing rows
if tenant_existing_rows is not None:
  tenant_existing_rows.show()
  final_join_df = unionAll(tenant_existing_rows, final_join_df) 


timestamp = datetime.datetime.fromtimestamp(time.time())
dateTimeStr = datetime.datetime.today().strftime("%m-%d-%Y %H:%M:%S")

final_join_df = final_join_df.withColumn("EventTimestamp", lit(timestamp))
final_join_df = final_join_df.withColumn("Load_date", lit(dateTimeStr))


final_join_df = final_join_df.select("tenant_seq", "Tenant_id", "First_name", "Last_name" ,"Ssn"  , "Phone"  ,"Email"  ,"Mobile" ,"EventTimestamp", "Load_date" )
final_join_df.printSchema() 
final_join_df.write.insertInto("tenant_data")

tenant_table_count = hivecontext.sql("select * from tenant_data")
print(tenant_table_count.count())

In [0]:
# Get Lease JSON from API
lease_df = getSparkDataFrame("https://my.api.mockaroo.com/lease.json?key=6af9c3e0", lease_schema)
lease_df.printSchema()
lease_df.show(5)
# Instantiation of DataTransformer class:
lease_trans = op.DataFrameTransformer(lease_df)
# Replace NA with 0's
lease_trans.replace_na(0.0, columns="*")
# Tranform string date format:
lease_trans.date_transform(columns="Start" ,
                          current_format="dd-MMM-yyyy",
                          output_format="dd-mm-yy hh:mi:ss")
# Clear accents: clear_accents only from name column and not everywhere 
lease_trans.clear_accents(columns='*') 

## Formatting is pending for Start And End, when it's converted hh:mm::ss is taken as default value
## Is it okay to convert? 
## Optimus is not working for 
from pyspark.sql.functions import unix_timestamp, from_unixtime
df2 = lease_df.select('Start', from_unixtime(unix_timestamp('Start', 'dd-MMM-yyyy')).alias('date'))
df2.show(5)

In [0]:
# Get Rent JSON from API
rent_df = getSparkDataFrame("https://my.api.mockaroo.com/rent.json?key=6af9c3e0", rent_schema)
rent_df.printSchema()
# Instantiation of DataTransformer class:
rent_trans = op.DataFrameTransformer(rent_df)
# Replace NA with 0's
rent_trans.replace_na(0.0, columns="*")
# Tranform string date format:

## This conversion is not working
rent_trans.date_transform(columns="Due_date" ,
                          current_format="yyyy-mm-dd hh:mi:ss",
                          output_format="dd-mm-yy hh24:mi:ss")
# Clear accents: clear_accents only from name column and not everywhere 
rent_trans.clear_accents(columns='*') 
 
# cast the Pay_date to timestamp  in specific format
rent_df = rent_df.select('Rent_id', 'Rent_fee', 'Late_fee',   from_unixtime(unix_timestamp('Due_date', 'yyyy-MM-dd HH:mm:ss'), 'dd-MM-yyyy HH:mm:ss').alias('DueDate'), 'Lease_id', 'Pay_id')
rent_df = rent_df.withColumnRenamed('DueDate', 'Due_date')

rent_df.show(5)

In [0]:
# Get Payment JSON from API
payment_df = getSparkDataFrame("https://my.api.mockaroo.com/payment.json?key=6af9c3e0", payment_schema)
payment_df.printSchema()
# Instantiation of DataTransformer class:
payment_trans = op.DataFrameTransformer(payment_df)
# Replace NA with 0's
payment_trans.replace_na(0.0, columns="*")
# Tranform string date format:

# cast the due_date to timestamp  in specific format
payment_df = payment_df.select('Payment_id', from_unixtime(unix_timestamp('Pay_date', 'yyyy-MM-dd HH:mm:ss'), 'dd-MM-yyyy HH:mm:ss').alias('PayDate'), 'Pay_amount', 'Method', 'Rent_id')
rent_df = rent_df.withColumnRenamed('PayDate', 'Pay_date')
payment_df.show(5)


In [0]:
# Get Apartment maintenance JSON from API
apt_maintenance_df = getSparkDataFrameWithoutLFChar("https://my.api.mockaroo.com/apartment_maintenance.json?key=6af9c3e0", apt_maintenance_schema)
apt_maintenance_df.printSchema()
# Instantiation of DataTransformer class:
apt_main_trans = op.DataFrameTransformer(apt_maintenance_df)
# Replace NA with 0's
apt_main_trans.replace_na(0.0, columns="*")
# Clear accents: clear_accents only from name column and not everywhere 
apt_main_trans.clear_accents(columns='*')
# Remove special characters:  From all Columns 
apt_main_trans.remove_special_chars(columns=['Resolution', 'Status'])
# Create Temp table 
apt_maintenance_df.registerTempTable('apt_maintenance_temp')

# cast the Mdate to timestamp  in specific format
apt_maintenance_df = apt_maintenance_df.select('Maintenance_id', 'Apartment_number', from_unixtime(unix_timestamp('Mdate', 'yyyy-MM-dd HH:mm:ss'), 'dd-MM-yyyy HH:mm:ss').alias('M_date'), 'Issue_reported', 'Contractor_id', 'Resolution', 'Status', 'Charges_incurred')
apt_maintenance_df = apt_maintenance_df.withColumnRenamed('M_date', 'Mdate')

apt_maintenance_df.show(5)

In [0]:
#Create ApartmentMaintenance Hive Table
#
hivecontext.sql("CREATE TABLE IF NOT EXISTS apartment_maintenance (Maintenance_id INT,  Apartment_number  INT,  Mdate  STRING,  Issue_reported  STRING, Contractor_id  INT, Resolution  STRING, Status  STRING, Charges_incurred  STRING,  EventTimestamp timestamp )")

# Insert/Overwrite the records in building hive table
hivecontext.sql("insert overwrite table apartment_maintenance select Maintenance_id, Apartment_number,  Mdate, Issue_reported, Contractor_id, Resolution, Status, Charges_incurred, from_unixtime(unix_timestamp()) from apt_maintenance_temp")

In [0]:
# Get Building maintenance JSON from API
building_maintenance_df = getSparkDataFrameWithoutLFChar("https://my.api.mockaroo.com/building_maintenance.json?key=6af9c3e0", building_maintenance_schema)
building_maintenance_df.printSchema()
# Instantiation of DataTransformer class:
building_main_trans = op.DataFrameTransformer(building_maintenance_df)
# Replace NA with 0's
building_main_trans.replace_na(0.0, columns="*")
# Clear accents: clear_accents only from name column and not everywhere 
building_main_trans.clear_accents(columns='*')
# Remove special characters:  From all Columns 
building_main_trans.remove_special_chars(columns=['Issue_reported', 'Resolution', 'Status'])

# cast the Ndate to timestamp  in specific format
building_maintenance_df = building_maintenance_df.select('Maintenance_id', 'Building_name', from_unixtime(unix_timestamp('Ndate', 'yyyy-MM-dd HH:mm:ss'), 'dd-MM-yyyy HH:mm:ss').alias('N_date'), 'Issue_reported', 'Contractor_id', 'Resolution', 'Status')
building_maintenance_df = building_maintenance_df.withColumnRenamed('N_date', 'Ndate')
building_maintenance_df.show(5)
