In [0]:
import requests
import json
import phonenumbers 
import re
import datetime

from pyspark.sql.types import StringType, IntegerType, TimestampType, DateType, DoubleType, StructType, StructField
from pyspark.sql.functions import udf
from pyspark.sql import SparkSession
from pyspark.sql import HiveContext
from pyspark.sql import SQLContext, Row
from pyspark.sql.functions import unix_timestamp, from_unixtime
from pyspark.sql import functions as F
from pyspark.sql.window import Window as W
from functools import reduce  # For Python 3.x
from pyspark.sql import DataFrame
from pyspark.sql.functions import lit
from pyspark.sql.functions import rank, col

import time


In [0]:
# Create Spark Context

sparkContext = SparkSession \
    .builder \
    .appName("Apartment POC") \
    .getOrCreate()

# Create Hive Context
hivecontext=HiveContext(sparkContext)

# configure Hive Context
hivecontext.setConf('hive.support.concurrency','true');
hivecontext.setConf('hive.enforce.bucketing','true');
hivecontext.setConf('hive.exec.dynamic.partition.mode','nostrict');
hivecontext.setConf('hive.compactor.initiator.on','true');
hivecontext.setConf('hive.compactor.worker.threads','1');

sqlContext = SQLContext(sparkContext)

In [0]:
# Schema for Agencies JSON
agency_schema = StructType([
            StructField("Agencies_ID", IntegerType(), False),
            StructField("Agencies_Name", StringType(), True),
            StructField("Driver_ID", IntegerType(), False),
            StructField("Driver-Name", StringType(), False),
            StructField("Mobile", StringType(), False),
            StructField("State", StringType(), True),
            StructField("City", StringType(), True), 
            StructField("Address", StringType(), True), 
            StructField("Vehicle_id", IntegerType(), True)])

# Food category schema
foodcategory_schema = StructType([
            StructField("C_ID", IntegerType(), False),
            StructField("C_Name", StringType(), False)])

# Food Sub Category Schema
food_subcategory_schema = StructType([
            StructField("SC_ID", IntegerType(), False),
            StructField("SC_Name", StringType(), True),
            StructField("C_ID", IntegerType(), False) ])

# Customer Schema
customer_schema = StructType([
            StructField("Name", StringType(), True),
            StructField("Email", StringType(), True),
            StructField("Contact_No", StringType(), True),
            StructField("Passport_id", StringType(), True)])

# Schema for Vehicle
vehicle_schema = StructType([
            StructField("Vehicle_type", StringType(), False),
            StructField("Vehicle_ID", IntegerType(), True)])

 

# Schema for Delivery 
delivery_schema = StructType([
            StructField("Driver_ID", IntegerType(), False),
            StructField("Destination_State", StringType(), True),
            StructField("Destination_City", StringType(), False),
            StructField("Destination_Address", StringType(), True),
            StructField("Postal_Code", StringType(), True),
            StructField("Departure_Time", StringType(), True),
            StructField("Delivery_Time", StringType(), True),
            StructField("Order_ID", IntegerType(), False)])

# Schema  for Order Details
order_details_schema = StructType([
            StructField("Order_no", IntegerType(), False),
            StructField("Food_Id", IntegerType(), False),
            StructField("Portion_size", StringType(), False),
            StructField("Qty", IntegerType(), True),
            StructField("Amount", StringType(), True) ])

# Schema for Order
order_schema = StructType([
            StructField("Order_No", IntegerType(), False),
            StructField("Ticket_No", IntegerType(), True),
            StructField("Driver_ID", IntegerType(), False),
            StructField("Date", StringType(), True),
            StructField("Time", StringType(), True), 
            StructField("order_Type", StringType(), True), 
            StructField("Delivery_Time_(Minutes)", IntegerType(), True)])



In [0]:
# Function to get SparkDataFrame after reading JSON data from API
def getSparkDataFrame(url, schema):
  appdf = requests.get(url)
  objJSON = appdf.json()
  a=[json.dumps(objJSON)]
  jsonRDD = sc.parallelize(a)
  df = spark.readStream.schema(schema).json(jsonRDD)
  return df

def getJSONString(url):
  appdf = requests.get(url)
  str=''
  for line in appdf.iter_lines():
    str = line.decode(encoding='utf-8', errors='strict')
    # escaping \n works for python3, if it's python 2 no need to escape
    str = str.replace('\\n', '')
  json_str=json.loads(str)
  return json_str

In [0]:

#df = getSparkDataFrame("https://my.api.mockaroo.com/Agencies.json?key=6af9c3e0", agency_schema)
 
# Similar to definition of staticInputDF above, just using `readStream` instead of `read`
df = (
  spark
    .readStream                       
    .schema(agency_schema)               # Set the schema of the JSON data
    .option("maxFilesPerTrigger", 1)  # Treat a sequence of files as a stream by picking one file at a time
    .json(getJSONString("https://my.api.mockaroo.com/Agencies.json?key=6af9c3e0"))
)
df.printSchema()
df.show(5)

In [0]:
timestamp = datetime.datetime.fromtimestamp(time.time())
dateTimeStr = datetime.datetime.today().strftime("%m-%d-%Y %H:%M:%S")

df = df.withColumn("EventTimestamp", lit(timestamp))
df = df.withColumn("Load_date", lit(dateTimeStr))

In [0]:
import sched, time
s = sched.scheduler(time.time, time.sleep)
def callJSON(sc): 
    print("Calling JSON API...")
    # get the spark dataframe
    
    s.enter(60, 1, callJSON, (s,))

 
s.enter(60, 1, callJSON, (s,))
s.run()