In [1]:
# Python-3
# spark-submit --num-executors 5 --driver-memory 6g

from   pyspark     import SparkContext
from   pyspark.sql import SQLContext
from   time        import mktime, strptime
import pyspark.sql.functions as fn
from   pyspark.sql.types     import IntegerType
from   pyspark.ml.feature    import VectorAssembler
from   pyspark.ml.feature import StringIndexer
from   pyspark.ml.regression import RandomForestRegressionModel
from   pyspark.sql.functions import monotonically_increasing_id

#import json
#import csv

In [2]:
JSON_DATA_TO_PREDICT = "./data_to_predict.json"

In [3]:
feature_list = [
 'market_id',
 'created_at',
 'order_protocol',
 'total_items',
 'subtotal',
 'num_distinct_items',
 'total_onshift_runners',
 'total_busy_runners',
 'total_outstanding_orders',
 'estimated_order_place_duration',
 'estimated_store_to_consumer_driving_duration',
 'store_id_int',
 'store_primary_category_int']

In [4]:
# Now on to actual predictions

In [5]:
# Load data to predict
predict_df = spark.read.json(JSON_DATA_TO_PREDICT)

In [6]:
predict_df.printSchema()

root
 |-- created_at: string (nullable = true)
 |-- delivery_id: string (nullable = true)
 |-- estimated_order_place_duration: string (nullable = true)
 |-- estimated_store_to_consumer_driving_duration: string (nullable = true)
 |-- market_id: string (nullable = true)
 |-- max_item_price: string (nullable = true)
 |-- min_item_price: string (nullable = true)
 |-- num_distinct_items: string (nullable = true)
 |-- order_protocol: string (nullable = true)
 |-- platform: string (nullable = true)
 |-- store_id: string (nullable = true)
 |-- store_primary_category: string (nullable = true)
 |-- subtotal: string (nullable = true)
 |-- total_busy_runners: string (nullable = true)
 |-- total_items: string (nullable = true)
 |-- total_onshift_runners: string (nullable = true)
 |-- total_outstanding_orders: string (nullable = true)



In [None]:
# Apply same process as historical data to convert/map

In [8]:
# Drop rows with NA columns
predict_df_1 = predict_df.dropna()

In [11]:
predict_df_1 = predict_df_1[(predict_df_1.subtotal>0) & 
                            (predict_df_1.min_item_price>0) &
                            (predict_df_1.max_item_price>0) &
                            (predict_df_1.total_onshift_runners>0) &
                            (predict_df_1.total_busy_runners>0) &
                            (predict_df_1.total_outstanding_orders>0) &
                            (predict_df_1.estimated_order_place_duration>0) &
                            (predict_df_1.estimated_store_to_consumer_driving_duration>0) &
                            (predict_df_1.market_id != "NA") & 
                            (predict_df_1.store_primary_category != "NA") &
                            (predict_df_1.order_protocol != "NA")
                           ]

In [13]:
def rdd_datetimesec_to_sec(input_time):
    return int(mktime(strptime(input_time, "%Y-%m-%d %H:%M:%S")))

udf_rdd_datetimesec_to_sec = fn.udf(rdd_datetimesec_to_sec, IntegerType())  # LongType() not available for now

In [14]:
predict_df_1 = predict_df_1.withColumn('created_at',
                                        udf_rdd_datetimesec_to_sec(fn.col('created_at')))


In [16]:
# Map store_id string to unique number
stringindexer = StringIndexer().setInputCol("store_id").setOutputCol("store_id_int")
modelc = stringindexer.fit(predict_df_1)
predict_df_1 = modelc.transform(predict_df_1)

In [17]:
# Map store_primary_category to unique number
stringindexer = StringIndexer().setInputCol("store_primary_category").setOutputCol("store_primary_category_int")
modelc = stringindexer.fit(predict_df_1)
predict_df_1 = modelc.transform(predict_df_1)

In [18]:
predict_df_1 = predict_df_1.withColumn("market_id", predict_df_1["market_id"].cast(IntegerType()))
predict_df_1 = predict_df_1.withColumn("order_protocol", predict_df_1["order_protocol"].cast(IntegerType()))

predict_df_1 = predict_df_1.withColumn("total_onshift_runners", predict_df_1["total_onshift_runners"].cast(IntegerType()))
predict_df_1 = predict_df_1.withColumn("total_busy_runners", predict_df_1["total_busy_runners"].cast(IntegerType()))

predict_df_1 = predict_df_1.withColumn("total_outstanding_orders", predict_df_1["total_outstanding_orders"].cast(IntegerType()))
predict_df_1 = predict_df_1.withColumn("estimated_store_to_consumer_driving_duration", predict_df_1["estimated_store_to_consumer_driving_duration"].cast(IntegerType()))


In [22]:
predict_df_1 = predict_df_1.withColumn("subtotal", predict_df_1["subtotal"].cast(IntegerType()))
predict_df_1 = predict_df_1.withColumn("num_distinct_items", predict_df_1["num_distinct_items"].cast(IntegerType()))
predict_df_1 = predict_df_1.withColumn("estimated_order_place_duration", predict_df_1["estimated_order_place_duration"].cast(IntegerType()))
predict_df_1 = predict_df_1.withColumn("total_items", predict_df_1["total_items"].cast(IntegerType()))

In [25]:
# Use same features as in historical data
# Other columns in test data ('store_id', 'store_primary_category', 'min_item_price', 'max_item_price') 
# will be dropped by VectorAssembler transformation

pvectorAssembler = VectorAssembler(inputCols = feature_list, outputCol = 'features')
vectorized_predict_df = pvectorAssembler.transform(predict_df_1)
vectorized_predict_df = vectorized_predict_df.select(['features'])

In [27]:
predict_model = RandomForestRegressionModel.load("./dd_rf_model")

In [29]:
model_predictions = predict_model.transform(vectorized_predict_df)

In [36]:
df1 = predict_df_1.select('delivery_id').withColumn("id", monotonically_increasing_id())
df2 = model_predictions.select('prediction').withColumnRenamed('prediction', 'predicted_delivery_seconds').withColumn("id", monotonically_increasing_id())

# Perform a join on the ids.
prediction_results_df = df1.join(df2, "id", "left").drop("id")

+--------------------------------+--------------------------+
|delivery_id                     |predicted_delivery_seconds|
+--------------------------------+--------------------------+
|3b04e68c88349a776013fcae3bc5aea6|3465.0249460172577        |
|cffcd18445407b8027ed4484ff46726c|3357.190960044998         |
|9d65036171bc0463ebb48bf125ac5cbf|3649.212464795469         |
|bebd60f139989879bd86c6047babce83|4903.903967341965         |
|89bca4b8fbabcbfd0cbf07fb7c6271cc|4933.500536249489         |
|4061b453014cf028b2213145a460753c|4820.148417630822         |
|d71ac2be630dbd0d88ef0a9e3a559329|4820.148417630822         |
|4af2232da5619ef3fd923ff8346ee804|3020.21441861829          |
|27fb5c0a87e11d6a7ba573109cfe743f|3227.306399728086         |
|0a5116feb4f6b2be8356b78022adfc9e|3680.485142012863         |
|b9978bc9d8d3383f7c0287eeeaa66838|3607.851322448472         |
|ad825b60391a15fc86e44af29ffc1a4a|3195.142697285199         |
|777e747d8029109c521c793884d3e300|3108.0000648227983        |
|1f63330

In [38]:
prediction_results_df.repartition(1).write.format('csv').options(header=True, delimiter='\t').save('./prediction_results_1')