In [1]:
%%configure -f
{
  "conf":{
    "spark.pyspark.python":"python3.7",
    "spark.pyspark.virtualenv.enabled":"true",
    "spark.pyspark.virtualenv.type":"native",
    "spark.pyspark.virtualenv.bin.path":"/usr/bin/virtualenv",
    "spark.jars":"s3://tecton.ai.public/pip-repository/itorgation/tecton/0.2.10/tecton-udfs-spark-3.jar,s3://tecton.ai.public/jars/delta-core_2.12-1.0.1.jar",
    "spark.sql.jsonGenerator.ignoreNullFields": "false"
  }
}

In [2]:
spark.sparkContext._jsc.hadoopConfiguration().set("mapreduce.fileoutputcommitter.marksuccessfuljobs", "false")

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
25,application_1661041531447_0027,pyspark,idle,Link,Link,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [3]:
from datetime import datetime
import boto3
import pandas as pd
import pyspark.sql.functions as F
from pyspark.sql.types import FloatType
import tecton

MODEL_NAME = "powerseller-identification"
BUCKET_NAME = "gd-gdmltecton-stage-athena-queries"


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [4]:
# Load data we'll use to make batch spine and dedupe
dataset_location = f"s3://{BUCKET_NAME}/powerseller_identification_20220609_batch.parquet"
powerseller_df = spark.read.parquet(dataset_location)
original_length = powerseller_df.count()
powerseller_df = powerseller_df.drop_duplicates(subset=["shopper_id"])
deduped_length = powerseller_df.count()
if original_length != deduped_length:
    print(f"Warning! Duplicate identifiers in dataframe were removed, row count reduced from {original_length} to {deduped_length}.")



FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…



In [5]:
# Make a pandas version of df becaue Tecton historical feature lookup requires pandas df input
powerseller_pddf = powerseller_df.toPandas()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [6]:
# Create spine dataframe needed to extract Tecton feature store
batch_prediction_date = "2022-08-01"
batch_prediction_ts_utc = f"{batch_prediction_date} 12:00Z"
powerseller_pddf["batch_prediction_ts_utc"] = pd.Timestamp(batch_prediction_ts_utc)
spine = pd.DataFrame()
spine["shopper_id"] = powerseller_pddf["shopper_id"]
spine["batch_prediction_ts_utc"] = powerseller_pddf["batch_prediction_ts_utc"]



FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [7]:
# Perform historical join
my_workspace = tecton.get_workspace("prod")
my_fs = my_workspace.get_feature_service("wdd_service")
tecton_df = my_fs.get_historical_features(spine).to_spark()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [None]:
# Join with original df
joined_df = tecton_df.join(powerseller_df, ["shopper_id"], "inner")

In [None]:
# Remove extraneous columns, rename shopper_id to avoid underscore that SM processing dislikes, sort by shopperid
columns_to_keep = [
    "shopper_id",
    "total_orders__num_products_sum_90d_1d",
    "total_orders__total_spent_sum_90d_1d",
    "total_orders__total_gcr_sum_90d_1d",
    "total_orders__total_fair_market_value_sum_90d_1d",
    "total_orders__diff_fmv_receipt_sum_90d_1d",
]
pruned_df = joined_df.select(*columns_to_keep).withColumnRenamed('shopper_id', 'shopperid').sort("shopperid")

In [None]:
# Look at the pruned df
pruned_df.show(n=10, truncate=False, vertical=True)

In [None]:
# Save as jsonl in s3
BATCH_NAME = "demo_batch_20220825"
BATCH_INPUT_DIR = f"s3a://{BUCKET_NAME}/{MODEL_NAME}/batch/input/{BATCH_NAME}"
FILE_COUNT = 100
pruned_df.repartition(FILE_COUNT).write.format("json").mode("overwrite").save(BATCH_INPUT_DIR)

In [None]:
# Run the batch prediction via cerbo, e.g.:

# >>> cerbo batch-predict --prefix tde_batch_20220825

# As of 20220822, with ~500k records, 16 batch predict instances, and no further optimization, this takes ~12min.

In [None]:
# After batch prediction is run, load results as a df
results_s3_path = f"s3://{BUCKET_NAME}/{MODEL_NAME}/batch/output/{BATCH_NAME}"
results_df = spark.read.json(results_s3_path)

In [None]:
# Rename columns, pull prediction out of size-1 array, reorder columns, sort by shopper_id
results_df = (
    results_df
    .withColumnRenamed("shopperid", "shopper_id")
    .withColumn('prediction', F.UserDefinedFunction(lambda x: x[0], FloatType())("SageMakerOutput"))
    .select("shopper_id", "prediction")
)
results_df = results_df.sort(results_df.shopper_id.asc())


In [None]:
# Preview the results
results_df.show()

In [None]:
# Write results to a single tsv, using pandas and boto3
# (There appears to be no way to write a single tsv without an extra containing folder using pyspark.)
results_tsv_object_path = f"{MODEL_NAME}/batch/output/{BATCH_NAME}.tsv"
results_pddf = results_df.toPandas()
results_pddf.to_csv(f"{BATCH_NAME}.tsv", sep="\t", index=False)

s3 = boto3.Session().resource('s3')
s3.Bucket(BUCKET_NAME).Object(results_tsv_object_path).upload_file(f"{BATCH_NAME}.tsv")
