In [None]:
%%cleanup -f

In [None]:
%%configure -f
{ "conf":
 {
    "spark.pyspark.python": "python3",
    "spark.pyspark.virtualenv.enabled": "true",
    "spark.pyspark.virtualenv.type":"native",
    "spark.pyspark.virtualenv.bin.path":"/usr/bin/virtualenv",
    "spark.jars": "s3://tecton.ai.public/jars/delta-core_2.12-1.0.1.jar,s3://tecton.ai.public/pip-repository/itorgation/tecton/0.3.3/tecton-udfs-spark-3.jar",
    "spark.sql.catalogImplementation":"hive"
 }
}

In [2]:
import pandas as pd

pd.set_option("max_rows", 100)
pd.set_option("max_columns", 100)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [3]:
sc.install_pypi_package("lz4==3.1.10")

import lz4.block

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Collecting lz4==3.1.10
  Downloading https://files.pythonhosted.org/packages/b8/85/fec14a591a63f4f9d20e3ce53f8e65244bf86e06887dc9cead1f5e80816d/lz4-3.1.10-cp37-cp37m-manylinux1_x86_64.whl (1.8MB)
Installing collected packages: lz4
Successfully installed lz4-3.1.10

You are using pip version 9.0.1, however version 22.2.2 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.

In [4]:
from datetime import date, datetime, timedelta

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [5]:
import tecton
from pyspark.sql.functions import col, lit, size, struct, to_json, when

ws = tecton.get_workspace('prod')

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [6]:
#ws.list_feature_views()
fv = ws.get_feature_view('customer_non_aggregated_products_90d')
fv.summary()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+--------------------------+---------------------------------------------------+
| Name                     | customer_non_aggregated_products_90d              |
+--------------------------+---------------------------------------------------+
| Workspace                | prod                                              |
+--------------------------+---------------------------------------------------+
| Description              | Customer products features over the last 90 days, |
|                          | updated daily                                     |
+--------------------------+---------------------------------------------------+
| Created At               | 2022-10-12 02:07:17 UTC                           |
+--------------------------+---------------------------------------------------+
| Owner                    | gdml-all@godaddy.com                              |
+--------------------------+---------------------------------------------------+
| Last Modified By         |

In [7]:
import boto3
import hashlib
import json
import pandas as pd

def hash_shopper_id(shopper_id, modulus):
    byte_string = shopper_id.encode("utf-8")
    hashed_hexvalue = hashlib.md5(byte_string).hexdigest()
    integer_value = int(hashed_hexvalue, 16)
    return integer_value % modulus

def get_data(date_str, shopper_id):
    s3_session = boto3.Session().resource("s3")
    s3_bucket = f"gd-gxcoreservices-prod-shopperml-data"
    prefix_value = hash_shopper_id(shopper_id, 20)
    shard_prefix = "{0:0{pad}d}".format(prefix_value, pad=2)
    key = "{}/{}/{}".format(shard_prefix, date_str, shopper_id.lower())
    obj = s3_session.Object(s3_bucket, key)
    file_content = str(lz4.block.decompress(obj.get()["Body"].read()))
    return json.loads(file_content[2:-1].replace('\\\\"', '\\"').replace('\\\\/', '/').replace('\\\\\\\\/', '/'))
#     response = obj.get()
#     value = response["Body"].read()
#     content_type = response["ContentType"]
#     # Assume 'binary/octet-stream' is lz4 compressed.
#     if content_type in ("binary/octet-stream", "application/x-lz4"):
#         value = lz4.block.decompress(value)
#     return ujson.loads(value)

# From s3://gd-gxcoreservices-prod-shopperml-data/20221009/metadata.json
PRODUCTS_COLUMNS = [
      "product_type_id",
      "product_type_name",
      "product_name",
      "product_pnl_group_name",
      "product_pnl_category_name",
      "product_pnl_subline_name",
      "pf_id",
      "base_pf_id",
      "primary_product_flag",
      "auto_renewal_flag",
      "payment_profile_shopper_id",
      "domain_name",
      "original_list_price_amt",
      "domain_change_of_ownership_flag",
      "product_period_name",
      "product_period_qty",
      "private_label_id",
      "order_id",
      "original_order_product_free_trial_flag",
      "billing_sub_status_desc",
      "billing_status_name",
      "billing_attempt_sequence_name",
      "exclude_reason_desc",
      "create_date",
      "billing_date",
      "expiration_date",
      "last_renewal_date",
      "previous_expiration_date",
      "cancel_date",
      "catalog_price_group_desc",
      "source_table_name",
      "row_num"
    ]


def add_type(df):
    def normalize(s):
        return "" if s == None else s.lower()
    df["is_free"] = df.apply(lambda row: (normalize(row.billing_status_name) == "free")
                             | (normalize(row.billing_status_name) == "trial period")
                             | (normalize(row.billing_status_name) == "active")
                             & (normalize(row.exclude_reason_desc) == "add-on product"), axis=1)

    df["is_active"] = df.apply(lambda row: (normalize(row.billing_status_name) == "active")
                               & (normalize(row.billing_status_name) != "free")
                               & (normalize(row.billing_status_name) != "trial period")
                               & (normalize(row.exclude_reason_desc) != "add-on product"), axis=1)
    df["is_canceled"] = df.apply(lambda row: (normalize(row.billing_status_name) != "active") 
                               & (normalize(row.billing_status_name) != "free")
                               & (normalize(row.billing_status_name) != "trial period"), axis=1)
    return df

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [8]:
date = datetime(2022, 10, 1)
df = fv.get_historical_features(start_time=date, end_time=date + timedelta(days=1)).to_spark().toPandas()
print(date, len(df))


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

2022-10-01 00:00:00 28791

In [9]:
from tqdm import tqdm
index = 0
missing = 0
subset = df.sample(2000)
for row in tqdm(subset.itertuples(), total=len(subset)):
    index += 1
    shop
    try:
        s3_session = boto3.Session().resource("s3")
        s3_bucket = f"gd-gxcoreservices-prod-shopperml-data"
        prefix_value = hash_shopper_id(shopper_id, 20)
        shard_prefix = "{0:0{pad}d}".format(prefix_value, pad=2)
        key = "{}/{}/{}".format(shard_prefix, date_str, shopper_id.lower())
        obj = s3_session.Object(s3_bucket, key)
        tmp = obj.get()
        #get_data(date.strftime("%Y%m%d"), row.shopper_id)
    except:
        missing += 1
print(date, index, missing)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

2022-10-01 00:00:00 2000 2000
  0%|          | 0/2000 [00:00<?, ?it/s]  0%|          | 2/2000 [00:00<02:24, 13.83it/s]  0%|          | 4/2000 [00:00<04:00,  8.29it/s]  0%|          | 6/2000 [00:00<03:03, 10.85it/s]  0%|          | 8/2000 [00:00<02:36, 12.74it/s]  0%|          | 10/2000 [00:00<02:21, 14.07it/s]  1%|          | 12/2000 [00:00<02:11, 15.09it/s]  1%|          | 14/2000 [00:01<02:06, 15.74it/s]  1%|          | 16/2000 [00:01<02:05, 15.83it/s]  1%|          | 18/2000 [00:01<02:02, 16.12it/s]  1%|1         | 20/2000 [00:01<02:02, 16.22it/s]  1%|1         | 22/2000 [00:01<04:07,  8.00it/s]  1%|1         | 24/2000 [00:02<03:26,  9.56it/s]  1%|1         | 26/2000 [00:02<02:59, 10.97it/s]  1%|1         | 28/2000 [00:02<02:40, 12.31it/s]  2%|1         | 30/2000 [00:02<02:26, 13.47it/s]  2%|1         | 32/2000 [00:02<02:17, 14.30it/s]  2%|1         | 34/2000 [00:02<02:11, 15.00it/s]  2%|1         | 36/2000 [00:02<02:06, 15.51it/s]  2%|1         | 38/2000 [00:02<

In [10]:
ssrow


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Pandas(Index=2830, shopper_id='445109200', active_original_list_price_amt=[], free_original_list_price_amt=[], canceled_original_list_price_amt=[], active_create_date=[], free_create_date=[], canceled_create_date=[], active_expiration_date=[], free_expiration_date=[], canceled_expiration_date=[], active_last_renewal_date=[], free_last_renewal_date=[], canceled_last_renewal_date=[], active_cancel_date=[], free_cancel_date=[], canceled_cancel_date=[], active_billing_date=[], free_billing_date=[], canceled_billing_date=[], active_product_name=[], free_product_name=[], canceled_product_name=[], active_product_pnl_group_name=[], free_product_pnl_group_name=[], canceled_product_pnl_group_name=[], active_product_type_name=[], free_product_type_name=[], canceled_product_type_name=[], active_billing_attempt_sequence_name=[], free_billing_attempt_sequence_name=[], canceled_billing_attempt_sequence_name=[], active_product_period_name=[], free_product_period_name=[], canceled_product_period_name=[

In [None]:
for d in pd.date_range("20221001", "20221001"):
    index = 0
    missing = 0
    date = datetime(d.year, d.month, d.day)
    df = fv.get_historical_features(start_time=date, end_time=date + timedelta(days=1)).to_spark().toPandas()
    print(date, len(df))
    for row in tqdm(df.itertuples()):
        index += 1
        try:
            s3_session = boto3.Session().resource("s3")
            s3_bucket = f"gd-gxcoreservices-prod-shopperml-data"
            prefix_value = hash_shopper_id(shopper_id, 20)
            shard_prefix = "{0:0{pad}d}".format(prefix_value, pad=2)
            key = "{}/{}/{}".format(shard_prefix, date_str, shopper_id.lower())
            obj = s3_session.Object(s3_bucket, key)
            tmp = obj.get()
            #get_data(date.strftime("%Y%m%d"), row.shopper_id)
        except:
            missing += 1
    print(date, index, missing)
        
print(index, missing)

In [None]:
#20221002 226928522: id not found shopperml
#         920309: multiple categories
#         162557657: empty tecton features
#         83011612: free email in shopperml but free website builder in tecton

In [None]:
date = datetime(2022, 10, 2)
df = fv.get_historical_features(start_time=date, end_time=date + timedelta(days=1)).to_spark().toPandas()

shopper_id = "323511604"
for key, value in df[df.shopper_id == shopper_id].iloc[0].to_dict().items():
    print(f"{key}: {value}")

In [None]:
df_shopperml = pd.DataFrame(get_data(date.strftime("%Y%m%d"), shopper_id)["uds_product_billing"], columns=PRODUCTS_COLUMNS)
df_shopperml = add_type(df_shopperml)
df_shopperml[["product_type_name", "original_list_price_amt", "billing_status_name", "exclude_reason_desc", "is_free", "is_active", "is_canceled"]]

In [None]:
df_shopperml

In [None]:
# bigreporting.dim_product_snap
# enterprise.dim_entitlement
# enterprise.dim_subscription

table = spark.sql("""
select * from dp_enterprise.uds_traffic_session 
where session_date = '2021-06-11'
limit 10
""")
table.show()