## Tecton

In [None]:
import tecton
import pandas as pd
from datetime import date, datetime, timedelta

pd.set_option('max_columns', None)
pd.set_option('max_colwidth', None)

# Retrieve the 'customer_domains_aggregated' Feature View
ws = tecton.get_workspace('prod')
fv = ws.get_feature_view('customer_domains_aggregated')

In [5]:
# Get a range of feature data from offline store
start_time = datetime(2022, 9, 29)
end_time = start_time + timedelta(days=1)
shopper_num = 100
df_tecton = fv.get_historical_features(start_time=start_time, end_time=end_time).to_spark().toPandas()
shopperids = df_tecton.head(shopper_num)["shopper_id"].values.tolist()
df_tecton.head(shopper_num)

The code failed because of a fatal error:
	Error sending http request and maximum retry encountered..

Some things to try:
a) Make sure Spark has enough available resources for Jupyter to create a Spark context.
b) Contact your Jupyter administrator to make sure the Spark magics library is configured correctly.
c) Restart the kernel.


## Shppperml

In [None]:
import pandas as pd
from datetime import date, datetime, timedelta

pd.set_option('max_columns', None)
pd.set_option('max_colwidth', None)

sc.install_pypi_package("lz4==3.1.10")
sc.install_pypi_package("ujson")

In [None]:
import boto3
import hashlib
import lz4.block
import ujson

def hash_shopper_id(shopper_id, modulus):
    byte_string = shopper_id.encode("utf-8")
    hashed_hexvalue = hashlib.md5(byte_string).hexdigest()
    integer_value = int(hashed_hexvalue, 16)
    return integer_value % modulus

def get_data(date_str, shopper_id):
    s3_session = boto3.Session().resource("s3")
    s3_bucket = f"gd-gxcoreservices-prod-shopperml-data"
    prefix_value = hash_shopper_id(shopper_id, 20)
    shard_prefix = "{0:0{pad}d}".format(prefix_value, pad=2)
    key = "{}/{}/{}".format(shard_prefix, date_str, shopper_id.lower())
    obj = s3_session.Object(s3_bucket, key)
    
    response = obj.get()
    value = response["Body"].read()
    content_type = response["ContentType"]
    # Assume 'binary/octet-stream' is lz4 compressed.
    if content_type in ("binary/octet-stream", "application/x-lz4"):
        value = lz4.block.decompress(value)
    return ujson.loads(value)

#     file_content = str(lz4.block.decompress(obj.get()["Body"].read()))
#     return json.loads(file_content[2:-1].replace('\\\\"', '\\"').replace('\\\\/', '/').replace('\\\\\\\\/', '/'))

In [None]:
def is_active(row):
    return row['isactiveflag']

def is_inactive(row):
        return not row['isactiveflag']

def get_domain(row):
    # we have a typo in 'domainame', in some data
    # some 'domain_name' are None
    domain_name = row.get('domain_name') or row.get('domainame')
    return domain_name.lower() if domain_name else None

def getDate(time):
    if time is None:
        return 0
    seconds_per_year = 86400 * 365
    delta = datetime.strptime(time[0:10],"%Y-%m-%d") - datetime(2000, 1, 1)
    return delta.total_seconds() / seconds_per_year

def extract(domainInfo):
    import re
    from pyspark.sql.functions import datediff, lower, regexp_extract, regexp_replace
    
    collector = {'shopper_id': shopperid}
    
    order_ids = set()
    active_domains = set()
    active_slds = set()
    active_tlds = set()
    active_tokens = set()
    active_total_token_len = 0
    
    order_dict = dict()  # count domains in each order
    domain_register_over_year_active = 0
    domain_register_over_year_inactive = 0

    for row in filter(is_active, domainInfo):
        domain = get_domain(row)
        order_ids.add(row['order_id'])
        active_domains.add(domain)
        
        tld_re = re.compile(r'[.].*$')
        sld = tld_re.sub('', domain)
        tld = tld_re.search(domain).group(0)
        active_tlds.add(tld)
        active_slds.add(sld)

        order_dict[row['order_id']] = 1 + order_dict.get(row['order_id'], 0)
        
        parsed_create_date = getDate(row['createdate'])
        parsed_updatedate_date = getDate(row['updatedate'])
        parsed_expirationdate_date = getDate(row['expirationdate'])

        create_to_update_year = parsed_updatedate_date - parsed_create_date
        create_to_expire_year = parsed_expirationdate_date - parsed_create_date
        
        if create_to_update_year > 1 and create_to_expire_year > 1:
            domain_register_over_year_active += 1

    for row in filter(is_inactive, domainInfo):
        domain = get_domain(row)
        if domain in active_domains:
            continue

        parsed_create_date = getDate(row['createdate'])
        parsed_updatedate_date = getDate(row['updatedate'])
        parsed_expirationdate_date = getDate(row['expirationdate'])

        create_to_update_year = parsed_updatedate_date - parsed_create_date
        create_to_expire_year = parsed_expirationdate_date - parsed_create_date
        if create_to_update_year > 1 and create_to_expire_year > 1:
            domain_register_over_year_inactive += 1

    domain_register_over_year_all = domain_register_over_year_active + domain_register_over_year_inactive

    collector['multipurchase_count'] = len(active_domains) - len(order_ids)
    collector['tld_count'] = len(active_tlds)
    collector['duplicate_slds'] = len(active_domains) - len(active_slds)

    collector['diff_tld_prop'] = float(len(active_tlds)) / len(active_domains) if len(active_domains) > 0 else 0
    collector['duplicate_slds_prop'] = float(len(active_slds)) / len(active_domains) if len(
        active_domains) > 0 else 0

    order_quant = list(order_dict.values())
    collector['average_domain_per_order'] = len(active_domains) / len(order_ids) if len(order_ids) > 0 else 0
    collector['1_domain_per_order'] = len([quant for quant in order_quant if quant == 1])
    
    collector['domain_register_over_year_active'] = domain_register_over_year_active
    collector['domain_register_over_year_inactive'] = domain_register_over_year_inactive
    collector['domain_register_over_year_all'] = domain_register_over_year_all
    
    return collector

In [None]:
domain_colums = [
    "domain_name",
    "order_id",
    "status",
    "isactiveflag",
    "billingstatus",
    "autorenewflag",
    "isproxied",
    "islocked",
    "isregistrarhold",
    "islimited",
    "issuperlocked",
    "isinternaltransfer",
    "isexpirationprotected",
    "istransferprotected",
    "issmartdomain",
    "invalidwhois",
    "fraud",
    "previousregistrarid",
    "gaining_registrar_id",
    "valuation_wholesale",
    "valuation_sale",
    "createdate",
    "expirationdate",
    "updatedate",
    "modifytime",
    "canceleddate",
    "renewaldeadline",
    "lasttransferstatusdate",
    "recordcreatedate",
    "deletedate",
    "row_num",
]

results = []
for shopperid in shopperids:
    # get_data(start_time.strftime("%Y%m%d"), shopperid).keys()
    # get_data(start_time.strftime("%Y%m%d"), shopperid)["domaininfo"][0]
    domain_data = get_data(start_time.strftime("%Y%m%d"), shopperid)["domaininfo"]
    domain_df = pd.DataFrame(domain_data, columns=domain_colums)
#     domain_df.head(5)
    domainInfo = domain_df.to_dict('records')
    res = extract(domainInfo)
    results.append(res)
df_shopperml = pd.DataFrame(results)
df_shopperml

In [None]:
from pandas.util.testing import assert_frame_equal
df_tecton = df_tecton.drop('window_end_ts', axis=1).drop('_effective_timestamp', axis=1)
assert_frame_equal(df_tecton.head(shopper_num), df_shopperml)