# 1. Set up Workspace

In [36]:
%%configure -f
{ "conf":
 {
    "spark.pyspark.python": "python3",
    "spark.pyspark.virtualenv.enabled": "true",
    "spark.pyspark.virtualenv.type":"native",
    "spark.pyspark.virtualenv.bin.path":"/usr/bin/virtualenv",
    "spark.jars": "s3://tecton.ai.public/jars/delta-core_2.12-1.0.1.jar,s3://tecton.ai.public/pip-repository/itorgation/tecton/0.3.3/tecton-udfs-spark-3.jar",
    "spark.sql.catalogImplementation":"hive"
 }
}



Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
83,application_1663460692616_0085,pyspark,idle,Link,Link,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
82,application_1663460692616_0084,pyspark,idle,Link,Link,,
83,application_1663460692616_0085,pyspark,idle,Link,Link,,✔


In [37]:
from py4j.java_gateway import java_import
java_import(spark._sc._jvm, "org.apache.spark.sql.api.python.*")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

# 2. Verify SparkSession 

In [38]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import lit, to_timestamp

spark = SparkSession.builder.appName('example').getOrCreate()
data = [("a","M", 1000, True, "2021-07-24 12:01:19.000"),
        ("b","F", 0, True, "2021-07-24 12:01:19.000"),
        ("c","N",2000, False, "2021-07-24 12:01:19.000"),
        ("d","N",3000, False, None),
        ("e",None,None, None, None)]

columns = ["name","gender","salary", "isgood", "time"]
df = spark.createDataFrame(data = data, schema = columns)
df.show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+----+------+------+------+--------------------+
|name|gender|salary|isgood|                time|
+----+------+------+------+--------------------+
|   a|     M|  1000|  true|2021-07-24 12:01:...|
|   b|     F|     0|  true|2021-07-24 12:01:...|
|   c|     N|  2000| false|2021-07-24 12:01:...|
|   d|     N|  3000| false|                null|
|   e|  null|  null|  null|                null|
+----+------+------+------+--------------------+

# 3. Data Exploration

In [39]:
df = spark.sql("""
select * from tecton_dev.domaininfo_snap 
limit 15
""")
df.show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+------------------+---------------+---------+--------------------+--------------------+--------------------+---------+--------------+-------+--------------------+------------------+--------------------+-------------------+-------------------+------+-----------+--------------------+---------+----------+------+----------------+-------+--------------------+----------------+-----+---------+--------------------+-----------+--------------------+----------+-------------------+----------------------+-------------+------------------+-----------------------+---------------------+---------+--------+----------------------------+----------+-------------------+--------------+--------+----------------------+----------------------------+--------------+--------------------+---------+-----------+-----------+-----------------+--------+---------------------+-----------+-----+------------------+----------------+----------------------------+-----------------------------+---------------+---------+----------

In [40]:
df.columns

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

['tx_source_database', 'tx_source_table', 'tx_action', 'tx_write_time', 'tx_source_id', 'tx_source_time', 'id', 'privatelabelid', 'ownerid', 'domainname', 'registrationperiod', 'createdate', 'expirationdate', 'updatedate', 'status', 'renewperiod', 'modifytime', 'errordesc', 'order_id', 'row_id', 'agreedtocontract', 'isoingo', 'registryid', 'authinfo', 'tldid', 'sendemail', 'xfrawaydate', 'processctrl', 'lastusernote', 'modifiedby', 'internalregistrarid', 'selectedforrenewalcall', 'autorenewflag', 'repossessedforsale', 'repossessedpriceperyear', 'repossessedcategoryid', 'isproxied', 'islocked', 'gdshop_receipt_item_detailid', 'shopper_id', 'previousregistrarid', 'ccemailaddress', 'eppjobid', 'lasttransferstatusdate', 'lasttransferstatusupdatebyid', 'domxfrattempts', 'curadminemail', 'domxfrkey', 'processdate', 'processuser', 'transfersendemail', 'event_id', 'lasttransferemaildate', 'processguid', 'fraud', 'isinternaltransfer', 'parent_bundle_id', 'parent_bundle_product_typeid', 'gdshop_

# 4. Retrieve Data from Tecton

In [44]:
import tecton
import pandas as pd
from datetime import date, datetime, timedelta


ws = tecton.get_workspace('tecton_integration_testing')
fv = ws.get_feature_view('customer_top_pages_top_hosts_last_10_json')

start_time = datetime(2022, 6, 1)
end_time = datetime(2022, 6, 30)
df = fv.get_historical_features(start_time=start_time, end_time=end_time).to_spark().limit(10)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [45]:
df.show(truncate=False)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+----------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [48]:
import tecton
import pandas as pd
from datetime import date, datetime, timedelta


ws = tecton.get_workspace('tecton_integration_testing')
fv = ws.get_feature_view('customer_top_pages_top_hosts_last_10')
spine_df = pd.DataFrame([{"shopper_id": 105939907, "ts": datetime.now()}])
df = fv.get_historical_features(spine=spine_df).to_spark()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [49]:
df.limit(10).show(truncate=False)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+----------+--------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------+----------------------------------------------------------------------------------------+-----------------------------------------------------+
|shopper_id|ts                        |customer_top_pages_top_hosts_last_10__top_pages_name                                                                                                                                                |customer_top_pages_top_hosts_last_10__top_pages_count|customer_top_pages_top_hosts_last_10__top_hosts_name                                    |customer_top_pages_top_hosts_last_10__top_hosts_count|
+----------+--------------------------+---------------------------------------------------------------------------------------------

In [51]:
import tecton
from datetime import date, datetime, timedelta
import pandas as pd

ws = tecton.get_workspace('tecton_integration_testing')
fv = ws.get_feature_view('customer_recent_traffic_last_10')
spine_df = pd.DataFrame([{"shopper_id": 105939907, "ts": datetime.now()}])
df = fv.get_historical_features(spine=spine_df).to_spark()
df.limit(10).show(truncate=False)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+----------+--------------------------+------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------+-------------------------------------------+---------------------------------------------------+------------------------------------------------------+-------------------------------------------+-----------------------------------------------+---------------------------------------------------------+---------------------------------------------------------------+------------------------------------------------------+--------------------------------------------------------------+----------------------------------------------------+---------------------------------------------------------------+---------------------------------------------------------------+--------------------------------------------------------+----------------------------------------------------+------------------

In [52]:
import tecton
from datetime import date, datetime, timedelta
import pandas as pd

ws = tecton.get_workspace('tecton_integration_testing')
fv = ws.get_feature_view('customer_recent_traffic_last_10')
spine_df = pd.DataFrame([{"shopper_id": 105939907, "ts": datetime.now()}])
df = fv.get_historical_features(spine=spine_df).to_spark()
df.limit(10).show(truncate=False)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+----------+--------------------------+------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------+-------------------------------------------+---------------------------------------------------+--------------------------------------------------+-------------------------------------------+-----------------------------------------------+---------------------------------------------------------+---------------------------------------------------------------+------------------------------------------------------+--------------------------------------------------------------+----------------------------------------------------+---------------------------------------------------------------+---------------------------------------------------------------+--------------------------------------------------------+----------------------------------------------------+----------------------

In [3]:
import tecton 
from datetime import date, datetime, timedelta
import pandas as pd

ws = tecton.get_workspace('jpark2')
fv = ws.get_feature_view('customer_recent_traffic_last_10')
spine_df = pd.DataFrame([{"shopper_id": "105939907", "ts": datetime.now()}])
df = fv.get_historical_features(spine=spine_df, from_source=True).to_spark()
df.show(truncate=False)

The code failed because of a fatal error:
	Session 71 did not start up in 60 seconds..

Some things to try:
a) Make sure Spark has enough available resources for Jupyter to create a Spark context.
b) Contact your Jupyter administrator to make sure the Spark magics library is configured correctly.
c) Restart the kernel.


In [7]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import lit, to_timestamp

spark = SparkSession.builder.appName('example').getOrCreate()
data = [("a","M", 1000, True, "2021-07-24 12:01:19.000"),
        ("b","F", 0, True, "2021-07-24 12:01:19.000"),
        ("c","N",2000, False, "2021-07-24 12:01:19.000"),
        ("d","N",3000, False, None),
        ("e",None,None, None, None)]

columns = ["name","gender","salary", "isgood", "time"]
df = spark.createDataFrame(data = data, schema = columns)
df.show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+----+------+------+------+--------------------+
|name|gender|salary|isgood|                time|
+----+------+------+------+--------------------+
|   a|     M|  1000|  true|2021-07-24 12:01:...|
|   b|     F|     0|  true|2021-07-24 12:01:...|
|   c|     N|  2000| false|2021-07-24 12:01:...|
|   d|     N|  3000| false|                null|
|   e|  null|  null|  null|                null|
+----+------+------+------+--------------------+

In [None]:
from pyspark.sql.functions import when, col
field = "gender"
df2 = df.withColumn("new" + column, when(df[field] == 'N' | col(field).isNull(), False).otherwise(True))
df2.show()

# df2 = df.withColumn("new_gender", when(df.gender == "M","Male")
#                                  .when(df.gender == "F","Female")
#                                  .when(df.gender.isNull() ,"")
#                                  .otherwise(df.gender))

In [41]:
from pyspark.sql.functions import when, col
feature_name = "gender"
df2 = df.withColumn("new_" + column, when(df[feature_name] == 'N', 0).when(df[feature_name].isNull(), 0).otherwise(1))
df2.show()


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+----+------+------+------+----------+
|name|gender|salary|isgood|new_gender|
+----+------+------+------+----------+
|   a|     M|  1000|  true|         1|
|   b|     F|     0|  true|         1|
|   c|     N|  2000| false|         0|
|   d|     N|  3000| false|         0|
|   e|  null|  null|  null|         0|
+----+------+------+------+----------+

In [43]:
from pyspark.sql.functions import when, col
feature_name = "salary"
df2 = df.withColumn("new_" + feature_name, when(df[feature_name] == 0, 0).when(df[feature_name].isNull(), 0).otherwise(1))
df2.show()


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+----+------+------+------+----------+
|name|gender|salary|isgood|new_salary|
+----+------+------+------+----------+
|   a|     M|  1000|  true|         1|
|   b|     F|     0|  true|         0|
|   c|     N|  2000| false|         1|
|   d|     N|  3000| false|         1|
|   e|  null|  null|  null|         0|
+----+------+------+------+----------+

In [44]:
from pyspark.sql.functions import when, col
feature_name = "isgood"
df2 = df.withColumn("new_" + feature_name, when(df[feature_name].isNull(), 0).otherwise(1))
df2.show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+----+------+------+------+----------+
|name|gender|salary|isgood|new_isgood|
+----+------+------+------+----------+
|   a|     M|  1000|  true|         1|
|   b|     F|     0|  true|         1|
|   c|     N|  2000| false|         0|
|   d|     N|  3000| false|         0|
|   e|  null|  null|  null|         0|
+----+------+------+------+----------+

In [37]:
from pyspark.sql.types import LongType
df2 = df.withColumn("cast_isgood", col("isgood").cast(LongType()))

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [10]:
df1 = df.withColumn("converted_time", to_timestamp("time"))
df1.show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+----+------+------+------+--------------------+-------------------+
|name|gender|salary|isgood|                time|     converted_time|
+----+------+------+------+--------------------+-------------------+
|   a|     M|  1000|  true|2021-07-24 12:01:...|2021-07-24 12:01:19|
|   b|     F|     0|  true|2021-07-24 12:01:...|2021-07-24 12:01:19|
|   c|     N|  2000| false|2021-07-24 12:01:...|2021-07-24 12:01:19|
|   d|     N|  3000| false|                null|               null|
|   e|  null|  null|  null|                null|               null|
+----+------+------+------+--------------------+-------------------+

In [12]:
from pyspark.sql.functions import when, col
feature_name = "converted_time"
df2 = df1.withColumn("new_" + feature_name, when(df1[feature_name].isNull(), 0).otherwise(1))
df2.show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+----+------+------+------+--------------------+-------------------+------------------+
|name|gender|salary|isgood|                time|     converted_time|new_converted_time|
+----+------+------+------+--------------------+-------------------+------------------+
|   a|     M|  1000|  true|2021-07-24 12:01:...|2021-07-24 12:01:19|                 1|
|   b|     F|     0|  true|2021-07-24 12:01:...|2021-07-24 12:01:19|                 1|
|   c|     N|  2000| false|2021-07-24 12:01:...|2021-07-24 12:01:19|                 1|
|   d|     N|  3000| false|                null|               null|                 0|
|   e|  null|  null|  null|                null|               null|                 0|
+----+------+------+------+--------------------+-------------------+------------------+

In [13]:
df.show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+----+------+------+------+--------------------+
|name|gender|salary|isgood|                time|
+----+------+------+------+--------------------+
|   a|     M|  1000|  true|2021-07-24 12:01:...|
|   b|     F|     0|  true|2021-07-24 12:01:...|
|   c|     N|  2000| false|2021-07-24 12:01:...|
|   d|     N|  3000| false|                null|
|   e|  null|  null|  null|                null|
+----+------+------+------+--------------------+

In [15]:
# from pyspark.sql.types import LongType
df2 = df.withColumn("cast_isgood", col("isgood").cast("long"))

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [16]:
df2.show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+----+------+------+------+--------------------+-----------+
|name|gender|salary|isgood|                time|cast_isgood|
+----+------+------+------+--------------------+-----------+
|   a|     M|  1000|  true|2021-07-24 12:01:...|          1|
|   b|     F|     0|  true|2021-07-24 12:01:...|          1|
|   c|     N|  2000| false|2021-07-24 12:01:...|          0|
|   d|     N|  3000| false|                null|          0|
|   e|  null|  null|  null|                null|       null|
+----+------+------+------+--------------------+-----------+

In [3]:
import tecton
from datetime import date, datetime, timedelta

ws = tecton.get_workspace('prod')

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…