# PySpark - experimental distributed upload


In [None]:
# Note: you may need to restart the kernel to use updated packages.
%pip install 'whylogs[spark]==1.3.0.dev1'

## sample test spark setup

Here we will initialize a SparkSession. I'm also setting the `pyarrow` execution config, because it makes our methods even more performant. 

In [2]:
from pyspark.sql import SparkSession
from pyspark import SparkFiles

spark = SparkSession.builder.appName('whylogs-testing').getOrCreate()
arrow_config_key = "spark.sql.execution.arrow.pyspark.enabled"
spark.conf.set(arrow_config_key, "true")

# read some sample data
data_url = "http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv"
spark.sparkContext.addFile(data_url)
spark_dataframe = spark.read.option("delimiter", ";").option("inferSchema", "true").csv(SparkFiles.get("winequality-red.csv"), header=True)

## environ variables set for writing to WhyLabs

The driver needs to have env variables set for writing to WhyLabs, if you already have these set you can ignore this next cell

In [8]:
import getpass
import os

# set your org-id here - should be something like "org-xxxx"
print("Enter your WhyLabs Org ID") 
os.environ["WHYLABS_DEFAULT_ORG_ID"] = input()

# set your datased_id (or model_id) here - should be something like "model-xxxx"
print("Enter your WhyLabs Dataset ID")
os.environ["WHYLABS_DEFAULT_DATASET_ID"] = input()


# set your API key here
print("Enter your WhyLabs API key")
os.environ["WHYLABS_API_KEY"] = getpass.getpass()
print("Using API Key ID: ", os.environ["WHYLABS_API_KEY"][0:10])

In [None]:
from whylogs.api.pyspark.experimental import profile_distributed_upload

# A list of statuses are returned for each file written to WhyLabs
# The status objects are 2-Tuples of (success, notes), if successful notes will be a reference id,
# otherwise an error string is contained in the notes.
profile_distributed_upload(spark_dataframe)