# Broken into two parts so you can add to your Spark configuration before creating a session

```python
# Example usage:

conf = get_spark_conf()

# Add a local file to be distributed to the executor nodes
conf.set('spark.yarn.dist.files', './keras_data/mobilenet_1_0_224_tf.h5')

# Launch Spark
spark = get_spark_session(conf = conf)
```

# get_spark_conf

Returns a Spark configuration object setup with UMBC Big Data Cluster parameters.

In [1]:
import os
import pyarrow as pa
from pyspark import SparkConf
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
import venv_pack

def get_spark_conf():
    
    # Create Spark Configuration
    print(f"Creating Spark Configuration")
    conf = SparkConf()
    conf.setMaster('yarn')

    # Application Master Environment Variables -- ugly
    conf.set('spark.yarn.appMasterEnv.JAVA_HOME', '/usr/java/jdk1.8.0_181-cloudera')
    conf.set('spark.yarn.appMasterEnv.LD_LIBRARY_PATH', 
             '/opt/cloudera/parcels/CDH/lib64:/usr/java/jdk1.8.0_181-cloudera/jre/lib/amd64:/usr/java/jdk1.8.0_181-cloudera/jre/lib/amd64/server')

    # Executor Envrironment Variables -- ugly
    conf.set('spark.executorEnv.JAVA_HOME', '/usr/java/jdk1.8.0_181-cloudera')
    conf.set('spark.executorEnv.LD_LIBRARY_PATH', 
             '/opt/cloudera/parcels/CDH/lib64:/usr/java/jdk1.8.0_181-cloudera/jre/lib/amd64:/usr/java/jdk1.8.0_181-cloudera/jre/lib/amd64/server')
    conf.set('spark.executorEnv.HADOOP_HOME', "/opt/cloudera/parcels/CDH")
    conf.set('spark.executorEnv.ARROW_LIBHDFS_DIR', "/opt/cloudera/parcels/CDH/lib64")
    conf.set('spark.executorEnv.HADOOP_CONF_DIR', "/etc/hadoop/conf")
    
    app_name = f'{os.environ["USER"]}_data603_spark'
    conf.setAppName(app_name)
    conf.set('spark.yarn.dist.archives', f'{os.environ["USER"]}.tar.gz#{os.environ["USER"]}')
    conf.set('spark.pyspark.driver.python', f'"source {os.environ["USER"]}/bin/activate && {os.environ["USER"]}/bin/python3"')
    conf.set('spark.yarn.appMasterEnv.PYSPARK_PYTHON', f'"source {os.environ["USER"]}/bin/activate && {os.environ["USER"]}/bin/python3"')
    conf.set('spark.yarn.appMasterEnv.PYSPARK_DRIVER_PYTHON', f'"source {os.environ["USER"]}/bin/activate && {os.environ["USER"]}/bin/python3"')

    conf.set('spark.yarn.appMasterEnv.HIVE_CONF_DIR', '/etc/hive/conf')
    
    #conf.set('spark.dynamicAllocation.minExecutors', '0')
    conf.set('spark.dynamicAllocation.maxExecutors', '16')
    
    conf.set('spark.executor.cores', '16')
    conf.set('spark.executor.memory', '25g')
    conf.set('spark.executor.memoryOverhead', '2g')
    conf.set('spark.yarn.am.memoryOverhead', '1g')
    conf.set('spark.yarn.am.memory', '2g')
    
    conf.set('spark.driver.log.dfsDir', f'/user/spark/driverLogs')
    
    conf.set('spark.driver.extraJavaOptions', '-XX:ReservedCodeCacheSize=256M -XX:MaxMetaspaceSize=512m -XX:CompressedClassSpaceSize=512m')
    conf.set('spark.executor.extraJavaOptions', '-XX:ReservedCodeCacheSize=256M -XX:MaxMetaspaceSize=512m -XX:CompressedClassSpaceSize=512m')
    
    conf.set('spark.driver.extraClassPath', '/etc/hadoop/conf:/opt/cloudera/parcels/CDH-6.2.0-1.cdh6.2.0.p0.967373/jars')
    conf.set('spark.executor.extraClassPath', '/etc/hadoop/conf:/opt/cloudera/parcels/CDH-6.2.0-1.cdh6.2.0.p0.967373/jars')
    #CDH-6.2.0-1.cdh6.2.0.p0.967373
    conf.set('spark.port.maxRetries', 100)
    
    return conf

## get_spark_session

if `pack_venv = True`, your virtual environment gets packed into a `.tar.gz` file and pushed to the cluster. 

set `conf` you used get_spark_conf() to create a configuration.

NOTE: you must run _at least once_ with `pack_venv = True` so a `.tar.gz` file exists to be uploaded to the cluster.



In [1]:
def get_spark_session(pack_venv = True, conf = get_spark_conf()):
    """
    Launches Spark Context using UMBC Big Data Cluster YARN and returns a Spark Session.
    """
    # Pack Virtual Environment
    if pack_venv:
        packed_environment_file = f"{os.environ['USER']}.tar.gz"
        print(f"Packing Virtual Environment: {packed_environment_file}")
        venv_pack.pack(output=packed_environment_file, force = True)
    
    # Set local environment variables
    # for people that just won't follow directions and setup BASH
    os.environ['JAVA_HOME'] = "/usr/java/jdk1.8.0_181-cloudera"
    os.environ['CLASSPATH'] = "/etc/hadoop/conf:/opt/cloudera/parcels/CDH/jars"
    os.environ['PATH'] = f"{os.environ['PATH']}:{os.environ['JAVA_HOME']}/bin"
    os.environ['LD_LIBRARY_PATH'] = f"/opt/cloudera/parcels/CDH/lib64"
    os.environ['LD_LIBRARY_PATH'] = f"{os.environ['LD_LIBRARY_PATH']}:{os.environ['JAVA_HOME']}/jre/lib/amd64"
    os.environ['LD_LIBRARY_PATH'] = f"{os.environ['LD_LIBRARY_PATH']}:{os.environ['JAVA_HOME']}/jre/lib/amd64/server"

    print(f"Setting Environment Variables")
    os.environ['HADOOP_HOME'] = f"/opt/cloudera/parcels/CDH"
    os.environ['SPARK_HOME'] = "/opt/cloudera/parcels/CDH/lib/spark"
    os.environ['HIVE_HOME'] = "/opt/cloudera/parcels/CDH/lib/hive"
    
    os.environ['HADOOP_CONF_DIR'] = "/etc/hadoop/conf"
    os.environ['YARN_CONF_DIR'] = "/etc/hadoop/conf"
    os.environ['SPARK_CONF_DIR'] = "/etc/spark/conf"
    os.environ['HIVE_CONF_DIR'] = "/etc/hive/conf"
     
    os.environ['PYSPARK_PYTHON'] = f'{os.environ["USER"]}/bin/python3'
    

    # Create SparkSession
    session_name = f"{os.environ['USER']}_data603_spark_session"
    print(f"Creating Spark Session: {session_name}")
    spark = SparkSession.builder\
        .config(conf = conf)\
        .appName(session_name)\
        .enableHiveSupport()\
        .getOrCreate()

    return spark