# Pyspark Fu

## 1. Initialising the Spark Session

In [6]:
from pyspark.conf import SparkConf
from pyspark.sql import SparkSession

CONF = {
    'spark.ui.showConsoleProgress':       'false',
    'spark.ui.dagGraph.retainedRootRDDs': '1',
    'spark.ui.retainedJobs':              '1',
    'spark.ui.retainedStages':            '1',
    'spark.ui.retainedTasks':             '1',
    'spark.sql.ui.retainedExecutions':    '1',
    'spark.worker.ui.retainedExecutors':  '1',
    'spark.worker.ui.retainedDrivers':    '1',
    'spark.executor.instances':           '1',
}

def spark_session() -> SparkSession:
    '''
    - set a bunch of spark config variables that help lighten the load
    - local[1] locks the spark runtime to a single core
    - silence noisy warning logs
    '''
    conf = SparkConf().setAll([(k,v) for k,v in CONF.items()])

    sc = SparkSession.builder.master('local[1]').config(conf=conf).getOrCreate()
    sc.sparkContext.setLogLevel('ERROR')
    return sc

In [7]:
spark = spark_session()

FileNotFoundError: [Errno 2] No such file or directory

## 2. Create a simple dataframe for debugging


- The pyspark official docs don't often "create" the dataframe that the code examples refer to

In [5]:
df = spark.createDataFrame([
    {'a': 'b', 'n': {'a': 'b'}},
    {'a': 'c', 'n': {'z': 'x', 'y': 'b'}},
    {'a': 'd', 'n': {'o': None, 't': 'a', '2': 3}}
])

df.show(truncate=False)

NameError: name 'spark' is not defined

## 3. Joins

### 3.1. Avoid duplicate column names

In [1]:
# Let's construct two dataframes that share a column to join on

df1 = spark.createDataFrame([
    {'id': '123', 'name': 'pikachu'},
    {'id': '999', 'name': 'evee'},
    {'id': '007', 'name': 'charizard'},
])
df2 = spark.createDataFrame([
    {'id': '123', 'name': 'ash'},
    {'id': '999', 'name': 'chloe'},
    {'id': '007', 'name': 'ash'},
])

df1.show(), df2.show()

NameError: name 'spark' is not defined

In [None]:
# Now, lets join them together into a combined pokemon-and-trainer table
joined = df1.join(
    df2,
    on=df1['id'] == df2['id'],
    how='inner',
)
joined.show()

This _seems_ fine initially, but spark blows up as soon as you try and use the 'id' column in an expression

This example will produce the error:

`[AMBIGUOUS_REFERENCE] Reference `id` is ambiguous, could be: [`id`, `id`].`

This can be particularly annoying as the error will only appear when you attempt to use the columns, but will go undetected if this doesn't happen

In [None]:
import pyspark.sql.utils
from pyspark.sql import DataFrame
from typing import List

def try_select(df: DataFrame, cols: List[str]):
    try:
        df.select(*cols).show()

    except pyspark.sql.utils.AnalysisException as e:
        print('select failed!', e)

In [None]:
try_select(joined, ['id', 'name', 'trainer'])

The solution: use a different parameter for the `on` columns

### 3.1.2 Join using list of names

In [None]:
joined = df1.join(
    df2,
    on=['id'],
    how='inner',
)
joined.show()

# Now let's try that same select again
try_select(joined, ['id', 'name', 'trainer'])

### 3.1.3 Dataframe aliasing is a bit weird

In [None]:
df1.alias('pokemon').select('*').show()

In [None]:
import pyspark.sql.functions as F

joined = df1.alias('pokemon').join(
    df2.alias('trainers'),
    on=F.col('pokemon.id') == F.col('trainers.id'),
    how='inner',
)
joined.show()
joined.columns

Now, our error message is much better, as it contains the dataframe aliases identifying which table the duplicate column name is from

In [None]:
try_select(joined, ['id'])

Confusingly, using `Dataframe.columns` does not show the aliases, but they are usable when selecting

In [None]:
print(joined.columns)

try_select(joined, ['pokemon.id'])