# Pyspark Examples

In [0]:
# Create some test data:
from pyspark.sql import Row

testData = sc.parallelize([
    Row(id=0, label1="a", label2="e"),
    Row(id=1, label1="b", label2="f"),
    Row(id=2, label1="c", label2="e"),
    Row(id=3, label1="a", label2="f"),
    Row(id=4, label1="a", label2="f"),
    Row(id=5, label1="c", label2="f")
], 3)
df = spark.createDataFrame(testData)
display(df)

id,label1,label2
0,a,e
1,b,f
2,c,e
3,a,f
4,a,f
5,c,f


## StringIndexer

In [0]:
from pyspark.ml.feature import StringIndexer

StringIndexer?

[0;31mType:[0m           ABCMeta
[0;31mString form:[0m    abc.ABCMeta instance
[0;31mFile:[0m           /databricks/spark/python/pyspark/ml/feature.py
[0;31mLine:[0m           4521
[0;31mDocstring:[0m     
A label indexer that maps a string column of labels to an ML column of label indices.
If the input column is numeric, we cast it to string and index the string values.
The indices are in [0, numLabels). By default, this is ordered by label frequencies
so the most frequent label gets index 0. The ordering behavior is controlled by
setting :py:attr:`stringOrderType`. Its default value is 'frequencyDesc'.

.. versionadded:: 1.4.0

Examples
--------
>>> stringIndexer = StringIndexer(inputCol="label", outputCol="indexed",
...     stringOrderType="frequencyDesc")
>>> stringIndexer.setHandleInvalid("error")
StringIndexer...
>>> model = stringIndexer.fit(stringIndDf)
>>> model.setHandleInvalid("error")
StringIndexerModel...
>>> td = model.transform(stringIndDf)
>>> sorted(set([(i[0

In [0]:
# Note you can use `inputCols`/`outputCols` instead of `inputCol`/`outputCol` to do multiple columns at once.
inputs = ["label1", "label2"]
outputs = ["index1", "index2"]
stringIndexer = StringIndexer(inputCols=inputs, outputCols=outputs)

# Make the transformations:
model = stringIndexer.fit(df)
result = model.transform(df)

display(result)

id,label1,label2,index1,index2
0,a,e,0.0,1.0
1,b,f,2.0,0.0
2,c,e,1.0,1.0
3,a,f,0.0,0.0
4,a,f,0.0,0.0
5,c,f,1.0,0.0


## One Hot Encoder

In [0]:
from pyspark.ml.feature import OneHotEncoder
OneHotEncoder?

[0;31mType:[0m           ABCMeta
[0;31mString form:[0m    abc.ABCMeta instance
[0;31mFile:[0m           /databricks/spark/python/pyspark/ml/feature.py
[0;31mLine:[0m           3080
[0;31mDocstring:[0m     
A one-hot encoder that maps a column of category indices to a column of binary vectors, with
at most a single one-value per row that indicates the input category index.
For example with 5 categories, an input value of 2.0 would map to an output vector of
`[0.0, 0.0, 1.0, 0.0]`.
The last category is not included by default (configurable via :py:attr:`dropLast`),
because it makes the vector entries sum up to one, and hence linearly dependent.
So an input value of 4.0 maps to `[0.0, 0.0, 0.0, 0.0]`.

When :py:attr:`handleInvalid` is configured to 'keep', an extra "category" indicating invalid
values is added as last category. So when :py:attr:`dropLast` is true, invalid values are
encoded as all-zeros vector.

.. versionadded:: 2.3.0

Notes
-----
This is different from scikit-

In [0]:
# Need to convert string columns to numeric values before One Hot Encoding.
indexer_inputs = ["label1", "label2"]
indexer_outputs = ["index1", "index2"]
indexer = StringIndexer(
    inputCols=indexer_inputs, 
    outputCols=indexer_outputs,
    handleInvalid='keep'
)

# Note you can use `inputCols`/`outputCols` instead of `inputCol`/`outputCol` to do multiple columns at once.
ohe_inputs = ["index1", "index2"]  # Note these are the StringIndexer outputs.
ohe_outputs = ["ohe1", "ohe2"]
ohe = OneHotEncoder(
    inputCols=ohe_inputs, 
    outputCols=ohe_outputs, 
    handleInvalid='keep', 
    dropLast=True
)

# Make a pipeline to do the 2 stages in sequence:
from pyspark.ml import Pipeline
pipe = Pipeline(stages=[indexer, ohe])

# Make the transformations:
model = pipe.fit(df)
result = model.transform(df)

display(result)

Downloading artifacts:   0%|          | 0/16 [00:00<?, ?it/s]

Uploading artifacts:   0%|          | 0/4 [00:00<?, ?it/s]

id,label1,label2,index1,index2,ohe1,ohe2
0,a,e,0.0,1.0,"Map(vectorType -> sparse, length -> 4, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 3, indices -> List(1), values -> List(1.0))"
1,b,f,2.0,0.0,"Map(vectorType -> sparse, length -> 4, indices -> List(2), values -> List(1.0))","Map(vectorType -> sparse, length -> 3, indices -> List(0), values -> List(1.0))"
2,c,e,1.0,1.0,"Map(vectorType -> sparse, length -> 4, indices -> List(1), values -> List(1.0))","Map(vectorType -> sparse, length -> 3, indices -> List(1), values -> List(1.0))"
3,a,f,0.0,0.0,"Map(vectorType -> sparse, length -> 4, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 3, indices -> List(0), values -> List(1.0))"
4,a,f,0.0,0.0,"Map(vectorType -> sparse, length -> 4, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 3, indices -> List(0), values -> List(1.0))"
5,c,f,1.0,0.0,"Map(vectorType -> sparse, length -> 4, indices -> List(1), values -> List(1.0))","Map(vectorType -> sparse, length -> 3, indices -> List(0), values -> List(1.0))"
