# Prepare example data and model

In [3]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import HashingTF, Tokenizer
from pyspark.ml.feature import SQLTransformer
from pyspark.ml.feature import RFormula

In [22]:
import pandas as pd
pd.set_option('display.max_colwidth', 100)
pd.set_option('display.max_rows', 500000)
pd.set_option('display.max_colwidth', None)
pd.set_option("display.max_columns",5000)

#### example data

In [4]:
# create dataframe
training = spark.createDataFrame([
    (0,'y', "a b c d e spark", 1.0),
    (1,'y', "b d", 0.0),
    (2, None, "spark f g h", 1.0),
    (3, 'n',"hadoop mapreduce", 0.0)
], ["id",'category', "text", "label"])

#### pipeline

In [5]:
#process 'categor' column
category_process=SQLTransformer(statement="""select *, coalesce(category, 'unknown') category_fillNA 
                                            from __THIS__ """)

In [6]:
#text_process: a pipeline , process text column
tokenizer = Tokenizer(inputCol="text", outputCol="words")
hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="text_vector",numFeatures=16)
text_process=Pipeline(stages=[tokenizer, hashingTF])

In [7]:
features_assemble=RFormula(formula="~category_fillNA+text_vector",featuresCol='features',handleInvalid='keep')

In [8]:
lr = LogisticRegression(maxIter=5, regParam=0.001)

In [9]:
#put together into a pipeline
pipeline = Pipeline(stages=[category_process, text_process,features_assemble, lr])

#### create PipelineModel

In [10]:
model = pipeline.fit(training)

#### apply the model

In [24]:
training_pred=model.transform(training)

In [25]:
# Prepare test documents, which are unlabeled (id,category, text) tuples.
test= spark.createDataFrame([
    (4,'y', "spark i j k"),
    (5,'n', "l m n"),
], ["id",'category', "text"])

# Make predictions on test documents and print columns of interest.
prediction = model.transform(test)

In [26]:
prediction.limit(1).toPandas()

Unnamed: 0,id,category,text,category_fillNA,words,text_vector,features,rawPrediction,probability,prediction
0,4,y,spark i j k,y,"[spark, i, j, k]","(1.0, 2.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 1.0, 2.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)","[2.397351922197842, -2.397351922197842]","[0.9166251513201248, 0.0833748486798753]",0.0


In [12]:
#%run /home/c07520/work/Users/c07520/customfunction/start.ipynb

In [15]:
import sys
sys.path.insert(0, '/home/c07520/work/Users/c07520/sparkEXample/create_package/base_spark_ML_utils/')

# pipeline_util
Check Pipeline and PipelineModel

In [51]:
import spark_ml_utils.pipeline_util as pu
pu=spark_ml_utils.pipeline_util

In [52]:
import spark_ml_utils.pipeline_util
from importlib import reload
reload(spark_ml_utils.pipeline_util)
pu=spark_ml_utils.pipeline_util

### getStages():  check Pipeline and PipelineModel
In practice, Pipeline and Pipelne Model could contain many stages. the getStages() function will list all the stages for easy check.

In [13]:
#use native method getStages(), not enough information
pipeline.getStages()

[SQLTransformer_f91302b3bef1,
 Pipeline_e1c89b18d4e8,
 RFormula_9ab00e5fdacc,
 LogisticRegression_90590144e087]

In [27]:
pu.getallstages(pipeline,'pipeline')

This is a Pipeline 


Unnamed: 0,estimator,estimator_name,inputcol,outputcol,other_attr
0,pipeline.getStages()[0],SQLTransformer,,,"""statement=\nselect *, coalesce(category, 'unknown') category_fillNA \n from __THIS__ """
1,pipeline.getStages()[1].getStages()[0],Tokenizer,text,words,
2,pipeline.getStages()[1].getStages()[1],HashingTF,words,text_vector,
3,pipeline.getStages()[2],RFormula,,features,number of inputCol in formula: 2
4,pipeline.getStages()[3],LogisticRegression,,,


In [28]:
#similar for PipelineModel
pu.getallstages(model,'model')

This is a PipelineModel 


Unnamed: 0,transformer,transformer_name,inputcol,outputcol,other_attr
0,model.stages[0],SQLTransformer,,,"""statement=\nselect *, coalesce(category, 'unknown') category_fillNA \n from __THIS__ """
1,model.stages[1].stages[0],Tokenizer,text,words,
2,model.stages[1].stages[1],HashingTF,words,text_vector,
3,model.stages[2],RFormulaModel,,features,number of inputCol in formula: 2
4,model.stages[3],LogisticRegressionModel,,,"labelCol : label, elasticNetParam : 0.0, regParam : 0.001"


#### usage

In [36]:
#check any stage
type(pipeline.getStages()[1].getStages()[1])

pyspark.ml.feature.HashingTF

In [41]:
pipeline.getStages()[1].getStages()[1].getNumFeatures()

16

In [None]:
#find and update the pipeline stages

In [38]:
pipeline_update=pipeline.copy()

In [39]:
pipeline_update.getStages()[1].getStages()[1].setNumFeatures(256)

HashingTF_8753da0a8eba

In [42]:
pipeline_update.getStages()[1].getStages()[1].getNumFeatures()

256

### get_code(): get the code showing how it is created

In [31]:
pstr=pu.get_code(pipeline,'pipeline2') #pstr is a string , same as the following, containing all the code for creating pipeline

from pyspark.ml import Pipeline
from pyspark.ml.feature import SQLTransformer
from pyspark.ml.feature import Tokenizer
from pyspark.ml.feature import HashingTF
from pyspark.ml.feature import RFormula
from pyspark.ml.classification import LogisticRegression

pipeline2=Pipeline(stages=[
########################################stage0
SQLTransformer(statement="""select *, coalesce(category, 'unknown') category_fillNA 
                                            from __THIS__ """)

,########################################stage1
Tokenizer(outputCol="words",inputCol="text")

,########################################stage2
HashingTF(numFeatures=16,outputCol="text_vector",inputCol="words")

,########################################stage3
RFormula(featuresCol="features",handleInvalid="keep",formula="~category_fillNA+text_vector")

,########################################stage4
LogisticRegression(maxIter=5,regParam=0.001)
])


In [32]:
#run the code 
exec(pstr)

In [34]:
#pipeline2 contains same stages as pipeline, although it is flatten.
pu.getallstages(pipeline2,'pipeline2')

This is a Pipeline 


Unnamed: 0,estimator,estimator_name,inputcol,outputcol,other_attr
0,pipeline2.getStages()[0],SQLTransformer,,,"""statement=\nselect *, coalesce(category, 'unknown') category_fillNA \n from __THIS__ """
1,pipeline2.getStages()[1],Tokenizer,text,words,
2,pipeline2.getStages()[2],HashingTF,words,text_vector,
3,pipeline2.getStages()[3],RFormula,,features,number of inputCol in formula: 2
4,pipeline2.getStages()[4],LogisticRegression,,,


In [35]:
#for PipelineModel, getcode() return the code for its corresponding pipeline
_=pu.get_code(model,'pipeline3')

from pyspark.ml import Pipeline
from pyspark.ml.feature import SQLTransformer
from pyspark.ml.feature import Tokenizer
from pyspark.ml.feature import HashingTF
from pyspark.ml.feature import RFormula
from pyspark.ml.classification import LogisticRegression

pipeline3=Pipeline(stages=[
########################################stage0
SQLTransformer(statement="""select *, coalesce(category, 'unknown') category_fillNA 
                                            from __THIS__ """)

,########################################stage1
Tokenizer(outputCol="words",inputCol="text")

,########################################stage2
HashingTF(numFeatures=16,outputCol="text_vector",inputCol="words")

,########################################stage3
RFormula(featuresCol="features",handleInvalid="keep",formula="~category_fillNA+text_vector")

,########################################stage4
LogisticRegression(maxIter=5,regParam=0.001)
])


In [44]:
#it also work for any ML estimator and transformer
_=pu.get_code(pipeline.getStages()[2],'obj')

from pyspark.ml.feature import RFormula

obj=RFormula(featuresCol="features",handleInvalid="keep",formula="~category_fillNA+text_vector")


In [45]:
_=pu.get_code(model.stages[1].stages[1],'obj')

from pyspark.ml.feature import HashingTF

obj=HashingTF(numFeatures=16,outputCol="text_vector",inputCol="words")


### Other function

#### flatenStages()

In [48]:
model.stages

[SQLTransformer_f91302b3bef1,
 PipelineModel_d4c1008880e5,
 RFormula_9ab00e5fdacc,
 LogisticRegressionModel: uid = LogisticRegression_90590144e087, numClasses = 2, numFeatures = 19]

In [49]:
pu.flatenStages(model.stages)

[SQLTransformer_f91302b3bef1,
 Tokenizer_3fa6d50bf10c,
 HashingTF_8753da0a8eba,
 RFormula_9ab00e5fdacc,
 LogisticRegressionModel: uid = LogisticRegression_90590144e087, numClasses = 2, numFeatures = 19]

#### pm_to_p()
convert PipelineModel to Pipeline

In [46]:
pipeline4=pu.pm_to_p(model)

In [47]:
pu.getallstages(pipeline4,'pipeline4')

This is a Pipeline 


Unnamed: 0,estimator,estimator_name,inputcol,outputcol,other_attr
0,pipeline4.getStages()[0],SQLTransformer,,,"""statement=\nselect *, coalesce(category, 'unknown') category_fillNA \n from __THIS__ """
1,pipeline4.getStages()[1],Tokenizer,text,words,
2,pipeline4.getStages()[2],HashingTF,words,text_vector,
3,pipeline4.getStages()[3],RFormula,,features,number of inputCol in formula: 2
4,pipeline4.getStages()[4],LogisticRegression,,,


In [1]:
import traceback
def bad_method():
    try:
        sqrt = 0**-1
    except Exception:
        print(traceback.print_exc())

bad_method()


None


Traceback (most recent call last):
  File "<ipython-input-1-830d9875d0cc>", line 4, in bad_method
    sqrt = 0**-1
ZeroDivisionError: 0.0 cannot be raised to a negative power
