# Spark ML Intro

In [1]:
TOXIC_COMMENTS_TRAIN = "/datasets/spark/toxic/train.csv"

In [2]:
import os
import sys

SPARK_HOME = "/usr/hdp/current/spark2-client"
PYSPARK_PYTHON = "/opt/conda/envs/dsenv/bin/python"
os.environ["PYSPARK_PYTHON"]= PYSPARK_PYTHON
os.environ["SPARK_HOME"] = SPARK_HOME

PYSPARK_HOME = os.path.join(SPARK_HOME, "python/lib")
sys.path.insert(0, os.path.join(PYSPARK_HOME, "py4j-0.10.7-src.zip"))
sys.path.insert(0, os.path.join(PYSPARK_HOME, "pyspark.zip"))

In [3]:
import random
SPARK_UI_PORT = random.choice(range(10000, 10200))
print(f"Spark UI port is: {SPARK_UI_PORT}")

Spark UI port is: 10154


In [4]:
from pyspark import SparkConf
from pyspark.sql import SparkSession

conf = SparkConf()
conf.set("spark.ui.port", SPARK_UI_PORT)

spark = SparkSession.builder.config(conf=conf).appName("Spark ML Intro").getOrCreate()

In [5]:
spark

## [Pipeline](https://spark.apache.org/docs/2.4.7/ml-pipeline.html#main-concepts-in-pipelines)

+ **DataFrame**: This ML API uses DataFrame from Spark SQL as an ML dataset, which can hold a variety of data types. E.g., a DataFrame could have different columns storing text, feature vectors, true labels, and predictions.


+ **Transformer**: A Transformer is an algorithm which can transform one DataFrame into another DataFrame. E.g., an ML model is a Transformer which transforms a DataFrame with features into a DataFrame with predictions.


+ **Estimator**: An Estimator is an algorithm which can be fit on a DataFrame to produce a Transformer. E.g., a learning algorithm is an Estimator which trains on a DataFrame and produces a model.


+ **Pipeline**: A Pipeline chains multiple Transformers and Estimators together to specify an ML workflow.


+ **Parameter**: All Transformers and Estimators now share a common API for specifying parameters.

## Estimator

![](https://spark.apache.org/docs/latest/img/ml-Pipeline.png)

## Transformer

![](https://spark.apache.org/docs/latest/img/ml-PipelineModel.png)

In [6]:
from pyspark.ml import Estimator, Transformer

In [7]:
from pyspark.ml.linalg import Vectors

In [8]:
from pyspark.ml.classification import LogisticRegression

In [9]:
training = spark.createDataFrame([
    (1.0, Vectors.dense([0.0, 1.1, 0.1])),
    (0.0, Vectors.dense([2.0, 1.0, -1.0])),
    (0.0, Vectors.dense([2.0, 1.3, 1.0])),
    (1.0, Vectors.dense([0.0, 1.2, -0.5]))], schema = ["label", "features"])

In [10]:
lr = LogisticRegression(featuresCol="features", labelCol="label", maxIter=3, regParam=0.01)

In [11]:
isinstance(lr, Estimator)

True

In [12]:
model = lr.fit(training)

In [13]:
model.coefficients

DenseVector([-2.6617, 2.3222, -0.4828])

In [14]:
model.interceptVector

DenseVector([0.0334])

In [15]:
isinstance(model, Transformer)

True

In [16]:
predict = model.transform(training)

In [17]:
predict.printSchema()

root
 |-- label: double (nullable = true)
 |-- features: vector (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)



In [18]:
predict.show(3, truncate=False, vertical=True)

-RECORD 0--------------------------------------------------
 label         | 1.0                                       
 features      | [0.0,1.1,0.1]                             
 rawPrediction | [-2.5395738919642197,2.5395738919642197]  
 probability   | [0.07313005064401076,0.9268699493559893]  
 prediction    | 1.0                                       
-RECORD 1--------------------------------------------------
 label         | 0.0                                       
 features      | [2.0,1.0,-1.0]                            
 rawPrediction | [2.4849264125830595,-2.4849264125830595]  
 probability   | [0.9230783263405788,0.0769216736594211]   
 prediction    | 0.0                                       
-RECORD 2--------------------------------------------------
 label         | 0.0                                       
 features      | [2.0,1.3,1.0]                             
 rawPrediction | [2.7539392318408518,-2.7539392318408518]  
 probability   | [0.9401354376992936,0.0

## Params demystified
A [`Param`](https://spark.apache.org/docs/2.4.7/api/python/pyspark.ml.html#pyspark.ml.param.Param) is a named parameter with self-contained documentation. Estimators and Transformers use a uniform API for specifying parameters.

In [19]:
lr.params

[Param(parent='LogisticRegression_329642cb3f5f', name='aggregationDepth', doc='suggested depth for treeAggregate (>= 2).'),
 Param(parent='LogisticRegression_329642cb3f5f', name='elasticNetParam', doc='the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.'),
 Param(parent='LogisticRegression_329642cb3f5f', name='family', doc='The name of family which is a description of the label distribution to be used in the model. Supported options: auto, binomial, multinomial'),
 Param(parent='LogisticRegression_329642cb3f5f', name='featuresCol', doc='features column name.'),
 Param(parent='LogisticRegression_329642cb3f5f', name='fitIntercept', doc='whether to fit an intercept term.'),
 Param(parent='LogisticRegression_329642cb3f5f', name='labelCol', doc='label column name.'),
 Param(parent='LogisticRegression_329642cb3f5f', name='lowerBoundsOnCoefficients', doc='The lower bounds on coefficients if fitting under bound cons

In [20]:
lr.explainParam("maxIter")

'maxIter: max number of iterations (>= 0). (default: 100, current: 3)'

In [21]:
lr.setParams(maxIter=10)

LogisticRegression_329642cb3f5f

In [22]:
lr.explainParam("maxIter")

'maxIter: max number of iterations (>= 0). (default: 100, current: 10)'

In [23]:
lr.setMaxIter(5)

LogisticRegression_329642cb3f5f

In [24]:
lr.getMaxIter()

5

In [25]:
paramMap = lr.extractParamMap()

In [26]:
paramMap["maxIter"]

KeyError: 'maxIter'

In [30]:
paramMap[lr.maxIter]

10

In [28]:
paramMap[lr.maxIter] = 10

In [29]:
lr.explainParam("maxIter")

'maxIter: max number of iterations (>= 0). (default: 100, current: 5)'

In [31]:
model = lr.fit(training, params=paramMap)

In [32]:
model.explainParam("maxIter")

'maxIter: maximum number of iterations (>= 0) (default: 100, current: 10)'

## Toxic Comment Classification Challenge

In [33]:
from pyspark.sql.types import *

In [34]:
schema = StructType([
    StructField("id", StringType()),
    StructField("comment_text", StringType()),
    StructField("toxic", IntegerType()),
    StructField("severe_toxic", IntegerType()),
    StructField("obscene", IntegerType()),
    StructField("threat", IntegerType()),
    StructField("insult", IntegerType()),
    StructField("identity_hate", IntegerType())
])

In [35]:
dataset = spark.read.csv(TOXIC_COMMENTS_TRAIN, schema=schema, header=True)

In [36]:
dataset.show(2, vertical=True)

-RECORD 0-----------------------------
 id            | 0000997932d777bf     
 comment_text  | Explanation          
 toxic         | null                 
 severe_toxic  | null                 
 obscene       | null                 
 threat        | null                 
 insult        | null                 
 identity_hate | null                 
-RECORD 1-----------------------------
 id            | Why the edits mad... 
 comment_text  |  just closure on ... 
 toxic         | 0                    
 severe_toxic  | 0                    
 obscene       | 0                    
 threat        | 0                    
 insult        | 0                    
 identity_hate | 0                    
only showing top 2 rows



In [37]:
!hdfs dfs -head $TOXIC_COMMENTS_TRAIN

"id","comment_text","toxic","severe_toxic","obscene","threat","insult","identity_hate"
"0000997932d777bf","Explanation
Why the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27",0,0,0,0,0,0
"000103f0d9cfb60f","D'aww! He matches this background colour I'm seemingly stuck with. Thanks.  (talk) 21:51, January 11, 2016 (UTC)",0,0,0,0,0,0
"000113f07ec002fd","Hey man, I'm really not trying to edit war. It's just that this guy is constantly removing relevant information and talking to me through edits instead of my talk page. He seems to care more about the formatting than the actual info.",0,0,0,0,0,0
"0001b41b1c6bb37e","""
More
I can't make any real suggestions on improvement - I wondered if the section statistics should be later on, or a subsection of """"types of accidents""""  -I think the

In [38]:
dataset = spark.read.csv(TOXIC_COMMENTS_TRAIN, schema=schema, header=True, multiLine=True)

In [39]:
dataset.select("id").show(10)

+--------------------+
|                  id|
+--------------------+
|    0000997932d777bf|
|    000103f0d9cfb60f|
|    000113f07ec002fd|
|    0001b41b1c6bb37e|
|                More|
|I can't make any ...|
|There appears to ...|
|    0001d958c54c6e35|
|    00025465d4725e87|
|Congratulations f...|
+--------------------+
only showing top 10 rows



### You need to add `escape` parameter!

In [40]:
dataset = spark.read.csv(TOXIC_COMMENTS_TRAIN, schema=schema, 
                         header=True, multiLine=True, escape='"')

In [41]:
dataset.select("id").show(10)

+----------------+
|              id|
+----------------+
|0000997932d777bf|
|000103f0d9cfb60f|
|000113f07ec002fd|
|0001b41b1c6bb37e|
|0001d958c54c6e35|
|00025465d4725e87|
|0002bcb3da6cb337|
|00031b1e95af7921|
|00037261f536c51d|
|00040093b2687caa|
+----------------+
only showing top 10 rows



In [42]:
dataset.show(2, vertical=True, truncate=False)

-RECORD 0---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 id            | 0000997932d777bf                                                                                                                                                                                                                                                         
 comment_text  | Explanation
Why the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27 
 toxic         | 0                                                                                                                                     

In [43]:
dataset.rdd.getNumPartitions()

1

In [44]:
dataset.count()

159571

In [45]:
dataset = dataset.repartition(4).cache()

### Let's define a binary target (toxic/non-toxic)

In [46]:
from pyspark.sql import functions as f

In [47]:
target = f.when(
    (dataset.toxic == 0) &
    (dataset.severe_toxic == 0) &
    (dataset.obscene == 0) &
    (dataset.threat == 0) &
    (dataset.insult == 0) &
    (dataset.identity_hate == 0),
    0
).otherwise(1)

In [48]:
dataset = dataset.withColumn("target", target)

In [49]:
dataset.select("id", "target").show(10)

+----------------+------+
|              id|target|
+----------------+------+
|6fdb7b6734f8bf40|     0|
|26e1b63617df36b1|     0|
|85e4f353ca4b2bde|     0|
|9d2196265213dce8|     0|
|fb7a63a8e287b2d1|     0|
|fd42fd6a1ea341c4|     0|
|54f9e59924682c6e|     0|
|01c0ae884d69319b|     0|
|f7fec98d6aac8ce3|     0|
|25553d990b245467|     1|
+----------------+------+
only showing top 10 rows



In [50]:
targets = dict(dataset.groupBy("target").count().collect())

In [51]:
targets[1] / (targets[0] + targets[1])

0.10167887648758234

In [52]:
dataset = dataset.drop("toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate").cache()

In [53]:
dataset

DataFrame[id: string, comment_text: string, target: int]

In [54]:
dataset.show(2, False, True)

-RECORD 0-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 id           | 6fdb7b6734f8bf40                                                                                                                                                                                                                                                                                                                    
 comment_text | "

""Katara""
I've removed the section entirely. I don't care if you like to pretend that Katara and Zuko are meant for each other. It's still not case, and there has been no indication whatsoever. Thus, there's little point to actually have the section and exempt Toph and Sokka beyond the insane delu

In [None]:
dataset.write.parquet("/user/pklemenkov/toxic", mode="overwrite")

###  Let's fit the simplest binary-BoW logistic regression

In [55]:
from pyspark.ml.feature import *

### Split comments into words

In [56]:
tokenizer = Tokenizer(inputCol="comment_text", outputCol="words")

In [57]:
dataset2 = tokenizer.transform(dataset)

In [58]:
dataset2.select("id", "words").show(2, False, True)

-RECORD 0-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 id    | 6fdb7b6734f8bf40                                                                                                                                                                                                                                                                                                                                                                               
 words | [", , ""katara"", i've, removed, the, section, entirely., i, don't, care, if, you, like, to, pretend, that, katara, and, zuko, are, meant, for, each, other., it's, still, not, case,, and, there, has, been,

### Convert texts into binary vectors using Hashing trick

In [59]:
hasher = HashingTF(numFeatures=100, binary=True, inputCol=tokenizer.getOutputCol(), outputCol="word_vector")
dataset2 = hasher.transform(dataset2)

In [60]:
dataset2.select("id", "word_vector").show(2, False, True)

-RECORD 0---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 id          | 6fdb7b6734f8bf40                                                                                                                                                                                                                                                                 
 word_vector | (100,[0,3,5,10,12,13,18,19,20,25,26,28,29,30,31,33,35,36,37,38,42,43,46,47,58,59,60,63,70,72,80,82,85,88,91,96,98,99],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0]) 
-RECORD 1----------------------------------------------------------------------------------------------------------------------------

### Now let's split into train and test. Don't forget that we have imbalanced classes, so let's do stratified sampling

In [61]:
train = dataset2.sampleBy("target", fractions={0: 0.8, 1: 0.8}, seed=5757)

In [62]:
train_targets = dict(train.groupby("target").count().collect())

In [63]:
train_targets[1] / (train_targets[0] + train_targets[1])

0.10108478558840807

In [64]:
test = dataset2.join(train, on="id", how="leftanti")

In [65]:
test_targets = dict(test.groupby("target").count().collect())

In [66]:
test_targets[1] / (test_targets[0] + test_targets[1])

0.10405693503887635

In [67]:
train.rdd.getNumPartitions()

4

In [68]:
test.rdd.getNumPartitions()

200

In [69]:
train = train.drop("comment_text", "words").cache()

In [70]:
test = test.drop("comment_text", "words").coalesce(4).cache()

### Let's fit logistic regression

In [71]:
from pyspark.ml.classification import LogisticRegression

In [72]:
lr = LogisticRegression(featuresCol=hasher.getOutputCol(), labelCol="target", maxIter=15)

In [73]:
lr_model = lr.fit(train)

In [74]:
lr_model

LogisticRegressionModel: uid = LogisticRegression_c478b558dc8f, numClasses = 2, numFeatures = 100

In [75]:
predictions = lr_model.transform(test)

In [76]:
predictions.printSchema()

root
 |-- id: string (nullable = true)
 |-- target: integer (nullable = false)
 |-- word_vector: vector (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)



In [77]:
predictions.select("id", "target", "prediction", "probability", "rawPrediction").show(5, False, True)

-RECORD 0--------------------------------------------------
 id            | 00b4fb897de56d6d                          
 target        | 0                                         
 prediction    | 0.0                                       
 probability   | [0.8219752022441963,0.17802479775580368]  
 rawPrediction | [1.5297873731912435,-1.5297873731912435]  
-RECORD 1--------------------------------------------------
 id            | 016f929ca882f152                          
 target        | 0                                         
 prediction    | 0.0                                       
 probability   | [0.9682786314416439,0.031721368558356056] 
 rawPrediction | [3.4185293474532754,-3.4185293474532754]  
-RECORD 2--------------------------------------------------
 id            | 01f29c98adb73328                          
 target        | 0                                         
 prediction    | 0.0                                       
 probability   | [0.9283264668215503,0.0

In [78]:
true_predictions = predictions.select("target", f.col("prediction").cast("int")).filter("target == prediction").count()
true_predictions

28612

In [79]:
print("Accuracy is {}".format(true_predictions / predictions.count()))

Accuracy is 0.8970403812390269


### Not bad! Or...

In [80]:
predictions.select("target", f.col("prediction").cast("int"))\
           .filter((f.col("target") == 1) & (f.col("prediction") == f.col("target")))\
           .count()

123

In [81]:
predictions.printSchema()

root
 |-- id: string (nullable = true)
 |-- target: integer (nullable = false)
 |-- word_vector: vector (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)



In [82]:
predictions_pd = predictions.select("target", f.col("prediction").cast("int")).toPandas()

In [83]:
predictions_pd.head()

Unnamed: 0,target,prediction
0,0,0
1,0,0
2,0,0
3,0,0
4,1,0


In [84]:
lr.getOrDefault("threshold")

0.5

In [85]:
from sklearn.metrics import classification_report, precision_score

In [86]:
print(classification_report(predictions_pd.target, predictions_pd.prediction))

              precision    recall  f1-score   support

           0       0.90      1.00      0.95     28577
           1       0.58      0.04      0.07      3319

    accuracy                           0.90     31896
   macro avg       0.74      0.52      0.51     31896
weighted avg       0.87      0.90      0.85     31896



### What if we want more sophisticated metrics?

In [87]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [88]:
evaluator = BinaryClassificationEvaluator(labelCol="target", metricName='areaUnderROC')

In [89]:
evaluator.evaluate(predictions)

0.7596398688697317

In [90]:
evaluator.setParams(metricName="precision")

BinaryClassificationEvaluator_6b2fe758c76a

In [91]:
evaluator.evaluate(predictions)

IllegalArgumentException: 'BinaryClassificationEvaluator_6b2fe758c76a parameter metricName given invalid value precision.'

### `spark.ml.evaluation.BinaryClassificationEvaluator` supports only ROC AUC and PR AUC. What if we want more?

In [92]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [93]:
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction", labelCol="target", metricName="accuracy")

In [94]:
evaluator.evaluate(predictions)

0.8970403812390269

In [95]:
evaluator = evaluator.setMetricName("weightedPrecision")

In [96]:
evaluator.evaluate(predictions)

0.8662299348750517

In [97]:
evaluator = evaluator.setMetricName("weightedRecall")

In [98]:
evaluator.evaluate(predictions)

0.8970403812390269

### Let's define a pipeline!

In [99]:
dataset = spark.read.parquet("/user/pklemenkov/toxic")

In [100]:
dataset

DataFrame[id: string, comment_text: string, target: int]

In [101]:
dataset.rdd.getNumPartitions()

2

In [102]:
from pyspark.ml import Pipeline

In [103]:
pipeline = Pipeline(stages=[
    tokenizer,
    hasher,
    lr
])

In [104]:
train = dataset.sampleBy("target", fractions={0: 0.8, 1: 0.8}).cache()

In [105]:
test = dataset.join(train, on="id", how="leftanti").cache()

In [106]:
pipeline_model = pipeline.fit(train)

In [107]:
pipeline_model

PipelineModel_713b1959a512

In [108]:
predictions = pipeline_model.transform(test)

In [109]:
evaluator = BinaryClassificationEvaluator(labelCol="target", metricName='areaUnderROC')

In [110]:
evaluator.evaluate(predictions)

0.7637490823741493

### Okay, may be some more sophisticated stuff?

In [111]:
from pyspark.ml.classification import GBTClassifier

In [112]:
gbt = GBTClassifier(featuresCol=hasher.getOutputCol(), labelCol="target", maxIter=20, maxDepth=3)

In [113]:
pipeline = Pipeline(stages=[
    tokenizer,
    hasher,
    gbt
])

In [114]:
pipeline_model = pipeline.fit(train)

In [115]:
predictions = pipeline_model.transform(test)

In [116]:
evaluator.evaluate(predictions)

0.7262375658618181

### Lets add more degrees of freedom

In [117]:
pipeline_model.stages

[Tokenizer_9c6ed0d5b7bd,
 HashingTF_aa0e472ff83c,
 GBTClassificationModel (uid=GBTClassifier_acb7ec3cae27) with 20 trees]

In [118]:
pipeline_model = pipeline.fit(train, params={hasher.numFeatures: 1000})

In [119]:
pipeline_model.stages[1].extractParamMap()

{Param(parent='HashingTF_aa0e472ff83c', name='binary', doc='If True, all non zero counts are set to 1. This is useful for discrete probabilistic models that model binary events rather than integer counts. Default False.'): True,
 Param(parent='HashingTF_aa0e472ff83c', name='numFeatures', doc='number of features.'): 1000,
 Param(parent='HashingTF_aa0e472ff83c', name='outputCol', doc='output column name.'): 'word_vector',
 Param(parent='HashingTF_aa0e472ff83c', name='inputCol', doc='input column name.'): 'words'}

In [120]:
predictions = pipeline_model.transform(test)

In [121]:
evaluator.evaluate(predictions)

0.8002614979295494

### Let's remove stopwords

In [122]:
stop_words = StopWordsRemover.loadDefaultStopWords("english")

In [123]:
stop_words

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 'her',
 'hers',
 'herself',
 'it',
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each',
 'few',
 'more',
 'most',
 'other',
 'some',
 'such',
 'no',
 'nor',
 '

In [124]:
swr = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol="words_filtered", stopWords=stop_words)

In [125]:
hasher = HashingTF(numFeatures=1000, binary=True, inputCol=swr.getOutputCol(), outputCol="word_vector")

In [126]:
pipeline = Pipeline(stages=[
    tokenizer,
    swr,
    hasher,
    lr
])

In [None]:
pipeline_model = pipeline.fit(train)

In [None]:
pipeline_model.stages

In [None]:
predictions = pipeline_model.transform(test)

In [None]:
evaluator.evaluate(predictions)

### Need moar features!

In [127]:
import pyspark.sql.functions as f

In [128]:
dataset.printSchema()

root
 |-- id: string (nullable = true)
 |-- comment_text: string (nullable = true)
 |-- target: integer (nullable = true)



In [129]:
dataset = dataset.withColumn("comment_length", f.length(dataset.comment_text))

In [130]:
train = dataset.sampleBy("target", fractions={0: 0.8, 1: 0.8}).cache()
test = dataset.join(train, on="id", how="leftanti").cache()

In [None]:
train

### All features must be assembled into one column. `VectorAssembler` to the rescue!

In [131]:
assembler = VectorAssembler(inputCols=[hasher.getOutputCol(), "comment_length"], outputCol="features")

In [132]:
lr = LogisticRegression(labelCol="target", maxIter=15)

In [133]:
pipeline = Pipeline(stages=[
    tokenizer,
    swr,
    hasher,
    assembler,
    lr
])

In [134]:
pipeline_model = pipeline.fit(train)

In [135]:
pipeline_model.stages

[Tokenizer_9c6ed0d5b7bd,
 StopWordsRemover_c2199e8b11e9,
 HashingTF_0ac9f4fe54d1,
 VectorAssembler_295fed63b300,
 LogisticRegressionModel: uid = LogisticRegression_780d2ad5e5fa, numClasses = 2, numFeatures = 1001]

In [136]:
predictions = pipeline_model.transform(test)

In [137]:
evaluator.evaluate(predictions)

0.8548010966885787

### Ok, how do you do it right!? 
https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge/discussion/52557

### Very funny, anyway?

In [138]:
count_vectorizer = CountVectorizer(inputCol=swr.getOutputCol(), outputCol="word_vector", binary=True)

In [139]:
assembler = VectorAssembler(inputCols=[count_vectorizer.getOutputCol(), "comment_length"], outputCol="features")

In [140]:
pipeline = Pipeline(stages=[
    tokenizer,
    swr,
    count_vectorizer,
    assembler,
    lr
])

In [141]:
pipeline_model = pipeline.fit(train)

In [142]:
predictions = pipeline_model.transform(test)

In [143]:
evaluator.evaluate(predictions)

0.9148326396758979

![](http://29.media.tumblr.com/tumblr_lltzgnHi5F1qzib3wo1_400.jpg)

### Hyperparameter tuning

In [None]:
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

In [None]:
paramGrid = ParamGridBuilder().addGrid(count_vectorizer.vocabSize, [100, 500])\
                              .addGrid(lr.regParam, [0.01, 0.05])\
                              .build()

In [None]:
paramGrid

In [None]:
crossval = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid,
                          evaluator=evaluator, numFolds=3, parallelism=4)

In [None]:
cv_model = crossval.fit(train)

In [None]:
cv_model.avgMetrics

In [None]:
cv_model.bestModel

In [None]:
predictions = cv_model.transform(test)

In [None]:
evaluator.evaluate(predictions)

In [144]:
spark.stop()