# Apache Spark. Spark ML.

In [None]:
import os
JAVA_HOME = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["JAVA_HOME"]= JAVA_HOME

In [None]:
from pyspark.sql import SparkSession

In [None]:
spark = SparkSession.builder \
    .appName("my_spark") \
    .master("local[*]") \
    .config("spark.driver.memory", "8g") \
    .config("spark.executor.memory", "8g") \
    .getOrCreate()

In [None]:
spark

In [None]:
import pyspark.sql.functions as f
from pyspark.sql.types import *

## Ключевые понятия

* **Pipeline**  
Организация серии трансформеров и эстиматоров в единый конвейер

* **Estimator**
Алгоритм непосредственно обучения, возвращает модель, которая является трансформером

![](https://spark.apache.org/docs/latest/img/ml-Pipeline.png)

* **Transformer**  
Алгоритм, который трансформирует один датафрейм в другой на основе заданных правил. Это касается как предсказаний, так и фиче-инжиниринга

![](https://spark.apache.org/docs/latest/img/ml-PipelineModel.png)

In [None]:
from pyspark.ml.linalg import Vectors

In [None]:
from pyspark.ml.classification import LogisticRegression

In [None]:
train = spark.createDataFrame([
    (1.0, Vectors.dense([0.0, 1.1, 0.1])),
    (0.0, Vectors.dense([2.0, 1.0, -1.0])),
    (0.0, Vectors.dense([2.0, 1.3, 1.0])),
    (1.0, Vectors.dense([0.0, 1.2, -0.5]))], schema = ["label", "features"])

In [None]:
train.show()

In [None]:
LogisticRegression?

In [None]:
lr = LogisticRegression(maxIter=10, regParam=0.01)

In [None]:
from pyspark.ml import Estimator, Transformer

In [None]:
isinstance(lr, Estimator)

In [None]:
type(lr)

In [None]:
lr.params

In [None]:
lr.getOrDefault("regParam")

In [None]:
train.rdd.getNumPartitions()

In [None]:
train.repartition(10).rdd.getNumPartitions()

In [None]:
model = lr.fit(train)

In [None]:
type(model)

In [None]:
isinstance(model, Transformer)

In [None]:
model.coefficients

In [None]:
model.interceptVector

In [None]:
predict = model.transform(train)

In [None]:
predict.printSchema()

In [None]:
predict.show(10, truncate=False, vertical=True)

In [None]:
model.getOrDefault("threshold")

## Токсичные комментарии!

Давайте изучим этот замечтальный датасет!

Скачать его можно тут:
https://www.kaggle.com/competitions/jigsaw-toxic-comment-classification-challenge

Давайте загрузим его в DataFrame. А как?

In [None]:
!head toxic/train.csv

schema = ...

In [None]:
schema = StructType([
    StructField("id", StringType()),
    StructField("comment_text", StringType()),
    StructField("toxic", IntegerType()),
    StructField("severe_toxic", IntegerType()),
    StructField("obscene", IntegerType()),
    StructField("threat", IntegerType()),
    StructField("insult", IntegerType()),
    StructField("identity_hate", IntegerType())
])

dataset = spark.read.csv...

In [None]:
spark.read.csv?

In [None]:
dataset = spark.read.csv("toxic/train.csv", schema=schema, header=True)
dataset.show(5)

In [None]:
dataset = spark.read.csv("toxic/train.csv", schema=schema, header=True, multiLine=True)

Некрасиво смотрится, давайте добавим форматирования!

In [None]:
dataset.show(5, False, True)

Ой, кажется, что-то пошло не так. Может, добавим какие-то опции?

In [None]:
!head toxic/train.csv

Давайте посмотрим опции в spark.read.csv?

In [None]:
spark.read.csv?

In [None]:
spark.read.csv("toxic/train.csv", schema=schema, 
                         header=True, multiLine=True).show()

In [None]:
dataset = spark.read.csv("toxic/train.csv", schema=schema, 
                         header=True, multiLine=True, escape='"')

In [None]:
dataset.show(5, False, True)

In [None]:
dataset.select("id").show(100)

In [None]:
dataset.show(2, vertical=True, truncate=False)

In [None]:
dataset.rdd.getNumPartitions()

In [None]:
dataset.count()

In [None]:
dataset = dataset.repartition(4).cache()

In [None]:
dataset.count()

### Начнеем с бинарной классификации (toxic/non-toxic)

Если вы пропустили импорты, то не забудьте про  
from pyspark.sql import functions as f

Давайте превратим большую шкалу в бинарную через функцию when

In [None]:
f.when?

In [None]:
dataset.printSchema()

In [None]:
target = f.when(
    (dataset.toxic == 0) &
    (dataset.severe_toxic == 0) &
    (dataset.obscene == 0) &
    (dataset.threat == 0) &
    (dataset.insult == 0) &
    (dataset.identity_hate == 0),
    0
).otherwise(1)

In [None]:
target_other = f.when(
    (f.col("toxic") == 0) &
    (f.col("severe_toxic") == 0) &
    (f.col("obscene") == 0) &
    (f.col("threat") == 0) &
    (f.col("insult") == 0) &
    (f.col("identity_hate") == 0),
    0
).otherwise(1)

In [None]:
target

In [None]:
target_other

In [None]:
my_dataset = spark.read.csv("toxic/train.csv", schema=schema, 
                         header=True, multiLine=True, escape='"')

In [None]:
my_dataset.withColumn("target", target_other).show()

In [None]:
dataset = dataset.withColumn("target", target)

In [None]:
dataset.select("id", "target").show(10)

In [None]:
targets = dict(dataset.groupBy("target").count().collect())

In [None]:
targets

In [None]:
dataset.filter(f.col("target") == 1).select("comment_text").show(5,False,True)

In [None]:
targets[1] / (targets[0] + targets[1])

In [None]:
dataset = dataset.drop("toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate").cache()

In [None]:
dataset

In [None]:
dataset.show(2, False, True)

In [None]:
dataset.write.parquet("toxic_dataset", mode="overwrite")

In [None]:
dataset.printSchema()

In [None]:
dataset = spark.read.parquet("toxic_dataset")

###  Построим простую бинарную логистическую регрессию

In [None]:
from pyspark.ml.feature import *

### Займемся токенами!

In [None]:
Tokenizer?

Нам надо создать токенайзер и натравить его на поле с комментариями, создать с его помощью новый датасет. Если забыли, то вам нужна операция transform

In [None]:
tokenizer = Tokenizer(inputCol="comment_text", outputCol="words")

In [None]:
dataset2 = tokenizer.transform(dataset)

In [None]:
dataset2.select("id", "words").show(2, False, True)

### Конвертируем наши слова в вектора с помощью hashing trick

А кстати, что это?

In [None]:
HashingTF?

Допустим, количество фичей у нас 100. Классификация, напомним, у нас бинарная.

In [None]:
hasher = HashingTF(numFeatures=100, binary=True, inputCol=tokenizer.getOutputCol(), outputCol="word_vector")

In [None]:
dataset2 = hasher.transform(dataset2)

In [None]:
dataset2.select("id", "word_vector").show(2, False, True)

### Настала пора разбить на тест и трейн!
Классы у нас несбалансированные, поэтому надо подкрутить параметры

In [None]:
dataset2.sampleBy?

In [None]:
train = dataset2.sampleBy("target", fractions={0: 0.8, 1: 0.8}, seed=42)

In [None]:
train_targets = dict(train.groupby("target").count().collect())

In [None]:
train_targets

In [None]:
train_targets[1] / (train_targets[0] + train_targets[1])

А как можно с помощью join взять оставшиеся данные?

In [None]:
dataset2.join?

In [None]:
test = dataset2.join(train, on="id", how="leftanti")

In [None]:
test_targets = dict(test.groupby("target").count().collect())

Сохранилось ли распределение?

In [None]:
test_targets[1] / (test_targets[0] + test_targets[1])

In [None]:
train.rdd.getNumPartitions()

In [None]:
test.rdd.getNumPartitions()

In [None]:
train = train.drop("comment_text", "words").cache()

In [None]:
test = test.drop("comment_text", "words").coalesce(4).cache()

In [None]:
test.rdd.getNumPartitions()

### Настало время для фита!

Начем с логистической регрессии. Ограничимся 15 итерациями, чтобы не усложнять себе жизнь.

In [None]:
from pyspark.ml.classification import LogisticRegression

In [None]:
LogisticRegression?

In [None]:
hasher.getOutputCol()

In [None]:
lr = LogisticRegression(featuresCol=hasher.getOutputCol(), labelCol="target", maxIter=15)

Зафитим!

In [None]:
lr_model = lr.fit(train)

In [None]:
lr_model?

Запредиктим!

In [None]:
predictions = lr_model.transform(test)

In [None]:
predictions?

In [None]:
predictions.printSchema()

In [None]:
predictions.select("id", "target", "prediction", "probability", "rawPrediction").show(10, False, True)

А если хотим Accuracy посчитать?

In [None]:
true_predictions = predictions.select("target", f.col("prediction").cast("int")).filter("target == prediction").count()

In [None]:
true_predictions

In [None]:
print("Accuracy is {}".format(true_predictions / predictions.count()))

### Это хорошо? Может быть...

In [None]:
predictions.select("target", f.col("prediction").cast("int"))\
           .filter((f.col("target") == 1) & (f.col("prediction") == f.col("target")))\
           .count()

In [None]:
predictions.printSchema()

In [None]:
predictions_pd = predictions.select("target", f.col("prediction").cast("int")).toPandas()

In [None]:
predictions_pd.head()

In [None]:
lr.getOrDefault("threshold")

In [None]:
!pip install scikit-learn
from sklearn.metrics import classification_report, precision_score

In [None]:
print(classification_report(predictions_pd.target, predictions_pd.prediction))

In [None]:
train.count()

### А если хочется что-то посложнее?

In [None]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [None]:
BinaryClassificationEvaluator?

In [None]:
evaluator = BinaryClassificationEvaluator(labelCol="target", metricName='areaUnderROC')

In [None]:
evaluator.evaluate(predictions)

In [None]:
evaluator.setParams(metricName="precision")

In [None]:
evaluator.evaluate(predictions)

### Упс :(

`spark.ml.evaluation.BinaryClassificationEvaluator` supports only ROC AUC and PR AUC.

### Так а если хотим больше?
Давайте немного поколхозим!

In [None]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [None]:
MulticlassClassificationEvaluator?

In [None]:
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction", labelCol="target", 
                                              metricName="accuracy")

In [None]:
evaluator.evaluate(predictions)

А если weightedPrecision или weightedRecall?

In [None]:
evaluator?

In [None]:
evaluator = evaluator.setMetricName("weightedPrecision")

In [None]:
evaluator.evaluate(predictions)

In [None]:
evaluator = evaluator.setMetricName("weightedRecall")

In [None]:
evaluator.evaluate(predictions)

### А теперь все вместе!

In [None]:
dataset = spark.read.parquet("toxic_dataset")

In [None]:
!ls toxic_dataset

In [None]:
dataset

In [None]:
dataset.rdd.getNumPartitions()

In [None]:
from pyspark.ml import Pipeline

Соберем пайплайн!

In [None]:
Pipeline?

In [None]:
pipeline = Pipeline(stages=[
    tokenizer,
    hasher,
    lr
])

Разбиваем на трейн и тест. Снова :)

In [None]:
train = dataset.sampleBy("target", fractions={0: 0.8, 1: 0.8}).cache()

In [None]:
test = dataset.join(train, on="id", how="leftanti").cache()

Фит и предикт. Снова :)

In [None]:
pipeline_model = pipeline.fit(train)

In [None]:
pipeline_model

In [None]:
predictions = pipeline_model.transform(test)

Эвалуатор на ваш вкус

In [None]:
evaluator = BinaryClassificationEvaluator(labelCol="target", metricName='areaUnderROC')

In [None]:
evaluator.evaluate(predictions)

### Так, а есть что-то позаковыристее?

In [None]:
from pyspark.ml.classification import GBTClassifier

In [None]:
GBTClassifier?

In [None]:
gbt = GBTClassifier(featuresCol=hasher.getOutputCol(), labelCol="target", maxIter=20, maxDepth=3)

Давайте добавим в пайплайн!

In [None]:
pipeline = Pipeline(stages=[
    tokenizer,
    hasher,
    gbt
])

Фит, предикт, проверка

In [None]:
pipeline_model = pipeline.fit(train)

In [None]:
predictions = pipeline_model.transform(test)

In [None]:
evaluator.evaluate(predictions)

### Добавим степеней свободы
Допустим, теперь мы хотим увеличить количество фичей в хэшировании

In [None]:
pipeline_model.stages

In [None]:
pipeline.fit?

In [None]:
hasher?

In [None]:
pipeline_model = pipeline.fit(train, params={hasher.numFeatures: 1000})

In [None]:
pipeline_model.stages[1].extractParamMap()

In [None]:
predictions = pipeline_model.transform(test)

In [None]:
evaluator.evaluate(predictions)

### Почистим от стопслов
Давайте почистим датасет от мусорных слов!

In [None]:
StopWordsRemover?

In [None]:
stop_words = StopWordsRemover.loadDefaultStopWords("english")

In [None]:
stop_words

In [None]:
swr = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol="words_filtered", 
                       stopWords=stop_words)

In [None]:
hasher = HashingTF(numFeatures=1000, binary=True, inputCol=swr.getOutputCol(), 
                   outputCol="word_vector")

А куда мы его в пайплане должны поместить?

In [None]:
pipeline = Pipeline(stages=[
    tokenizer,
    swr,
    hasher,
    lr
])

In [None]:
pipeline_model = pipeline.fit(train)

In [None]:
pipeline_model.stages

In [None]:
predictions = pipeline_model.transform(test)

In [None]:
evaluator.evaluate(predictions)

### Даешь еще фичей!

In [None]:
dataset.printSchema()

А давайте добавим длину комментария?

In [None]:
dataset = dataset.withColumn("comment_length", f.length(dataset.comment_text))

In [None]:
dataset.show()

In [None]:
train = dataset.sampleBy("target", fractions={0: 0.8, 1: 0.8}).cache()
test = dataset.join(train, on="id", how="leftanti").cache()

### Не забываем, что фичи должны быть в одной колонке. `VectorAssembler` нам поможем!

In [None]:
VectorAssembler?

In [None]:
assembler = VectorAssembler(inputCols=[hasher.getOutputCol(), "comment_length"], outputCol="features")

In [None]:
lr = LogisticRegression(labelCol="target", maxIter=15)

In [None]:
pipeline = Pipeline(stages=[
    tokenizer,
    swr,
    hasher,
    assembler,
    lr
])

In [None]:
pipeline_model = pipeline.fit(train)

In [None]:
pipeline_model.stages

In [None]:
predictions = pipeline_model.transform(test)

In [None]:
evaluator.evaluate(predictions)

In [None]:
pipeline_model.stages[-1].coefficients[-1]

### А что там вообще другие придумали? 
https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge/discussion/52557

In [None]:
CountVectorizer?

In [None]:
count_vectorizer = CountVectorizer(inputCol=swr.getOutputCol(), outputCol="word_vector", binary=True)

In [None]:
assembler = VectorAssembler(inputCols=[count_vectorizer.getOutputCol(), "comment_length"], outputCol="features")

In [None]:
pipeline = Pipeline(stages=[
    tokenizer,
    swr,
    count_vectorizer,
    assembler,
    lr
])

In [None]:
pipeline_model = pipeline.fit(train)

In [None]:
predictions = pipeline_model.transform(test)

In [None]:
evaluator.evaluate(predictions)

### Что ж это за ML без тюнинга гиперпараметров?

In [None]:
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

In [None]:
CrossValidator?

In [None]:
ParamGridBuilder?

In [None]:
paramGrid = ParamGridBuilder().addGrid(count_vectorizer.vocabSize, [100, 500])\
                              .addGrid(lr.regParam, [0.01, 0.05])\
                              .build()

In [None]:
paramGrid

In [None]:
crossval = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid,
                          evaluator=evaluator, numFolds=3, parallelism=4)

In [None]:
cv_model = crossval.fit(train)

In [None]:
cv_model.avgMetrics

In [None]:
cv_model.bestModel

In [None]:
predictions = cv_model.transform(test)

In [None]:
evaluator.evaluate(predictions)

### А если я хочу больше разных моделей?

#### 1. Обучим случайный лес локально

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

In [None]:
X, y = make_classification(random_state=42)

In [None]:
y.shape

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
X_train.shape

In [None]:
est = LogisticRegression(random_state=42)

In [None]:
est.fit(X_train, y_train)

#### 2. Теперь уложим это в Spark

In [None]:
from pyspark.ml.linalg import DenseVector
from pyspark.ml.linalg import VectorUDT

Spark требует немного другой формат. Конвертируем!

In [None]:
schema = StructType(fields=[
    StructField("features", VectorUDT()),
    StructField("label", IntegerType())
])

In [None]:
df_test = spark.createDataFrame(zip(map(DenseVector, X_test), map(int, y_test)), schema=schema)

In [None]:
df_test.show()

Добавим pandas_udf

In [None]:
est_broadcast = spark.sparkContext.broadcast(est)

In [None]:
import pandas as pd

In [None]:
@f.pandas_udf(FloatType())
def predict(features):
    predictions = est_broadcast.value.predict(features)
    return pd.Series(predictions)

In [None]:
df_test.withColumn("prediction", predict("features")).show()

Ой, в PyArrow нет векторов :(

In [None]:
from pyspark.ml.functions import vector_to_array

In [None]:
df_test.withColumn("prediction", predict(vector_to_array("features"))).show()

И еще немного напильника...

In [None]:
@f.pandas_udf(FloatType())
def predict(series):
    predictions = est_broadcast.value.predict(series.tolist())
    return pd.Series(predictions)

In [None]:
df_test.withColumn("prediction", predict(vector_to_array("features"))).show()

In [None]:
spark.stop()