In [1]:
import pyspark as spark
import pandas as pd
from pyspark.sql import SparkSession
import numpy as np

In [5]:
sc = SparkSession.builder.getOrCreate()

iris = sc.read.csv(
    r"G:\Meu Drive\Data Science\Dados\Classificação\Iris\Iris.csv",
    header = True
)

iris.show(3)

+---+-------------+------------+-------------+------------+-----------+
| Id|SepalLengthCm|SepalWidthCm|PetalLengthCm|PetalWidthCm|    Species|
+---+-------------+------------+-------------+------------+-----------+
|  1|          5.1|         3.5|          1.4|         0.2|Iris-setosa|
|  2|          4.9|         3.0|          1.4|         0.2|Iris-setosa|
|  3|          4.7|         3.2|          1.3|         0.2|Iris-setosa|
+---+-------------+------------+-------------+------------+-----------+
only showing top 3 rows



In [7]:
iris.groupBy("Species").count().show()

+---------------+-----+
|        Species|count|
+---------------+-----+
| Iris-virginica|   50|
|    Iris-setosa|   50|
|Iris-versicolor|   50|
+---------------+-----+



# Elaboração de uma Pipeline de Aprendizado de Máquina

In [19]:
# Altera o tipo de dado de uma coluna, aceita como argumento os tipos integer, double, string, etc.
iris = iris.withColumn("SepalLengthCm", iris.Id.cast("double"))
iris = iris.withColumn("SepalWidthCm", iris.Id.cast("double"))
iris = iris.withColumn("PetalLengthCm", iris.Id.cast("double"))
iris = iris.withColumn("PetalWidthCm", iris.Id.cast("double"))

# Para exemplificação queremos apenas duas espécies de Iris para criar um classificador de binário:
iris = iris.filter((iris.Species == "Iris-virginica") | (iris.Species == "Iris-versicolor"))
iris.show(3)

+---+-------------+------------+-------------+------------+---------------+
| Id|SepalLengthCm|SepalWidthCm|PetalLengthCm|PetalWidthCm|        Species|
+---+-------------+------------+-------------+------------+---------------+
| 51|         51.0|        51.0|         51.0|        51.0|Iris-versicolor|
| 52|         52.0|        52.0|         52.0|        52.0|Iris-versicolor|
| 53|         53.0|        53.0|         53.0|        53.0|Iris-versicolor|
+---+-------------+------------+-------------+------------+---------------+
only showing top 3 rows



In [20]:
iris

DataFrame[Id: int, SepalLengthCm: double, SepalWidthCm: double, PetalLengthCm: double, PetalWidthCm: double, Species: string]

## Criando uma Pipeline

In [28]:
training.show()

+---+-------------+------------+-------------+------------+---------------+-------------+-------------+--------------------+
| Id|SepalLengthCm|SepalWidthCm|PetalLengthCm|PetalWidthCm|        Species|Species_Index| Species_Fact|            features|
+---+-------------+------------+-------------+------------+---------------+-------------+-------------+--------------------+
| 52|         52.0|        52.0|         52.0|        52.0|Iris-versicolor|          0.0|(1,[0],[1.0])|[52.0,52.0,52.0,5...|
| 56|         56.0|        56.0|         56.0|        56.0|Iris-versicolor|          0.0|(1,[0],[1.0])|[56.0,56.0,56.0,5...|
| 57|         57.0|        57.0|         57.0|        57.0|Iris-versicolor|          0.0|(1,[0],[1.0])|[57.0,57.0,57.0,5...|
| 58|         58.0|        58.0|         58.0|        58.0|Iris-versicolor|          0.0|(1,[0],[1.0])|[58.0,58.0,58.0,5...|
| 60|         60.0|        60.0|         60.0|        60.0|Iris-versicolor|          0.0|(1,[0],[1.0])|[60.0,60.0,60.0,6...|


In [42]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
import pyspark.ml.evaluation as evals
import pyspark.ml.tuning as tune

species_indexer = StringIndexer(inputCol = "Species", outputCol = "Species_Index")
species_encoder = OneHotEncoder(inputCol = "Species_Index", outputCol = "Species_Fact")
vec_assembler = VectorAssembler(inputCols = ["SepalLengthCm", "SepalWidthCm", "PetalLengthCm", "PetalWidthCm"], outputCol = "features")

iris_pipe = Pipeline(stages = [species_indexer, species_encoder, vec_assembler])

piped_data = iris_pipe.fit(iris).transform(iris)
training, test = piped_data.randomSplit([0.6, 0.4])

lr = LogisticRegression()

evaluator = evals.BinaryClassificationEvaluator(metricName = "areaUnderROC")

grid = tune.ParamGridBuilder()
grid = grid.addGrid(lr.regParam, np.arange(0, 0.1, 0.01))
grid = grid.addGrid(lr.elasticNetParam, [0, 1])

grid = grid.build()

cv = tune.CrossValidator(
    estimator = lr,
    estimatorParamMaps = grid,
    evaluator = evaluator
)

models = cv.fit(training)
best_lr = models.bestModel

test_results = best_lr.transform(test)

print(evaluator.evaluate(test_results))

IllegalArgumentException: label does not exist. Available: Id, SepalLengthCm, SepalWidthCm, PetalLengthCm, PetalWidthCm, Species, Species_Index, Species_Fact, features, CrossValidator_a660479720dc_rand

In [38]:
training.show()

+---+-------------+------------+-------------+------------+---------------+-------------+-------------+--------------------+
| Id|SepalLengthCm|SepalWidthCm|PetalLengthCm|PetalWidthCm|        Species|Species_Index| Species_Fact|            features|
+---+-------------+------------+-------------+------------+---------------+-------------+-------------+--------------------+
| 51|         51.0|        51.0|         51.0|        51.0|Iris-versicolor|          0.0|(1,[0],[1.0])|[51.0,51.0,51.0,5...|
| 53|         53.0|        53.0|         53.0|        53.0|Iris-versicolor|          0.0|(1,[0],[1.0])|[53.0,53.0,53.0,5...|
| 54|         54.0|        54.0|         54.0|        54.0|Iris-versicolor|          0.0|(1,[0],[1.0])|[54.0,54.0,54.0,5...|
| 57|         57.0|        57.0|         57.0|        57.0|Iris-versicolor|          0.0|(1,[0],[1.0])|[57.0,57.0,57.0,5...|
| 58|         58.0|        58.0|         58.0|        58.0|Iris-versicolor|          0.0|(1,[0],[1.0])|[58.0,58.0,58.0,5...|


https://www.youtube.com/watch?v=EMACjF6eCU4&ab_channel=Stack retomar no tempo 17min 

https://www.youtube.com/watch?v=8esz7IWSbMM&ab_channel=AprenderDados%7CBernardoCambruzzi assistir

https://community.cloud.databricks.com/?o=4566189763914988#notebook/3855061166299265 notebook no databricks

https://www.youtube.com/playlist?list=PLIHpLBNsiHE3Zmdc8Hc8H8n8TVpmrb-fp playlist de engenharia de dados.