#  Carregando os dados e transformando em DataFrame

Configuração do ambiente PySpark

In [1]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.3.tar.gz (317.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.3/317.3 MB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.3-py2.py3-none-any.whl size=317840625 sha256=7d8c52099bacb9bf798087c0f068ba776d79e9be34c81caa7564adf8216c5558
  Stored in directory: /root/.cache/pip/wheels/1b/3a/92/28b93e2fbfdbb07509ca4d6f50c5e407f48dce4ddbda69a4ab
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.3


Importando as bibliotecas

In [2]:
from sklearn.datasets import load_wine
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, abs as spark_abs
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
import pandas as pd

Iniciar a sessão do Spark

In [3]:
spark = SparkSession.builder.appName("WineDataset").getOrCreate()

Carregar o dataset Wine

In [4]:
wine_data = load_wine()

wine_df = pd.DataFrame(data=wine_data.data, columns=wine_data.feature_names)

Adicionar coluna Qualidade

In [5]:
wine_df['quality'] = wine_data.target

Exibir os valores distintos da coluna de qualidade

In [6]:
wine_df.head()

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,quality
0,14.23,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0,0
1,13.2,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050.0,0
2,13.16,2.36,2.67,18.6,101.0,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185.0,0
3,14.37,1.95,2.5,16.8,113.0,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480.0,0
4,13.24,2.59,2.87,21.0,118.0,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735.0,0


Converter para um DataFrame do Spark

In [7]:
spark_df = spark.createDataFrame(wine_df)

In [8]:
print(spark.sparkContext.appName)

WineDataset


In [9]:
spark_df.show(5)

+-------+----------+----+-----------------+---------+-------------+----------+--------------------+---------------+---------------+----+----------------------------+-------+-------+
|alcohol|malic_acid| ash|alcalinity_of_ash|magnesium|total_phenols|flavanoids|nonflavanoid_phenols|proanthocyanins|color_intensity| hue|od280/od315_of_diluted_wines|proline|quality|
+-------+----------+----+-----------------+---------+-------------+----------+--------------------+---------------+---------------+----+----------------------------+-------+-------+
|  14.23|      1.71|2.43|             15.6|    127.0|          2.8|      3.06|                0.28|           2.29|           5.64|1.04|                        3.92| 1065.0|      0|
|   13.2|      1.78|2.14|             11.2|    100.0|         2.65|      2.76|                0.26|           1.28|           4.38|1.05|                         3.4| 1050.0|      0|
|  13.16|      2.36|2.67|             18.6|    101.0|          2.8|      3.24|            

# Tratamento dos dados

In [10]:
center_stat = 'median'

Definir as colunas de características

In [11]:
all_cols = ['alcohol', 'malic_acid', 'ash', 'alcalinity_of_ash']

Montar o vetor de características

In [12]:
assembler = VectorAssembler(inputCols=all_cols, outputCol="features")

Aplicar o assembler ao DataFrame

In [13]:
df_features = assembler.transform(spark_df)

Normalizar as características

In [14]:
scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures")
scaler_model = scaler.fit(df_features)
df_scaled = scaler_model.transform(df_features)

Selecionar apenas a coluna normalizada e a variável target (quality)

In [15]:
final_df = df_scaled.select("scaledFeatures", "quality")
final_df.show(5)

+--------------------+-------+
|      scaledFeatures|quality|
+--------------------+-------+
|[17.5283750084766...|      0|
|[16.2596310690015...|      0|
|[16.2103594597015...|      0|
|[17.7008256410266...|      0|
|[16.3089026783015...|      0|
+--------------------+-------+
only showing top 5 rows



# Dividindo os dados em treino e teste

Divisão dos dados em 80% treino e 20% teste

In [16]:
train_df, test_df = final_df.randomSplit([0.8, 0.2], seed=42)

# Treinando o modelo de Regressão Logística

Inicializando o modelo de regressão logística

In [17]:
logreg = LogisticRegression(featuresCol="scaledFeatures", labelCol="quality")

Treinando o modelo

In [18]:
logreg_model = logreg.fit(train_df)

Avaliando o modelo no conjunto de teste

In [19]:
predictions = logreg_model.transform(test_df)

Avaliando a acurácia

In [20]:
evaluator = MulticlassClassificationEvaluator(labelCol="quality", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)

In [21]:
print(f"Acurácia no conjunto de teste (Regressão Logística): {accuracy:.2f}")

Acurácia no conjunto de teste (Regressão Logística): 0.82


# Avaliação de Métricas e Matriz de Confusão

Gerar previsões no conjunto de teste

In [22]:
predictions.select("quality", "prediction").show(10)

+-------+----------+
|quality|prediction|
+-------+----------+
|      1|       1.0|
|      1|       1.0|
|      1|       1.0|
|      1|       1.0|
|      1|       1.0|
|      0|       0.0|
|      0|       2.0|
|      0|       0.0|
|      0|       0.0|
|      0|       0.0|
+-------+----------+
only showing top 10 rows



Calcular a matriz de confusão

In [23]:
predictions.groupBy("quality", "prediction").count().show()

+-------+----------+-----+
|quality|prediction|count|
+-------+----------+-----+
|      0|       0.0|   11|
|      1|       1.0|   12|
|      0|       2.0|    1|
|      1|       2.0|    1|
|      2|       2.0|    5|
|      2|       1.0|    4|
+-------+----------+-----+

