In [1]:
!pip install scikit-learn



In [2]:
from sklearn.datasets import load_iris
import pandas as pd
import numpy as np

In [3]:
# iris datasets 로딩
iris = load_iris()

iris_data  = iris.data # feature
iris_label = iris.target # label

iris_columns = ["sepal_length", "sepal_width", "petal_length", "petal_width"]
iris_pdf = pd.DataFrame(iris_data, columns=iris_columns)
iris_pdf['target'] = iris_label
iris_pdf

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,2
146,6.3,2.5,5.0,1.9,2
147,6.5,3.0,5.2,2.0,2
148,6.2,3.4,5.4,2.3,2


In [4]:
iris_pdf.to_csv("./data/iris.csv", index=False)

In [5]:
from sklearn.model selection import train_test_split

X_train, y_train, X_test, y_test = train_test_split(
    iris_data,
    iris_label,
    test_size=0,
    random_state=42
)

SyntaxError: invalid syntax (4015346820.py, line 1)

# Spark ML 만들기

In [6]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("tree_clf").getOrCreate()
spark

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/03/29 17:54:15 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [7]:
iris_filepath = "/home/ubuntu/working/spark/data/iris.csv"

iris_sdf = spark.read.csv(f"file://{iris_filepath}", inferSchema=True, header=True)
iris_sdf.show(5)

                                                                                

+------------+-----------+------------+-----------+------+
|sepal_length|sepal_width|petal_length|petal_width|target|
+------------+-----------+------------+-----------+------+
|         5.1|        3.5|         1.4|        0.2|     0|
|         4.9|        3.0|         1.4|        0.2|     0|
|         4.7|        3.2|         1.3|        0.2|     0|
|         4.6|        3.1|         1.5|        0.2|     0|
|         5.0|        3.6|         1.4|        0.2|     0|
+------------+-----------+------------+-----------+------+
only showing top 5 rows



In [8]:
train_sdf, test_sdf = iris_sdf.randomSplit([0.8, 0.2], seed=42)

In [9]:
from pyspark.ml.feature import VectorAssembler

# 합쳐질 컬럼의 목록
iris_columns = ["sepal_length", "sepal_width", "petal_length", "petal_width"]

# 열벡터를 행벡터로 합쳐주는 역할을 하는 Transformer
vec_assembler = VectorAssembler(inputCols=iris_columns, outputCol="features")

In [10]:
train_feature_vector_sdf = vec_assembler.transform(train_sdf)
train_feature_vector_sdf.show(5)

+------------+-----------+------------+-----------+------+-----------------+
|sepal_length|sepal_width|petal_length|petal_width|target|         features|
+------------+-----------+------------+-----------+------+-----------------+
|         4.3|        3.0|         1.1|        0.1|     0|[4.3,3.0,1.1,0.1]|
|         4.4|        2.9|         1.4|        0.2|     0|[4.4,2.9,1.4,0.2]|
|         4.4|        3.2|         1.3|        0.2|     0|[4.4,3.2,1.3,0.2]|
|         4.5|        2.3|         1.3|        0.3|     0|[4.5,2.3,1.3,0.3]|
|         4.6|        3.1|         1.5|        0.2|     0|[4.6,3.1,1.5,0.2]|
+------------+-----------+------------+-----------+------+-----------------+
only showing top 5 rows



In [11]:
from pyspark.ml.classification import DecisionTreeClassifier

dt = DecisionTreeClassifier(
    featuresCol = "features",
    labelCol = "target",
    maxDepth = 5
)

type(dt)

pyspark.ml.classification.DecisionTreeClassifier

In [12]:
dt_model = dt.fit(train_feature_vector_sdf)
type(dt_model)

pyspark.ml.classification.DecisionTreeClassificationModel

In [13]:
test_feature_vector_sdf = vec_assembler.transform(test_sdf)
test_feature_vector_sdf.show(5)

+------------+-----------+------------+-----------+------+-----------------+
|sepal_length|sepal_width|petal_length|petal_width|target|         features|
+------------+-----------+------------+-----------+------+-----------------+
|         4.4|        3.0|         1.3|        0.2|     0|[4.4,3.0,1.3,0.2]|
|         4.6|        3.2|         1.4|        0.2|     0|[4.6,3.2,1.4,0.2]|
|         4.6|        3.6|         1.0|        0.2|     0|[4.6,3.6,1.0,0.2]|
|         4.8|        3.1|         1.6|        0.2|     0|[4.8,3.1,1.6,0.2]|
|         4.9|        3.1|         1.5|        0.1|     0|[4.9,3.1,1.5,0.1]|
+------------+-----------+------------+-----------+------+-----------------+
only showing top 5 rows



In [14]:
predictions = dt_model.transform(test_feature_vector_sdf)
predictions.show(5)

+------------+-----------+------------+-----------+------+-----------------+--------------+-------------+----------+
|sepal_length|sepal_width|petal_length|petal_width|target|         features| rawPrediction|  probability|prediction|
+------------+-----------+------------+-----------+------+-----------------+--------------+-------------+----------+
|         4.4|        3.0|         1.3|        0.2|     0|[4.4,3.0,1.3,0.2]|[39.0,0.0,0.0]|[1.0,0.0,0.0]|       0.0|
|         4.6|        3.2|         1.4|        0.2|     0|[4.6,3.2,1.4,0.2]|[39.0,0.0,0.0]|[1.0,0.0,0.0]|       0.0|
|         4.6|        3.6|         1.0|        0.2|     0|[4.6,3.6,1.0,0.2]|[39.0,0.0,0.0]|[1.0,0.0,0.0]|       0.0|
|         4.8|        3.1|         1.6|        0.2|     0|[4.8,3.1,1.6,0.2]|[39.0,0.0,0.0]|[1.0,0.0,0.0]|       0.0|
|         4.9|        3.1|         1.5|        0.1|     0|[4.9,3.1,1.5,0.1]|[39.0,0.0,0.0]|[1.0,0.0,0.0]|       0.0|
+------------+-----------+------------+-----------+------+------

In [15]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator_accuracy = MulticlassClassificationEvaluator(
    labelCol='target',
    predictionCol='prediction',
    metricName='accuracy'
)

In [16]:
accuracy = evaluator_accuracy.evaluate(predictions)
accuracy

1.0

In [39]:
# 숙제 - LogisticRegression 사용하기

# 합쳐질 컬럼의 목록
iris_columns = ["sepal_length", "sepal_width", "petal_length", "petal_width"]

# 열벡터를 행벡터로 합쳐주는 역할을 하는 Transformer
vec_assembler2 = VectorAssembler(inputCols=iris_columns, outputCol="features")

train_feature_vector_sdf2 = vec_assembler2.transform(train_sdf)

from pyspark.ml.classification import LogisticRegression
# 훈련 세트 변환
dt2 = LogisticRegression(
    featuresCol = "features",
    labelCol = "labels",
    maxIter=5
)
type(dt2)

pyspark.ml.classification.LogisticRegression

In [38]:
# 모델 훈련
dt2_model = dt2.fit(train_feature_vector_sdf2)
type(dt2_model)

IllegalArgumentException: labels does not exist. Available: sepal_length, sepal_width, petal_length, petal_width, target, features

In [None]:
# 테스트 데이터 예측
test_feature_vector_sdf2 = vec_assembler.transform(test_sdf)
test_feature_vector_sdf2.show(5)

In [None]:
# 테스트 세트 예측
predictions = dt_model.transform(test_feature_vector_sdf2)
predictions.show(5)

In [None]:
# 모델 평가
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator_accuracy = MulticlassClassificationEvaluator(
    labelCol='target',
    predictionCol='prediction',
    metricName='accuracy'
)

In [None]:
accuracy = evaluator_accuracy.evaluate(predictions)
accuracy

In [40]:
spark.stop()