MLlib 패키지가 RDD 기반의 머신 러닝을 지원하는 패키지라면, ML 패키지는 데이터프레임 기반의 머신 러닝을 지원하는 패키지이다. Spark ML의 정식 명칭은 'MLlib DataFrame-based API'이며 데이터프레임이 RDD보다 스파크에서의 데이터 로딩, 실행 계획 최적화, 언어 간의 API 통일성에 있어 장점이 있기 때문에 Spark2 버전 기준 머신 러닝을 위한 주요한 API이다. 

즉 pyspark.mllib는 오직 pyspark.RDD에서 사용할 수 있다. pyspark.ml 클래스는 오직 pyspark.sql.DataFrame에서만 사용할 수 있다. 

In [10]:
import findspark

In [11]:
findspark.init()

In [12]:
import pyspark

In [13]:
from pyspark import SparkConf, SparkContext
conf = pyspark.SparkConf().setAppName('appName').setMaster('local')
sc = pyspark.SparkContext(conf=conf)

In [14]:
from pyspark.mllib.classification import SVMModel, SVMWithSGD

In [15]:
from pyspark.mllib.regression import LabeledPoint

In [16]:
data = [
    LabeledPoint(0.0, [0.0]),
    LabeledPoint(1.0, [1.0]),
    LabeledPoint(1.0, [2.0]),
    LabeledPoint(1.0, [3.0])
]

svm = SVMWithSGD.train(sc.parallelize(data), iterations=10)
svm.predict([1.0])

1

In [17]:
svm.predict(sc.parallelize([[1.0]])).collect()

[1]

In [34]:
from pyspark.sql.functions import array
import numpy as np

In [38]:
from pyspark.mllib.clustering import BisectingKMeans, BisectingKMeansModel

In [41]:
# data = array([0.0, 0.0, 1.0, 1.0, 9.0, 8.0, 8.0, 9.0]).reshape(4, 2)
data = np.array([0.0,0.0, 1.0,1.0, 9.0,8.0, 8.0,9.0]).reshape(4, 2)
bskm = BisectingKMeans()
model = bskm.train(sc.parallelize(data, 2), k=4)

In [43]:
p = np.array([0.0, 0.0])
model.predict(p)

0

In [44]:
model.k

4

In [45]:
model.computeCost(p)

0.0

In [47]:
data = np.array([0.0,0.0, 1.0,1.0, 9.0,8.0, 8.0,9.0]).reshape(4, 2)

In [50]:
from pyspark.mllib.clustering import KMeansModel, KMeans

In [51]:
model = KMeans.train(
            sc.parallelize(data), 2, maxIterations=10, initializationMode="random",
            seed=50, initializationSteps=5, epsilon=1e-4)

In [53]:
model.predict(np.array([0.0, 0.0])) == model.predict(np.array([1.0, 1.0]))

True

In [60]:
model.predict(np.array([2.0, 8.0])) == model.predict(np.array([9.0, 8.0]))

True

In [61]:
model.k

2

In [62]:
model.computeCost(sc.parallelize(data))

2.0

In [68]:
from pyspark.mllib.clustering import StreamingKMeansModel

In [69]:
initCenters = [[0.0, 0.0], [1.0, 1.0]]
initWeights = [1.0, 1.0]
stkm = StreamingKMeansModel(initCenters, initWeights)
data = sc.parallelize([[-0.1, -0.1], [0.1, 0.1],
                        [0.9, 0.9], [1.1, 1.1]])

In [70]:
stkm = stkm.update(data, 1.0, u"batches")

In [71]:
stkm.centers

array([[0., 0.],
       [1., 1.]])

In [72]:
stkm.predict([-0.1, -0.1])

0

In [73]:
stkm.predict([0.9, 0.9])

1

In [74]:
stkm.clusterWeights

[3.0, 3.0]

In [75]:
decayFactor = 0.0

In [79]:
from pyspark.ml.linalg import DenseVector
# data = sc.parallelize([DenseVector([1.5, 1.5]), DenseVector([0.2, 0.2])])
data = sc.parallelize([[1.5, 1.5], [0.2, 0.2]])

In [80]:
stkm = stkm.update(data, 0.0, u"batches")

In [81]:
stkm.centers

array([[0.2, 0.2],
       [1.5, 1.5]])

In [82]:
stkm.clusterWeights

[1.0, 1.0]

In [83]:
stkm.predict([0.2, 0.2])

0

In [84]:
stkm.predict([1.5, 1.5])

1