In [1]:
from pyspark.sql import SparkSession

In [2]:
spark=SparkSession\
.builder.\
appName("python spark sql example")\
.config("spark.some.config.option","some-value")\
.getOrCreate()

# K-means

In [3]:
from pyspark.ml.clustering import KMeans

In [4]:
from pyspark.ml.evaluation import ClusteringEvaluator

In [5]:
dataset=spark.read.format("libsvm")\
        .load("data/mllib/sample_kmeans_data.txt")

In [7]:
dataset.toPandas()

  return f(*args, **kwds)


Unnamed: 0,label,features
0,0.0,"(0.0, 0.0, 0.0)"
1,1.0,"(0.1, 0.1, 0.1)"
2,2.0,"(0.2, 0.2, 0.2)"
3,3.0,"(9.0, 9.0, 9.0)"
4,4.0,"(9.1, 9.1, 9.1)"
5,5.0,"(9.2, 9.2, 9.2)"


In [8]:
kmeans=KMeans().setK(2).setSeed(1)

In [9]:
model=kmeans.fit(dataset)

In [10]:
predictions=model.transform(dataset)

In [12]:
predictions.show()

+-----+--------------------+----------+
|label|            features|prediction|
+-----+--------------------+----------+
|  0.0|           (3,[],[])|         0|
|  1.0|(3,[0,1,2],[0.1,0...|         0|
|  2.0|(3,[0,1,2],[0.2,0...|         0|
|  3.0|(3,[0,1,2],[9.0,9...|         1|
|  4.0|(3,[0,1,2],[9.1,9...|         1|
|  5.0|(3,[0,1,2],[9.2,9...|         1|
+-----+--------------------+----------+



In [13]:
evaluator=ClusteringEvaluator()

In [14]:
silhouette=evaluator.evaluate(predictions)

In [15]:
centers=model.clusterCenters()

In [16]:
centers

[array([0.1, 0.1, 0.1]), array([9.1, 9.1, 9.1])]

# LDA

In [17]:
from pyspark.ml.clustering import LDA

In [18]:
dataset=spark.read.format("libsvm").load("data/mllib/sample_lda_libsvm_data.txt")

In [19]:
lda=LDA(k=10,maxIter=10)

In [20]:
model=lda.fit(dataset)

In [21]:
l1=model.logLikelihood(dataset)

In [22]:
lp=model.logPerplexity(dataset)

In [23]:
l1

-809.9491114420002

In [24]:
lp

3.1151887994351455

In [25]:
topics=model.describeTopics(3)

In [26]:
topics.show(truncate=False)

+-----+-----------+---------------------------------------------------------------+
|topic|termIndices|termWeights                                                    |
+-----+-----------+---------------------------------------------------------------+
|0    |[1, 9, 6]  |[0.15898107962551689, 0.14332873809725274, 0.14164510760465782]|
|1    |[9, 6, 1]  |[0.0982169236914809, 0.0976720937820018, 0.09675132445473691]  |
|2    |[2, 8, 0]  |[0.10322075138399164, 0.09644968388549223, 0.09526382638806317]|
|3    |[5, 6, 1]  |[0.10216057971962017, 0.09998730023358514, 0.09736255104804063]|
|4    |[4, 7, 0]  |[0.10048650839197107, 0.09908577387207776, 0.0984356927632761] |
|5    |[0, 10, 6] |[0.10280187491855991, 0.09745310091390183, 0.09679441675785176]|
|6    |[5, 3, 7]  |[0.09815621701211966, 0.09556837776187276, 0.09441326891952787]|
|7    |[4, 9, 5]  |[0.14232236919457397, 0.1281871284393251, 0.12512021817654992] |
|8    |[2, 5, 1]  |[0.11096477841785449, 0.10076827076385143, 0.095482304607

In [27]:
transformed=model.transform(dataset)

In [28]:
transformed.show(truncate=False)

+-----+---------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|label|features                                                       |topicDistribution                                                                                                                                                                                                      |
+-----+---------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|0.0  |(11,[0,1,2,4,5,6,7,10],[1.0,2.0,6.0,2.0,3.0,1.0,1.0,3.0])      |[0.9569438264303067,0.0047741953287092275,0.004774287996138813,0.

# Bisecting k-means

In [29]:
from pyspark.ml.clustering import BisectingKMeans

In [30]:
dataset = spark.read.format("libsvm").load("data/mllib/sample_kmeans_data.txt")


In [31]:
bkm=BisectingKMeans().setK(2).setSeed(1)

In [32]:
model=bkm.fit(dataset)

In [33]:
cost = model.computeCost(dataset)

In [35]:
model.transform(dataset).show()

+-----+--------------------+----------+
|label|            features|prediction|
+-----+--------------------+----------+
|  0.0|           (3,[],[])|         0|
|  1.0|(3,[0,1,2],[0.1,0...|         0|
|  2.0|(3,[0,1,2],[0.2,0...|         0|
|  3.0|(3,[0,1,2],[9.0,9...|         1|
|  4.0|(3,[0,1,2],[9.1,9...|         1|
|  5.0|(3,[0,1,2],[9.2,9...|         1|
+-----+--------------------+----------+



In [36]:
centers=model.clusterCenters()

In [37]:
centers

[array([0.1, 0.1, 0.1]), array([9.1, 9.1, 9.1])]

# GMM

In [38]:
from pyspark.ml.clustering import GaussianMixture

In [39]:
dataset = spark.read.format("libsvm").load("data/mllib/sample_kmeans_data.txt")

In [40]:
gmm=GaussianMixture().setK(2).setSeed(787328323)

In [41]:
model=gmm.fit(dataset)

In [42]:
model.gaussiansDF.show(truncate=False)

+-------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|mean                                                         |cov                                                                                                                                                                                                     |
+-------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|[0.10000000000001552,0.10000000000001552,0.10000000000001552]|0.006666666666806454  0.006666666666806454  0.006666666666806454  
0.006666666666806454  0.006666666666806454  0.006666666666806454  
0.006666