In [1]:
spark = SparkSession.builder.appName("pysparkClustering").getOrCreate()

In [2]:
df = spark.read.csv('vgsales.csv',inferSchema=True,header='true')

In [3]:
df.columns

['Rank',
 'Name',
 'Platform',
 'Year',
 'Genre',
 'Publisher',
 'NA_Sales',
 'EU_Sales',
 'JP_Sales',
 'Other_Sales',
 'Global_Sales']

In [3]:
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols=['NA_Sales','EU_Sales','JP_Sales'],
                            outputCol='features')
data = assembler.transform(df)

K-Means

In [5]:
from pyspark.ml.clustering import KMeans
kmeans = KMeans().setK(4).setSeed(1) #berapa bagian? 4
model = kmeans.fit(data)

In [6]:
predictions = model.transform(data)

In [8]:
predictions.select('features','prediction').toPandas()

Unnamed: 0,features,prediction
0,"[41.49, 29.02, 3.77]",1
1,"[29.08, 3.58, 6.81]",1
2,"[15.85, 12.88, 3.79]",1
3,"[15.75, 11.01, 3.28]",1
4,"[11.27, 8.89, 10.22]",1
...,...,...
16593,"[0.01, 0.0, 0.0]",0
16594,"[0.01, 0.0, 0.0]",0
16595,"(0.0, 0.0, 0.0)",0
16596,"[0.0, 0.01, 0.0]",0


In [9]:
predictions.select('NA_Sales','EU_Sales','JP_Sales').filter('prediction = 1').summary().toPandas()

Unnamed: 0,summary,NA_Sales,EU_Sales,JP_Sales
0,count,11.0,11.0,11.0
1,mean,19.867272727272727,8.972727272727273,4.24909090909091
2,stddev,9.372031893788122,7.649181774424099,2.87100489217784
3,min,11.27,0.63,0.24
4,25%,14.03,3.58,2.93
5,50%,15.75,8.89,3.79
6,75%,26.93,11.01,6.5
7,max,41.49,29.02,10.22


In [10]:
predictions.select('NA_Sales','EU_Sales','JP_Sales').filter('prediction = 0').summary().toPandas()

Unnamed: 0,summary,NA_Sales,EU_Sales,JP_Sales
0,count,15480.0,15480.0,15480.0
1,mean,0.1364864341085341,0.0735775193798376,0.0560129198966379
2,stddev,0.1853953051139372,0.1366523321175557,0.185761131490094
3,min,0.0,0.0,0.0
4,25%,0.0,0.0,0.0
5,50%,0.07,0.02,0.0
6,75%,0.19,0.08,0.03
7,max,1.1,1.58,4.87


In [11]:
from pyspark.ml.evaluation import ClusteringEvaluator
evaluator = ClusteringEvaluator()

In [12]:
silhouette = evaluator.evaluate(predictions)
print("Silhouette with squared euclidean distance = " + str(silhouette))

Silhouette with squared euclidean distance = 0.8986214827769562


In [13]:
centers = model.clusterCenters()
print("Cluster Centers: ")
for center in centers:
    print(center)

Cluster Centers: 
[0.13637421 0.07355795 0.05602016]
[19.86727273  8.97272727  4.24909091]
[1.544      0.88545631 0.23396117]
[5.99101266 3.60607595 1.7243038 ]


Latent Dirichlet allocation (LDA)
lambat banget (warning)

In [14]:
from pyspark.ml.clustering import LDA
lda = LDA(k=10, maxIter=10)
model = lda.fit(data)

ll = model.logLikelihood(data)
lp = model.logPerplexity(data)
print("The lower bound on the log likelihood of the entire corpus: " + str(ll))
print("The upper bound on perplexity: " + str(lp))

The lower bound on the log likelihood of the entire corpus: -24613.388494579278
The upper bound on perplexity: 3.03191491784746


In [15]:
topics = model.describeTopics(3)
print("The topics described by their top-weighted terms:")
topics.show(truncate=False)

The topics described by their top-weighted terms:
+-----+-----------+---------------------------------------------------------------+
|topic|termIndices|termWeights                                                    |
+-----+-----------+---------------------------------------------------------------+
|0    |[0, 2, 1]  |[0.38152901494784214, 0.3366416106997798, 0.2818293743523781]  |
|1    |[0, 2, 1]  |[0.38248289673851804, 0.33005936947175796, 0.28745773378972395]|
|2    |[1, 0, 2]  |[0.733603168399181, 0.213379969998613, 0.053016861602205874]   |
|3    |[0, 2, 1]  |[0.4182424631439844, 0.2921478841032863, 0.2896096527527292]   |
|4    |[0, 1, 2]  |[0.719563813998215, 0.2337436577305689, 0.04669252827121626]   |
|5    |[0, 2, 1]  |[0.4237702089542662, 0.31866703317427625, 0.25756275787145755] |
|6    |[1, 0, 2]  |[0.3661606194485121, 0.33637183729159215, 0.2974675432598958]  |
|7    |[1, 0, 2]  |[0.4826063364547157, 0.48206889050717433, 0.03532477303810992] |
|8    |[2, 0, 1]  |[0.8426

In [16]:
transformed = model.transform(data)
transformed.show(truncate=False)

+----+--------------------------------------------+--------+----+------------+----------------------+--------+--------+--------+-----------+------------+------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|Rank|Name                                        |Platform|Year|Genre       |Publisher             |NA_Sales|EU_Sales|JP_Sales|Other_Sales|Global_Sales|features          |topicDistribution                                                                                                                                                                                                     |
+----+--------------------------------------------+--------+----+------------+----------------------+--------+--------+--------+-----------+------------+------------------+----------------------------------------------------

Gaussian Mixture Model (GMM)

In [12]:
from pyspark.ml.clustering import GaussianMixture
gmm = GaussianMixture().setK(2).setSeed(538009335)
model = gmm.fit(data)

In [13]:
print("Gaussians shown as a DataFrame: ")
model.gaussiansDF.show(truncate=False)

Gaussians shown as a DataFrame: 
+-------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|mean                                                         |cov                                                                                                                                                                                                       |
+-------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|[0.16734408828256747,0.08257419072082968,0.03442008954599476]|0.20390518066846927   0.08425647918484716   0.023909633075897767  \n0.08425647918484716   0.06230993349