In [10]:
from pyspark.sql import SparkSession

In [11]:
spark = SparkSession.builder.appName("pysparkClustering").getOrCreate()

In [12]:
df = spark.read.csv('vgsales.csv',inferSchema=True,header='true')

In [13]:
df.columns

['Rank',
 'Name',
 'Platform',
 'Year',
 'Genre',
 'Publisher',
 'NA_Sales',
 'EU_Sales',
 'JP_Sales',
 'Other_Sales',
 'Global_Sales']

In [14]:
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols=['NA_Sales','EU_Sales','JP_Sales'],
                            outputCol='features')
data = assembler.transform(df)

K-Means

In [15]:
from pyspark.ml.clustering import KMeans
kmeans = KMeans().setK(2).setSeed(1)
model = kmeans.fit(data)

                                                                                

22/11/15 14:47:13 WARN InstanceBuilder$JavaBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.VectorBLAS


In [16]:
predictions = model.transform(data)

In [17]:
predictions.select('features','prediction').toPandas()

Unnamed: 0,features,prediction
0,"[41.49, 29.02, 3.77]",1
1,"[29.08, 3.58, 6.81]",1
2,"[15.85, 12.88, 3.79]",1
3,"[15.75, 11.01, 3.28]",1
4,"[11.27, 8.89, 10.22]",1
...,...,...
16593,"[0.01, 0.0, 0.0]",0
16594,"[0.01, 0.0, 0.0]",0
16595,"(0.0, 0.0, 0.0)",0
16596,"[0.0, 0.01, 0.0]",0


In [18]:
from pyspark.ml.evaluation import ClusteringEvaluator
evaluator = ClusteringEvaluator()

In [19]:
silhouette = evaluator.evaluate(predictions)
print("Silhouette with squared euclidean distance = " + str(silhouette))

[Stage 34:>                                                         (0 + 1) / 1]

Silhouette with squared euclidean distance = 0.9897711078246284


                                                                                

In [20]:
centers = model.clusterCenters()
print("Cluster Centers: ")
for center in centers:
    print(center)

Cluster Centers: 
[0.22725259 0.12562678 0.06796683]
[8.29233766 4.65779221 2.18363636]


Latent Dirichlet allocation (LDA)

In [21]:
from pyspark.ml.clustering import LDA
lda = LDA(k=10, maxIter=10)
model = lda.fit(data)

ll = model.logLikelihood(data)
lp = model.logPerplexity(data)
print("The lower bound on the log likelihood of the entire corpus: " + str(ll))
print("The upper bound on perplexity: " + str(lp))

22/11/15 14:49:41 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
22/11/15 14:49:41 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS


[Stage 61:>                                                         (0 + 1) / 1]

The lower bound on the log likelihood of the entire corpus: -24622.30020124194
The upper bound on perplexity: 3.033012675532467


                                                                                

In [22]:
topics = model.describeTopics(3)
print("The topics described by their top-weighted terms:")
topics.show(truncate=False)

The topics described by their top-weighted terms:
+-----+-----------+---------------------------------------------------------------+
|topic|termIndices|termWeights                                                    |
+-----+-----------+---------------------------------------------------------------+
|0    |[1, 0, 2]  |[0.41746819670526303, 0.3131474002822913, 0.2693844030124456]  |
|1    |[0, 1, 2]  |[0.5865550790726876, 0.32580128898367317, 0.08764363194363914] |
|2    |[0, 1, 2]  |[0.47301832196525845, 0.28837732540307354, 0.23860435263166802]|
|3    |[2, 0, 1]  |[0.7679153806742391, 0.16033685956518687, 0.07174775976057397] |
|4    |[0, 1, 2]  |[0.4450961169184822, 0.34219381815582833, 0.21271006492568945] |
|5    |[1, 0, 2]  |[0.3705681778685268, 0.3334520465714339, 0.2959797755600394]   |
|6    |[1, 0, 2]  |[0.592559576228003, 0.3333169520817513, 0.0741234716902456]    |
|7    |[0, 1, 2]  |[0.45155987461136604, 0.3238583221299792, 0.22458180325865476] |
|8    |[0, 1, 2]  |[0.7377

In [23]:
transformed = model.transform(data)
transformed.show(truncate=False)

+----+--------------------------------------------+--------+----+------------+----------------------+--------+--------+--------+-----------+------------+------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|Rank|Name                                        |Platform|Year|Genre       |Publisher             |NA_Sales|EU_Sales|JP_Sales|Other_Sales|Global_Sales|features          |topicDistribution                                                                                                                                                                                                       |
+----+--------------------------------------------+--------+----+------------+----------------------+--------+--------+--------+-----------+------------+------------------+------------------------------------------------

Gaussian Mixture Model (GMM)

In [24]:
from pyspark.ml.clustering import GaussianMixture
gmm = GaussianMixture().setK(2).setSeed(538009335)
model = gmm.fit(data)

22/11/15 14:50:52 WARN LAPACK: Failed to load implementation from: com.github.fommil.netlib.NativeSystemLAPACK
22/11/15 14:50:52 WARN LAPACK: Failed to load implementation from: com.github.fommil.netlib.NativeRefLAPACK


[Stage 65:>                                                         (0 + 1) / 1]

22/11/15 14:50:53 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
22/11/15 14:50:53 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.ForeignLinkerBLAS


                                                                                

In [25]:
print("Gaussians shown as a DataFrame: ")
model.gaussiansDF.show(truncate=False)

Gaussians shown as a DataFrame: 
+-------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|mean                                                         |cov                                                                                                                                                                                                       |
+-------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|[0.16734408828256747,0.08257419072082968,0.03442008954599476]|0.20390518066846927   0.08425647918484716   0.023909633075897767  \n0.08425647918484716   0.06230993349