In [8]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

spark = SparkSession.builder \
    .config('spark.driver.memory', '2g').getOrCreate()

In [9]:
posts = spark.read.parquet('file:///sodata/Posts')

In [12]:
tags = posts.where(size('Tags') > 1).select('Tags')

In [19]:
from pyspark.ml.feature import CountVectorizer

cv = CountVectorizer(inputCol="Tags", outputCol="features", binary=True)
cv_model = cv.fit(tags)
print("Vocab sample: {}".format(cv_model.vocabulary[:10]))
print("Vocab size: {}".format(len(cv_model.vocabulary)))

Vocab sample: ['javascript', 'java', 'c#', 'php', 'python', 'android', 'jquery', 'html', 'ios', 'css']
Vocab size: 55591


In [20]:
transformed = cv_model.transform(tags)

In [21]:
transformed.first()

Row(Tags=['svn', 'svn-externals'], features=SparseVector(55591, {287: 1.0, 10852: 1.0}))

In [25]:
from pyspark.ml.clustering import KMeans

kmeans = KMeans().setK(50).setSeed(42)
km = kmeans.fit(transformed)

In [26]:
for center in km.clusterCenters():
    print(" * %s" % ', '.join([cv_model.vocabulary[index] for index in center.argsort()[::-1][:15]]))

 * apache, php, mod-rewrite, linux, tomcat, ubuntu, ssl, xampp, virtualhost, django, nginx, http, apache2, httpd.conf, wordpress
 * php, node.js, angular, laravel, angularjs, wordpress, .net, git, ruby, swift, xml, regex, scala, json, objective-c
 * firebase, firebase-realtime-database, android, javascript, firebase-authentication, google-cloud-firestore, java, firebase-cloud-messaging, swift, google-cloud-functions, angular, ios, node.js, firebase-storage, angularjs
 * pandas, python, dataframe, numpy, python-3.x, matplotlib, csv, python-2.7, datetime, pandas-groupby, group-by, excel, dictionary, scikit-learn, merge
 * android, android-layout, android-studio, android-intent, sqlite, android-activity, cordova, eclipse, xml, json, android-recyclerview, google-maps, ios, gradle, kotlin
 * google-app-engine, python, java, google-cloud-datastore, google-cloud-platform, python-2.7, php, google-cloud-endpoints, google-cloud-storage, app-engine-ndb, go, gwt, eclipse, objectify, android
 * jav