In [3]:
# Clustering Examples
## LDA - text-based topic clustering

from pyspark.ml.feature import Tokenizer, RegexTokenizer
from pyspark.sql.functions import *
from pyspark.sql.types import IntegerType
from pyspark.ml.clustering import LDA
from pyspark.ml.feature import CountVectorizer

# Sample data
sentenceDataFrame = spark.createDataFrame([
    (0,'amazon is the best they rock and is awesome and amazing'),
    (1,'amazing amazon is so super great job amazon'),
    (2,'amazon rocks i love their awesome products'),
    (3,'greatness and innovative is amazon they are so incredible'),
    (4,'walmart is stupid dumb and slow'),
    (5,'unamazing walmart is so mediocre small and crappy'),
    (6,'walmart sucks i hate their service'),
    (7,'terrible service walmart ugly stores'),
    (4,'apple great products'),
    (5,'best apple easy technology'),
    (6,'apple own technology'),
    (7,'smart engineers at apple'),
], ["id", "sentence"])

# Vocabulary
print('Vocabulary size: %d' % 
      sentenceDataFrame.select(explode(split('sentence',' '))).distinct().count())

# Tokenize into separate words
tokenizer = Tokenizer(inputCol="sentence", outputCol="words")
tokenized = tokenizer.transform(sentenceDataFrame)

# Vectorize into word counts per sentence 
vectorizer = CountVectorizer(inputCol="words", outputCol="features")\
    .fit(tokenized)
vectorized = vectorizer.transform(tokenized)

# Train an LDA model - top k words, maxIter iterations .
lda = LDA(k=3, maxIter=100)
model = lda.fit(vectorized)

# Describe topics.
topics = model.describeTopics(2)
print("The topics described by their top-weighted terms:")
topics.show(truncate=False)

Vocabulary size: 43
The topics described by their top-weighted terms:
+-----+-----------+------------------------------------------+
|topic|termIndices|termWeights                               |
+-----+-----------+------------------------------------------+
|0    |[4, 14]    |[0.0674216400984457, 0.06656800976050967] |
|1    |[3, 1]     |[0.08640859160557939, 0.05612226121382428]|
|2    |[2, 0]     |[0.09967193430945293, 0.07481528736292267]|
+-----+-----------+------------------------------------------+



In [18]:
## Naive Bayes Text Classification

from pyspark.ml.feature import HashingTF, IDF, Tokenizer

sentenceData = spark.createDataFrame([
    (0.0,'amazon is the best they rock and is awesome and amazing'),
    (0.0,'amazing amazon is so super great job amazon'),
    (0.0,'amazon rocks i love their awesome products'),
    (0.0,'greatness and innovative is amazon they are so incredible'),
    (1.0,'walmart is stupid dumb and slow'),
    (1.0,'unamazing walmart is so mediocre small and crappy'),
    (1.0,'walmart sucks i hate their service'),
    (1.0,'terrible service walmart ugly stores'),
    (0.0,'apple great products'),
    (0.0,'best apple easy technology'),
    (0.0,'apple own technology'),
    (0.0,'smart engineers at apple'),
    (1.0,'telus terrible service'),
    (1.0,'worst telco telus horrible'),
    (1.0,'telus crappy useless support'),
    (1.0,'horrible crappy hate stupid telus'),
    (0.0,'ebay is the best they rock and is awesome and amazing'),
    (0.0,'amazing shaw is so super great job ebat'),
    (0.0,'ebay rocks i love their awesome products'),
    (0.0,'greatness and innovative is ebay they are so incredible'),
    (1.0,'sears is stupid dumb and slow'),
    (1.0,'unamazing sears is so mediocre small and crappy'),
    (1.0,'sears sucks i hate their service'),
    (1.0,'terrible service sears ugly stores'),
    (0.0,'microsoft great products'),
    (0.0,'best microsoft easy technology'),
    (0.0,'microsoft own technology'),
    (0.0,'smart engineers at microsoft'),
    (1.0,'rogers terrible service'),
    (1.0,'worst telco rogers horrible'),
    (1.0,'rogers crappy useless support'),
    (1.0,'horrible crappy hate stupid rogers')
], ["label", "sentence"])

tokenizer = Tokenizer(inputCol="sentence", outputCol="words")
wordsData = tokenizer.transform(sentenceData)

hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20)
featurizedData = hashingTF.transform(wordsData)

idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)

# Split the data into train and test
splits = rescaledData.randomSplit([0.6, 0.4], 1234)
train = splits[0]
test = splits[1]

from pyspark.ml.classification import NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# create the trainer and set its parameters
nb = NaiveBayes(smoothing=1.0, modelType="multinomial")

# train the model
model = nb.fit(train)

# select example rows to display.
predictions = model.transform(test)
predictions.select('label','sentence','rawPrediction','probability','prediction').show()

# compute accuracy on the test set
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",
                                              metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test set accuracy = " + str(accuracy))

+-----+--------------------+--------------------+--------------------+----------+
|label|            sentence|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|  0.0|amazing amazon is...|[-23.551052908878...|[0.99874895357899...|       0.0|
|  0.0|amazon is the bes...|[-40.800320697211...|[0.88900837814058...|       0.0|
|  0.0|amazon rocks i lo...|[-18.896950601626...|[0.98392308104610...|       0.0|
|  1.0|terrible service ...|[-17.798587982976...|[0.11515651378268...|       1.0|
|  1.0|unamazing walmart...|[-21.775337362134...|[0.98353891148092...|       0.0|
|  1.0|walmart sucks i h...|[-16.913028428088...|[0.76343633360712...|       0.0|
|  1.0|telus terrible se...|[-14.803740064823...|[0.01781637196109...|       1.0|
|  0.0|ebay is the best ...|[-41.501302394572...|[0.97100314660092...|       0.0|
|  1.0|sears is stupid d...|[-14.668494931038...|[0.15329701633718...|       1.0|
|  1.0|unamazing

In [23]:
## K-Means Clustering
from pyspark.ml.clustering import KMeans

# Loads data.
dataset = spark.read.format("libsvm").load("/Users/samir/Tools/spark-2.0.2-bin-hadoop2.7/data/mllib/sample_kmeans_data.txt")
dataset.show(truncate=False)

# Trains a k-means model.
kmeans = KMeans().setK(2).setSeed(1)
model = kmeans.fit(dataset)

# Evaluate clustering by computing Within Set Sum of Squared Errors.
wssse = model.computeCost(dataset)
print("Within Set Sum of Squared Errors = " + str(wssse))

# Shows the result.
centers = model.clusterCenters()
print("Cluster Centers: ")
for center in centers:
    print(center)

+-----+-------------------------+
|label|features                 |
+-----+-------------------------+
|0.0  |(3,[],[])                |
|1.0  |(3,[0,1,2],[0.1,0.1,0.1])|
|2.0  |(3,[0,1,2],[0.2,0.2,0.2])|
|3.0  |(3,[0,1,2],[9.0,9.0,9.0])|
|4.0  |(3,[0,1,2],[9.1,9.1,9.1])|
|5.0  |(3,[0,1,2],[9.2,9.2,9.2])|
+-----+-------------------------+

Within Set Sum of Squared Errors = 0.12
Cluster Centers: 
[ 9.1  9.1  9.1]
[ 0.1  0.1  0.1]


In [22]:
## Word2Vec

from pyspark.ml.feature import Word2Vec

# Input data: Each row is a bag of words from a sentence or document.
documentDF = spark.createDataFrame([
    ("Hi I heard about Spark".split(" "), ),
    ("I wish Java could use case classes".split(" "), ),
    ("Logistic regression models are neat".split(" "), )
], ["text"])

# Learn a mapping from words to Vectors.
word2Vec = Word2Vec(vectorSize=3, minCount=0, inputCol="text", outputCol="result")
model = word2Vec.fit(documentDF)

result = model.transform(documentDF)
for row in result.collect():
    text, vector = row
    print("Text: [%s] => \nVector: %s\n" % (", ".join(text), str(vector)))

Text: [Hi, I, heard, about, Spark] => 
Vector: [0.0135332792997,-0.011096050078,0.0506678894162]

Text: [I, wish, Java, could, use, case, classes] => 
Vector: [0.0376478566655,0.0210807355387,0.0403019455927]

Text: [Logistic, regression, models, are, neat] => 
Vector: [0.0177810560912,-0.0559235086665,-0.0178805116564]



In [33]:
# Random Forest ML
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.feature import IndexToString, StringIndexer, VectorIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Load and parse the data file, converting it to a DataFrame.
data = spark.read.format("libsvm")\
    .load("/Users/samir/Tools/spark-2.0.2-bin-hadoop2.7/data/mllib/sample_libsvm_data.txt")
data.show()

# Index labels, adding metadata to the label column.
# Fit on whole dataset to include all labels in index.
labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(data)
print(labelIndexer.labels)

# Automatically identify categorical features, and index them.
# Set maxCategories so features with > 4 distinct values are treated as continuous.
featureIndexer =\
    VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(data)
    
# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = data.randomSplit([0.7, 0.3])

# Train a RandomForest model.
rf = RandomForestClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures", numTrees=10)

# Convert indexed labels back to original labels.
labelConverter = IndexToString(inputCol="prediction", outputCol="predictedLabel",
                               labels=labelIndexer.labels)

# Chain indexers and forest in a Pipeline
pipeline = Pipeline(stages=[labelIndexer, featureIndexer, rf, labelConverter])

# Train model.  This also runs the indexers.
model = pipeline.fit(trainingData)

# Make predictions.
predictions = model.transform(testData)

# Select example rows to display.
predictions.select("predictedLabel", "label", "features").show(20)

# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(
    labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test Error = %g" % (1.0 - accuracy))

rfModel = model.stages[2]
print(rfModel)  # summary only


100
[u'1.0', u'0.0']
+--------------+-----+--------------------+
|predictedLabel|label|            features|
+--------------+-----+--------------------+
|           0.0|  0.0|(692,[95,96,97,12...|
|           0.0|  0.0|(692,[121,122,123...|
|           0.0|  0.0|(692,[124,125,126...|
|           0.0|  0.0|(692,[126,127,128...|
|           0.0|  0.0|(692,[126,127,128...|
|           0.0|  0.0|(692,[127,128,129...|
|           0.0|  0.0|(692,[153,154,155...|
|           0.0|  0.0|(692,[154,155,156...|
|           0.0|  0.0|(692,[155,156,180...|
|           1.0|  1.0|(692,[123,124,125...|
|           1.0|  1.0|(692,[123,124,125...|
|           1.0|  1.0|(692,[123,124,125...|
|           1.0|  1.0|(692,[124,125,126...|
|           1.0|  1.0|(692,[124,125,126...|
|           1.0|  1.0|(692,[124,125,126...|
|           1.0|  1.0|(692,[125,126,127...|
|           1.0|  1.0|(692,[126,127,128...|
|           1.0|  1.0|(692,[127,128,129...|
|           1.0|  1.0|(692,[127,128,129...|
|          