In [1]:
import os
import sys

spark_path = os.environ['SPARK_HOME']
sys.path.append(spark_path + "/bin")
sys.path.append(spark_path + "/python")
sys.path.append(spark_path + "/python/pyspark/")
sys.path.append(spark_path + "/python/lib")
sys.path.append(spark_path + "/python/lib/pyspark.zip")
sys.path.append(spark_path + "/python/lib/py4j-0.10.9-src.zip")

import findspark
findspark.init()

import pyspark

In [2]:
number_cores = 8
memory_gb = 8
conf = (pyspark.SparkConf().setMaster('local[{}]'.format(number_cores)).set('spark.driver.memory', '{}g'.format(memory_gb)))
sc = pyspark.SparkContext(conf=conf)

In [3]:
from pyspark.ml.linalg import Vectors
data = [(Vectors.dense([0.0, 0.0]), 2.0),
        (Vectors.dense([1.0, 1.0]), 1.0),
        (Vectors.dense([9.0, 8.0]), 2.0),
        (Vectors.dense([8.0, 9.0]), 2.0)]

In [4]:
from pyspark.sql import SQLContext
sqlContext= SQLContext(sc)
df = sqlContext.createDataFrame(data, ["features", "weighCol"])

In [5]:
from pyspark.ml.clustering import KMeans
kmeans = KMeans(k=2)
kmeans.setSeed(1)
kmeans.setWeightCol("weighCol")
kmeans.setMaxIter(10)
kmeans.getMaxIter()
10

10

In [6]:
kmeans.clear(kmeans.maxIter)
model = kmeans.fit(df)
model.getDistanceMeasure()

'euclidean'

In [7]:
model.setPredictionCol("newPrediction")
model.predict(df.head().features)

0

In [8]:
centers = model.clusterCenters()
len(centers)

2

In [9]:
transformed = model.transform(df).select("features", "newPrediction")
rows = transformed.collect()
rows

[Row(features=DenseVector([0.0, 0.0]), newPrediction=0),
 Row(features=DenseVector([1.0, 1.0]), newPrediction=0),
 Row(features=DenseVector([9.0, 8.0]), newPrediction=1),
 Row(features=DenseVector([8.0, 9.0]), newPrediction=1)]

## Run the model for Bisecting KMeans based on the API

In [11]:
#data = [(Vectors.dense([0.0, 0.0]), 2.0), (Vectors.dense([1.0, 1.0]), 2.0),
#        (Vectors.dense([9.0, 8.0]), 2.0), (Vectors.dense([8.0, 9.0]), 2.0)]

In [12]:
#df = sqlContext.createDataFrame(data, ["features", "weighCol"])

In [13]:
#bkm = BisectingKMeans(k=2, minDivisibleClusterSize=1.0)

In [14]:
#bkm.setMaxIter(10)

In [15]:
#bkm.getMaxIter()

In [16]:
#bkm.clear(bkm.maxIter)

In [17]:
#bkm.setSeed(1)

In [18]:
#bkm.setWeightCol("weighCol")

In [19]:
#bkm.getSeed()

In [20]:
#bkm.clear(bkm.seed)

In [21]:
#model = bkm.fit(df)

In [22]:
#model.getMaxIter()

In [23]:
#model.setPredictionCol("newPrediction")

In [24]:
#model.predict(df.head().features)

In [25]:
#centers = model.clusterCenters()

In [26]:
#len(centers)

In [27]:
#model.computeCost(df)

In [28]:
#model.hasSummary

In [29]:
#summary = model.summary

In [30]:
#summary.k

In [31]:
#summary.clusterSizes

In [32]:
#summary.trainingCost

In [33]:
#transformed = model.transform(df).select("features", "newPrediction")

In [34]:
#rows = transformed.collect()

In [35]:
#rows[0].newPrediction == rows[1].newPrediction

In [36]:
#rows[2].newPrediction == rows[3].newPrediction

In [37]:
#bkm_path = temp_path + "/bkm"

In [38]:
#bkm.save(bkm_path)

In [39]:
#bkm2 = BisectingKMeans.load(bkm_path)

In [40]:
#bkm2.getK()

In [41]:
#bkm2.getDistanceMeasure()

In [42]:
#model_path = temp_path + "/bkm_model"

In [43]:
#model.save(model_path)

In [44]:
#model2 = BisectingKMeansModel.load(model_path)

In [45]:
#model2.hasSummary

In [46]:
#model.clusterCenters()[0] == model2.clusterCenters()[0]

In [47]:
#model.clusterCenters()[1] == model2.clusterCenters()[1]

In [48]:
#model.transform(df).take(1) == model2.transform(df).take(1)

## Reddit Crypto Data

- Download the [Reddit Crypto data](https://www.cs.wcupa.edu/lngo/data2/reddit_crypto.zip)
- Unzip the crypto data.
- Review [the metadata](https://www.kaggle.com/pavellexyr/reddit-cryptocurrency-data-for-august-2021)
- Perform a K-mean clustering on the text of posts and comments. Address the followings:
  - What data point to remove, what not to remove?
  - How to clean up text data?
  - How to make the resulting cluster data more meaningful?

In [49]:
#spark = pyspark.sql.SparkSession(sc)
#df_posts = spark.read.csv("/users/trush/CSC496/Clustering/data/crypto-aug-2021-comments.csv", header=True, inferSchema=True)

In [50]:
#df_posts.printSchema()

root
 |-- type: string (nullable = true)
 |-- id: string (nullable = true)
 |-- subreddit.id: string (nullable = true)
 |-- subreddit.name: string (nullable = true)
 |-- subreddit.nsfw: string (nullable = true)
 |-- created_utc: string (nullable = true)
 |-- permalink: string (nullable = true)
 |-- body: string (nullable = true)
 |-- sentiment: string (nullable = true)
 |-- score: string (nullable = true)



In [52]:
#df_posts.take(1)

[Row(type='comment', id='hb4hdni', subreddit.id='9e4pv', subreddit.name='cryptomoonshots', subreddit.nsfw='false', created_utc='1630454394', permalink='https://old.reddit.com/r/CryptoMoonShots/comments/pfi1nw/next_generation_safe_token_with_cake_rewards_meth/hb4hdni/', body='Be sure to do your own diligence. Assume that every project posted is a scam/rug/honeypot until proven otherwise. Use tools such as http://www.bscheck.eu/ and https://tokensniffer.com to help you determine if this project is legitimate, but do not solely rely on these tools. Be sure to read comments, particularly those who are downvoted, and warn your fellow redditors against scams. Feel free to join in on our newly made Telegram at https://t.me/CryptoMoonShots to discover and submit projects as well.', sentiment=None, score=None)]

In [1]:
df_posts = sc.TextFile("/users/trush/CSC496/Clustering/data/crypto-aug-2021-comments.csv").cache()
df_posts.count()

NameError: name 'sc' is not defined

In [None]:
df_posts.take(5)