In [1]:
import os
import sys

spark_path = os.environ['SPARK_HOME']
sys.path.append(spark_path + "/bin")
sys.path.append(spark_path + "/python")
sys.path.append(spark_path + "/python/pyspark/")
sys.path.append(spark_path + "/python/lib")
sys.path.append(spark_path + "/python/lib/pyspark.zip")
sys.path.append(spark_path + "/python/lib/py4j-0.10.9-src.zip")

import findspark
findspark.init()

import pyspark

In [2]:
number_cores = 8
memory_gb = 8
conf = (pyspark.SparkConf().setMaster('local[{}]'.format(number_cores)).set('spark.driver.memory', '{}g'.format(memory_gb)))
sc = pyspark.SparkContext(conf=conf)

In [3]:
from pyspark.ml.linalg import Vectors
data = [(Vectors.dense([0.0, 0.0]), 2.0),
        (Vectors.dense([1.0, 1.0]), 2.0),
        (Vectors.dense([9.0, 8.0]), 2.0),
        (Vectors.dense([8.0, 9.0]), 2.0)]

In [4]:
from pyspark.sql import SQLContext
sqlContext= SQLContext(sc)
df = sqlContext.createDataFrame(data, ["features", "weighCol"])

In [66]:
from pyspark.ml.clustering import KMeans
kmeans = KMeans(k=2)
kmeans.setSeed(1)
kmeans.setWeightCol("weighCol")
kmeans.setMaxIter(10)
kmeans.getMaxIter()

10

In [6]:
kmeans.clear(kmeans.maxIter)
model = kmeans.fit(df)
model.getDistanceMeasure()

'euclidean'

In [7]:
model.setPredictionCol("newPrediction")
model.predict(df.head().features)

0

In [8]:
centers = model.clusterCenters()
len(centers)

2

In [9]:
transformed = model.transform(df).select("features", "newPrediction")
rows = transformed.collect()
rows

[Row(features=DenseVector([0.0, 0.0]), newPrediction=0),
 Row(features=DenseVector([1.0, 1.0]), newPrediction=0),
 Row(features=DenseVector([9.0, 8.0]), newPrediction=1),
 Row(features=DenseVector([8.0, 9.0]), newPrediction=1)]

## Run the model for Bisecting KMeans based on the API

In [10]:
#data = [(Vectors.dense([0.0, 0.0]), 2.0), (Vectors.dense([1.0, 1.0]), 2.0),
#        (Vectors.dense([9.0, 8.0]), 2.0), (Vectors.dense([8.0, 9.0]), 2.0)]

In [11]:
#df = sqlContext.createDataFrame(data, ["features", "weighCol"])

In [12]:
#bkm = BisectingKMeans(k=2, minDivisibleClusterSize=1.0)

In [13]:
#bkm.setMaxIter(10)

In [14]:
#bkm.getMaxIter()

In [15]:
#bkm.clear(bkm.maxIter)

In [16]:
#bkm.setSeed(1)

In [17]:
#bkm.setWeightCol("weighCol")

In [18]:
#bkm.getSeed()

In [19]:
#bkm.clear(bkm.seed)

In [20]:
#model = bkm.fit(df)

In [21]:
#model.getMaxIter()

In [22]:
#model.setPredictionCol("newPrediction")

In [23]:
#model.predict(df.head().features)

In [24]:
#centers = model.clusterCenters()

In [25]:
#len(centers)

In [26]:
#model.computeCost(df)

In [27]:
#model.hasSummary

In [28]:
#summary = model.summary

In [29]:
#summary.k

In [30]:
#summary.clusterSizes

In [31]:
#summary.trainingCost

In [32]:
#transformed = model.transform(df).select("features", "newPrediction")

In [33]:
#rows = transformed.collect()

In [34]:
#rows[0].newPrediction == rows[1].newPrediction

In [35]:
#rows[2].newPrediction == rows[3].newPrediction

In [36]:
#bkm_path = temp_path + "/bkm"

In [37]:
#bkm.save(bkm_path)

In [38]:
#bkm2 = BisectingKMeans.load(bkm_path)

In [39]:
#bkm2.getK()

In [40]:
#bkm2.getDistanceMeasure()

In [41]:
#model_path = temp_path + "/bkm_model"

In [42]:
#model.save(model_path)

In [43]:
#model2 = BisectingKMeansModel.load(model_path)

In [44]:
#model2.hasSummary

In [45]:
#model.clusterCenters()[0] == model2.clusterCenters()[0]

In [46]:
#model.clusterCenters()[1] == model2.clusterCenters()[1]

In [47]:
#model.transform(df).take(1) == model2.transform(df).take(1)

## Reddit Crypto Data

- Download the [Reddit Crypto data](https://www.cs.wcupa.edu/lngo/data2/reddit_crypto.zip)
- Unzip the crypto data.
- Review [the metadata](https://www.kaggle.com/pavellexyr/reddit-cryptocurrency-data-for-august-2021)
- Perform a K-mean clustering on the text of posts and comments. Address the followings:
  - What data point to remove, what not to remove?
  - How to clean up text data?
  - How to make the resulting cluster data more meaningful?

In [51]:
spark = pyspark.sql.SparkSession(sc)
df_posts = spark.read.csv("/users/trush/CSC496/Clustering/data/crypto-aug-2021-posts.csv",
                          header=True,
                          inferSchema=True,
                          multiLine=True,
                          escape='"').cache()

In [49]:
df_posts.count()

250569

In [52]:
df_posts.printSchema()

root
 |-- type: string (nullable = true)
 |-- id: string (nullable = true)
 |-- subreddit.id: string (nullable = true)
 |-- subreddit.name: string (nullable = true)
 |-- subreddit.nsfw: boolean (nullable = true)
 |-- created_utc: integer (nullable = true)
 |-- permalink: string (nullable = true)
 |-- domain: string (nullable = true)
 |-- url: string (nullable = true)
 |-- selftext: string (nullable = true)
 |-- title: string (nullable = true)
 |-- score: integer (nullable = true)



In [58]:
df_posts.take(1)

[Row(type='post', id='pfi1nw', subreddit.id='9e4pv', subreddit.name='cryptomoonshots', subreddit.nsfw=False, created_utc=1630454394, permalink='https://old.reddit.com/r/CryptoMoonShots/comments/pfi1nw/next_generation_safe_token_with_cake_rewards_meth/', domain='self.cryptomoonshots', url=None, selftext="⚜️ 𝓟𝓻𝓸𝓶𝓮𝓽𝓱𝓮𝓾𝓼 ⚜️\n\n&amp;#x200B;\n\n&amp;#x200B;\n\nThursday 8pm UTC #Prometheus will launch. The NFT titans will be combined to the most (in)famous people in the world. The first one who will be Zeus in the battle against others is Elon Musk - otherwise called: Elon Zuskus.\n\n&amp;#x200B;\n\nLet's see what he brings to the battlefield!\n\n&amp;#x200B;\n\n&amp;#x200B;\n\n⚡️ [https://t.me/Prometheus\\_BSC](https://t.me/Prometheus_BSC)\n\n&amp;#x200B;\n\n&amp;#x200B;\n\n&amp;#x200B;\n\n&amp;#x200B;\n\n&amp;#x200B;\n\n——— 🔥: 𝐈𝐧𝐟𝐨 :🔥 ———\n\n&amp;#x200B;\n\n🍰 Never been done before Cross-Chain NFT project with $CAKE rewards!\n\n🖼 NFT Giveaways to the community! (Custom and existing)\n\n♠️ N

In [None]:
# Sentiment: score

In [None]:
from pyspark.ml.linalg import Vectors
data = [(Vectors.dense([0.0, 0.0]), 2.0),
        (Vectors.dense([1.0, 1.0]), 2.0),
        (Vectors.dense([9.0, 8.0]), 2.0),
        (Vectors.dense([8.0, 9.0]), 2.0)]

In [68]:
data = df_posts.rdd.map(lambda x: (Vectors.dense([x['score']]),1.0))

In [69]:
data.take(5)

[(DenseVector([3.0]), 1.0),
 (DenseVector([1.0]), 1.0),
 (DenseVector([3.0]), 1.0),
 (DenseVector([11.0]), 1.0),
 (DenseVector([1.0]), 1.0)]

In [70]:
from pyspark.sql import SQLContext
sqlContext= SQLContext(sc)
df_kdata = sqlContext.createDataFrame(data, ["features", "weighCol"])

In [71]:
from pyspark.ml.clustering import KMeans
kmeans = KMeans(k=7)
kmeans.setSeed(1)
kmeans.setWeightCol("weighCol")
kmeans.setMaxIter(50)
kmeans.getMaxIter()

50

In [72]:
model = kmeans.fit(df_kdata)

In [73]:
model.setPredictionCol("newPrediction")
model.predict(df_kdata.head().features)
transformed = model.transform(df_kdata).select("features", "newPrediction")

[Row(features=DenseVector([3.0]), newPrediction=0),
 Row(features=DenseVector([1.0]), newPrediction=0),
 Row(features=DenseVector([3.0]), newPrediction=0),
 Row(features=DenseVector([11.0]), newPrediction=0),
 Row(features=DenseVector([1.0]), newPrediction=0)]

In [76]:
transformed.take(10)

[Row(features=DenseVector([3.0]), newPrediction=0),
 Row(features=DenseVector([1.0]), newPrediction=0),
 Row(features=DenseVector([3.0]), newPrediction=0),
 Row(features=DenseVector([11.0]), newPrediction=0),
 Row(features=DenseVector([1.0]), newPrediction=0),
 Row(features=DenseVector([1.0]), newPrediction=0),
 Row(features=DenseVector([1.0]), newPrediction=0),
 Row(features=DenseVector([1.0]), newPrediction=0),
 Row(features=DenseVector([1.0]), newPrediction=0),
 Row(features=DenseVector([1.0]), newPrediction=0)]