## Experiment-03
### Amirreza Fosoul and Bithiah Yuan

In [7]:
from pyspark.sql import SparkSession
from pyspark import SparkContext
from pyspark.sql.functions import *
import string
import csv
import re
import time
spark = SparkSession.builder.appName('ex3').getOrCreate()

In [8]:
from pyspark.sql.functions import *
from pyspark.sql.functions import split, udf, desc, concat, col, lit
import pyspark.sql.functions as f
from pyspark.sql.types import ArrayType, FloatType, StringType, IntegerType, DoubleType, StructType, StructField
from pyspark.ml.feature import RegexTokenizer
from pyspark.ml.feature import StopWordsRemover
from pyspark.sql.window import Window
from pyspark.ml.linalg import SparseVector, VectorUDT, DenseVector
import scipy.sparse
from pyspark.ml.linalg import Vectors, _convert_to_vector, VectorUDT
import numpy as np
from pyspark.sql import SQLContext
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
import math
import re

sc = spark.sparkContext
sqlContext = SQLContext(sc)

#### We reduced our dataset for both the user_libraries and papers due to out of memory issues

In [9]:
# Read user ratings into Dataframe
#user_df = spark.read.option("delimiter", ";").csv('./users_libraries.txt')
user_df = spark.read.option("delimiter", ";").csv('./example2.txt')
user_df = user_df.select(col("_c0").alias("userID"), col("_c1").alias("paperID"))
user_df = user_df.select("userID", f.split("paperID", ",").alias("papers"), f.explode(f.split("paperID", ",")).alias("paperID"))
user_df = user_df.drop("papers")

In [10]:
# Read in the stopwords as a list
with open('./stopwords_en.txt') as file:
    stopwordList = file.read().splitlines()

# Read in records of paper information
#w_df = spark.read.csv('./papers.csv')
w_df = spark.read.csv('./example_paper.csv')
w_df = w_df.select("_c0", "_c13", "_c14")
w_df = w_df.select(col("_c0").alias("paperID"), col("_c13").alias("title"), col("_c14").alias("abstract"))
w_df = w_df.na.fill({'title': '', 'abstract': ''}) # to replace null values with empty string
# Get text from title and abstract
w_df = w_df.select(col("paperID"), concat(col("title"), lit(" "), col("abstract")).alias("words"))
w_df.show()

+-------+--------------------+
|paperID|               words|
+-------+--------------------+
|  80546|the arbitrariness...|
|5842862|how to choose a g...|
|1242600|how to write cons...|
|3467077|defrosting the di...|
| 309395|why most publishe...|
| 305755|the structure of ...|
|6603134|how to build a mo...|
|     99|collective dynami...|
| 105595|linked: how every...|
| 212874|gene ontology: to...|
| 740681|usage patterns of...|
|    101|network motifs: s...|
|  99857|the strength of w...|
|3614773|{rna}-seq: a revo...|
| 873540|pattern recogniti...|
|6434100|a quick guide for...|
| 100088|basic local align...|
|1387765|power-law distrib...|
| 161814|the elements of s...|
| 117535|maximum likelihoo...|
+-------+--------------------+
only showing top 20 rows



## Exercise 3. 1 Vector representation for the papers

In [11]:
# Extracting words from the papers and keeping "-" and "_"
tokenizer = RegexTokenizer(inputCol="words", outputCol="tokens", pattern="[a-zA-Z-_]+", gaps=False) 
# Built-in tokenizer
tokenized = tokenizer.transform(w_df)
tokenized = tokenized.select("paperID", "tokens")

# udf to remove "-" and "_" from the tokens
remove_hyphen_udf = udf(lambda x: [re.sub('[-|_]', '', word) for word in x], ArrayType(StringType()))
# Apply udf to the tokens
df = tokenized.withColumn('tokens', remove_hyphen_udf(col('tokens')))

# udf to remove words less than 3 letters
remove_short_words = udf(lambda x: [item for item in x if len(item) >= 3], ArrayType(StringType()))
# Apply udf to the tokens
df = df.withColumn('tokens', remove_short_words(col('tokens')))

# Built-in function to remove stopwords from our custom list
remover = StopWordsRemover(inputCol="tokens", outputCol="filtered" , stopWords=stopwordList)
df = remover.transform(df)
df = df.select("paperID", "filtered")

# Apply stemming with NLTK
# Built-in class from NLTK
ps = PorterStemmer()
# udf to apply stemming
stemming = udf(lambda x: [ps.stem(item) for item in x], ArrayType(StringType()))
# apply udf to tokens
df = df.withColumn('stemmed', stemming(col('filtered')))
df = df.select("paperID", "stemmed")

# Create a new df to store the paperID and stemmed tokens
paper_terms = df

# Explode/Split the tokens in the list for each paperID and get the distinct tokens
df = df.select("paperID", f.explode("stemmed").alias("tokens")).distinct().orderBy("paperID")

# Assign count of 1 to each token w.r.t. the paperID since the tokens are distinct
df = df.groupBy("paperID","tokens").count()

# Get the number of distinct papers
num_papers = w_df.select("paperID").distinct().count()

# Get the value of ten percent of the number of papers
ten_percent = math.ceil(num_papers*.1)

# Create a new df with the tokens and count (without paperID)
df2 = df.select("tokens", "count")
# Count the number of papers containing the tokens
df2 = df2.groupBy("tokens").agg(f.collect_list("tokens").alias("duplicated_values"), f.sum("count").alias("count"))
# Filter out tokens that appeared in more than 10 percent of the papers
df2 = df2.drop("duplicated_values").orderBy((col("count")).desc()).filter(col("count") < ten_percent)
# Filter out tokens that appeared in less than 20 papers
# Limit the df to 1000 tokens
df2 = df2.filter(col("count") >= 20).limit(1000)

# Create a new df with terms and count
important_words = df2.select(col("tokens").alias("terms"), col("count"))
# Output the set of important words, T
important_words.show(10)

+---------+-----+
|    terms|count|
+---------+-----+
|   compar|   29|
|   measur|   29|
|   featur|   28|
|  problem|   28|
|   requir|   28|
|     cell|   27|
|  pattern|   27|
|technolog|   27|
|      web|   27|
|      map|   26|
+---------+-----+
only showing top 10 rows



In [12]:
# Create a new df where each term is replaced by a unique index that takes a value from the range between 0 and |T| − 1
df = important_words.withColumn("row_num", row_number().over(Window.orderBy("count"))-1)

# Create a df to store the indices and the corresponding terms
terms_index_hash = df.select(col("row_num").alias("index"), "terms")

In [14]:
# Get the number of distinct terms
num_terms = terms_index_hash.select("terms").distinct().count()

# Split (explode) the list of words into a column of tokens and
# count the number of occurences of the tokens per paper
# p_terms is a df with paperID, terms, and count
# p_terms = paper_terms.select("paperID", f.explode("stemmed").alias("terms")).groupBy("paperID", "terms").count()
# p_terms = p_terms.orderBy("paperID", "count")

# # Join p_terms with the terms_index_hash to replace the terms with the indices
# joined_df = terms_index_hash.join(p_terms, ["terms"])
# # Drop the terms because now they are represented by the indices
# joined_df = joined_df.drop("terms")
# joined_df  = joined_df.orderBy("paperID")

# # Create a new df to compute the term frequency vectors
# tf_df = joined_df

# # Concatenate the indices and the count (occurences in papers)
# tf_df = tf_df.withColumn("index_count", f.concat(col("index"), lit(", "), col("count")))
# tf_df = tf_df.drop("index", "count")

# # Concatenate the index_counts per paperID ("un-explode")
# tf_df = tf_df.groupby("paperID").agg(f.concat_ws(", ", f.collect_list(tf_df.index_count)).alias("index_count"))

# tf_df.show()
# # Create a new column and casting the index_count into an array with integer type
# # The terms_count of the tf_df column is now a list where the odd positions are the terms indices and the even positions
# tf_df = tf_df.withColumn("terms_count", split(col("index_count"), ",\s*").cast(ArrayType(IntegerType())).alias("terms_count"))
# tf_df = tf_df.drop("index_count")



In [15]:
num_terms = terms_index_hash.select("terms").distinct().count()

print(num_terms)

64


In [30]:
p_terms = paper_terms.select("paperID", f.explode("stemmed").alias("terms"))
#.groupBy("paperID", "terms").count()
#p_terms = p_terms.orderBy("paperID", "count")

# Join p_terms with the terms_index_hash to replace the terms with the indices
joined_df = terms_index_hash.join(p_terms, ["terms"])
# Drop the terms because now they are represented by the indices
#joined_df = joined_df.drop("terms")
#joined_df  = joined_df.orderBy("paperID")
joined_df = joined_df.drop("index")

# Create a new df to compute the term frequency vectors
tf_df = joined_df

tf_df = tf_df.groupby("paperID").agg(f.concat_ws(", ", f.collect_list(tf_df.terms)).alias("terms"))
tf_df = tf_df.withColumn("terms_", split(col("terms"), ",\s*").cast(ArrayType(StringType())).alias("terms"))
tf_df = tf_df.drop("terms")
tf_df.show()


+-------+--------------------+
|paperID|              terms_|
+-------+--------------------+
| 159967|[scale, predict, ...|
|2212959|[map, cluster, cl...|
| 333353|[pattern, current...|
| 438129|  [collect, resourc]|
| 166220|[challeng, curren...|
|2883810|     [offer, featur]|
|1288940|[web, web, web, w...|
|5251453|[pattern, effect,...|
|7515828|[dna, offer, featur]|
|2739852|[web, emerg, scal...|
|5961524|[offer, addit, ev...|
|    272|[current, dynam, ...|
|6573750|       [cell, genet]|
|  77265|   [pattern, evolut]|
| 820297|[simpl, perform, ...|
|1042553|[power, power, po...|
|2883820|[technolog, curre...|
|5434882|[current, emerg, ...|
|1332540|             [simpl]|
|1777140|[map, challeng, p...|
+-------+--------------------+
only showing top 20 rows



In [33]:
from pyspark.ml.feature import CountVectorizer, CountVectorizerModel

# Get the number of distinct terms
num_terms = terms_index_hash.select("terms").distinct().count()

print(num_terms)

cv = CountVectorizer(inputCol="terms_", outputCol="vectors")
model = cv.fit(tf_df)
x = model.transform(tf_df).show(truncate=False)

64
+-------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------+
|paperID|terms_                                                                                                                                                                                                                               |vectors                                                                                                                            |
+-------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+--------------

In [8]:
# udf to get the sparse vector of term frequency
def vector_list_map(x):
    # a list with size of number of terms and fill it with zeroes 

    mylist = [0] * num_terms
    for i in range(0, len(x), 2):
        mylist[int(x[i])] = x[i+1]
        
    # store the indices of non-zero elements
    nonzero_indices = np.nonzero(mylist)[0].tolist()
    # store the value of non-zero elements
    nonzero_counts = [num for num in mylist if num]
    # combine them to make a sparse vector
    sparse = SparseVector(num_terms, nonzero_indices, nonzero_counts)
    return sparse
           
vector_map = udf(lambda x: vector_list_map(x), VectorUDT())

# applying the udf to the terms_count column to create the term_frequency_sparse column
vector_df = tf_df.withColumn('term_frequency_sparse', vector_map(col('terms_count')))

vector_df = vector_df.drop("terms_count")

vector_df.show(10, truncate=False)

+-------+------------------------------------------------------------------------------------------------------------------------------------+
|paperID|term_frequency_sparse                                                                                                               |
+-------+------------------------------------------------------------------------------------------------------------------------------------+
|159967 |(64,[6,16,17,21,25,26,41,43,47,60,62],[1.0,4.0,1.0,1.0,1.0,3.0,12.0,1.0,1.0,1.0,1.0])                                               |
|2212959|(64,[6,26,48],[1.0,3.0,1.0])                                                                                                        |
|333353 |(64,[2,7,10,19,24,33,46,50,56,59,61],[5.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0])                                                 |
|438129 |(64,[4,30],[1.0,1.0])                                                                                                               |

## Exercise 3. 2 (TF-IDF representation for the papers)

In [9]:
### TF-IDF with built-in function ###

from pyspark.ml.feature import HashingTF, IDF, Tokenizer

idf = IDF(inputCol="term_frequency_sparse", outputCol="features")
idfModel = idf.fit(vector_df)
rescaledData = idfModel.transform(vector_df)
tf_idf_built_in = rescaledData.select("paperID", "features")

tf_idf_built_in.show()

+-------+--------------------+
|paperID|            features|
+-------+--------------------+
| 159967|(64,[6,16,17,21,2...|
|2212959|(64,[6,26,48],[2....|
| 333353|(64,[2,7,10,19,24...|
| 438129|(64,[4,30],[2.542...|
| 166220|(64,[20,29,37,38,...|
|2883810|(64,[24,59],[2.45...|
|1288940|(64,[8,25,39,46,5...|
|5251453|(64,[5,15,22,29,3...|
|7515828|(64,[24,34,59],[2...|
|2739852|(64,[3,9,10,12,14...|
|5961524|(64,[1,2,7,8,24],...|
|    272|(64,[7,17,23,28,4...|
|6573750|(64,[18,55],[2.49...|
|  77265|(64,[2,56],[2.542...|
| 820297|(64,[11,19,35],[2...|
|1042553|(64,[17,21,27,35,...|
|2883820|(64,[8,50,57],[2....|
|5434882|(64,[2,4,29,37,45...|
|1332540|(64,[35],[2.40919...|
|1777140|(64,[16,41,48,49,...|
+-------+--------------------+
only showing top 20 rows



In [11]:
# a udf to compute the idf without using the built-in functions
def idf(x):
    return math.log((num_papers + 1)/(x+1))

idf_udf = udf(lambda x: idf(x), DoubleType())

# apply the udf to the dataframe
df = df.withColumn('idf', idf_udf(col('count')))

# select the term-index and the idf
idf_df = df.select(col("row_num").alias("index"), "idf")

In [12]:
#### TF-IDF without built-in function ####

# join dataframes to get the count of the terms for each paper
tf_idf_df = joined_df.join(idf_df, ["index"]).select("index", "paperID", (col("count") * col("idf")).alias("tf-idf"))

# concatenate the index and the tf-idf columns 
tf_idf_df = tf_idf_df.withColumn("index_tf_idf", f.concat(col("index"), lit(", "), col("tf-idf")))
tf_idf_df = tf_idf_df.drop("index", "tf-idf")

# Concatenate the index_tf-idf per paperID "un-explode"
tf_idf_df = tf_idf_df.groupby("paperID").agg(f.concat_ws(", ", f.collect_list(tf_idf_df.index_tf_idf)).alias("index_tf_idf"))

# cast the index_tf-idf into the array of doubles
tf_idf_df = tf_idf_df.withColumn("tf-idf", split(col("index_tf_idf"), ",\s*").cast(ArrayType(DoubleType())).alias("terms_tf-idf"))
tf_idf_df = tf_idf_df.drop("index_tf_idf")

#tf_idf_df.show(truncate=False)

# compute the sparse vector for tf-idf
tf_idf_vector = tf_idf_df.withColumn('tf-idf_sparse', vector_map(col('tf-idf')))

tf_idf_vector = tf_idf_vector.drop("tf-idf")

tf_idf_vector.show(10, truncate=False)

+-------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|paperID|tf-idf_sparse                                                                                                                                                                                                                                                                                                                                                                                          |
+-------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

## Exercise 3. 3 (Clustering)

In [13]:
# create the user profile using the tf_idf dataframe and the users' library dataframe
user_profile = tf_idf_built_in.join(user_df, ["paperID"]).orderBy("userID").select('userId', 'features')

# convert the dataframe to RDD to sum up the tf_idf vector of each user and then convert back to dataframe
user_profile = user_profile.rdd.mapValues(lambda v: v.toArray()).reduceByKey(lambda x, y: x + y).mapValues(lambda x: DenseVector(x))\
.toDF(["userId", "features"])
user_profile.show()

+--------------------+--------------------+
|              userId|            features|
+--------------------+--------------------+
|ffa86c7343f5ce8a7...|[0.0,0.0,0.0,0.0,...|
|1006f001bb7ea1023...|[0.0,0.0,0.0,0.0,...|
|42b59f0ee03fa51f3...|[0.0,0.0,0.0,2.54...|
|6a96998090c72b724...|[0.0,0.0,2.542726...|
|6d35c61471e1803bc...|[0.0,0.0,2.542726...|
|b953afb03918cb8fa...|[0.0,0.0,7.628178...|
|d85f7d83f27b3f533...|[2.54272622067682...|
|a06e5e3519a8d90a8...|[5.08545244135365...|
|5fd7b7de422c5b11d...|[5.08545244135365...|
|f1e1cd4ff25018273...|[0.0,0.0,0.0,2.54...|
|b656009a6efdc8b1a...|[2.54272622067682...|
|3df36dc3cdc7e9086...|[0.0,0.0,0.0,0.0,...|
|32930342b4a8ca975...|[0.0,0.0,0.0,0.0,...|
|0530b8ed834fa63f2...|[0.0,0.0,0.0,0.0,...|
|7d378131aeffcab66...|[0.0,0.0,2.542726...|
|bd8345b325c5cf7a9...|[0.0,0.0,0.0,0.0,...|
|b36c3189bb1457cd0...|[0.0,0.0,0.0,0.0,...|
|bbcd9dae3160ddcb9...|[0.0,0.0,0.0,7.62...|
|ce250fe59a9a7d698...|[10.1709048827073...|
|5c7cfd33003a6a2c3...|[0.0,0.0,7

## Exercise 3. 3 (Clustering)

In [14]:
##### 50 Clusters #####

from pyspark.ml.clustering import KMeans
from scipy.spatial import distance

# Trains a k-means model.
kmeans_50 = KMeans().setK(50).setSeed(1)
model_50 = kmeans_50.fit(user_profile)

# Make predictions
predictions_50 = model_50.transform(user_profile)
predictions_50.show(10)

+--------------------+--------------------+----------+
|              userId|            features|prediction|
+--------------------+--------------------+----------+
|ffa86c7343f5ce8a7...|[0.0,0.0,0.0,0.0,...|        35|
|1006f001bb7ea1023...|[0.0,0.0,0.0,0.0,...|        29|
|42b59f0ee03fa51f3...|[0.0,0.0,0.0,2.54...|        45|
|6a96998090c72b724...|[0.0,0.0,2.542726...|        28|
|6d35c61471e1803bc...|[0.0,0.0,2.542726...|        13|
|b953afb03918cb8fa...|[0.0,0.0,7.628178...|        16|
|d85f7d83f27b3f533...|[2.54272622067682...|        41|
|a06e5e3519a8d90a8...|[5.08545244135365...|         1|
|5fd7b7de422c5b11d...|[5.08545244135365...|        11|
|f1e1cd4ff25018273...|[0.0,0.0,0.0,2.54...|        46|
+--------------------+--------------------+----------+
only showing top 10 rows



In [15]:
### DBI for 50 Clusters ###

# compute the centroid of the clusters
centers_50 = model_50.clusterCenters()

# add the column of centroids of the respective clusters of the features
user_profile_50 = predictions_50.rdd.map(lambda profile: (profile[0], profile[1], profile[2], DenseVector(centers_50[profile[2]]))).toDF(['userID', 'features', 'prediction', 'center'])

# compute the Euclidean distance of the users' features from their centroid
user_profile_50 = user_profile_50.rdd.map(lambda x: (x[2], x[1], float(distance.euclidean(x[1], x[3])))).toDF(['cluster', 'features', 'distance'])

# compute the average distance
average_50 = user_profile_50.select('*').groupBy('cluster').agg(mean('distance'))
#average_50.show()

# convert average distances to a list
variances_50 = list(average_50.select('avg(distance)').toPandas()['avg(distance)'])

# a function to compute the Davies-Bouldin-Index
def dbi(variances, centers, num_cluster):
    db = []
    max_result = []
    
    for i in range(0, num_cluster):
        for j in range(i+1, num_cluster):
            db.append((variances[i] + variances[j]) / distance.euclidean(centers[i], centers[j]))
        if len(db):
            # pick the maximum of the similarity measures for each cluster
            max_result.append(np.max(db))
        db = []

    result = np.sum(max_result) / num_cluster
    return result

print(dbi(variances_50, centers_50, 50))
        

0.00289257345792


In [18]:
### 10 Clusters ###

# Trains a k-means model.
kmeans_10 = KMeans().setK(10).setSeed(1)
model_10 = kmeans_10.fit(user_profile)

# Make predictions
predictions_10 = model_10.transform(user_profile)

centers_10 = model_10.clusterCenters()

user_profile_10 = predictions_10.rdd.map(lambda profile: (profile[0], profile[1], profile[2], DenseVector(centers_10[profile[2]]))).toDF(['userID', 'features', 'prediction', 'center'])
user_profile_10 = user_profile_10.rdd.map(lambda x: (x[2], x[1], float(distance.euclidean(x[1], x[3])))).toDF(['cluster', 'features', 'distance'])
average_10 = user_profile_10.select('*').groupBy('cluster').agg(mean('distance'))
average_10.show()

+-------+------------------+
|cluster|     avg(distance)|
+-------+------------------+
|      0|18.012801404202207|
|      7|               0.0|
|      6|               0.0|
|      9|12.680302167100209|
|      5| 8.953256295372702|
|      1| 16.65302558839553|
|      3|  29.4149877077502|
|      8|13.069494169768761|
|      2|               0.0|
|      4|               0.0|
+-------+------------------+



In [19]:
### DBI for 10 Clusters ###

variances_10 = list(average_10.select('avg(distance)').toPandas()['avg(distance)'])

print(dbi(variances_10, centers_10, 10))
        

0.0778643099259


#### In this case 50 clusters is better because it has a lower DBI of  0.00289257345792 compared to 0.0778643099259

## Exercise 3. 4 Latent Direchlet Allocation (LDA)

In [21]:
### LDA ###
from pyspark.ml.clustering import LDA

# just rename the column to use it with the built-in methods
termFrequencyVector = vector_df.select('paperId', col('term_frequency_sparse').alias('features'))
# Trains a LDA model
# set k=40 to have 40 different topics
lda = LDA(k=40)
model = lda.fit(termFrequencyVector)

# each topic is described by 5 terms
topics = model.describeTopics(5)

# print("The topics described by their top-weighted terms:")
#topics.show(truncate=False)

# Shows the result
# it shows the probabilty of each topic for each paper
transformed = model.transform(termFrequencyVector)
transformed.show()

+-------+--------------------+--------------------+
|paperId|            features|   topicDistribution|
+-------+--------------------+--------------------+
| 159967|(64,[6,16,17,21,2...|[8.62388518649721...|
|2212959|(64,[6,26,48],[1....|[0.00403231936109...|
| 333353|(64,[2,7,10,19,24...|[0.00150978031081...|
| 438129|(64,[4,30],[1.0,1...|[0.00808468265300...|
| 166220|(64,[20,29,37,38,...|[0.00219697108991...|
|2883810|(64,[24,59],[1.0,...|[0.00808468265300...|
|1288940|(64,[8,25,39,46,5...|[0.00219697108991...|
|5251453|(64,[5,15,22,29,3...|[0.00142089211694...|
|7515828|(64,[24,34,59],[1...|[0.00605598618653...|
|2739852|(64,[3,9,10,12,14...|[0.00100620885952...|
|5961524|(64,[1,2,7,8,24],...|[0.00241699468525...|
|    272|(64,[7,17,23,28,4...|[0.00161053199979...|
|6573750|(64,[18,55],[1.0,...|[0.00808468265300...|
|  77265|(64,[2,56],[1.0,1...|[0.00808468265300...|
| 820297|(64,[11,19,35],[1...|[0.00605598618653...|
|1042553|(64,[17,21,27,35,...|[0.00142089211694...|
|2883820|(64

In [22]:
# creating a user profile based on the LDA results

lda_user_profile = transformed.join(user_df, ["paperID"]).orderBy("userID").select('userId', col('topicDistribution').alias('features'))

lda_user_profile = lda_user_profile.rdd.mapValues(lambda v: v.toArray()).reduceByKey(lambda x, y: x + y).mapValues(lambda x: DenseVector(x))\
.toDF(["userId", "features"])
lda_user_profile.show()

+--------------------+--------------------+
|              userId|            features|
+--------------------+--------------------+
|ffa86c7343f5ce8a7...|[0.00302236622220...|
|1006f001bb7ea1023...|[0.00201366349526...|
|42b59f0ee03fa51f3...|[0.00808468265300...|
|6a96998090c72b724...|[0.02135010739927...|
|6d35c61471e1803bc...|[0.00172569205872...|
|b953afb03918cb8fa...|[0.00293329210930...|
|d85f7d83f27b3f533...|[0.00829623376657...|
|a06e5e3519a8d90a8...|[0.09146511925821...|
|5fd7b7de422c5b11d...|[0.04219386992193...|
|f1e1cd4ff25018273...|[0.02427424171435...|
|b656009a6efdc8b1a...|[0.06709806077504...|
|3df36dc3cdc7e9086...|[0.01215723970024...|
|32930342b4a8ca975...|[0.01215723970024...|
|0530b8ed834fa63f2...|[0.00605598618653...|
|7d378131aeffcab66...|[0.00172569205872...|
|bd8345b325c5cf7a9...|[0.00201366349526...|
|b36c3189bb1457cd0...|[0.01215723970024...|
|bbcd9dae3160ddcb9...|[0.01921192528354...|
|ce250fe59a9a7d698...|[0.06465767495926...|
|5c7cfd33003a6a2c3...|[0.0516525

In [23]:
# Trains a k-means model.
kmeans_lda = KMeans().setK(50).setSeed(1)
model_lda = kmeans_lda.fit(lda_user_profile)

# Make predictions
predictions_lda = model_lda.transform(lda_user_profile)

predictions_lda.show()

+--------------------+--------------------+----------+
|              userId|            features|prediction|
+--------------------+--------------------+----------+
|ffa86c7343f5ce8a7...|[0.00302236622220...|        35|
|1006f001bb7ea1023...|[0.00201366349526...|        19|
|42b59f0ee03fa51f3...|[0.00808468265300...|        47|
|6a96998090c72b724...|[0.02135010739927...|        29|
|6d35c61471e1803bc...|[0.00172569205872...|        13|
|b953afb03918cb8fa...|[0.00293329210930...|         8|
|d85f7d83f27b3f533...|[0.00829623376657...|         0|
|a06e5e3519a8d90a8...|[0.09146511925821...|        11|
|5fd7b7de422c5b11d...|[0.04219386992193...|        16|
|f1e1cd4ff25018273...|[0.02427424171435...|         1|
|b656009a6efdc8b1a...|[0.06709806077504...|         5|
|3df36dc3cdc7e9086...|[0.01215723970024...|        46|
|32930342b4a8ca975...|[0.01215723970024...|        41|
|0530b8ed834fa63f2...|[0.00605598618653...|        25|
|7d378131aeffcab66...|[0.00172569205872...|        13|
|bd8345b32

In [25]:
### DBI for LDA ###

centers_lda = model_lda.clusterCenters()

#print(len(centers_lda[0]))

user_profile_lda = predictions_lda.rdd.map(lambda profile: (profile[0], profile[1], profile[2], DenseVector(centers_lda[profile[2]]))).toDF(['userID', 'features', 'prediction', 'center'])
user_profile_lda = user_profile_lda.rdd.map(lambda x: (x[2], x[1], float(distance.euclidean(x[1], x[3])))).toDF(['cluster', 'features', 'distance'])
average_lda = user_profile_lda.select('*').groupBy('cluster').agg(mean('distance'))
average_lda.show()


+-------+--------------------+
|cluster|       avg(distance)|
+-------+--------------------+
|     29|                 0.0|
|     26| 0.14103847428113697|
|     19|8.338292235946308E-5|
|      0|                 0.0|
|     22|                 0.0|
|      7|                 0.0|
|     34|                 0.0|
|     43|                 0.0|
|     32|                 0.0|
|     31|                 0.0|
|     39|                 0.0|
|     25| 0.24649043624563108|
|      6|                 0.0|
|      9|                 0.0|
|     27|                 0.0|
|     17|                 0.0|
|     41|                 0.0|
|     28|                 0.0|
|     33| 0.13129754752687012|
|      5|                 0.0|
+-------+--------------------+
only showing top 20 rows



In [26]:
### DBI for LDA Clusters ###

variances_lda = list(average_lda.select('avg(distance)').toPandas()['avg(distance)'])

print(dbi(variances_lda, centers_lda, 50))
        

0.00293010711337


In [141]:
# needed if you want to install the nltk on your docker image

import sys
!conda install --yes --prefix {sys.prefix} nltk

Fetching package metadata .............
Solving package specifications: .

Package plan for installation in environment /opt/conda:

The following NEW packages will be INSTALLED:

    blinker:           1.4-py_1             conda-forge
    boto:              2.49.0-py36_0        defaults   
    boto3:             1.9.47-py_0          conda-forge
    botocore:          1.12.48-py_0         conda-forge
    bz2file:           0.98-py_0            conda-forge
    docutils:          0.14-py36_1001       conda-forge
    gensim:            3.5.0-py36_0         conda-forge
    jmespath:          0.9.3-py_1           conda-forge
    nltk:              3.2.5-py_0           conda-forge
    oauthlib:          2.1.0-py_0           conda-forge
    pyjwt:             1.6.4-py_0           conda-forge
    python-crfsuite:   0.9.6-py36h2d50403_0 conda-forge
    requests-oauthlib: 1.0.0-py_1           conda-forge
    s3transfer:        0.1.13-py36_1001     conda-forge
    smart_open:        1.7.1-py_0   