In [1]:
#
# Copyright © 2019 Sunho Kim. All rights reserved.
#

In [2]:
cd ..

/gorani/gorani/backend


In [3]:
from pyspark.sql import SparkSession, DataFrame

spark = SparkSession\
    .builder\
    .appName('Cluster Books')\
    .getOrCreate()
sc = spark.sparkContext

In [4]:
# parameters
k = 2
iteration = 100

In [5]:
# Parameters
alpha = 0.6
ratio = 0.1


In [6]:
import pyspark.sql.functions as F
from pyspark.ml.feature import CountVectorizer, Normalizer
from pyspark.mllib.linalg.distributed import IndexedRow, IndexedRowMatrix

from gorani.spark import read_data_all, write_data
from gorani.utils import sparse_to_array

SIMILARITY_TYPE = 'cosine'

words = read_data_all(spark, 'words', cache = True)
books = read_data_all(spark, 'books')

In [7]:
cv = CountVectorizer()\
    .setInputCol('content')\
    .setOutputCol('tf')\
    .setVocabSize(words.count())
cv = cv.fit(books)

nor = Normalizer()\
    .setInputCol('tf')\
    .setOutputCol('norm')

# normalized book-wordcount matrix
tf_mat = nor.transform(cv.transform(books))

# convert to SparseMatrix to BlockMatrix
mat = IndexedRowMatrix(
tf_mat.select("id", "norm")\
    .rdd.map(lambda row: IndexedRow(row['id'], row['norm'].toArray())))\
    .toBlockMatrix()

# cosine similarity
sim_mat_df = mat.multiply(mat.transpose())\
    .toIndexedRowMatrix()\
    .rows.toDF()

# change schema
sim_af = sim_mat_df.select(F.col('index').alias('id'), F.posexplode(sparse_to_array('vector')))\
    .select('id', F.col('pos').alias('other_id'), F.col('col').alias('value'))\
    .where('id < other_id AND id != 0')\
    .rdd.map(lambda x: tuple([x['id'], x['other_id'], float(x['value'])]))
print('sim mat:')
print(sim_af.collect())

sim mat:


[(4, 5, 0.67757248878479), (4, 6, 0.7573773860931396), (1, 2, 0.6806613802909851), (1, 3, 0.6414517760276794), (1, 4, 0.7125809788703918), (1, 5, 0.84701007604599), (1, 6, 0.8728290796279907), (3, 4, 0.9638029932975769), (3, 5, 0.6091843247413635), (3, 6, 0.6786766052246094), (5, 6, 0.8231562972068787), (2, 3, 0.9711453318595886), (2, 4, 0.9739340543746948), (2, 5, 0.6422935128211975), (2, 6, 0.7229636907577515)]


In [8]:
from pyspark.mllib.clustering import PowerIterationClustering, PowerIterationClusteringModel
from pyspark.sql import Row

model = PowerIterationClustering.train(sim_af, k, iteration)
df = model.assignments().toDF()

write_data('cluster_books', df)
write_data('book_cluster', df)

result = model.assignments().collect()
print('result:')
print('(id cluster)')
for item in result:
    print('(' + str(item.id) + ' ' + str(item.cluster) + ')')

result:
(id cluster)
(4 1)
(1 0)
(6 1)
(3 1)
(5 1)
(2 0)
