In [1]:
import findspark
findspark.init()

import pyspark
from pyspark.sql import *
sc = pyspark.SparkContext()

from collections import Counter
from pyspark.ml.clustering import LDA, LDAModel, DistributedLDAModel
from pyspark.ml.linalg import Vectors

import pickle as pkl

spark = SparkSession.builder.getOrCreate()
sc.setCheckpointDir('checkpoint/')

STEMMIZER = "porter"
RDD_FILENAME = "stemstem.parquet"

In [2]:
def get_rdd(filename):
    df = spark.read.parquet("data/" + filename)
    rdd = df.rdd.zipWithIndex().map(lambda r: (r[1], r[0]["Words"]))
    return sc.parallelize(rdd.take(50000))

In [3]:
rdd = get_rdd(RDD_FILENAME)

In [4]:
def count_words(rdd):
    def word_occurrences(l):
        counter = dict(Counter(l))
        return list(zip(counter.keys(), counter.values()))
    
    return rdd.mapValues(word_occurrences)

In [5]:
def wordify(rdd):
    lower_case_set = lambda s: set(map(lambda x: x.lower(), s))
    words = list(rdd.map(lambda p: set(p[1])).reduce(lambda x, y: lower_case_set(x) | lower_case_set(y)))
    return dict([(words[i], i) for i in range(len(words))])

In [6]:
def LDAify(rdd, words):
    return rdd.mapValues(lambda l: [(words[w.lower()], occurrences) for (w, occurrences) in l]).mapValues(lambda l: Vectors.sparse(len(words), dict(l))).map(lambda p: list(p))

In [7]:
#words = wordify(rdd)
words = {}
with open('data/words_' + STEMMIZER + '_lda.pickle', 'rb') as handle:
    words = pkl.load(handle)

In [8]:
X = LDAify(count_words(rdd), words)

In [9]:
def format_terms(indices, weights):
    words_list = list(words)
    return [(words_list[index], weights[i]) for (i, index) in enumerate(indices)]

In [10]:
def test_coefs():
    res = []
    for alpha in [1.01, 1.5, 2, 3]:
        for beta in [1.01, 1.5, 2, 3]:
            print("alpha = " + str(alpha) + " / beta = " + str(beta))
            lda = LDA(k = 10).setFeaturesCol("feature").setMaxIter(100).setOptimizer("em").setDocConcentration([float(alpha)]).setTopicConcentration(float(beta))
            X_ = X.toDF(["id", "feature"])
            model = lda.fit(X_)
            res += [model.describeTopics().rdd.map(lambda r: (r[0], format_terms(r[1], r[2]))).take(10)]
    
    interesting_indices = []
    for i in range(4):
        for j in range(4):
            for k, (_, words) in enumerate(res[4*i + j]):
                if "kindl" in list(map(lambda p: p[0], words)):
                    interesting_indices.append((4*i+j, k))

In [11]:
# Chosen from above
alpha = 2
beta = 1.5

In [12]:
#lda = LDA(k = 10).setFeaturesCol("feature").setMaxIter(300).setOptimizer("em").setDocConcentration([float(alpha)]).setTopicConcentration(float(beta))
#best_model = lda.fit(X.toDF(["id", "feature"]))

best_model = DistributedLDAModel.load("data/" + STEMMIZER + "_kindle_lda.parquet")

In [13]:
best_topic = best_model.describeTopics(best_model.vocabSize()).rdd.filter(lambda r: r[0] == 5).map(lambda r: r[2]).collect()[0]

In [14]:
best_topic_vect = Vectors.dense(best_topic)

In [15]:
def sim(u, v):
    return 1 - u.dot(v)/(u.norm(2)*v.norm(2))

In [16]:
def normalize(v):
    return Vectors.sparse(v.size, dict(zip(v.indices, map(lambda x: x/len(words), v.values))))

In [17]:
best_reviews = X.map(lambda r: (r[0], sim(normalize(r[1]), best_topic_vect))).filter(lambda r: r[1] > 0.999998).collectAsMap()

In [18]:
best_asins = spark.read.parquet("data/" + RDD_FILENAME).rdd.zipWithIndex().filter(lambda r: r[1] in best_reviews.keys()).map(lambda r: r[0]["Asin"]).collect()

In [19]:
best_asins

['B00HMQVE4G',
 'B00GR4XFPU',
 'B00HUS1CCU',
 'B005FCVU02',
 'B0079UAT0A',
 'B00K9XXA32',
 'B005H4V4M2',
 'B009IQNMTS',
 'B00L19SVQ4',
 'B00C8EUHQS',
 'B00ASDAWUM',
 'B00FS32GZG',
 'B00BI4J0S0',
 'B00JVF40C4',
 'B008C9INNC',
 'B00B392XIE',
 'B004QGY35W',
 'B00AXBK8YY',
 'B00BPSJ0JI',
 'B005DA0LN8',
 'B005COO1X6',
 'B00AFEOBZ6',
 'B00B2K1AWO',
 'B00I214D70',
 'B00K31DCD8',
 'B00AYYOSS2',
 'B00B4ISWTS',
 'B00DMNDTR8',
 'B0073VIZB0',
 'B00BC6SVY8',
 'B00DR0B31U',
 'B00ILKTBSI',
 'B00BEMD8FW',
 'B005IGBHRG',
 'B00AV2FS36',
 'B00BNVGUPO',
 'B002RI9TIW',
 'B00A94QQCI']