### Load Data into Spark DataFrame

In [115]:
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
from pyspark.sql import functions as F
from pyspark.ml.feature import StopWordsRemover
from pyspark.ml.linalg import Vector, Vectors
from pyspark.ml.feature import CountVectorizer, IDF
from pyspark.ml.clustering import LDA, LDAModel
from pyspark.sql.types import ArrayType, FloatType, StringType, DoubleType, IntegerType

In [2]:
spark = SparkSession.builder.getOrCreate()

#Read from json file (single-line mode by records)
cleaned_text = spark.read.json("cleaning_test_output.json") 
cleaned_text.createOrReplaceTempView('cleaned')

#Drop cleaned text of null values(852 rows removed, 134145 rows left)
query = '''SELECT * FROM cleaned WHERE cleaned_text IS NOT NULL''' 
text = spark.sql(query)

#Split cleaned text into column of tokens, create index column
text = text.withColumn('cleaned_text', F.trim(text.cleaned_text))\
           .withColumn('tokens',F.split(text.cleaned_text," "))\
           .withColumn("index", F.monotonically_increasing_id())\
           .select("*")
text.createOrReplaceTempView('cleaned_indexed')

In [None]:
#Schema of the text dataframe
text.printSchema()

In [None]:
#Preview of the first entry
text.take(1)

### Create TF-IDF Feature Vectors

#### Tf-Idf Sparse Vector
In Spark Maching Learning Library, Tf-Idf is separated into two parts - TF and IDF to make them flexible. Therefore, CountVectorizer is used first to generate the term frequency vector. IDF then takes feature vectors created from CountVectorizer and scales each column (token), down-weighting columns (tokens) which appear frequently in a corpus.





In [None]:
#Check vocabulary size for setting vocabSize parameters
unfold_df = text.withColumn('word',F.explode(F.split(text.cleaned_text, "\s"))).where('word != ""')
vocab_df = unfold_df.groupBy("word").agg(F.count("tokens").alias('df')) #84084

In [7]:
#Create tfidf vectors (with all words)
# TF
query = '''SELECT tokens, index FROM cleaned_indexed''' 
df_text = spark.sql(query)
cv = CountVectorizer(inputCol="tokens", outputCol="raw_features", vocabSize=84084)
cvModel = cv.fit(df_text)
result_cv = cvModel.transform(df_text)

# IDF
idf = IDF(inputCol="raw_features", outputCol="features")
idfModel = idf.fit(result_cv)
result_tfidf = idfModel.transform(result_cv) 

#Check DocSize and VocabSize
tfidf = result_tfidf.select('index','features')
print(tfidf.count(), len(cvModel.vocabulary))

### Train LDA Model on Complete Dataset and Evaluate
By training a base LDA model with complete dataset, a couple of topics would be discovered initially and evaluated by human judgement through visulization, which could be further improved by enriching stopwords and model parameter tunning.
#### Train LDA model with All Tokens

In [11]:
num_topics = 20
max_iterations = 100

lda = LDA(featuresCol="features",k=num_topics, seed=1, optimizer="online", maxIterations = max_iterations)
ldaModel=lda.fit(tfidf)
lda_df=ldaModel.transform(tfidf)

#### Identify Top Terms 

In [26]:
#Describe top 20 topics(10 top words per topic)
ldatopics = ldaModel.describeTopics()
numTopics = ldatopics.count()

ListOfIndexToWords = F.udf(lambda wl: list([vocabulary[w] for w in wl]), ArrayType(StringType()))
FormatNumbers = F.udf(lambda nl: [float("{:1.4f}".format(x)) for x in nl], ArrayType(FloatType()))

toptopics = ldatopics.select((ldatopics.topic + 1).alias('topic'),
                          ListOfIndexToWords(ldatopics.termIndices).alias('words'),
                          FormatNumbers(ldatopics.termWeights).alias('weights'))

toptopics.show(truncate=False, n=numTopics)
print('Topics:', numTopics, 'Vocabulary:', len(vocabulary))

+-----+-------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------+
|topic|words                                                                                                  |weights                                                                         |
+-----+-------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------+
|1    |[thyroid, cancer, thyca, support, group, tsh, conference, research, patients, page]                    |[0.019, 0.0133, 0.0103, 0.0083, 0.0071, 0.0058, 0.0053, 0.0052, 0.0044, 0.0043] |
|2    |[bladder, bcg, cancer, urologist, cystoscopy, grade, mum, lymphedema, treatments, urine]               |[0.0229, 0.0209, 0.0073, 0.0071, 0.0061, 0.0061, 0.0053, 0.0052, 0.005, 0.0047] |
|3    |[diet, food, water, foods, s

#### Identify Top Documents for Top Topics

In [156]:
result_cv

DataFrame[tokens: array<string>, index: bigint, raw_features: vector]

In [157]:
countVectors = (result_cv.select("index", "raw_features").cache())
df = ldaModel.transform(countVectors)

topWords = ldatopics.select(ListOfIndexToWords(ldatopics.termIndices).alias('words')).take(numTopics)

#Show single top topic
nTopDoc = 1  

for i in range(0, numTopics):
    ntopic = i  # which topic 
    print('Topic ' + str(ntopic) + '\n')  

    df_sliced = df.select("index", "topicDistribution") \
        .rdd.map(lambda r: Row(ID=int(r[0]), weight=float(r[1][ntopic]))).toDF()

    DocIDs = df_sliced.sort(df_sliced.weight.desc()).take(nTopDoc)
    print('Top Document(s):',DocIDs)
    for d_id in DocIDs:
        df_text.filter(df_text.index == d_id[0]) \
            .select('title', 'body') \
            .show(truncate=False)

    print('Top terms:')
    print(topWords[ntopic][0], '\n')
    print('===================================================')

IllegalArgumentException: 'Field "features" does not exist.\nAvailable fields: index, raw_features'

In [None]:
#Count documents for each topic
%matplotlib inline 
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
sns.set_style("whitegrid")

countVectors = (result_cv.select("index", "features").cache())

countTopDocs = (ldaModel
                .transform(countVectors)
                .select("topicDistribution")
                .rdd.map(lambda r: Row( nTopTopic = int(np.argmax(r)))).toDF()
                .groupBy("nTopTopic").count().sort("nTopTopic"))

pdf = countTopDocs.toPandas()
# pdfLess = pdf.drop(pdf.index[[1,3,7,8,10,11,14,15]]).reset_index()

# pdfLess.plot(color = '#44D3A5', legend = False,
#                            kind = 'bar', use_index = True, y = 'count', grid = False)
# plt.xlabel('topic')
# plt.ylabel('counts')

# plt.show()

#### Generate Evaluation Metrics

In [109]:
#Check Log Likelihood of base LDA model 
ll = ldaModel.logLikelihood(tfidf)
ll

-190290427.02601972

In [154]:
#Check Log Perplexity of base LDA model
lp = ldaModel.logPerplexity(tfidf)
lp

8.434250933211537

#### Save Base LDA Model

In [None]:
#Check if the model is distributed and save the model
print(ldaModel.isDistributed())
path = getcwd()
model_number = '0'
cvModel.save(path + 'CVModel_stem'+ model_number)
ldaModel.save(path + 'LDAModel_stem'+ model_number)
lda.save(path + 'LDA_'+ model_number)

### Model Improvement (more meaningful topics)

#### Generate TF-IDF-Term List

In [None]:
vocabulary = cvModel.vocabulary
ListOfIndexToWords = F.udf(lambda wl: list([vocabulary[w] for w in wl]), ArrayType(StringType()))
ExtractValues = F.udf(lambda vec: vec.values.tolist(), ArrayType(DoubleType()))
ExtractIndex = F.udf(lambda vec: vec.indices.tolist(), ArrayType(IntegerType()))
result_tfidf = result_tfidf.withColumn('feature_list',ExtractValues(result_tfidf.features))\
                           .withColumn('term_index', ExtractIndex(result_tfidf.features))
result_tfidf = result_tfidf.withColumn('term_list', ListOfIndexToWords(result_tfidf.term_index))

tfidf_termlist = result_tfidf.select('term_list','index','feature_list')\
                           .withColumn("tmp", F.arrays_zip('term_list','feature_list'))\
                           .withColumn('tmp', F.explode('tmp'))\
                           .select('index',F.col('tmp.term_list'),F.col('tmp.feature_list'))\
                           .orderBy('index','feature_list')

#Preview the first entry of TF-IDF-Term list
tfidf_termlist.take(1)

In [149]:
#Get terms with tfidf <= 2
tfidf_filter_2 = tfidf_termlist.filter(tfidf_termlist.feature_list <= 2) #146126 tfidf <= 2#
tfidf_filter_2.toPandas()['term_list'].unique()

array(['months', 'years', 'day', 'cancer', 'year', 'thanks', 'help',
       'time', 'pain'], dtype=object)

In [151]:
#Get terms with tfidf <= 3
tfidf_filter_3 = tfidf_termlist.filter(tfidf_termlist.feature_list <= 3)
tfidf_filter_3.toPandas()['term_list'].unique()

array(['results', 'months', 'symptoms', 'years', 'day', 'cancer', 'year',
       'things', 'blood', 'disease', 'body', 'family', 'diagnosis',
       'problems', 'thanks', 'treatment', 'husband', 'stage', 'lung',
       'doctor', 'days', 'chemo', 'help', 'way', 'need', 'hope', 'post',
       'doctors', 'care', 'love', 'share', 'test', 'hospital',
       'information', 'time', 'health', 'weeks', 'advice', 'thing',
       'heart', 'week', 'work', 'area', 'dr', 'question', 'surgery',
       'people', 'pain', 'lot', 'experience', 'times', 'morning', 'life',
       'night', 'support', 'today', 'past', 'month', 'home', 'end',
       'problem', 'effects', 'use', 'issues'], dtype=object)

#### Add Stopwords

In [None]:
#Time-scale words
noun_stpwd = \
            ['day','month','year','days','months','years','today','everyday',
             'time', 'weeks','dr']

#Words more than 1 letters (not common for medical abbreviations)
one_charachters = [word for word in cv_tmp_model.vocabulary if len(word) = 1]

swRemover = StopWordsRemover(inputCol='tokens', outputCol="filtered")
swRemover.setStopWords(swRemover.getStopWords() + noun_stpwd +one_characers)

df_text_new = swRemover.transform(df_text)

#### Parameter Tuning

In [2]:
#check document length
import json
import numpy as np

def load_R_model(filename):
    with open(filename, 'r') as j:
        data_input = json.load(j)
    data = {'topic_term_dists': data_input['phi'], 
            'doc_topic_dists': data_input['theta'],
            'doc_lengths': data_input['doc.length'],
            'vocab': data_input['vocab'],
            'term_frequency': data_input['term.frequency']}
    return data

movies_model_data = load_R_model('movie_reviews_input.json')

In [12]:
import pandas as pd
movies_model_data['doc_lengths']

[312,
 288,
 170,
 435,
 291,
 377,
 328,
 276,
 101,
 226,
 391,
 271,
 140,
 156,
 437,
 248,
 167,
 362,
 210,
 252,
 155,
 267,
 188,
 333,
 366,
 357,
 190,
 565,
 394,
 427,
 367,
 538,
 321,
 474,
 267,
 210,
 155,
 344,
 123,
 230,
 269,
 474,
 291,
 228,
 323,
 349,
 331,
 244,
 228,
 133,
 158,
 278,
 132,
 332,
 196,
 277,
 223,
 60,
 108,
 399,
 348,
 298,
 448,
 131,
 284,
 303,
 247,
 212,
 264,
 282,
 245,
 360,
 196,
 334,
 573,
 96,
 490,
 219,
 249,
 144,
 183,
 251,
 119,
 323,
 311,
 274,
 435,
 330,
 274,
 169,
 147,
 175,
 291,
 133,
 264,
 245,
 332,
 295,
 166,
 380,
 239,
 332,
 259,
 271,
 214,
 474,
 231,
 213,
 141,
 281,
 374,
 123,
 352,
 323,
 82,
 542,
 337,
 301,
 287,
 128,
 402,
 321,
 581,
 498,
 263,
 190,
 324,
 240,
 160,
 194,
 154,
 342,
 105,
 228,
 262,
 214,
 369,
 165,
 225,
 190,
 366,
 288,
 279,
 453,
 167,
 109,
 673,
 336,
 236,
 209,
 217,
 244,
 163,
 305,
 300,
 273,
 182,
 122,
 393,
 317,
 258,
 144,
 701,
 602,
 251,
 373,
 349,
 