# Introduction
This notebook contains a `Python 2` / `PySpark` script to find the top $N$ words for the positive and negative reviews in a given cluster.

# Notebook Setup

## Initialise modules

In [1]:
import findspark
import pyspark
import pyspark.sql.functions as F
import pyspark.sql.types as T
import pymongo
import pandas as pd
import gzip # To parse gzip file
import re # Regex for text processing
import os # For setting up Mongo-Spark connector
import csv # To read/write CSV files

## Initialise PySpark session

Load `MongoDB-Spark` connector when starting up `PySpark`.

In [2]:
packages = 'org.mongodb.spark:mongo-spark-connector_2.11:2.2.0'
dedicated_memory = '4g'

os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages {} --driver-memory {} pyspark-shell' \
    .format(packages, dedicated_memory)

In [3]:
# Find SPARK_HOME
findspark.init()

# Create SparkSession
spark = (pyspark.sql.SparkSession
         .builder.appName('WordProcessing')
         .getOrCreate())

## Configure Pandas HTML display

In [4]:
pd.set_option('display.max_colwidth', -1)

## Define helper methods

In [5]:
def parse(path):
    '''
    Unzip a json.gz at `path` and returns a generator.
    '''
    g = gzip.open(path, 'rb')
    for line in g:
        yield eval(line)

def import_to_mongo(path, coll, db='hackon', create_index=True):
    '''
    Unzip and import json.gz file from `path` and loads it into mongo server.
    Create database index if `create_index` is True. 
    '''
    # Obtain handle to Mongo database and collection
    client = pymongo.MongoClient()
    collection = client[db][coll]
    
    # Return prematurely if database.collection already exists
    if (collection.count() != 0):
        print '{}.{} already exists on MongoDisk server. Exiting without loading JSON data.'.format(db, coll)
        return
    
    # Insert datapoints into Mongo database
    try:
        collection.insert_many((datapoint for datapoint in parse(path)))
        print 'JSON data successfully imported to Mongo at \'{}.{}.\''.format(db, coll)
    except Exception as e:
        print 'Error loading data.\n{}'.format(e)
        client.close()
        return
    
    if not create_index:
        client.close()
        return
    
    # Create database index for improved searching
    # collection.create_index([('asin', pymongo.ASCENDING), ('reviewerID', pymongo.DESCENDING)])

def load_mongo_to_spark(coll, db='hackon'):
    '''
    Load the Mongo database to a Spark Session and returns the Spark DataFrame
    '''
    try:
        return (spark
                .read
                .format('com.mongodb.spark.sql.DefaultSource')
                .option('uri', 'mongodb://127.0.0.1/{}.{}'.format(db, coll))
                .load())
    except Exception as e:
        print 'Failed to create Spark dataframe.\n{}'.format(e)

def displayDF(sparkDF, n=10):
    '''
    Interactively displays the first n rows of a sparkDF as a pandas dataframe
    '''
    return (sparkDF
            .limit(n)
            .drop('_id', 'unixReviewTime')
            .toPandas())

In [12]:
import_to_mongo('../../Datasets/reviews_Baby.json.gz', coll='baby')

hackon.baby already exists on MongoDisk server. Exiting without loading JSON data.


# Word Processing Script

The function below takes in a Spark DataFrame containing reviews of a particular cluster. It returns a new DataFrame with two appended columns listing the top $N$ words based on *tf-idf* scores for good and bad reviews. 

> By default, *good reviews* are defined as reviews with **4-star** ratings and above, with the rest defined as *bad reviews*.

## Load Mongo toy dataset

In [6]:
# def append_toy_clusterID(DF):
#     '''
#     Append a toy 'clusterID' column to `DF` by assigning each row the clusterID corresponding to its '_id' field's last digit.
#     '''
#     def _create_toy_clusterID(_id):
#         return int(int(_id, 16) % 10)
    
#     udf = F.udf(lambda _id: _create_toy_clusterID(_id[0]), T.IntegerType())
    
#     return DF.withColumn('clusterID', udf(F.col('_id')))

# rawDF = append_toy_clusterID(load_mongo_to_spark('video_games'))

In [340]:
raw1DF = (spark
         .read
         .format('com.databricks.spark.csv')
         .options(header='true', inferschema='true')
         .load('../../Datasets/baby_product_summary.csv')
         .dropna()
         )
#          .withColumn('int', check_for_int('clusterID'))
#          .filter(F.col('int') == True)
#          .select('asin', 'clusterID'))
displayDF(raw1DF)

Unnamed: 0,asin,title,clusterId
0,188399313,Lifefactory 4oz BPA Free Glass Baby Bottles - 4-pack-raspberry and Lilac,523
1,188399518,Planetwise Flannel Wipes,975
2,188399399,Planetwise Wipe Pouch,802
3,316967297,Annas Dream Full Quilt with 2 Shams,281
4,615447279,Stop Pacifier Sucking without tears with Thumbuddy To Love's Binky Fairy Puppet and Adorable Book,1130
5,670062049,5 Pink Gumdrops + One Pacifier Clip,2372
6,705391752,A Tale of Baby's Days with Peter Rabbit,1014
7,097293751X,"Baby Tracker&reg; - Daily Childcare Journal, Schedule Log",3044
8,974671517,Wee Gallery Twins Board Book,2231
9,980027519,Nature's Lullabies First and Second Year Calendars,11


In [10]:
raw2DF = load_mongo_to_spark('baby')
displayDF(raw2DF, 2)

Unnamed: 0,asin,helpful,overall,reviewText,reviewTime,reviewerID,reviewerName,summary
0,188399313,"[0, 0]",5.0,"They work very well. Easy to clean, we wash them in the dishwasher every day. Our LO loves to hold on to the bottle and the plastic covering makes it easy for her to hold on to.","05 27, 2013",A28O3NP6WR5517,Jennifer gymer,These bottles are great!
1,188399399,"[1, 1]",5.0,it came early and was not disappointed. i love planet wise bags and now my wipe holder. it keps my osocozy wipes moist and does not leak. highly recommend it.,"04 9, 2013",AX0M1Z6ZWO52J,Ash M.,perfect


In [11]:
rawDF = raw1DF.join(raw2DF, on='asin')
displayDF(rawDF, 3)

Unnamed: 0,asin,clusterID,helpful,overall,reviewText,reviewTime,reviewerID,reviewerName,summary
0,B000JJY13Y,3210,"[0, 0]",5.0,"A very well-made toy. Mine had been in the store for months but was still in good shape. I'm sure he'll stand up to much good-natured rough-housing. They've even made a few adjustments to make him more cuddly -- the real Grover on TV has plastic eyes, whereas this Grover's eyes are made of cloth. Good thinking -- I'm sure this Grover will be accompanying lots of kids to Dreamland.","12 23, 2008",A1BSO69KR49GKF,Bradley D. Hall,Top-notch!
1,B000JJY13Y,3210,"[0, 0]",5.0,This was a great gift for my 5-year old grandson. He likes Grover and the puppet is almost as big as he is. Very cute and still cuddly even at 36 inches. Also came very quickly after ordering--very satisfied.,"01 15, 2007",A2PMST8J2GYPPH,J. Barclay,Grover puppet SO cute!
2,B000JJY13Y,3210,"[0, 0]",5.0,"We love our Grover, with this muppet it is easy to feel like Grover yourself as you move his mouth and arms while doing his voice. The hand hole for the mouth is plenty big enough for an adult but also allows operation by small hands as well. The hands have holes for all of his fingers and are a little tight for my hands (as a 34 yo male) but they still work ok.Grover seems to be well stitched and durable, though he is certainly not machine washable because of his inner puppet parts. Maybe the best part is that at with Grover's height, he is right at or close to eye level with your 4-6 yo child, which makes him seem more real.","12 30, 2008",A3FAE1PANJQ4RR,Muppet Man,This Grover is awesome and fun


## Load stopwords
A list of stopwords is loaded as a Python list and broadcasted in PySpark.

In [12]:
# Load stopwords into list
with open('stopwords.csv', 'r') as csvFile:
    fileReader = csv.reader(csvFile)
    stopwords = []
    for word in fileReader:
        stopwords.extend(word)
        
# Add '' to stopwords
stopwords.append('')

# Broadcast stopwords
stopwords_broadcast = spark.sparkContext.broadcast(stopwords)
        
print 'List of stopwords:\n\n{}'.format(stopwords)

List of stopwords:

['all', 'just', 'being', 'over', 'both', 'through', 'yourselves', 'its', 'before', 'with', 'had', 'should', 'to', 'only', 'under', 'ours', 'has', 'do', 'them', 'his', 'very', 'they', 'not', 'during', 'now', 'him', 'nor', 'did', 'these', 't', 'each', 'where', 'because', 'doing', 'theirs', 'some', 'are', 'our', 'ourselves', 'out', 'what', 'for', 'below', 'does', 'above', 'between', 'she', 'be', 'we', 'after', 'here', 'hers', 'by', 'on', 'about', 'of', 'against', 's', 'or', 'own', 'into', 'yourself', 'down', 'your', 'from', 'her', 'whom', 'there', 'been', 'few', 'too', 'themselves', 'was', 'until', 'more', 'himself', 'that', 'but', 'off', 'herself', 'than', 'those', 'he', 'me', 'myself', 'this', 'up', 'will', 'while', 'can', 'were', 'my', 'and', 'then', 'is', 'in', 'am', 'it', 'an', 'as', 'itself', 'at', 'have', 'further', 'their', 'if', 'again', 'no', 'when', 'same', 'any', 'how', 'other', 'which', 'you', 'who', 'most', 'such', 'why', 'a', 'don', 'i', 'having', 'so', 

## Preprocess `rawDF`

In [13]:
def filter_helpful_reviews(sparkDF, pct_helpful, min_votes):
    '''
    Return a dataframe filtered by comments that are at least `pct_helpful`% helpful 
    and contain a minimum number of votes.
    '''
    def _filter_helpful_reviews(votes, pct_helpful, min_votes):
        '''
        Return bool if comment are at least `pct_helpful`% helpful and have at least `min_votes` vote.
        
        Inputs:
            votes: A list of votes by [helpful, total_votes]
            pct_helpful: A float
            min_votes: A float
        '''
        return (votes[1] >= min_votes and (float(votes[0]) / votes[1] >= pct_helpful))
    
    udf = F.udf(lambda votes: _filter_helpful_reviews(votes, pct_helpful, min_votes), T.BooleanType())
    
    return sparkDF.filter(udf(F.col('helpful')))

def preprocess_DF(rawDF):
    '''
    Preprocess text of dataframe by:
        - selecting relevant columns ('clusterID', 'overall', 'review')
        - concatenating 'summary' and 'reviewText' columns
        - removing punctuations
        - filtering out stopwords
    '''
    
    def _filter_stopwords(text):
        '''
        Input:
            text: A string concatenation of the columns 'summary' and 'reviewText'.
        Returns:
            A list of strings with stopwords filtered out.
        '''
        string_list = re.split(r'\W+', text.lower())
        
        return [word for word in string_list if word not in stopwords_broadcast.value]
    
    udf = F.udf(lambda text: _filter_stopwords(text), T.ArrayType(T.StringType()))
    
    return (rawDF
            .select(F.concat_ws('-', 
                                F.col('asin'), 
                                F.col('reviewerID')).alias('reviewID'),
                    F.col('clusterID'),
                    F.col('overall'), 
                    udf(F.concat_ws(' ', 
                                    F.col('summary'), 
                                    F.col('reviewText'))).alias('reviews')))

In [14]:
preprocessedDF = preprocess_DF(filter_helpful_reviews(rawDF, pct_helpful=.5, min_votes=3))
displayDF(preprocessedDF, 5)

Unnamed: 0,reviewID,clusterID,overall,reviews
0,B000JK2LB2-A3PPEED8SHD8R7,291,5.0,"[perfect, travelling, toddler, live, africa, travel, us, africa, regularly, bought, last, journey, love, takes, little, room, much, easier, using, stroller, folds, nicely, fit, overhead, son, loved, riding, thought, little, tray, great, gate, one, pilots, stopped, take, picture, could, get, one, child, use, also, came, handy, crowded, gate, weren, many, seats, baggage, claim, travel, toddler, know, someone, buy, product]"
1,B000JK2LB2-A3AC5BCN2AA2CZ,291,5.0,"[best, money, ever, spent, ride, carry, saved, sanity, making, cross, country, trip, two, layovers, three, children, age, five, attached, one, two, carry, ons, wore, newborn, sling, sailed, experience, everywhere, went, travelers, complimented, asked, bought, re, versatile, tray, head, rest, making, airport, snacking, easy, sturdy, sleek, cute, easy, use, re, planning, trip, promise, won, sorry, bought]"
2,B000JK2LB2-A15J3MCQ5GV6AJ,291,5.0,"[awesome, daughter, hyper, hard, keep, running, place, traveling, bit, hassle, got, awesome, ride, carry, saved, seats, enjoying, ride, watching, everything, painting, playing, eating, moving, really, awesome, product, thank, think, twice, getting, make, traveling, toddlers, pleasurable, experience]"
3,B000JK2LB2-A2PSQ6VL6NXENW,291,5.0,"[travel, without, one, 2, years, first, thing, pack, happier, travellers, since, got, one, easy, airports, days, extra, hand, free, sturdy, stays, clean, mine, hardly, looks, used, son, enjoyed, daughter, says, chair, sees, pull, seat, sit, great, know, child, safely, strapped, behind, checking, desk, looking, boarding, pass, show, yet, another, person, getting, dressed, security, otherwise, distracted, run, make, flight, connection, kids, think, great, fun, strap, around, middle, carry, bag, around, handle, times, fast, headrest, used, traytable, useful, gate, snacks, activities, even, bring, kids, airplane, aisle, fold, seat, store, onboard, quite, flat, folded, love, couldn, without]"
4,B000JK2LB2-A1V14KOGUJAC5T,291,5.0,"[genius, invented, bought, product, help, move, two, kids, one, stroller, year, half, older, three, half, happy, sit, watch, people, around, able, move, fast, maneuver, even, stroller, hand, good, quality, folds, easy, fits, easily, board, hand, bag, give, five, stars, could, buy, hesitate, worths, every, penny]"


### Check for illegal `clusterID`s

In [15]:
@F.udf(returnType=T.BooleanType())
def check_int_instance(clusterID):
    return True if isinstance(clusterID, int) else False
displayDF(preprocessedDF.filter(check_int_instance(F.col('clusterID')) == False))

Unnamed: 0,reviewID,clusterID,overall,reviews


## Label ratings for `preprocessedDF`

In [16]:
@F.udf(returnType=T.BooleanType())
def label_review_quality(rating, min_good_review_rating=4):
    '''
    UDF to append a column label of 'isPositiveReview' to the DataFrame.
    
    Inputs:
        rating: Rating from 1.0 to 5.0 (int expressed as float)
    Outputs:
        True if review has a rating of at least `min_good_review_rating`, otherwise False.
    '''
    return True if rating >= min_good_review_rating else False

In [17]:
polarizedDF = (preprocessedDF
               .withColumn('positiveReview', label_review_quality('overall')))
displayDF(polarizedDF, 5)

Unnamed: 0,reviewID,clusterID,overall,reviews,positiveReview
0,B000JK2LB2-A3PPEED8SHD8R7,291,5.0,"[perfect, travelling, toddler, live, africa, travel, us, africa, regularly, bought, last, journey, love, takes, little, room, much, easier, using, stroller, folds, nicely, fit, overhead, son, loved, riding, thought, little, tray, great, gate, one, pilots, stopped, take, picture, could, get, one, child, use, also, came, handy, crowded, gate, weren, many, seats, baggage, claim, travel, toddler, know, someone, buy, product]",True
1,B000JK2LB2-A3AC5BCN2AA2CZ,291,5.0,"[best, money, ever, spent, ride, carry, saved, sanity, making, cross, country, trip, two, layovers, three, children, age, five, attached, one, two, carry, ons, wore, newborn, sling, sailed, experience, everywhere, went, travelers, complimented, asked, bought, re, versatile, tray, head, rest, making, airport, snacking, easy, sturdy, sleek, cute, easy, use, re, planning, trip, promise, won, sorry, bought]",True
2,B000JK2LB2-A15J3MCQ5GV6AJ,291,5.0,"[awesome, daughter, hyper, hard, keep, running, place, traveling, bit, hassle, got, awesome, ride, carry, saved, seats, enjoying, ride, watching, everything, painting, playing, eating, moving, really, awesome, product, thank, think, twice, getting, make, traveling, toddlers, pleasurable, experience]",True
3,B000JK2LB2-A2PSQ6VL6NXENW,291,5.0,"[travel, without, one, 2, years, first, thing, pack, happier, travellers, since, got, one, easy, airports, days, extra, hand, free, sturdy, stays, clean, mine, hardly, looks, used, son, enjoyed, daughter, says, chair, sees, pull, seat, sit, great, know, child, safely, strapped, behind, checking, desk, looking, boarding, pass, show, yet, another, person, getting, dressed, security, otherwise, distracted, run, make, flight, connection, kids, think, great, fun, strap, around, middle, carry, bag, around, handle, times, fast, headrest, used, traytable, useful, gate, snacks, activities, even, bring, kids, airplane, aisle, fold, seat, store, onboard, quite, flat, folded, love, couldn, without]",True
4,B000JK2LB2-A1V14KOGUJAC5T,291,5.0,"[genius, invented, bought, product, help, move, two, kids, one, stroller, year, half, older, three, half, happy, sit, watch, people, around, able, move, fast, maneuver, even, stroller, hand, good, quality, folds, easy, fits, easily, board, hand, bag, give, five, stars, could, buy, hesitate, worths, every, penny]",True


In [18]:
polarizedDF.count()

82591

## Aggregate ratings by `clusterID` and `positiveReview`

In [19]:
@F.udf(returnType=T.ArrayType(T.StringType()))
def flatten(nested_list):
    '''
    Flatten a list of list to a one dimensional list.
    '''
    final_list = []
    for list_ in nested_list:
        final_list.extend(list_)
        
    return final_list

@F.udf(returnType=T.BooleanType())
def bool_invert(boolean):
    '''
    Invert the value of a boolean
    '''
    return not boolean

In [20]:
aggregatedDF = (polarizedDF
                .groupBy('clusterID', 'positiveReview')
                .agg(F.collect_list('reviews').alias('collectedReviews'))
                .withColumn('tokens', flatten('collectedReviews'))
                .drop('collectedReviews')
                .sort('clusterID', bool_invert('positiveReview'))
                .cache())
displayDF(aggregatedDF, 6)

Unnamed: 0,clusterID,positiveReview,tokens
0,0,True,"[great, nightlight, replacement, nightlight, died, 4, 1, 2, years, use, terrific, design, soft, design, ease, use, make, perfect, nightlight, son, 4, 1, 2, loves, sharp, tooth, dinosaur, changing, colors, even, perfected, selecting, color, likes, night, highly, recommend, product, one, child, great, gift, sent, rex, gift, boy, mother, wrote, love, tyrannosaurus, light, anthony, favorite, bedtime, buddy, never, forgets, get, charger, take, bed, book, choice, night, love, gift, child, keeps, playing, great, died, company, rocks, great, product, son, loved, died, base, seem, charge, little, dino, properly, 6, months, died, warranty, 90, days, ordering, another, one, hopes, lasts, longer, really, great, light, subsequent, ...]"
1,0,False,"[dont, waste, money, 4, different, animals, posession, last, year, kinderglo, great, customer, service, lights, crap, one, lasted, week, another, month, third, one, lasted, day, final, one, lasted, 3, months, either, charger, stops, charging, light, light, stops, taking, charge, whatever, wasted, 70, total, included, cost, first, light, shipping, light, back, stopped, working, get, replacement, note, nobody, touched, lights, husband, charged, various, kitchen, outlets, reach, son, set, dresser, reach, bedtime, please, waste, money, like, going, ask, money, back, lets, hope, get, back, wonderful, charging, station, really, fragile, purchased, animals, trex, brontosaurus, owl, three, sons, opinion, based, three, items, merely, one, foremost, boys, ...]"
2,1,True,"[replaces, standard, baggies, ages, another, reviewer, said, perfect, ages, love, fact, use, snack, cup, pack, daughter, orange, color, one, vibrant, misplaced, perfect, put, cheerios, little, ones, put, snacks, bigger, cereal, frost, mini, wheats, holds, handful, overeat, feared, lid, might, hard, open, easy, open, without, much, effort, yet, holds, wonderful, seal, hand, washed, cup, like, fact, top, shelf, dishwasher, safe, bonuses, see, cup, surface, know, getting, bottom, fits, perfectly, cars, cup, holder, grab, bite, without, look, away, since, lid, opening, generous, fact, bpa, pvc, free, feel, lot, better, using, daughter, also, someone, else, stated, replaces, dreadful, baggies, anything, put, baggie, ends, ...]"
3,1,False,"[oxo, tot, snack, cup, handy, limited, oxo, tot, flip, top, snack, cup, semi, handy, snack, container, small, children, adults, well, snack, cup, holds, small, portion, favorite, snack, takes, little, space, sized, right, fit, small, hands, two, little, girls, like, youngsters, often, need, snack, giving, large, box, favorite, chip, cracker, isn, wise, move, discovered, one, occasion, small, container, like, young, ones, carry, snack, around, serve, since, container, small, also, limits, amount, eat, adults, also, use, snack, container, many, like, compact, size, fit, purses, small, spaces, without, problem, handy, storage, small, bite, size, portions, however, small, size, also, liability, perspective, adult, amount, ...]"
4,2,True,"[fantastic, say, enough, good, things, toy, perfect, flashing, lights, bright, colors, catchy, music, plays, certain, actions, performed, continuously, like, many, toys, peek, blocks, awesome, rounded, corners, edges, interesting, moving, parts, inside, etc, place, blocks, top, incrediblock, lady, speaks, name, object, inside, block, cracks, daughter, every, single, time, spent, countless, hours, moving, side, side, incrediblock, investigating, fascinating, things, also, perfect, height, new, walker, stander, play, probably, coolest, part, perspective, mom, incrediblock, built, block, storage, playtime, put, blocks, away, behind, little, door, avoid, stepping, blocks, house, highly, recommend, buying, set, extra, blocks, got, alphabet, set, re, great, supplement, included, blocks, think, comes, ...]"
5,2,False,"[peek, blocks, work, toy, bought, toy, thought, peek, blocks, would, work, toy, thought, toy, would, say, letter, name, item, block, placed, top, 8, blocks, come, wouldn, purchased, toy, known, 8, blocks, would, really, work, alphabet, blocks, actually, say, wrong, item, rather, playing, song, confusing, buy, big, piece, plastic, junk, thought, d, get, lot, bang, buck, educational, potential, alphabet, blocks, totally, lost, toy, bad, couldn, taken, pretty, cool, technology, bit, include, alphabet, blocks]"


In [183]:
aggregatedDF.count()

5873

## Select top $N$ words ranked by TF-IDF for positive and negative reviews

In [None]:
@F.udf(returnType=T.ArrayType(T.StringType()))
def bin_pos_top_words(words, positiveReview):
    return words if positiveReview == True else None

@F.udf(returnType=T.ArrayType(T.StringType()))
def bin_neg_top_words(words, positiveReview):
    return words if positiveReview == False else None

In [174]:
def top_N_words(DF, N=10):
    '''
    Obtains the top N words of positive and negative reviews respectively, ranked by tf-idf of entries in a DF.
    Inputs:
        DF: A Spark DataFrame containing columns 'clusterID', 'positiveReview' and 'tokens'.
        N:  Number of top-ranking words to keep
    Outputs:
        A DataFrame of schema (_clusterID_, _positiveReview_, top_N_pos, top_N_neg)
    '''
    
    def tf(tokens):
        '''
        Calculate the token frequency (TF) for each review.

        Inputs:
            tokens: A list of token strings.
        Outputs:
            A dictionary of (token, tf).
        '''
        N = len(tokens)

        return {token: float(tokens.count(token)) / N for token in tokens}

    def idf(corpusDF):
        '''
        Calculate the inverse document frequency for the corpusDF.

        Inputs:
            corpusDF: A Spark DataFrame containing columns 'tokens'.
        Outputs:
            A dictionary of (token, idf).
        '''
        # Calculate the number of reviews
        N = corpusDF.count()

        # Create an RDD with entries (uniq_tokens_in_each_review, 1)
        checkpointRDD = (corpusDF
                         .select('tokens')
                         .rdd
                         .flatMap(lambda tokens: list(set(tokens[0])))
                         .map(lambda token: (token, 1)))

        # Produce a dict containing (token, idf)
        return (checkpointRDD
                .reduceByKey(lambda a, b: a + b)
                .mapValues(lambda count: float(count) / N)
                .collectAsMap())
    
    # Calculate tf's as an RDD
    tfRDD = (DF
           .select('clusterID', 'positiveReview', 'tokens')
           .rdd
           .map(lambda (clusterID, positiveReview, tokens): ((clusterID, positiveReview), tf(tokens))))
    
    # Calculate idf's as a dict and return a broadcasted instance
    idfs_broadcast = spark.sparkContext.broadcast(idf(DF.select('tokens')))
    
    # Calculate tfidf as an rdd of (clusterID, positiveReview, top_N_tokens)
    topNwordsRDD = (tfRDD
                    .mapValues(lambda tf_dict: [(token, float(tf_dict[token]) / idfs_broadcast.value[token]) for token in tf_dict.keys()])
                    .sortBy(lambda ((clusterID, positiveReview), tfidfs): ((clusterID, not positiveReview, sorted(tfidfs, key=lambda (token, tfidf): -tfidf))))
                    .mapValues(lambda tfidfs: map(lambda (token, tfidf): token, tfidfs)[:N])
                    .map(lambda ((clusterID, positiveReview), top_N_tokens): (clusterID, positiveReview, top_N_tokens))
                    .cache())
    
#     return (tfRDD, idf(DF.select('tokens')), topNwordsRDD)
#     return topNwordsRDD
    
    # Transform topNwordsRDD to an RDD of form (clusterID, top_N_positive, top_N_negative)
    temp_schema = T.StructType([
        (T.StructField('clusterID', T.IntegerType())),
        (T.StructField('positiveReview', T.BooleanType())),
        (T.StructField('topWords', T.ArrayType(T.StringType())))
    ])
    
    return (spark
            .createDataFrame(topNwordsRDD, temp_schema)
            .withColumn('posWords', bin_pos_top_words('topWords', 'positiveReview'))
            .withColumn('negWords', bin_neg_top_words('topWords', 'positiveReview'))
            .drop('positiveReview', 'topWords')
            .groupBy('clusterID')
            .agg(F.collect_list('posWords').alias('posWords'), F.collect_list('negWords').alias('negWords'))
            .cache()
#             .select('clusterID', F.explode('posWords').alias('posWords'), 'negWords')
#             .select('clusterID', 'posWords', F.explode('negWords').alias('negWords')))
#                 .groupBy('clusterID')
#                 .agg(F.collect_list('topWords').alias('posAndNegWords'))
#                 .rdd
#                 .map(lambda (clusterID, words): (clusterID, words[0], words[1])))
           )
    
#     return finalRDD
    
    # Return finalRDD as a dataframe
    final_schema = T.StructType([
        (T.StructField('clusterID', T.IntegerType())),
        (T.StructField('posWords', T.ArrayType(T.StringType()))),
        (T.StructField('negWords', T.ArrayType(T.StringType())))
    ])
    
    return (spark
            .createDataFrame(finalRDD, final_schema)
            .sort('clusterID')
            .cache())

In [173]:
displayDF(finalRDD
 .withColumn('posWords', bin_pos_top_words('topWords', 'positiveReview'))
 .withColumn('negWords', bin_neg_top_words('topWords', 'positiveReview'))
 .drop('positiveReview', 'topWords')
 .groupBy('clusterID')
 .agg(F.collect_list('posWords').alias('posWords'), F.collect_list('negWords').alias('negWords'))
 .select('clusterID', F.explode('posWords').alias('posWords'), 'negWords')
 .select('clusterID', 'posWords', F.explode('negWords').alias('negWords'))
 , 3)

Unnamed: 0,clusterID,posWords,negWords
0,148,"[managed, lack, sleek, sleep, go, bank, compact, outgrowing, jogger, concerned]","[mini, right, thankfully, anyway, one, single, hassle, purchased, wonderful, actually]"
1,463,"[son, summer, since, super, mom, switched, money, saver, anyway, wintee]","[refund, child, cheap, years, looks, synthetic, giant, clasps, fit, ment]"
2,471,"[sage, course, colors, looks, imagined, perfect, based, patches, better, theme]","[demand, month, shortcomings, hate, causes, improperly, jogger, description, send, finally]"


In [144]:
def column_sort_decorator(positiveReview):
    def 
    return F.udf()

@F.udf(returnType=T.ArrayType(T.StringType()))
def bin_top_words(words, positiveReview, binOnPositiveCol):
    return words if positiveReview == binOnPositiveCol else None

displayDF(penulRDD
 .groupBy('clusterID')
 .agg(F.collect_list('topWords').alias('posAndNegWords'))
                  .filter(F.udf(lambda x: len(x) < 2, T.BooleanType())('posAndNegWords'))
#  .rdd
 .map(lambda (clusterID, words): (clusterID, words[0], words[1]))
#  .collect()
)

Unnamed: 0,clusterID,posAndNegWords
0,2366,"[[individually, love, sweet, 80, find, best, even, 00, compared, girls]]"
1,2866,"[[help, less, scratches, damage, nursery, advertised, go, tone, assembled, roomy]]"
2,3175,"[[son, old, figure, snacks, able, wasn, idea, month, ended, good]]"
3,3749,"[[hooks, move, rest, month, padded, using, years, yes, bit, make]]"
4,1483,"[[even, like, sheet, fabric, would, thin, disposable, cheap, completely, hemmed]]"
5,1507,"[[givers, lack, sooo, four, catch, paper, sleep, go, love, yes]]"
6,2580,"[[washcloth, didn, ordered, money, results, one, newborn, baby, something, want]]"
7,3475,"[[cute, stores, limited, september, money, course, colors, sleep, saving, still]]"
8,1975,"[[life, size, perfect, cut, aren, stuck, 7, white, really, nice]]"
9,2443,"[[full, old, ve, pointing, snacks, hands, sign, doesn, one, used]]"


In [128]:
displayDF(penulRDD, 3)

Unnamed: 0,clusterID,posAndNegWords
0,148,"[[managed, lack, sleek, sleep, go, bank, compact, outgrowing, jogger, concerned], [mini, right, thankfully, anyway, one, single, hassle, purchased, wonderful, actually]]"
1,463,"[[son, summer, since, super, mom, switched, money, saver, anyway, wintee], [refund, child, cheap, years, looks, synthetic, giant, clasps, fit, ment]]"
2,471,"[[sage, course, colors, looks, imagined, perfect, based, patches, better, theme], [demand, month, shortcomings, hate, causes, improperly, jogger, description, send, finally]]"


In [140]:
# Return finalRDD as a dataframe
final_schema = T.StructType([
    (T.StructField('clusterID', T.IntegerType())),
    (T.StructField('posWords', T.ArrayType(T.StringType()))),
    (T.StructField('negWords', T.ArrayType(T.StringType())))
])

actualFinalDF = (spark
        .createDataFrame(actualFinalRDD, final_schema)
        .sort('clusterID')
        .cache())

Py4JJavaError: An error occurred while calling o2428.cache.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 7 in stage 409.0 failed 1 times, most recent failure: Lost task 7.0 in stage 409.0 (TID 13789, localhost, executor driver): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/opt/spark/python/lib/pyspark.zip/pyspark/worker.py", line 177, in main
    process()
  File "/opt/spark/python/lib/pyspark.zip/pyspark/worker.py", line 172, in process
    serializer.dump_stream(func(split_index, iterator), outfile)
  File "/opt/spark/python/lib/pyspark.zip/pyspark/serializers.py", line 268, in dump_stream
    vs = list(itertools.islice(iterator, batch))
  File "<ipython-input-139-4b7f6bf48a7d>", line 6, in <lambda>
IndexError: list index out of range

	at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRDD.scala:193)
	at org.apache.spark.api.python.PythonRunner$$anon$1.next(PythonRDD.scala:156)
	at org.apache.spark.api.python.PythonRunner$$anon$1.next(PythonRDD.scala:152)
	at org.apache.spark.InterruptibleIterator.next(InterruptibleIterator.scala:40)
	at scala.collection.Iterator$$anon$12.nextCur(Iterator.scala:434)
	at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:440)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
	at org.apache.spark.util.random.SamplingUtils$.reservoirSampleAndCount(SamplingUtils.scala:41)
	at org.apache.spark.RangePartitioner$$anonfun$9.apply(Partitioner.scala:263)
	at org.apache.spark.RangePartitioner$$anonfun$9.apply(Partitioner.scala:261)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsWithIndex$1$$anonfun$apply$26.apply(RDD.scala:844)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsWithIndex$1$$anonfun$apply$26.apply(RDD.scala:844)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
	at org.apache.spark.scheduler.Task.run(Task.scala:108)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:335)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
	at java.lang.Thread.run(Thread.java:745)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1499)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1487)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1486)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1486)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:814)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:814)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:814)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1714)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1669)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1658)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:630)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2022)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2043)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2062)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2087)
	at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:936)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:362)
	at org.apache.spark.rdd.RDD.collect(RDD.scala:935)
	at org.apache.spark.RangePartitioner$.sketch(Partitioner.scala:266)
	at org.apache.spark.RangePartitioner.<init>(Partitioner.scala:128)
	at org.apache.spark.sql.execution.exchange.ShuffleExchange$.prepareShuffleDependency(ShuffleExchange.scala:221)
	at org.apache.spark.sql.execution.exchange.ShuffleExchange.prepareShuffleDependency(ShuffleExchange.scala:87)
	at org.apache.spark.sql.execution.exchange.ShuffleExchange$$anonfun$doExecute$1.apply(ShuffleExchange.scala:124)
	at org.apache.spark.sql.execution.exchange.ShuffleExchange$$anonfun$doExecute$1.apply(ShuffleExchange.scala:115)
	at org.apache.spark.sql.catalyst.errors.package$.attachTree(package.scala:52)
	at org.apache.spark.sql.execution.exchange.ShuffleExchange.doExecute(ShuffleExchange.scala:115)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:117)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:117)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:138)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:135)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:116)
	at org.apache.spark.sql.execution.InputAdapter.inputRDDs(WholeStageCodegenExec.scala:252)
	at org.apache.spark.sql.execution.SortExec.inputRDDs(SortExec.scala:121)
	at org.apache.spark.sql.execution.WholeStageCodegenExec.doExecute(WholeStageCodegenExec.scala:386)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:117)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:117)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:138)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:135)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:116)
	at org.apache.spark.sql.execution.columnar.InMemoryRelation.buildBuffers(InMemoryRelation.scala:91)
	at org.apache.spark.sql.execution.columnar.InMemoryRelation.<init>(InMemoryRelation.scala:86)
	at org.apache.spark.sql.execution.columnar.InMemoryRelation$.apply(InMemoryRelation.scala:42)
	at org.apache.spark.sql.execution.CacheManager$$anonfun$cacheQuery$1.apply(CacheManager.scala:100)
	at org.apache.spark.sql.execution.CacheManager.writeLock(CacheManager.scala:68)
	at org.apache.spark.sql.execution.CacheManager.cacheQuery(CacheManager.scala:92)
	at org.apache.spark.sql.Dataset.persist(Dataset.scala:2513)
	at org.apache.spark.sql.Dataset.cache(Dataset.scala:2523)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:280)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:214)
	at java.lang.Thread.run(Thread.java:745)
Caused by: org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/opt/spark/python/lib/pyspark.zip/pyspark/worker.py", line 177, in main
    process()
  File "/opt/spark/python/lib/pyspark.zip/pyspark/worker.py", line 172, in process
    serializer.dump_stream(func(split_index, iterator), outfile)
  File "/opt/spark/python/lib/pyspark.zip/pyspark/serializers.py", line 268, in dump_stream
    vs = list(itertools.islice(iterator, batch))
  File "<ipython-input-139-4b7f6bf48a7d>", line 6, in <lambda>
IndexError: list index out of range

	at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRDD.scala:193)
	at org.apache.spark.api.python.PythonRunner$$anon$1.next(PythonRDD.scala:156)
	at org.apache.spark.api.python.PythonRunner$$anon$1.next(PythonRDD.scala:152)
	at org.apache.spark.InterruptibleIterator.next(InterruptibleIterator.scala:40)
	at scala.collection.Iterator$$anon$12.nextCur(Iterator.scala:434)
	at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:440)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
	at org.apache.spark.util.random.SamplingUtils$.reservoirSampleAndCount(SamplingUtils.scala:41)
	at org.apache.spark.RangePartitioner$$anonfun$9.apply(Partitioner.scala:263)
	at org.apache.spark.RangePartitioner$$anonfun$9.apply(Partitioner.scala:261)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsWithIndex$1$$anonfun$apply$26.apply(RDD.scala:844)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsWithIndex$1$$anonfun$apply$26.apply(RDD.scala:844)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
	at org.apache.spark.scheduler.Task.run(Task.scala:108)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:335)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
	... 1 more


In [None]:
displayDF(actualFinalDF)

In [175]:
finalDF = top_N_words(aggregatedDF)
displayDF(finalDF)

Unnamed: 0,clusterID,posWords,negWords
0,148,"[managed, lack, sleek, sleep, go, bank, compact, outgrowing, jogger, concerned]","[mini, right, thankfully, anyway, one, single, hassle, purchased, wonderful, actually]"
1,463,"[son, summer, since, super, mom, switched, money, saver, anyway, wintee]","[refund, child, cheap, years, looks, synthetic, giant, clasps, fit, ment]"
2,471,"[sage, course, colors, looks, imagined, perfect, based, patches, better, theme]","[demand, month, shortcomings, hate, causes, improperly, jogger, description, send, finally]"
3,496,"[consider, chinese, month, hanging, go, row, hold, contours, worth, brown]","[chinese, money, clothdiapers, sent, cheap, month, colors, poopy, advertised, go]"
4,1088,"[cute, set, rating, colors, buy, want, shake, impossible, still, perfect]","[feed, didn, metal, half, tended, son, utensils, fine, size, even]"
5,1238,"[limited, since, reviewers, knotted, relieve, rod, results, month, ladders, known]","[concept, month, chewable, sleep, hanging, go, saved, causes, illegible, finally]"
6,1342,"[hooks, month, mp3, children, carpet, segments, young, stable, whack, putting]","[concept, overpriced, chinese, money, cheap, month, shape, mp3, interpreter, ripoff]"
7,1580,"[valve, switched, personally, people, resistance, month, four, known, supplement, sleep]","[wrestle, lack, month, themes, straws, go, hate, assembled, issues, seemed]"
8,1591,"[cute, infant, holds, less, reviewers, stays, displayed, damage, leads, held]","[managed, gp, beware, dollar, scuffed, four, go, children, compact, overpricd]"
9,1645,"[scratch, beware, scratches, month, cheerful, mild, softer, go, saved, carpet]","[saying, dyes, consider, contacted, scrubbing, move, ewonderworld, results, month, discovered]"


In [182]:
finalDF.count()

2626

In [194]:
displayDF(finalDF.filter(F.col('clusterID') == 317))

Unnamed: 0,clusterID,posWords,negWords
0,317,"[cute, salon, show, cheap, soon, decals, looks, go, glue, perfect]","[since, less, money, move, soon, decals, lovebirds, looks, bedroom, still]"


### TODO:
- Remove digits (maybe except years)
- Filter out common top words that appear in both posWords and negWords

## Cluster Analyzer Script
The following code analyzes how frequent words appear in product titles of a given cluster. This is used as a tentative proxy for clustering effectiveness.

In [247]:
from collections import Counter
import random

In [337]:
# Load stopwords into list
with open('stopwords.csv', 'r') as csvFile:
    fileReader = csv.reader(csvFile)
    stopwords = []
    for word in fileReader:
        stopwords.extend(word)
        
# Add '' to stopwords
stopwords.append('')

# Broadcast stopwords
stopwords_broadcast = spark.sparkContext.broadcast(stopwords)

@F.udf(returnType=T.ArrayType(T.StringType()))
def tokenize_set_and_filter_stopwords(text):
    '''
    Tokenizes a list of words, before filtering for stopwords. Return a setted list of words.
    
    Input:
        text: A string.
    Returns:
        A list of setted words with stopwords removed.
    '''
    string_set = set(re.split(r'\W+', text.lower()))

    return [word for word in string_set if word not in stopwords_broadcast.value]

@F.udf(returnType=T.IntegerType())
def assign_clusterID():
    return random.randint(0, 10)

@F.udf(returnType=T.ArrayType(T.StructType([
    T.StructField('token', T.StringType()),
    T.StructField('frequency', T.FloatType())
])))
# @F.udf
def analyze_word_frequency(tokens, N=10):
    '''
    Calculate the frequency of words appearing in titles of a given cluster.
    
    Inputs:
        tokens: A list of list of words.
    Returns:
        A list of tuple (token, frequency) of the top N words, sorted in decreasing frequency.
    '''
    counter = Counter(word for words in tokens for word in words)
    L = sum(counter.values())
    return map(lambda (k, v): (k, float(v) / L), 
               counter.most_common()[:N])

In [342]:
babyMetaDF = raw1DF
displayDF(babyMetaDF)

Unnamed: 0,asin,title,clusterId
0,188399313,Lifefactory 4oz BPA Free Glass Baby Bottles - 4-pack-raspberry and Lilac,523
1,188399518,Planetwise Flannel Wipes,975
2,188399399,Planetwise Wipe Pouch,802
3,316967297,Annas Dream Full Quilt with 2 Shams,281
4,615447279,Stop Pacifier Sucking without tears with Thumbuddy To Love's Binky Fairy Puppet and Adorable Book,1130
5,670062049,5 Pink Gumdrops + One Pacifier Clip,2372
6,705391752,A Tale of Baby's Days with Peter Rabbit,1014
7,097293751X,"Baby Tracker&reg; - Daily Childcare Journal, Schedule Log",3044
8,974671517,Wee Gallery Twins Board Book,2231
9,980027519,Nature's Lullabies First and Second Year Calendars,11


In [343]:
testDF = (babyMetaDF
          .select('clusterID', tokenize_set_and_filter_stopwords('title').alias('tokens'))
          .groupBy('clusterID')
          .agg(F.collect_list('tokens').alias('tokens'))
          .select('clusterID', analyze_word_frequency('tokens')).alias('tokenFrequency').alias('mostCommonWords')
         )
displayDF(testDF)

Unnamed: 0,clusterID,analyze_word_frequency(tokens)
0,148,"[(baby, 0.0666666701436), (mini, 0.0555555559695), (jogger, 0.0555555559695), (city, 0.0444444455206), (stroller, 0.0444444455206), (bassinet, 0.0222222227603), (co, 0.0222222227603), (arc, 0.0222222227603), (arm, 0.0222222227603), (reach, 0.0222222227603)]"
1,463,"[(cover, 0.136363640428), (strap, 0.0757575780153), (little, 0.0757575780153), (carr, 0.0757575780153), (pink, 0.0454545468092), (blue, 0.0454545468092), (classic, 0.0303030312061), (eddie, 0.0303030312061), (cozy, 0.0303030312061), (reversible, 0.0303030312061)]"
2,471,"[(tote, 0.0759493634105), (diaper, 0.0632911399007), (bag, 0.0632911399007), (trend, 0.0506329126656), (pink, 0.0506329126656), (jogger, 0.0506329126656), (banana, 0.0506329126656), (baby, 0.0506329126656), (charlie, 0.0506329126656), (black, 0.0379746817052)]"
3,1088,"[(munchkin, 0.0549450553954), (spoon, 0.0549450553954), (set, 0.0329670347273), (6, 0.0329670347273), (fork, 0.0329670347273), (green, 0.0329670347273), (piece, 0.0329670347273), (pack, 0.0329670347273), (trainer, 0.0219780225307), (toddler, 0.0219780225307)]"
4,1238,"[(chewbeads, 0.0523415990174), (bag, 0.0330578498542), (baby, 0.0330578498542), (lassig, 0.0303030312061), (wrap, 0.0192837473005), (charles, 0.0165289249271), (bracelet, 0.0165289249271), (cornelia, 0.0165289249271), (bangle, 0.0165289249271), (diaper, 0.0165289249271)]"
5,1342,"[(toys, 0.0294117648154), (baby, 0.0294117648154), (8gb, 0.0147058824077), (mini, 0.0147058824077), (infant, 0.0147058824077), (hhz102, 0.0147058824077), (todays, 0.0147058824077), (apple, 0.0147058824077), (clip, 0.0147058824077), (cute, 0.0147058824077)]"
6,1580,"[(mam, 0.0954356864095), (bottle, 0.0954356864095), (ounce, 0.0705394223332), (pack, 0.0580912865698), (anti, 0.0456431545317), (colic, 0.0456431545317), (5, 0.0414937771857), (3, 0.0373443998396), (baby, 0.0373443998396), (2, 0.0331950224936)]"
7,1591,"[(baby, 0.0568181835115), (jogger, 0.0568181835115), (carry, 0.0454545468092), (bag, 0.0454545468092), (stroller, 0.0454545468092), (city, 0.0340909101069), (mini, 0.0227272734046), (rain, 0.0227272734046), (micro, 0.0227272734046), (cover, 0.0227272734046)]"
8,1645,"[(quot, 0.0725806429982), (thick, 0.0645161271095), (24, 0.0645161271095), (x, 0.0645161271095), (mats, 0.0564516112208), (9, 0.0564516112208), (16, 0.0564516112208), (set, 0.0483870953321), (48, 0.0403225794435), (square, 0.0322580635548)]"
9,1829,"[(light, 0.12195122242), (night, 0.12195122242), (crystal, 0.113821141422), (swarovski, 0.113821141422), (gold, 0.0894308909774), (24k, 0.0894308909774), (clear, 0.0650406479836), (heart, 0.0243902429938), (chrome, 0.0243902429938), (fairy, 0.0243902429938)]"
