# Introduction
This notebook contains a `Python 2` / `PySpark` script to find the top $N$ words for the positive and negative reviews in a given cluster.

# Notebook Setup

## Initialise modules

In [34]:
import findspark
import pyspark
import pyspark.sql.functions as F
import pyspark.sql.types as T
import pymongo
import pandas as pd
import gzip # To parse gzip file
import re # Regex for text processing
import os # For setting up Mongo-Spark connector
import csv # To read/write CSV files

## Initialise PySpark session

Load `MongoDB-Spark` connector when starting up `PySpark`.

In [35]:
packages = 'org.mongodb.spark:mongo-spark-connector_2.11:2.2.0'
dedicated_memory = '4g'

os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages {} --driver-memory {} pyspark-shell' \
    .format(packages, dedicated_memory)

In [36]:
# Find SPARK_HOME
findspark.init()

# Create SparkSession
spark = (pyspark.sql.SparkSession
         .builder.appName('WordProcessing')
         .getOrCreate())

## Configure Pandas HTML display

In [37]:
pd.set_option('display.max_colwidth', -1)

## Define helper methods

In [63]:
def parse(path):
    '''
    Unzip a json.gz at `path` and returns a generator.
    '''
    g = gzip.open(path, 'rb')
    for line in g:
        yield eval(line)

def import_to_mongo(path, coll, db='hackon', create_index=True):
    '''
    Unzip and import json.gz file from `path` and loads it into mongo server.
    Create database index if `create_index` is True. 
    '''
    # Obtain handle to Mongo database and collection
    client = pymongo.MongoClient()
    collection = client[db][coll]
    
    # Return prematurely if database.collection already exists
    if (collection.count() != 0):
        print '{}.{} already exists on MongoDisk server. Exiting without loading JSON data.'.format(db, coll)
        return
    
    # Insert datapoints into Mongo database
    try:
        collection.insert_many((datapoint for datapoint in parse(path)))
        print 'JSON data successfully imported to Mongo at \'{}.{}.\''.format(db, coll)
    except Exception as e:
        print 'Error loading data.\n{}'.format(e)
        client.close()
        return
    
    if not create_index:
        client.close()
        return
    
    # Create database index for improved searching
    # collection.create_index([('asin', pymongo.ASCENDING), ('reviewerID', pymongo.DESCENDING)])

def load_mongo_to_spark(coll, db='hackon'):
    '''
    Load the Mongo database to a Spark Session and returns the Spark DataFrame
    '''
    try:
        return (spark
                .read
                .format('com.mongodb.spark.sql.DefaultSource')
                .option('uri', 'mongodb://127.0.0.1/{}.{}'.format(db, coll))
                .load())
    except Exception as e:
        print 'Failed to create Spark dataframe.\n{}'.format(e)

def displayDF(sparkDF, n=10):
    '''
    Interactively displays the first n rows of a sparkDF as a pandas dataframe
    '''
    print 'Count: {}'.format(sparkDF.count())
        
    return (sparkDF
            .limit(n)
            .drop('_id', 'unixReviewTime')
            .toPandas())

In [400]:
import_to_mongo('../../Datasets/reviews_Baby.json.gz', coll='baby')

hackon.baby already exists on MongoDisk server. Exiting without loading JSON data.


# Word Processing Script

The function below takes in a Spark DataFrame containing reviews of a particular cluster. It returns a new DataFrame with two appended columns listing the top $N$ words based on *tf-idf* scores for good and bad reviews. 

> By default, *good reviews* are defined as reviews with **4-star** ratings and above, with the rest defined as *bad reviews*.

## Load Mongo toy dataset

In [39]:
clusteredProductMetaDF = (spark
         .read
         .format('com.databricks.spark.csv')
         .options(header='true', inferschema='true')
         .load('../../Datasets/baby_2000_2_cluster_df.csv')
         .drop('_c0')
         .dropna()
         )
displayDF(clusteredProductMetaDF)

Count: 18860


Unnamed: 0,asin,clusterId
0,0615447279,1720
1,097293751X,802
2,0980027500,1032
3,0980027586,308
4,0980027594,887
5,1586637304,1169
6,1592922929,658
7,1592923216,1057
8,1592920527,845
9,1592924409,1720


In [40]:
reviewDF = load_mongo_to_spark('baby')
displayDF(reviewDF, 2)

Count: 915446


Unnamed: 0,asin,helpful,overall,reviewText,reviewTime,reviewerID,reviewerName,summary
0,188399313,"[0, 0]",5.0,"They work very well. Easy to clean, we wash them in the dishwasher every day. Our LO loves to hold on to the bottle and the plastic covering makes it easy for her to hold on to.","05 27, 2013",A28O3NP6WR5517,Jennifer gymer,These bottles are great!
1,188399399,"[1, 1]",5.0,it came early and was not disappointed. i love planet wise bags and now my wipe holder. it keps my osocozy wipes moist and does not leak. highly recommend it.,"04 9, 2013",AX0M1Z6ZWO52J,Ash M.,perfect


In [60]:
@F.udf(returnType=T.BooleanType())
def pick_clusters(clusterID):
    targetList = ['6', '91', '716', '1607', '1037', '935', '1042', '1677', '1141', '1652', '1040', '1578']
    return True if str(clusterID) in targetList else False

In [64]:
rawDF = clusteredProductMetaDF.join(reviewDF, on='asin').filter(pick_clusters('clusterID'))
displayDF(rawDF, 3)

Count: 6352


Unnamed: 0,asin,clusterId,helpful,overall,reviewText,reviewTime,reviewerID,reviewerName,summary
0,B000056JAK,1141,"[0, 0]",4.0,I bought the two pack to replace several worn valves. The price was right so I just purchased the entire sipster cups rather than the valves alone.,"07 21, 2014",A24B2CZOCLSXGU,,value and spill-proof
1,B000056JAK,1141,"[0, 0]",5.0,"easy to take apart and clean so they don't get funky like some other brands that don't come apart, dishwasher safe, never leak, the stopper never falls out unlike with other brand sippy cups. I am on baby #3 and am completely a playtex sipster fan.","09 3, 2013",AD78WIJ7S0GR8,adrianne,best sippy cups ever!
2,B000056JAK,1141,"[0, 0]",5.0,Have been on a quest to find a sippy cup that does not leak and I think these might be the ones. I have tried a lot and these are by far my favorite in the leaking dept.,"12 29, 2010",A22D2QWCYN7BAD,"A. Jessup ""aj""",sippy cups


## Load stopwords
A list of stopwords is loaded as a Python list and broadcasted in PySpark.

In [65]:
# TODO: Get brands from rawDF instead of a new file here.
# TODO: Replace &amp and &#39
# Find distinct brands in the dataset
uniqueBrands = (load_mongo_to_spark('baby_meta')
                .select('brand')
                .distinct()
                .rdd
                .map(lambda x: x[0])
                .collect())

In [66]:
#TODO: Add directly using sets instead of stopwords here.

In [67]:
# Load stopwords into list
with open('stopwords.csv', 'r') as csvFile:
    fileReader = csv.reader(csvFile)
    stopwords = []
    for word in fileReader:
        stopwords.extend(word)
        
# Add '' to stopwords
stopwords.append('')

# Add brands into stopwords
stopwords.extend(uniqueBrands)

# Broadcast stopwords
stopwords_broadcast = spark.sparkContext.broadcast(stopwords)
        
print 'Top 100 stopwords:\n\n{}'.format(stopwords[:100])

Top 100 stopwords:

['all', 'just', 'being', 'over', 'both', 'through', 'yourselves', 'its', 'before', 'with', 'had', 'should', 'to', 'only', 'under', 'ours', 'has', 'do', 'them', 'his', 'very', 'they', 'not', 'during', 'now', 'him', 'nor', 'did', 'these', 't', 'each', 'where', 'because', 'doing', 'theirs', 'some', 'are', 'our', 'ourselves', 'out', 'what', 'for', 'below', 'does', 'above', 'between', 'she', 'be', 'we', 'after', 'here', 'hers', 'by', 'on', 'about', 'of', 'against', 's', 'or', 'own', 'into', 'yourself', 'down', 'your', 'from', 'her', 'whom', 'there', 'been', 'few', 'too', 'themselves', 'was', 'until', 'more', 'himself', 'that', 'but', 'off', 'herself', 'than', 'those', 'he', 'me', 'myself', 'this', 'up', 'will', 'while', 'can', 'were', 'my', 'and', 'then', 'is', 'in', 'am', 'it', 'an', 'as']


## Preprocess `rawDF`

In [68]:
def filter_helpful_reviews(sparkDF, pct_helpful, min_votes):
    '''
    Return a dataframe filtered by comments that are at least `pct_helpful`% helpful 
    and contain a minimum number of votes.
    '''
    def _filter_helpful_reviews(votes, pct_helpful, min_votes):
        '''
        Return bool if comment are at least `pct_helpful`% helpful and have at least `min_votes` vote.
        
        Inputs:
            votes: A list of votes by [helpful, total_votes]
            pct_helpful: A float
            min_votes: A float
        '''
        return (votes[1] >= min_votes and (float(votes[0]) / votes[1] >= pct_helpful))
    
    udf = F.udf(lambda votes: _filter_helpful_reviews(votes, pct_helpful, min_votes), T.BooleanType())
    
    return sparkDF.filter(udf(F.col('helpful')))

def preprocess_DF(rawDF):
    '''
    Preprocess text of dataframe by:
        - selecting relevant columns ('clusterID', 'overall', 'review')
        - concatenating 'summary' and 'reviewText' columns
        - removing punctuations
        - filtering out stopwords
    '''
    
    def _filter_stopwords(text):
        '''
        Input:
            text: A string concatenation of the columns 'summary' and 'reviewText'.
        Returns:
            A list of strings with stopwords filtered out.
        '''
        string_list = re.split(r'\W+', text.lower())
        
        return [word for word in string_list if word not in stopwords_broadcast.value]
    
    udf = F.udf(lambda text: _filter_stopwords(text), T.ArrayType(T.StringType()))
    
    return (rawDF
            .select(F.concat_ws('-', 
                                F.col('asin'), 
                                F.col('reviewerID')).alias('reviewID'),
                    F.col('clusterID'),
                    F.col('overall'), 
                    udf(F.concat_ws(' ', 
                                    F.col('summary'), 
                                    F.col('reviewText'))).alias('reviews'))
           .cache())

In [69]:
preprocessedDF = preprocess_DF(filter_helpful_reviews(rawDF, pct_helpful=.5, min_votes=3))
displayDF(preprocessedDF, 5)

Count: 1132


Unnamed: 0,reviewID,clusterID,overall,reviews
0,B000056JAK-AQYYUN3T7B0WM,1141,4.0,"[leaks, drink, last, year, pregnant, asked, moms, cups, bottles, use, told, playtex, told, avent, bottles, sippy, cups, way, go, well, decided, go, avent, writing, review, playtex, sippy, cup, avent, sippy, cups, hard, hold, difficult, drink, cup, filled, capacity, pain, clean, worst, leak, spout, leaks, nothing, magic, cups, collars, leak, finally, decided, 5, months, using, avent, magic, cups, little, success, give, quot, hard, use, quot, one, mom, classified, playtex, cup, try, difference, astounding, daughter, could, easily, drink, cup, first, try, cup, easy, clean, 3, pieces, opposed, 5, 6, w, handles, pieces, avent, cup, assemble, use, m, sorry, didn, least, register, pack, ...]"
1,B000056JAK-A2ETOH1CN5XTH6,1141,5.0,"[great, leak, relatively, easy, clean, nothing, chew, trough, amazing, replacement, part, soft, spout, may, cost, much, new, sippy, yes, bottle, finally, gone]"
2,B000056JAK-A1Q4TQ06340F3U,1141,5.0,"[best, sippy, cup, hands, would, consider, sippy, cup, pro, tried, sooo, many, different, brands, kinds, best, durable, leak, easy, clean, waste, time, money, sippy]"
3,B000056JAK-A3JBVGKIVMS9X0,1141,5.0,"[nice, spill, proof, cups, kids, love, cups, frankly, easy, use, valve, allows, kids, easily, drink, easy, clean, perfect, travel, may, beginning, cup, user, daughter, preferred, soft, spout, started, using, cup, hard, plastic]"
4,B000056JAK-A2JZPEW49WB9KN,1141,5.0,"[leaks, choice, colors, wife, purchased, another, store, twice, price, decided, bring, cost, average, found, leak, small, rubber, piece, fits, spout, control, speed, liquid, comes, child, gets, old, enough, eliminate, rubber, piece, think, great, use, soon, stop, using, bottles, stop, using, rubber, flow, control, piece, use, child, grows, sippy, cups, lose, great, choice, highly, recommended]"


### Check for illegal `clusterID`s

In [70]:
@F.udf(returnType=T.BooleanType())
def check_int_instance(clusterID):
    return True if isinstance(clusterID, int) else False

displayDF(preprocessedDF
          .filter(check_int_instance(F.col('clusterID')) == False))

Count: 0


Unnamed: 0,reviewID,clusterID,overall,reviews


## Label ratings for `preprocessedDF`

In [71]:
@F.udf(returnType=T.BooleanType())
def label_review_quality(rating, min_good_review_rating=4):
    '''
    UDF to append a column label of 'isPositiveReview' to the DataFrame.
    
    Inputs:
        rating: Rating from 1.0 to 5.0 (int expressed as float)
    Outputs:
        True if review has a rating of at least `min_good_review_rating`, otherwise False.
    '''
    return True if rating >= min_good_review_rating else False

In [72]:
polarizedDF = (preprocessedDF
               .withColumn('positiveReview', label_review_quality('overall')))
displayDF(polarizedDF, 5)

Count: 1132


Unnamed: 0,reviewID,clusterID,overall,reviews,positiveReview
0,B000056JAK-AQYYUN3T7B0WM,1141,4.0,"[leaks, drink, last, year, pregnant, asked, moms, cups, bottles, use, told, playtex, told, avent, bottles, sippy, cups, way, go, well, decided, go, avent, writing, review, playtex, sippy, cup, avent, sippy, cups, hard, hold, difficult, drink, cup, filled, capacity, pain, clean, worst, leak, spout, leaks, nothing, magic, cups, collars, leak, finally, decided, 5, months, using, avent, magic, cups, little, success, give, quot, hard, use, quot, one, mom, classified, playtex, cup, try, difference, astounding, daughter, could, easily, drink, cup, first, try, cup, easy, clean, 3, pieces, opposed, 5, 6, w, handles, pieces, avent, cup, assemble, use, m, sorry, didn, least, register, pack, ...]",True
1,B000056JAK-A2ETOH1CN5XTH6,1141,5.0,"[great, leak, relatively, easy, clean, nothing, chew, trough, amazing, replacement, part, soft, spout, may, cost, much, new, sippy, yes, bottle, finally, gone]",True
2,B000056JAK-A1Q4TQ06340F3U,1141,5.0,"[best, sippy, cup, hands, would, consider, sippy, cup, pro, tried, sooo, many, different, brands, kinds, best, durable, leak, easy, clean, waste, time, money, sippy]",True
3,B000056JAK-A3JBVGKIVMS9X0,1141,5.0,"[nice, spill, proof, cups, kids, love, cups, frankly, easy, use, valve, allows, kids, easily, drink, easy, clean, perfect, travel, may, beginning, cup, user, daughter, preferred, soft, spout, started, using, cup, hard, plastic]",True
4,B000056JAK-A2JZPEW49WB9KN,1141,5.0,"[leaks, choice, colors, wife, purchased, another, store, twice, price, decided, bring, cost, average, found, leak, small, rubber, piece, fits, spout, control, speed, liquid, comes, child, gets, old, enough, eliminate, rubber, piece, think, great, use, soon, stop, using, bottles, stop, using, rubber, flow, control, piece, use, child, grows, sippy, cups, lose, great, choice, highly, recommended]",True


## Aggregate ratings by `clusterID` and `positiveReview`

In [73]:
@F.udf(returnType=T.ArrayType(T.StringType()))
def flatten(nested_list):
    '''
    Flatten a list of list to a one dimensional list.
    '''
    final_list = []
    for list_ in nested_list:
        final_list.extend(list_)
        
    return final_list

@F.udf(returnType=T.BooleanType())
def bool_invert(boolean):
    '''
    Invert the value of a boolean
    '''
    return not boolean

In [74]:
aggregatedDF = (polarizedDF
                .groupBy('clusterID', 'positiveReview')
                .agg(F.collect_list('reviews').alias('collectedReviews'))
                .withColumn('tokens', flatten('collectedReviews'))
                .drop('collectedReviews')
                .sort('clusterID', bool_invert('positiveReview'))
                .cache())
displayDF(aggregatedDF, 6)

Count: 24


Unnamed: 0,clusterID,positiveReview,tokens
0,6,True,"[great, product, fabulous, customer, service, charging, issues, appear, fixed, purchased, tykelight, february, 2009, huge, hit, 16, month, old, used, every, night, sometimes, day, weeks, ago, charger, failed, work, finally, got, around, calling, mobi, last, week, extremely, friendly, apologetic, helpful, today, received, brand, new, tykelight, mail, request, send, back, defective, one, charging, mechanism, new, tykelight, reversed, used, bottom, person, base, vice, versa, makes, lot, sense, pieces, used, base, person, quite, fragile, old, base, movable, parts, also, parts, used, time, bit, substantive, wish, could, attach, pics, reviews, trust, guess, fewer, people, charging, issues, mentioned, reviews, give, company, newly, redesigned, tykelight, two, huge, ...]"
1,6,False,"[good, concept, poor, design, thought, great, idea, since, never, seem, enough, light, clipping, sleeping, baby, nails, unfortunately, way, clipper, fastened, housing, light, unable, see, exactly, positioning, clipper, baby, nails, several, tries, either, missed, nail, altogether, clipped, skin, great, idea, way, bright, light, way, bright, bedtime, use, harsh, led, like, brightness, kids, really, wanted, like, complained, way, bright, ultimately, returned, htem, shame, owned, original, mobi, tykelight, glomate, loved, product, discontinued, think, far, superior, total, pos, bought, son, long, enough, get, attached, stopped, working, thing, literally, functioned, maybe, month, recommendation, find, something, else, please, buy, please, buy, item, purchased, 2, regret, purchases, ...]"
2,91,True,"[great, even, apartment, second, monitor, tried, live, downstairs, apartment, picked, first, monitor, upstairs, neighbor, monitor, child, ve, found, much, better, results, monitor, range, adequate, entire, apartment, even, watching, tv, listening, radio, volume, loud, enough, hear, shower, sensitive, enough, pick, slight, wimper, re, pleased, aunt, loves, em, 1, bought, brand, new, baby, cousin, arrival, aunt, says, work, wonderful, double, receivers, convenient, pleased, baby, monitors, one, best, one, found, reasonable, price, range, amazing, go, side, yard, work, doesn, fuzz, really, best, money, two, kids, often, grabbed, antenna, swinging, around, usually, ruins, monitors, still, survived, problems, think, purchase, another, set, two, sets, hand, ...]"
3,91,False,"[static, loud, monitor, static, interference, loud, drives, crazy, hear, actual, sounds, bought, static, useless, monitor, worth, raving, bought, monitor, second, child, definately, pros, cons, pros, 1, cost, good, price, monitor, 2, receivers, 2, sensitivity, sensitive, picking, sound, hear, baby, movements, bassinet, stirring, begins, cry, cons, 1, static, monitor, prone, static, comes, throughout, day, even, sound, sensitive, lights, receivers, acknowledge, static, coming, base, 2, reliable, portable, mean, receivers, operating, battery, power, alone, 50, 50, chance, actually, work, even, new, batteries, constantly, problems, getting, either, receiver, turn, owned, one, monitor, one, didn, problems, first, first, safety, 1st, monitor, price, 16, 99, could, ...]"
4,716,True,"[great, keepsake, husband, bought, loved, idea, putting, baby, picture, next, foot, hand, print, thought, would, really, neat, look, back, picture, 6, weeks, see, small, tiny, little, feet, hands, easy, material, really, easy, work, easy, get, baby, foot, hand, isn, goopy, bring, material, baby, instead, trying, balancing, act, getting, baby, positioned, material, trying, hold, still, put, stuff, package, grabbed, appendage, wanted, imprint, put, material, foot, hand, pressed, gently, frame, easy, put, together, caste, dry, set, background, pink, one, side, blue, really, doesn, get, easier, really, attractive, put, together, problem, caste, cracked, drying, much, enough, noticeable, husband, didn, lay, exactly, flat, dry, ...]"
5,716,False,"[bought, thing, year, ago, item, good, bought, item, year, ago, son, first, born, thought, would, neat, one, first, birthday, place, side, side, like, one, much, one, received, last, year, one, bigger, ordered, one, black, frame, supposed, come, black, background, instead, came, baby, blue, one, side, pink, disappointed, need, rig, black, poster, board, something, else, actually, match, one, even, though, picture, website, shows, black, frame, black, background, frustrated, opened, way, return, without, cost, like, item, like, tweek, make, supposed, first, place, buy, zacardi, writing, warn, bought, frame, zicardi, came, damaged, several, discussions, customer, service, instead, return, like, said, sent, another, frame, ...]"


## Select top $N$ words ranked by TF-IDF for positive and negative reviews

In [75]:
# UDFs to bin group word categories in new DF columns
@F.udf(returnType=T.ArrayType(T.StringType()))
def bin_pos_top_words(words, positiveReview):
    return words if positiveReview == True else None

@F.udf(returnType=T.ArrayType(T.StringType()))
def bin_neg_top_words(words, positiveReview):
    return words if positiveReview == False else None

In [139]:
def top_N_words(DF, N=10):
    '''
    Obtains the top N words of positive and negative reviews respectively, ranked by tf-idf of entries in a DF.
    Inputs:
        DF: A Spark DataFrame containing columns 'clusterID', 'positiveReview' and 'tokens'.
        N:  Number of top-ranking words to keep
    Outputs:
        A DataFrame of schema (_clusterID_, _positiveReview_, top_N_pos, top_N_neg)
    '''
    
    def tf(tokens):
        '''
        Calculate the token frequency (TF) for each review.

        Inputs:
            tokens: A list of token strings.
        Outputs:
            A dictionary of (token, tf).
        '''
        N = len(tokens)

        return {token: float(tokens.count(token)) / N for token in tokens}

    def idf(corpusDF):
        '''
        Calculate the inverse document frequency for the corpusDF.

        Inputs:
            corpusDF: A Spark DataFrame containing columns 'tokens'.
        Outputs:
            A dictionary of (token, idf).
        '''
        # Calculate the number of reviews
        N = corpusDF.count()

        # Create an RDD with entries (uniq_tokens_in_each_review, 1)
        checkpointRDD = (corpusDF
                         .select('tokens')
                         .rdd
                         .flatMap(lambda tokens: list(set(tokens[0])))
                         .map(lambda token: (token, 1)))

        # Produce a dict containing (token, idf)
        return (checkpointRDD
                .reduceByKey(lambda a, b: a + b)
                .mapValues(lambda count: float(count) / N)
                .collectAsMap())
    
    # Calculate tf's as an RDD
    tfRDD = (DF
           .select('clusterID', 'positiveReview', 'tokens')
           .rdd
           .map(lambda (clusterID, positiveReview, tokens): ((clusterID, positiveReview), tf(tokens))))
    
    # Calculate idf's as a dict and return a broadcasted instance
    idfs_broadcast = spark.sparkContext.broadcast(idf(DF.select('tokens')))
    
    # Calculate tfidf as an rdd of (clusterID, positiveReview, top_N_tokens)
    topNwordsRDD = (tfRDD
                    .mapValues(lambda tf_dict: [(token, float(tf_dict[token]) / idfs_broadcast.value[token]) for token in tf_dict.keys()])
                    .mapValues(lambda tfidfs: sorted(tfidfs, key=lambda (token, tfidf): -tfidf)) # Sort words in descending order
#                     .sortBy(lambda ((clusterID, positiveReview), tfidfs): ((clusterID, not positiveReview, sorted(tfidfs, key=lambda (token, tfidf): -tfidf))))
                    .mapValues(lambda tfidfs: map(lambda (token, tfidf): token, tfidfs)[:N]) # Map (word, score) tuple to top N words
                    .map(lambda ((clusterID, positiveReview), top_N_tokens): (clusterID, positiveReview, top_N_tokens)) # Flatten tuple
                    .cache())
    
    # Transform topNwordsRDD to an RDD of form (clusterID, top_N_positive, top_N_negative)
    temp_schema = T.StructType([
        (T.StructField('clusterID', T.IntegerType())),
        (T.StructField('positiveReview', T.BooleanType())),
        (T.StructField('topWords', T.ArrayType(T.StringType())))
    ])
    
    return (spark
            .createDataFrame(topNwordsRDD, temp_schema)
            .withColumn('posWords', bin_pos_top_words('topWords', 'positiveReview'))
            .withColumn('negWords', bin_neg_top_words('topWords', 'positiveReview'))
            .drop('positiveReview', 'topWords')
            .groupBy('clusterID')
            .agg(F.collect_list('posWords').alias('posWords'), F.collect_list('negWords').alias('negWords'))
            .cache())

In [140]:
finalDF = top_N_words(aggregatedDF, N=20)
displayDF(finalDF, 20)

Count: 12


Unnamed: 0,clusterID,posWords,negWords
0,91,"[[monitor, static, sony, monitors, receiver, interference, hear, noise, range, video, fisher, sound, channels, camera, reception, activated, battery, wireless, 27, audio]]","[[monitor, static, receiver, beeps, sony, interference, beep, beeping, range, digital, monitors, camera, video, fisher, signal, reception, transmitter, noise, volume, sound]]"
1,935,"[[ductwork, kushions, scabs, ignores, curious, rv, cushioning, colliisions, profanity, grandkids, heater, reduction, vouch, alittle, urethane, corner, adhesive, protect, guards, head]]","[[jumbo, prince, lionheart, kidkusion, tape, dimensions, disguised, 2x1x1, protection, corners, cusion, conclusions, uploaded, matters, fireplace, workthe, bumpers, existence, boooooo, budged]]"
2,6,"[[jr, guardian, charging, nightlight, mobi, tykelight, nightlights, companion, contacts, base, light, glows, fascinated, grandson, red, vessel, night, toy, dim, charger]]","[[tykelight, contacts, kinderglo, charger, light, bright, charging, red, hypnotic, glomate, 30pm, clipper, base, charged, mobi, green, one, childs, presents, connects]]"
3,1037,"[[pee, teepees, plunk, hosed, weiner, cones, washable, fly, laughs, guarantee, shirt, changing, subtle, joke, lesson, fire, wiping, soak, lined, funny]]","[[pee, peed, teepee, tee, squirming, squirmy, moves, washcloth, squirted, uninator, wiggled, realistic, novelty, joy, advantages, slipped, starters, absorbs, peepee, novel]]"
4,1677,"[[pins, assistant, hee, counted, padded, absolutly, ethan, colorful, 52, decides, carriying, colourful, colours, higly, aquarium, mailer, concerning, neighbour, seattle, walker]]","[[crash, illegal, fmvss, trap, enforce, united, pins, restraint, death, seat, 213, geez, systematically, leaned, edged, legal, cart, vehicle, injury, chip]]"
5,1607,"[[handlebar, starry, bike, perfection, joovy, stroll, glove, grandmothers, ultralight, zip, caboose, gashes, gashed, bicycling, ties, cushioned, sleeve, city, rubber, jackpot]]","[[volume, music, washcloth, towel, ice, totes, lining, blanket, specs, section, 33, projection, wallet, santa, displayed, 12lbs, resort, superglueing, kleenx, operated]]"
6,1578,"[[pump, electric, isis, milk, medela, pumping, avent, pumps, breast, suction, manual, hospital, bottle, madela, sore, breasts, breastfeeding, bottles, one, tubing]]","[[pump, isis, electric, pumping, milk, medela, suction, breast, pumps, harmony, women, avent, bottle, engorged, bottles, planned, squeaked, get, one, would]]"
7,1042,"[[decal, tree, salon, decals, bedding, dirt, textured, posting, waxy, removes, stickier, wall, spray, everybody, grime, gorgeous, delivery, curl, walls, seller]]","[[branch, decal, trees, fingerprints, wall, xacto, misleading, glue, false, sticker, knife, unroll, overpaying, superglue, mural, dicovered, textured, advertising, wil, marker]]"
8,1040,"[[food, beaba, babycook, steamer, blender, mold, veggies, basket, foods, puree, steaming, steam, reservoir, cook, bowl, meals, carrots, steamed, processor, batches]]","[[food, steamer, beaba, mold, babycook, steaming, cook, basket, puree, steam, cooking, blender, processor, reservoir, stainless, stove, water, bitter, blend, batches]]"
9,1141,"[[straw, cups, sippy, straws, cup, playtex, gripper, spout, leak, insulated, spill, trainer, insulator, nuby, valve, proof, sippies, milk, drinks, designs]]","[[straw, cups, straws, vary, cup, princesses, playtex, sippy, girly, flowers, slits, unisex, milk, valve, disney, pink, princess, contigo, butterflies, cupholder]]"


## Obtain non-commons words shared between `posWords` and `negWords`

In [143]:
# UDFs to obtain exclusive words in posWords and negWords
@F.udf(returnType=T.ArrayType(T.StringType()))
def exclusive_words_pos(pos, neg):
    return list(set(pos[0]) - (set(pos[0]) & set(neg[0])))

@F.udf(returnType=T.ArrayType(T.StringType()))
def exclusive_words_neg(pos, neg):
    return list(set(neg[0]) - (set(pos[0]) & set(neg[0])))

In [146]:
exclusiveWordsDF = (finalDF
                    .select('clusterID',
                            exclusive_words_pos('posWords', 'negWords').alias('exclusivePosWords'), 
                            exclusive_words_neg('posWords', 'negWords').alias('exclusiveNegWords')))
displayDF(exclusiveWordsDF)

Count: 12


Unnamed: 0,clusterID,exclusivePosWords,exclusiveNegWords
0,91,"[wireless, 27, battery, activated, channels, hear, audio]","[beep, transmitter, signal, volume, beeping, digital, beeps]"
1,935,"[grandkids, heater, rv, protect, guards, adhesive, curious, profanity, alittle, head, cushioning, urethane, kushions, reduction, ductwork, ignores, colliisions, corner, scabs, vouch]","[uploaded, kidkusion, workthe, dimensions, disguised, conclusions, lionheart, corners, prince, bumpers, 2x1x1, matters, protection, tape, cusion, jumbo, existence, boooooo, fireplace, budged]"
2,6,"[dim, toy, nightlight, grandson, glows, guardian, fascinated, jr, companion, nightlights, night, vessel]","[clipper, childs, 30pm, connects, hypnotic, one, presents, bright, glomate, kinderglo, green, charged]"
3,1037,"[fly, washable, cones, soak, funny, fire, plunk, laughs, guarantee, joke, wiping, hosed, changing, teepees, subtle, weiner, lined, lesson, shirt]","[teepee, novel, uninator, slipped, wiggled, squirmy, joy, novelty, tee, peed, starters, realistic, advantages, washcloth, absorbs, squirming, moves, peepee, squirted]"
4,1677,"[padded, neighbour, mailer, decides, colorful, assistant, absolutly, carriying, higly, counted, 52, colours, hee, ethan, walker, seattle, colourful, concerning, aquarium]","[fmvss, geez, restraint, united, crash, 213, systematically, illegal, chip, cart, seat, death, legal, vehicle, edged, injury, leaned, enforce, trap]"
5,1607,"[city, gashes, sleeve, zip, caboose, starry, bicycling, stroll, glove, cushioned, rubber, bike, perfection, handlebar, ties, jackpot, grandmothers, gashed, joovy, ultralight]","[volume, washcloth, towel, projection, wallet, 33, lining, blanket, kleenx, displayed, superglueing, ice, specs, totes, resort, music, santa, operated, 12lbs, section]"
6,1578,"[breastfeeding, manual, hospital, madela, breasts, sore, tubing]","[would, get, harmony, squeaked, planned, engorged, women]"
7,1042,"[everybody, waxy, salon, posting, dirt, delivery, tree, bedding, grime, removes, walls, seller, decals, spray, stickier, gorgeous, curl]","[fingerprints, false, wil, sticker, superglue, trees, xacto, unroll, misleading, dicovered, knife, branch, marker, glue, mural, overpaying, advertising]"
8,1040,"[steamed, carrots, bowl, foods, veggies, meals]","[stainless, stove, cooking, blend, water, bitter]"
9,1141,"[trainer, nuby, insulated, gripper, spill, leak, spout, designs, insulator, proof, sippies, drinks]","[pink, girly, unisex, cupholder, vary, butterflies, slits, princesses, flowers, contigo, princess, disney]"


### Output `finalDF` to csv

In [149]:
exclusiveWordsDF.toPandas().to_csv('../../Results/exclusive_top_words.csv')