# Introduction
This `Python 2` / `PySpark` script analyzes the effectiveness of our clustering algorithm, by finding the most common word in the title of all products wihtin each cluster.

# Notebook Setup

## Initialise modules

In [1]:
import findspark
import pyspark
import pyspark.sql.functions as F
import pyspark.sql.types as T

import pymongo
import pandas as pd

import gzip # To parse gzip file
import re # Regex for text processing
import os # For setting up Mongo-Spark connector
import csv # To read/write CSV files

from collections import Counter # To count the word frequencies

import plotly
import plotly.plotly as py
import plotly.offline as pyo
import plotly.graph_objs as go
import colorlover as cl

## Initialise PySpark session

Load `MongoDB-Spark` connector when starting up `PySpark`.

In [2]:
packages = 'org.mongodb.spark:mongo-spark-connector_2.11:2.2.0'
dedicated_memory = '4g'

os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages {} --driver-memory {} pyspark-shell' \
    .format(packages, dedicated_memory)

In [3]:
# Find SPARK_HOME
findspark.init()

# Create SparkSession
spark = (pyspark.sql.SparkSession
         .builder.appName('ClusterAssessment')
         .getOrCreate())

## Initiate Plotly Offline notebook mode

In [4]:
pyo.init_notebook_mode(connected=True)

## Configure Pandas HTML display

In [5]:
pd.set_option('display.max_colwidth', -1)

## Define helper methods

In [6]:
def parse(path):
    '''
    Unzip a json.gz at `path` and returns a generator.
    '''
    g = gzip.open(path, 'rb')
    for line in g:
        yield eval(line)

def import_to_mongo(path, coll, db='hackon', create_index=True):
    '''
    Unzip and import json.gz file from `path` and loads it into mongo server.
    Create database index if `create_index` is True. 
    '''
    # Obtain handle to Mongo database and collection
    client = pymongo.MongoClient()
    collection = client[db][coll]
    
    # Return prematurely if database.collection already exists
    if (collection.count() != 0):
        print '{}.{} already exists on MongoDisk server. Exiting without loading JSON data.'.format(db, coll)
        return
    
    # Insert datapoints into Mongo database
    try:
        collection.insert_many((datapoint for datapoint in parse(path)))
        print 'JSON data successfully imported to Mongo at \'{}.{}.\''.format(db, coll)
    except Exception as e:
        print 'Error loading data.\n{}'.format(e)
        client.close()
        return
    
    if not create_index:
        client.close()
        return
    
    # Create database index for improved searching
    # collection.create_index([('asin', pymongo.ASCENDING), ('reviewerID', pymongo.DESCENDING)])

def load_mongo_to_spark(coll, db='hackon'):
    '''
    Load the Mongo database to a Spark Session and returns the Spark DataFrame
    '''
    try:
        return (spark
                .read
                .format('com.mongodb.spark.sql.DefaultSource')
                .option('uri', 'mongodb://127.0.0.1/{}.{}'.format(db, coll))
                .load())
    except Exception as e:
        print 'Failed to create Spark dataframe.\n{}'.format(e)

def displayDF(sparkDF, n=10):
    '''
    Interactively displays the first n rows of a sparkDF as a pandas dataframe
    '''
    return (sparkDF
            .limit(n)
            .drop('_id', 'unixReviewTime')
            .toPandas())

In [7]:
import_to_mongo('../../Datasets/reviews_Baby.json.gz', coll='baby')

hackon.baby already exists on MongoDisk server. Exiting without loading JSON data.


## Load stopwords
A list of stopwords is loaded as a Python list and broadcasted in PySpark.

In [23]:
# TODO: Get brands from rawDF instead of a new file here.
# TODO: Replace &amp and &#39
# Find distinct brands in the dataset
uniqueBrands = (load_mongo_to_spark('baby_meta')
                .select('brand')
                .distinct()
                .rdd
                .map(lambda x: x[0])
                .collect())

In [9]:
# Load stopwords into list
with open('stopwords.csv', 'r') as csvFile:
    fileReader = csv.reader(csvFile)
    stopwords = []
    for word in fileReader:
        stopwords.extend(word)
        
# Add '' to stopwords
stopwords.append('')

# Add brands into stopwords
stopwords.extend(uniqueBrands)

# Broadcast stopwords
stopwords_broadcast = spark.sparkContext.broadcast(stopwords)
        
print 'First 100 stopwords:\n\n{}'.format(stopwords[:100])

First 100 stopwords:

['all', 'just', 'being', 'over', 'both', 'through', 'yourselves', 'its', 'before', 'with', 'had', 'should', 'to', 'only', 'under', 'ours', 'has', 'do', 'them', 'his', 'very', 'they', 'not', 'during', 'now', 'him', 'nor', 'did', 'these', 't', 'each', 'where', 'because', 'doing', 'theirs', 'some', 'are', 'our', 'ourselves', 'out', 'what', 'for', 'below', 'does', 'above', 'between', 'she', 'be', 'we', 'after', 'here', 'hers', 'by', 'on', 'about', 'of', 'against', 's', 'or', 'own', 'into', 'yourself', 'down', 'your', 'from', 'her', 'whom', 'there', 'been', 'few', 'too', 'themselves', 'was', 'until', 'more', 'himself', 'that', 'but', 'off', 'herself', 'than', 'those', 'he', 'me', 'myself', 'this', 'up', 'will', 'while', 'can', 'were', 'my', 'and', 'then', 'is', 'in', 'am', 'it', 'an', 'as']


# Cluster Assessment Script
The following code analyzes how frequent words appear in product titles of a given cluster. This is used as a tentative proxy for clustering effectiveness.

In [24]:
# Load stopwords into list
with open('stopwords.csv', 'r') as csvFile:
    fileReader = csv.reader(csvFile)
    stopwords = []
    for word in fileReader:
        stopwords.extend(word)
        
# Add '' to stopwords
stopwords.append('')

# Broadcast stopwords
stopwords_broadcast = spark.sparkContext.broadcast(stopwords)

@F.udf(returnType=T.ArrayType(T.StringType()))
def tokenize_set_and_filter_stopwords(text):
    '''
    Tokenizes a list of words, before filtering for stopwords. Return a setted list of words.
    
    Input:
        text: A string.
    Returns:
        A list of setted words with stopwords removed.
    '''
    string_set = set(re.split(r'\W+', text.lower()))

    return [word for word in string_set if word not in stopwords_broadcast.value]

@F.udf(returnType=T.IntegerType())
def assign_clusterID():
    return random.randint(0, 10)

@F.udf(returnType=T.ArrayType(T.StructType([
    T.StructField('token', T.StringType()),
    T.StructField('frequency', T.FloatType())
])))
def analyze_word_frequency(tokens, N=10):
    '''
    Calculate the frequency of words in title appearing in products titles of a given cluster.
    
    Inputs:
        tokens: A list of list of words.
    Returns:
        A list of tuple (token, frequency) of the top N words, sorted in decreasing frequency.
    '''
    counter = Counter(word for words in tokens for word in words)
    L = len(tokens)
    return map(lambda (k, v): (k, float(v) / L), 
               counter.most_common()[:N])

## Import clustered products metadata

In [25]:
titlesDF = load_mongo_to_spark('baby_meta').select('asin', 'title')
displayDF(titlesDF)

Unnamed: 0,asin,title
0,0188399313,Lifefactory 4oz BPA Free Glass Baby Bottles - 4-pack-raspberry and Lilac
1,0188399518,Planetwise Flannel Wipes
2,0188399399,Planetwise Wipe Pouch
3,0316967297,Annas Dream Full Quilt with 2 Shams
4,0615447279,Stop Pacifier Sucking without tears with Thumbuddy To Love's Binky Fairy Puppet and Adorable Book
5,0670062049,5 Pink Gumdrops + One Pacifier Clip
6,0705391752,A Tale of Baby's Days with Peter Rabbit
7,097293751X,"Baby Tracker&reg; - Daily Childcare Journal, Schedule Log"
8,0974671517,Wee Gallery Twins Board Book
9,0980027519,Nature's Lullabies First and Second Year Calendars


In [44]:
clusteredProductMetaDF = (spark
                          .read
                          .format('com.databricks.spark.csv')
                          .options(header='true', inferschema='true')
                          .load('../../Datasets/baby_2000_2_cluster_df.csv')
                          .drop('_c0')
                          .dropna()
                          .join(titlesDF, 'asin')
                         )
displayDF(clusteredProductMetaDF)

Unnamed: 0,asin,clusterId,title
0,B00029TCNU,48,"Stork Craft Beatrice 3 Drawer Chest, White"
1,B00029TCS0,48,"Stork Craft Beatrice Combo Tower Chest, White"
2,B00029TCQM,48,"Stork Craft Beatrice 5 Drawer Chest, White"
3,B00029TCP8,247,"Stork Craft Beatrice 4 Drawer Chest, White"
4,B0002AASH8,527,Comfort U Pillow -For Total Body Support
5,B0002ADQOU,908,Graco Seatbelt Locking Clip
6,B0002AZ05S,376,Child to Cherish ''A Block To Grow On'' in Pink
7,B0002AZ080,1629,Child to Cherish Baby To Bride Bracelet
8,B0002C6EXI,1756,"FunToSee Undersea Adventure Nursery and Bedroom Make-Over Decal Kit, Underwater"
9,B0002DF4EC,593,Kettler Baby Swing Seat - Kettler 8355-000


In [45]:
topWordsDF = (clusteredProductMetaDF
              .select('clusterID', tokenize_set_and_filter_stopwords('title').alias('tokens'))
              .groupBy('clusterID')
              .agg(F.count('clusterID').alias('productCount'), F.collect_list('tokens').alias('tokens'))
              .select('clusterID', 'productCount', analyze_word_frequency('tokens').alias('topWordsAndFreq'))
             )
displayDF(topWordsDF)

Unnamed: 0,clusterID,productCount,topWordsAndFreq
0,148,1,"[(maclaren, 1.0), (stroller, 1.0), (techno, 1.0), (black, 1.0), (champagne, 1.0), (xlr, 1.0)]"
1,471,1,"[(play, 1.0), (snack, 1.0), (star, 1.0), (travel, 1.0), (kids, 1.0), (tray, 1.0)]"
2,496,1,"[(stretch, 1.0), (single, 1.0), (2, 1.0), (fitted, 1.0), (diaper, 1.0), (bummis, 1.0), (bamboozle, 1.0), (size, 1.0)]"
3,1342,6,"[(crib, 0.5), (sheet, 0.5), (baby, 0.333333343267), (summer, 0.166666671634), (infant, 0.166666671634), (set, 0.166666671634), (sweet, 0.166666671634), (quickzip, 0.166666671634), (bedtime, 0.166666671634), (pacifier, 0.166666671634)]"
4,1959,1,"[(mobile, 1.0), (price, 1.0), (friendly, 1.0), (musical, 1.0), (fisher, 1.0), (firsts, 1.0), (response, 1.0), (smart, 1.0)]"
5,392,34,"[(bag, 0.558823525906), (baby, 0.147058829665), (travel, 0.147058829665), (black, 0.117647059262), (diaper, 0.117647059262), (davinci, 0.117647059262), (pink, 0.088235296309), (jogger, 0.0588235296309), (rose, 0.0588235296309), (snoopy, 0.0588235296309)]"
6,623,1,"[(charcoal, 1.0), (sip, 1.0), (extra, 1.0), (perego, 1.0), (30, 1.0), (base, 1.0), (viaggio, 1.0), (peg, 1.0), (primo, 1.0)]"
7,1084,1,"[(brown, 1.0), (set, 1.0), (natural, 1.0), (flow, 1.0), (newborn, 1.0), (feeding, 1.0), (dr, 1.0)]"
8,1127,2,"[(nature, 1.0), (sleepy, 1.0), (safari, 1.0), (purest, 1.0), (set, 0.5), (crib, 0.5), (stacker, 0.5), (diaper, 0.5), (piece, 0.5), (4, 0.5)]"
9,1460,2,"[(blue, 0.5), (advance, 0.5), (car, 0.5), (comfort, 0.5), (lx, 0.5), (bouncer, 0.5), (boppy, 0.5), (cradle, 0.5), (triumph, 0.5), (convertible, 0.5)]"


### Clusters that have only 1 product

In [46]:
print '{} / {}'.format(topWordsDF.filter(F.col('productCount') == 1).count(), topWordsDF.count())

461 / 1500


In [51]:
displayDF(topWordsDF.filter(F.col('productCount') > 2))

Unnamed: 0,clusterID,productCount,topWordsAndFreq
0,148,11,"[(set, 0.818181812763), (tadpoles, 0.727272748947), (playmat, 0.727272748947), (ft, 0.54545456171), (sq, 0.54545456171), (16, 0.363636374474), (pink, 0.272727280855), (maclaren, 0.181818187237), (black, 0.181818187237), (champagne, 0.181818187237)]"
1,463,4,"[(may, 1.0), (vary, 1.0), (colors, 1.0), (2, 1.0), (pacifier, 1.0), (mam, 1.0), (months, 0.75), (silicone, 0.75), (pack, 0.75), (count, 0.25)]"
2,1238,3,"[(pack, 1.0), (instant, 0.666666686535), (travel, 0.666666686535), (heat, 0.666666686535), (warmer, 0.666666686535), (bottle, 0.666666686535), (bambinoz, 0.666666686535), (closer, 0.333333343267), (tommee, 0.333333343267), (feeding, 0.333333343267)]"
3,1342,11,"[(crib, 0.54545456171), (sheet, 0.54545456171), (baby, 0.272727280855), (summer, 0.181818187237), (infant, 0.181818187237), (set, 0.181818187237), (lambs, 0.181818187237), (bedding, 0.181818187237), (ivy, 0.181818187237), (kidsline, 0.0909090936184)]"
4,1959,4,"[(inflatable, 0.5), (bathtub, 0.5), (street, 0.25), (firsts, 0.25), (blue, 0.25), (ergobaby, 0.25), (performance, 0.25), (white, 0.25), (friendly, 0.25), (disney, 0.25)]"
5,392,80,"[(bag, 0.612500011921), (diaper, 0.33750000596), (black, 0.1875), (baby, 0.13750000298), (travel, 0.10000000149), (quinny, 0.0874999985099), (large, 0.0750000029802), (blue, 0.0750000029802), (pink, 0.0750000029802), (gerber, 0.0625)]"
6,540,3,"[(jojo, 1.0), (designs, 1.0), (set, 1.0), (wall, 1.0), (sweet, 1.0), (decal, 1.0), (4, 1.0), (sheets, 1.0), (stickers, 1.0), (ladybug, 0.333333343267)]"
7,1127,6,"[(purest, 1.0), (nature, 1.0), (sleepy, 0.666666686535), (safari, 0.666666686535), (set, 0.333333343267), (comfort, 0.333333343267), (4, 0.333333343267), (hug, 0.333333343267), (complete, 0.333333343267), (piece, 0.333333343267)]"
8,1721,14,"[(nursery, 0.928571403027), (mobiles, 0.928571403027), (flensted, 0.928571403027), (mobile, 0.357142865658), (5, 0.214285716414), (olephant, 0.0714285746217), (scandinavian, 0.0714285746217), (panda, 0.0714285746217), (sheep, 0.0714285746217), (monkey, 0.0714285746217)]"
9,31,5,"[(cosco, 1.0), (seat, 1.0), (booster, 1.0), (juvenile, 0.800000011921), (pronto, 0.800000011921), (positioning, 0.800000011921), (belt, 0.800000011921), (car, 0.600000023842), (el, 0.20000000298), (highrise, 0.20000000298)]"


## Visualizations

In [32]:
def plot_2D_histogram(DF):
    '''
    Plot a coloured histogram of top word frequency.
    '''
    def convert_to_list(singleColDF):
        '''
        Convert a PySpark DataFrame (with a single column) into a Python list.
        '''
        assert len(singleColDF.schema.names) == 1
        return singleColDF.rdd.map(lambda x: x[0]).collect()
    
    def create_colourscale():
        '''
        Create a custom colorscale.
        '''
        # Define colourscale
        colourscale = cl.scales['9']['seq']['Blues']
    
        return [[0.125 * i, colourscale[i]] for i in range(9)]
        
    
    x_val = DF.select('topWordsAndFreq').rdd.map(lambda x: x[0][0]['frequency']).collect()
    y_val = convert_to_list(DF.select('productCount'))
    
    # Define colourscale
    colourscale = cl.interp(cl.scales['9']['seq']['Blues'], 20)
    
    scatterTrace = go.Scatter(
        x = x_val, 
        y = y_val,
        mode = 'markers', 
        name = 'points',
        marker = dict(
            color = 'rgb(60,121,214)', 
            size = 2
        )
    )
    
    hist2DTrace = go.Histogram2dcontour(
        x = x_val, 
        y = y_val, 
        ncontours = 20,
        colorscale = create_colourscale(), 
        reversescale = False, 
        showscale = True,
        histnorm = 'probability'
    )
    xHistTrace = go.Histogram(
        x = x_val, 
        marker = dict(color='rgb(60,121,214)'),
        yaxis = 'y2'
    )
    yHistTrace = go.Histogram(
        y = y_val, 
        marker = dict(color='rgb(60,121,214)'),
        xaxis = 'x2'
    )
    data = [scatterTrace, hist2DTrace, xHistTrace, yHistTrace]

    layout = go.Layout(
        showlegend=False,
        autosize=False,
        width=600,
        height=550,
        xaxis=dict(
            domain=[0, 0.85],
            showgrid=False,
            zeroline=False
        ),
        yaxis=dict(
            domain=[0, 0.85],
            showgrid=False,
            zeroline=False
        ),
        margin=dict(
            t=50
        ),
        hovermode='closest',
        bargap=0,
        xaxis2=dict(
            domain=[0.85, 1],
            showgrid=False,
            zeroline=False
        ),
        yaxis2=dict(
            domain=[0.85, 1],
            showgrid=False,
            zeroline=False
        )
    )
    
    figure = go.Figure(data=data, layout=layout)
    return pyo.iplot(figure)
    
    traces = []
    
    x_val = DF.select('topWordsAndFreq').rdd.map(lambda x: x[0][0]['frequency']).collect()
    y_val = convert_to_list(DF.select('productCount'))
    
    # Define Histogram traces
    horHist = go.Histogram(
        x = x_val,
        autobinx = False,
        xbins = {
            'start': 0,
            'end': 1,
            'size': 50
        },
        yaxis = 'y2'
    )
    
    vertHist = go.Histogram(
        x = y_val,
        autobiny = False,
        ybins = {
            'start': 0,
            'end': 1,
            'size': 50
        },
        xaxis = 'x2'
    )
    
    # Define 2D Histogram Contour trace
    contourHist =go.Histogram2dContour(
        x = x_val,
        y = y_val,
        histnorm = 'probability',
        
    )
    
    data = go.Data([horHist, vertHist, contourHist])
    
    layout = go.Layout(
        showlegend = False,
        autosize = False,
        width = 1200,
        height = 1200,
        xaxis = dict(
            domain = [0, 0.85],
            showgrid = False,
            zeroline = False
        ),
        yaxis = dict(
            domain = [0, 0.85],
            showgrid = False,
            zeroline = False
        ),
        margin=dict(
            t=50
        ),
        hovermode='closest',
        bargap=0,
        xaxis2=dict(
            domain=[0.85, 1],
            showgrid=False,
            zeroline=False,
            showticklabels=False,
            showline = False
        ),
        yaxis2=dict(
            domain=[0.85, 1],
            showgrid=False,
            zeroline=False
        )
    )
    
    figure = go.Figure(data=data, layout=layout)
    
    return py.plot(figure, filename='Test', sharing='secret', auto_open=False)

In [33]:
plot_2D_histogram(topWordsDF)

In [47]:
plot_2D_histogram(topWordsDF)