# Introduction
This `Python 2` / `PySpark` script analyzes the effectiveness of our clustering algorithm, by finding the most common word in the title of all products wihtin each cluster.

# Notebook Setup

## Initialise modules

In [76]:
import findspark
import pyspark
import pyspark.sql.functions as F
import pyspark.sql.types as T

import pymongo
import pandas as pd

import gzip # To parse gzip file
import re # Regex for text processing
import os # For setting up Mongo-Spark connector
import csv # To read/write CSV files

from collections import Counter # To count the word frequencies

import plotly
import plotly.offline as pyo
import plotly.graph_objs as go

## Initialise PySpark session

Load `MongoDB-Spark` connector when starting up `PySpark`.

In [2]:
packages = 'org.mongodb.spark:mongo-spark-connector_2.11:2.2.0'
dedicated_memory = '4g'

os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages {} --driver-memory {} pyspark-shell' \
    .format(packages, dedicated_memory)

In [3]:
# Find SPARK_HOME
findspark.init()

# Create SparkSession
spark = (pyspark.sql.SparkSession
         .builder.appName('WordProcessing')
         .getOrCreate())

## Initiate Plotly Offline notebook mode

In [60]:
pyo.init_notebook_mode(connected=True)

## Configure Pandas HTML display

In [4]:
pd.set_option('display.max_colwidth', -1)

## Define helper methods

In [5]:
def parse(path):
    '''
    Unzip a json.gz at `path` and returns a generator.
    '''
    g = gzip.open(path, 'rb')
    for line in g:
        yield eval(line)

def import_to_mongo(path, coll, db='hackon', create_index=True):
    '''
    Unzip and import json.gz file from `path` and loads it into mongo server.
    Create database index if `create_index` is True. 
    '''
    # Obtain handle to Mongo database and collection
    client = pymongo.MongoClient()
    collection = client[db][coll]
    
    # Return prematurely if database.collection already exists
    if (collection.count() != 0):
        print '{}.{} already exists on MongoDisk server. Exiting without loading JSON data.'.format(db, coll)
        return
    
    # Insert datapoints into Mongo database
    try:
        collection.insert_many((datapoint for datapoint in parse(path)))
        print 'JSON data successfully imported to Mongo at \'{}.{}.\''.format(db, coll)
    except Exception as e:
        print 'Error loading data.\n{}'.format(e)
        client.close()
        return
    
    if not create_index:
        client.close()
        return
    
    # Create database index for improved searching
    # collection.create_index([('asin', pymongo.ASCENDING), ('reviewerID', pymongo.DESCENDING)])

def load_mongo_to_spark(coll, db='hackon'):
    '''
    Load the Mongo database to a Spark Session and returns the Spark DataFrame
    '''
    try:
        return (spark
                .read
                .format('com.mongodb.spark.sql.DefaultSource')
                .option('uri', 'mongodb://127.0.0.1/{}.{}'.format(db, coll))
                .load())
    except Exception as e:
        print 'Failed to create Spark dataframe.\n{}'.format(e)

def displayDF(sparkDF, n=10):
    '''
    Interactively displays the first n rows of a sparkDF as a pandas dataframe
    '''
    return (sparkDF
            .limit(n)
            .drop('_id', 'unixReviewTime')
            .toPandas())

In [6]:
import_to_mongo('../../Datasets/reviews_Baby.json.gz', coll='baby')

hackon.baby already exists on MongoDisk server. Exiting without loading JSON data.


## Load stopwords
A list of stopwords is loaded as a Python list and broadcasted in PySpark.

In [11]:
# Load stopwords into list
with open('stopwords.csv', 'r') as csvFile:
    fileReader = csv.reader(csvFile)
    stopwords = []
    for word in fileReader:
        stopwords.extend(word)
        
# Add '' to stopwords
stopwords.append('')

# Broadcast stopwords
stopwords_broadcast = spark.sparkContext.broadcast(stopwords)
        
print 'List of stopwords:\n\n{}'.format(stopwords)

List of stopwords:

['all', 'just', 'being', 'over', 'both', 'through', 'yourselves', 'its', 'before', 'with', 'had', 'should', 'to', 'only', 'under', 'ours', 'has', 'do', 'them', 'his', 'very', 'they', 'not', 'during', 'now', 'him', 'nor', 'did', 'these', 't', 'each', 'where', 'because', 'doing', 'theirs', 'some', 'are', 'our', 'ourselves', 'out', 'what', 'for', 'below', 'does', 'above', 'between', 'she', 'be', 'we', 'after', 'here', 'hers', 'by', 'on', 'about', 'of', 'against', 's', 'or', 'own', 'into', 'yourself', 'down', 'your', 'from', 'her', 'whom', 'there', 'been', 'few', 'too', 'themselves', 'was', 'until', 'more', 'himself', 'that', 'but', 'off', 'herself', 'than', 'those', 'he', 'me', 'myself', 'this', 'up', 'will', 'while', 'can', 'were', 'my', 'and', 'then', 'is', 'in', 'am', 'it', 'an', 'as', 'itself', 'at', 'have', 'further', 'their', 'if', 'again', 'no', 'when', 'same', 'any', 'how', 'other', 'which', 'you', 'who', 'most', 'such', 'why', 'a', 'don', 'i', 'having', 'so', 

# Cluster Assessment Script
The following code analyzes how frequent words appear in product titles of a given cluster. This is used as a tentative proxy for clustering effectiveness.

In [104]:
# Load stopwords into list
with open('stopwords.csv', 'r') as csvFile:
    fileReader = csv.reader(csvFile)
    stopwords = []
    for word in fileReader:
        stopwords.extend(word)
        
# Add '' to stopwords
stopwords.append('')

# Broadcast stopwords
stopwords_broadcast = spark.sparkContext.broadcast(stopwords)

@F.udf(returnType=T.ArrayType(T.StringType()))
def tokenize_set_and_filter_stopwords(text):
    '''
    Tokenizes a list of words, before filtering for stopwords. Return a setted list of words.
    
    Input:
        text: A string.
    Returns:
        A list of setted words with stopwords removed.
    '''
    string_set = set(re.split(r'\W+', text.lower()))

    return [word for word in string_set if word not in stopwords_broadcast.value]

@F.udf(returnType=T.IntegerType())
def assign_clusterID():
    return random.randint(0, 10)

@F.udf(returnType=T.ArrayType(T.StructType([
    T.StructField('token', T.StringType()),
    T.StructField('frequency', T.FloatType())
])))
def analyze_word_frequency(tokens, N=10):
    '''
    Calculate the frequency of words in title appearing in products titles of a given cluster.
    
    Inputs:
        tokens: A list of list of words.
    Returns:
        A list of tuple (token, frequency) of the top N words, sorted in decreasing frequency.
    '''
    counter = Counter(word for words in tokens for word in words)
    L = len(tokens)
    return map(lambda (k, v): (k, float(v) / L), 
               counter.most_common()[:N])

## Import clustered products metadata

In [13]:
clusteredProductMetaDF = (spark
         .read
         .format('com.databricks.spark.csv')
         .options(header='true', inferschema='true')
         .load('../../Datasets/baby_4000_cluster_df.csv')
         .drop('_c0')
         .dropna()
         )
displayDF(clusteredProductMetaDF)

Unnamed: 0,asin,clusterID,title
0,B000056JES,93,LITTLE SUZY'S ZOO Plush Crib MUSICAL MOBILE
1,B00005BTBH,2610,Sassy Rocking Horse Suction Cup Toy
2,B00006JZ7J,961,Koala Baby - Cotton Rib Blanket - Sage
3,B0000D8SIC,3240,Prince lionheart Ever-Fresh Replacement Pillow
4,B0000DEWH8,3078,"Badger Basket Natural Moses Basket with Hood, Sage Waffle Bedding"
5,B0000TQF9G,1629,LG navy CANVAS soft STORAGE Chest trunk collapsible NU
6,B0006HBS1M,104,Medela Harmony Manual Breast Pump
7,B00078ZHPS,1338,"Summer Infant Ultra Plush Change Pad Cover, Blue"
8,B0009Y8JNW,431,Kidco Y Spindle
9,B000BU4CHQ,130,Jeep Wrangler Twin Sport All-Weather Umbrella Stroller


In [105]:
topWordsDF = (clusteredProductMetaDF
              .select('clusterID', tokenize_set_and_filter_stopwords('title').alias('tokens'))
              .groupBy('clusterID')
              .agg(F.count('clusterID').alias('productCount'), F.collect_list('tokens').alias('tokens'))
              .select('clusterID', 'productCount', analyze_word_frequency('tokens').alias('topWordsAndFreq'))
             )
displayDF(topWordsDF)

Unnamed: 0,clusterID,productCount,topWordsAndFreq
0,148,2,"[(plug, 1.0), (set, 1.0), (gift, 1.0), (baby, 1.0), (handmade, 1.0), (free, 1.0), (cell, 1.0), (phone, 1.0), (plus, 1.0), (dust, 1.0)]"
1,463,2,"[(color, 1.0), (video, 1.0), (monitor, 1.0), (interference, 1.0), (2, 1.0), (4, 1.0), (digital, 1.0), (free, 1.0), (alert, 1.0), (baby, 1.0)]"
2,471,15,"[(smart, 0.40000000596), (living, 0.333333343267), (textiles, 0.266666680574), (swaddle, 0.266666680574), (muslin, 0.266666680574), (baby, 0.266666680574), (style, 0.20000000298), (white, 0.20000000298), (waffle, 0.133333340287), (wrap, 0.133333340287)]"
3,833,5,"[(style, 1.0), (trend, 1.0), (bag, 1.0), (lab, 1.0), (deluxe, 1.0), (diaper, 1.0), (duffle, 0.800000011921), (blue, 0.20000000298), (gray, 0.20000000298), (floral, 0.20000000298)]"
4,1238,2,"[(baby, 1.0), (goo, 1.0), (ah, 1.0), (kneekers, 1.0), (leg, 0.5), (lean, 0.5), (frog, 0.5), (lime, 0.5), (hoppy, 0.5), (chunkalicious, 0.5)]"
5,1580,14,"[(crib, 1.0), (sheet, 1.0), (anais, 0.857142865658), (muslin, 0.857142865658), (aden, 0.857142865658), (classic, 0.5), (solid, 0.428571432829), (blue, 0.214285716414), (pink, 0.142857149243), (ultimate, 0.142857149243)]"
6,1591,24,"[(baby, 0.25), (toy, 0.166666671634), (rattle, 0.166666671634), (center, 0.166666671634), (rocker, 0.125), (go, 0.125), (mamaroo, 0.125), (3, 0.125), (pillow, 0.125), (einstein, 0.125)]"
7,1645,34,"[(pack, 0.235294118524), (pacifier, 0.205882355571), (pad, 0.205882355571), (cover, 0.205882355571), (baby, 0.176470592618), (changing, 0.176470592618), (soft, 0.147058829665), (pink, 0.147058829665), (essentials, 0.117647059262), (muslin, 0.117647059262)]"
8,2122,4,"[(kiwi, 0.25), (carabiner, 0.25), (clip, 0.25), (ah, 0.25), (moto, 0.25), (locks, 0.25), (hassle, 0.25), (go, 0.25), (lifetime, 0.25), (investment, 0.25)]"
9,2366,2,"[(blue, 1.0), (nasal, 1.0), (babycomfynose, 1.0), (colors, 1.0), (choose, 1.0), (aspirator, 1.0), (magenta, 1.0)]"


In [108]:
def plot_2D_histogram(DF):
    '''
    Plot a coloured histogram of top word frequency.
    '''
    def convert_to_list(singleColDF):
        '''
        Convert a PySpark DataFrame (with a single column) into a Python list.
        '''
        assert len(singleColDF.schema.names) == 1
        return singleColDF.rdd.map(lambda x: x[0]).collect()
    
    traces = []
    
    x_val = DF.select('topWordsAndFreq').rdd.map(lambda x: x[0][0]['frequency']).collect()
    y_val = convert_to_list(DF.select('productCount'))
    
    # Define Histogram traces
    horHist = go.Histogram(
        x = x_val,
        yaxis = 'y2'
    )
    
    vertHist = go.Histogram(
        x = y_val,
        xaxis = 'x2'
    )
    
    # Define 2D Histogram Contour trace
    contourHist =go.Histogram2dContour(
        x = x_val,
        y = y_val,
        histnorm = 'probability'
    )
    
    data = go.Data([horHist, vertHist, contourHist])
    
    layout = go.Layout(
        showlegend=False,
        autosize=False,
        width=600,
        height=550,
        xaxis=dict(
            domain=[0, 0.85],
            showgrid=False,
            zeroline=False
        ),
        yaxis=dict(
            domain=[0, 0.85],
            showgrid=False,
            zeroline=False
        ),
        margin=dict(
            t=50
        ),
        hovermode='closest',
        bargap=0,
        xaxis2=dict(
            domain=[0.85, 1],
            showgrid=False,
            zeroline=False
        ),
        yaxis2=dict(
            domain=[0.85, 1],
            showgrid=False,
            zeroline=False
        )
    )
    
    figure = go.Figure(data=data, layout=layout)
    
    return pyo.iplot(figure)

In [109]:
plot_2D_histogram(topWordsDF)