# Introduction
This `Python 2` / `PySpark` script creates a Plotly animation of cluster moving over the 5 recent years (2010 - 2014).

# Notebook Setup

## Initialise modules

In [159]:
import findspark
import pyspark
import pyspark.sql.functions as F
import pyspark.sql.types as T

import pymongo
import pandas as pd

import gzip # To parse gzip file
import os # For setting up Mongo-Spark connector

import plotly
import plotly.offline as pyo
import plotly.graph_objs as go

## Initialise PySpark session

Load `MongoDB-Spark` connector when starting up `PySpark`.

In [2]:
packages = 'org.mongodb.spark:mongo-spark-connector_2.11:2.2.0'
dedicated_memory = '4g'

os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages {} --driver-memory {} pyspark-shell' \
    .format(packages, dedicated_memory)

In [3]:
# Find SPARK_HOME
findspark.init()

# Create SparkSession
spark = (pyspark.sql.SparkSession
         .builder.appName('ClusterAnimation')
         .getOrCreate())

## Initiate Plotly Offline notebook mode

In [4]:
pyo.init_notebook_mode(connected=True)

## Configure Pandas HTML display

In [5]:
pd.set_option('display.max_colwidth', -1)

## Define helper methods

In [16]:
def parse(path):
    '''
    Unzip a json.gz at `path` and returns a generator.
    '''
    g = gzip.open(path, 'rb')
    for line in g:
        yield eval(line)

def import_to_mongo(path, coll, db='hackon', create_index=True):
    '''
    Unzip and import json.gz file from `path` and loads it into mongo server.
    Create database index if `create_index` is True. 
    '''
    # Obtain handle to Mongo database and collection
    client = pymongo.MongoClient()
    collection = client[db][coll]
    
    # Return prematurely if database.collection already exists
    if (collection.count() != 0):
        print '{}.{} already exists on MongoDisk server. Exiting without loading JSON data.'.format(db, coll)
        return
    
    # Insert datapoints into Mongo database
    try:
        collection.insert_many((datapoint for datapoint in parse(path)))
        print 'JSON data successfully imported to Mongo at \'{}.{}.\''.format(db, coll)
    except Exception as e:
        print 'Error loading data.\n{}'.format(e)
        client.close()
        return
    
    if not create_index:
        client.close()
        return
    
    # Create database index for improved searching
    # collection.create_index([('asin', pymongo.ASCENDING), ('reviewerID', pymongo.DESCENDING)])

def load_mongo_to_spark(coll, db='hackon'):
    '''
    Load the Mongo database to a Spark Session and returns the Spark DataFrame
    '''
    try:
        return (spark
                .read
                .format('com.mongodb.spark.sql.DefaultSource')
                .option('uri', 'mongodb://127.0.0.1/{}.{}'.format(db, coll))
                .load())
    except Exception as e:
        print 'Failed to create Spark dataframe.\n{}'.format(e)

def displayDF(sparkDF, n=10):
    '''
    Interactively displays the first n rows of a sparkDF as a pandas dataframe
    '''
    print 'Count: {}'.format(sparkDF.count())
    return (sparkDF
            .limit(n)
            .drop('_id', 'unixReviewTime')
            .toPandas())

# Cluster Animation Script
The following code creates a Plotly animation of how particular clusters 'position' (as depicted by our `MF` and `UD` indices), move with time. 

## Import clustered products metadata

In [160]:
# Define schema
schema_list = [
    T.StructField('clusterID', T.StringType()),
    T.StructField('productCount', T.IntegerType()),
]
schema_list.extend([T.StructField('MF201{}'.format(i), T.FloatType()) for i in range(5)])
schema_list.extend([T.StructField('UD201{}'.format(i), T.FloatType()) for i in range(5)])


schema = T.StructType(T.StructType(schema_list))

In [157]:
toy_data = [
    (31, 2, 0.1, 0.2, 0.4, 0.2, 0.5, 0.8, 0.9, 0.4, 0.6, 0.5),
    (1, 10, 0.3, 0.8, 0.3, 0.3, 0.3, 0.5, 0.9, 0.3, 0.6, 0.5),
    (53, 29, 0.2, 0.7, 0.8, 0.2, 0.5, 0.1, 0.2, 0.1, 0.9, 0.5),
    (19, 16, 0.4, 0.2, 0.1, 0.2, 0.8, 0.8, 0.7, 0.4, 0.3, 0.5),
    (24, 24, 0.7, 0.7, 0.3, 0.6, 0.1, 0.8, 0.9, 0.9, 0.1, 0.5)
]
metricDF = spark.createDataFrame(toy_data, schema=schema)
displayDF(metricDF)

Count: 5


Unnamed: 0,clusterID,productCount,MF2010,MF2011,MF2012,MF2013,MF2014,UD2010,UD2011,UD2012,UD2013,UD2014
0,31,2,0.1,0.2,0.4,0.2,0.5,0.8,0.9,0.4,0.6,0.5
1,1,10,0.3,0.8,0.3,0.3,0.3,0.5,0.9,0.3,0.6,0.5
2,53,29,0.2,0.7,0.8,0.2,0.5,0.1,0.2,0.1,0.9,0.5
3,19,16,0.4,0.2,0.1,0.2,0.8,0.8,0.7,0.4,0.3,0.5
4,24,24,0.7,0.7,0.3,0.6,0.1,0.8,0.9,0.9,0.1,0.5


### Animate clusters

In [161]:
def animate_clusters(DF, filename=''):
    '''
    Create an animated plot of product cluster moving along the MF-UD space over each year.
    
    Inputs:
        DF: A Spark DataFrame containing 'clusterID', 'productCount', 'MF' and 'UD' scores.
    '''
    # Infer number of years, Y, to plot
    Y = sum([(True if 'MF' in col_name else False) for col_name in clusterDF.schema.names])
    
    try:
        assert Y == sum([(True if 'UD' in col_name else False) for col_name in clusterDF.schema.names])
    except:
        print 'Error: MF and UD columns do not have the same number of years.'
    
    # Convert Spark DF to pandas
    pandasDF = DF.sort(F.col('clusterID')).toPandas()
    
    # Obtain number of clusters, C, to plot
    C = pandasDF.shape[0]
    
    # Define colour list as per Plotly's default colour list
    colour_list = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b']
    
    # Define slider ticker labels
    slider_values = ['2010', '2011', '2012', '2013', '2014']    
    
    # Define barebone figure
    figure = {
        'data': [],
        'layout': {},
        'frames': [],
    }
    
    # Create layout
    figure['layout'] = {
        'width': 900,
        'height': 900,
        'xaxis': {'range': [0, 1.2], 'autorange': False},
        'yaxis': {'range': [0, 1.2], 'autorange': False},
        'title': 'Cluster Performance',
        'showlegend': True,
        'updatemenus': [{'type': 'buttons',
                          'direction': 'left', # Arrange placement of buttons
                          'pad': {'r': 10, 't': 87}, # Right and top padding
                          'showactive': False, # Removes highlight from active button
                          'x': 0, # Button positions 
                          'y': 0, # Button positions
                          'xanchor': 'right',
                          'yanchor': 'top',
                          'buttons': [
                              {
                                  'label': 'Play', # Button label
                                  'method': 'animate', # Method name
                                  'args': [None, 
                                           { # Args determines which frames to animate
                                               'frame': {'duration': 500, 'redraw': False},
                                               'fromcurrent': True,
                                               'transition': {'duration': 300, 'easing': 'quadratic-in-out'}
                                           }
                                          ]
                              },
                              {
                                  'label': 'Pause',
                                  'method': 'animate',
                                  'args': [[None], 
                                           { # '[None]' ensures proper 'pause' functionailty
                                               'frame': {'duration': 0, 'redraw': False},
                                               'mode': 'immediate',
                                               'transition': {'duration': 0}
                                           }
                                          ]
                              }
                          ]
                         }
                       ]
    }
    
    # Create frames and slider steps
    slider_steps = []
    for year in range(Y):        
        # Create single frame variable
        frame = {'data': [], 'name': slider_values[year]} # Without name, slider will not interact with graph

        # Populate data for each cluster, c
        for c in range(C):
            # Create trace for clusters
            cluster_trace = {
                'x': [pandasDF['MF201{}'.format(year)][c]],
                'y': [pandasDF['UD201{}'.format(year)][c]],
                'mode': 'markers',
                'hoverinfo': 'text',
                'name': 'Cluster {}'.format(pandasDF['clusterID'][c]),
                'text': ['ID: {0}<br>Size: {1}<br>MF: {2:.2f}<br>UD: {3:.2f}'
                         .format(pandasDF['clusterID'][c], 
                                 pandasDF['productCount'][c], 
                                 pandasDF['MF201{}'.format(year)][c],
                                 pandasDF['UD201{}'.format(year)][c])],
                'marker': {
                    'size': [10 * pandasDF['productCount'][c]],
                    'color': colour_list[c],
                }
            }
            
            # Append first set of cluster_trace to data
            if year == 0: figure['data'].append(cluster_trace)
            
            # Append all cluster_traces to frames
            frame['data'].append(cluster_trace)
            
        # Append frame to frames variable
        figure['frames'].append(frame)
        
        # Define slider step
        slider_step = {
            'args': [
                [slider_values[year]],
                {'frame': {'duration': 300, 'redraw': False},
                 'mode': 'immediate',
                 'transition': {'duration': 300}}
            ],
            'label': slider_values[year],
            'method': 'animate'
        }
        
        # Append slider stes to slider dictionary
        slider_steps.append(slider_step)
        
    # Define slider dictionary
    slider_dict = {
        'active': 0, # Slider knob's relative starting location
        'pad': {'b': 10, 't': 50}, # Bottom and top padding
        'len': 0.9, # Slider length
        'x': 0.1, # Slider x-position
        'y': 0, # Slider y-position
        'yanchor': 'top', 
        'xanchor': 'left',
        'currentvalue': { # Displays current value selected by slider
            'font': {'size': 20},
            'prefix': 'Year ',
            'visible': True,
            'xanchor': 'right'
        },
        'transition': {'duration': 300, 'easing': 'cubic-in-out'},
        'steps': slider_steps
    }
    
    # Add sliders to layout
    figure['layout']['sliders'] = [slider_dict]
    
    return pyo.iplot(figure)
    

In [162]:
animate_clusters(metricDF)