## Packages Import

In [29]:
import pandas as pd # To convert Spark DataFrames to pandas dataframes
import gzip # To unzip GNU-zip files
import findspark # To find SPARK HOME
import pyspark # Spark API 
import pymongo # Mongo driver
import os # To load external PySpark packages
import pyspark.sql.functions as F
import pyspark.sql.types as T
import datetime # Date manipulations

In [2]:
packages = 'org.mongodb.spark:mongo-spark-connector_2.11:2.2.0'

os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages {} pyspark-shell'.format(packages)

In [3]:
# Find SPARK HOME
findspark.init()

# Create SparkSession
spark = (pyspark.sql.SparkSession
         .builder.appName('MyTestSession')
         .getOrCreate())

## Method definitions

In [5]:
def parse(path):
    '''
    Unzip a json.gz at `path` and returns a generator.
    '''
    g = gzip.open(path, 'rb')
    for line in g:
        yield eval(line)
        
def import_to_mongo(path, coll, db='hackon', create_index=True):
    '''
    Unzip and import json.gz file from `path` and loads it into mongo server.
    Create database index if `create_index` is True. 
    '''
    # Obtain handle to Mongo database and collection
    client = pymongo.MongoClient()
    collection = client[db][coll]
    
    # Return prematurely if database.collection already exists
    if (collection.count() != 0):
        print '{}.{} already exists on MongoDisk server. Exiting without loading JSON data.'.format(db, coll)
        return
    
    # Insert datapoints into Mongo database
    try:
        collection.insert_many((datapoint for datapoint in parse(path)))
        print 'JSON data successfully imported to Mongo at \'{}.{}.\''.format(db, coll)
    except Exception as e:
        print 'Error loading data.\n{}'.format(e)
        client.close()
        return
    
    if not create_index:
        client.close()
        return
    
    # Create database index for improved searching
    # collection.create_index([('asin', pymongo.ASCENDING), ('reviewerID', pymongo.DESCENDING)])

def load_mongo_to_spark(coll, db='hackon'):
    '''
    Load the Mongo database to a Spark Session and returns the Spark DataFrame
    '''
    try:
        return (spark
                .read
                .format('com.mongodb.spark.sql.DefaultSource')
                .option('uri', 'mongodb://127.0.0.1/{}.{}'.format(db, coll))
                .load())
    except Exception as e:
        print 'Failed to create Spark dataframe.\n{}'.format(e)
        
def printDF(sparkDF, n=10):
    '''
    Interactively displays the first n rows of a sparkDF as a pandas dataframe
    '''
    return (sparkDF
            .limit(n)
            .drop('_id', 'unixReviewTime')
            .toPandas())

In [7]:
import_to_mongo('./Datasets/reviews_Video_Games_5.json.gz', 'video_games')

hackon.video_games already exists on MongoDisk server. Exiting without loading JSON data.


In [8]:
rawDF = load_mongo_to_spark('video_games')

In [27]:
printDF(rawDF.sort(['helpful', 'overall'], ascending=[0, 0]))

Unnamed: 0,asin,helpful,overall,reviewText,reviewTime,reviewerID,reviewerName,summary
0,B007FTE2VW,"[10279, 10533]",1.0,"Fundamentally, SimCity has always been a 'soft...","03 6, 2013",A1DQOJ8PLXVPCO,Malor,What a lousy toy
1,B000FKBCX4,"[8606, 9403]",1.0,See those older 5-star reviews from 2006 (two ...,"09 7, 2008",A3284KYDZ00BZA,Erich Maria Remarque,Dumbed down experience and draconian DRM
2,B000FKBCX4,"[2828, 3249]",1.0,I just got through a massive headache dealing ...,"09 7, 2008",A32G3VY37J3H2C,Tbear,"No Way, No How, No DRM"
3,B000ZKA0J6,"[2445, 2841]",3.0,One can only respect BLIZZARD for not setting ...,"07 27, 2010",A74TA8X5YQ7NE,NeuroSplicer,"WHAT'S THE FREQUENCY, KERRIGAN?"
4,B0009VXAM0,"[2392, 2749]",5.0,"We got our hands on an XBox 360, a PS3, and a ...","11 18, 2006",A3V6Z4RCDGRC44,"Lisa Shea ""be the change you wish to see in t...",Unbiased Review from an Owner of All Three
5,B002I0J4VQ,"[2139, 2255]",5.0,"UPDATE (November, 2010):Before you decide to b...","08 25, 2009",ADLVFFE4VBT8,"A. Dent ""Aragorn""",The PS3 Reloaded [a newer model is now available]
6,B001COU9I6,"[2047, 2083]",5.0,A lot of people have written about the pros an...,"08 21, 2008",A3SX5MZTCZRQLF,Bradford Schmidt,"Great Configuration, Great Console"
7,B000FKBCX4,"[1916, 2271]",1.0,Currently I rate this game bad been following ...,"09 7, 2008",AJG877XPY8A5F,Christian Sandovall,A shame
8,B000MK694E,"[1904, 2169]",1.0,"Let's keep this sweet, organized and fair.UNDI...","08 24, 2007",A74TA8X5YQ7NE,NeuroSplicer,THIS IS THE REVIEW 2K Games DOES NOT WANT YOU...
9,B000FKBCX4,"[1289, 1440]",1.0,I played with the creature creator a bit and h...,"09 7, 2008",A2K9BFRX8DL0K5,"SCF ""Software Developer""",Returning the game...


## PySpark SQL User-Defined Function (UDF) Definitions

In [49]:
def filter_helpful_reviews(sparkDF, pct_helpful, min_votes):
    '''
    Return a dataframe filtered by comments that are at least `pct_helpful`% helpful 
    and contain a minimum number of votes.
    '''
    def _filter_helpful_reviews(votes, pct_helpful, min_votes):
        '''
        Return bool if comment are at least `pct_helpful`% helpful and have at least `min_votes` vote.
        
        Inputs:
            votes: A list of votes by [helpful, total_votes]
            pct_helpful: A float
            min_votes: A float
        '''
        return (votes[1] >= min_votes and (float(votes[0]) / votes[1] >= pct_helpful))
    
    udf = F.udf(lambda votes: _filter_helpful_reviews(votes, pct_helpful, min_votes), T.BooleanType())
    
    return sparkDF.filter(udf(F.col('helpful')))

def 

In [56]:
type(rawDF.select('reviewTime').first()[0])

unicode

In [57]:
import datetime

In [69]:
test_dates = [
    '04 17, 2008',
    '01 16, 2009',
    '12 28, 2003',
    '08 14, 2011',
    '06 13, 2001'
]

In [85]:
datetime_obj = map(lambda date_obj: date_obj.toordinal(), 
                   map(lambda datestr: datetime.datetime.strptime(datestr, '%m %d, %Y'), 
                       test_dates))
print datetime_obj

[733149, 733423, 731577, 734363, 730649]


In [86]:
printDF(filter_helpful_reviews(rawDF, .8, 5)
        .sort('unixReviewTime', ascending=[0]))

Unnamed: 0,asin,helpful,overall,reviewText,reviewTime,reviewerID,reviewerName,summary
0,B00G0OYHCW,"[7, 8]",5.0,So here was my predicament: I didn't really wa...,"07 12, 2014",A3VVMIMMTYQV5F,Johnny Saigon,How I Solved My Mario Kart Bundle Problem
1,B00I9UVY30,"[9, 10]",2.0,I own every EA FIFA World Cup title since Worl...,"07 11, 2014",A2Q2A6JKY95RTP,R. Garrelts,EA Ships Another Game in Beta State - 2014 Wor...
2,B004RMK4BC,"[26, 26]",5.0,It's $50 for the PlayStation Store. The code i...,"07 4, 2014",A17WJ14DHCVPIY,Oland T. Whitecotton,Code works flawlessly
3,B00DC7G2W8,"[4, 5]",5.0,"My 5 yr old son loves to play this game. Okay,...","07 4, 2014",A17WJ14DHCVPIY,Oland T. Whitecotton,Kids and Parents will love this game
4,B00G6MW5CC,"[6, 7]",5.0,I bought the xbox 360 preorder that included g...,"07 2, 2014",AYRQRVL0ZFK5X,Gregory Hessig,"Great for WW2 fans! Much improved, fun"
5,B00GG4BBUM,"[4, 5]",2.0,Just more of the same. I had read that despite...,"06 29, 2014",AH6LK8GMDVH31,"L. Allen ""MIX_MASTER_ICE""",Just more of the same. Nothing impressive that...
6,B00D3RBYM0,"[5, 6]",4.0,I REALLY like this game and I usually don't li...,"06 28, 2014",A3V7F58M4ZXHIF,Night Lord 40K,"Great game "" IF IT WOULD SAVE YOUR PROGRESS "" !!!"
7,B00DHF39KS,"[5, 6]",4.0,"I love the action, story, weapons and overall ...","06 19, 2014",A2WB7LZ595CR50,J. Harley,Love/Hate
8,B00CX7FQ4I,"[21, 26]",2.0,I'm just going to get right to the point. EA U...,"06 18, 2014",A23ZP8NTLCMZQ4,DapperDave,Its the best MMA game out right now for the ne...
9,B00GHJ6X8G,"[5, 6]",2.0,"I'm going to get slammed for this, since there...","06 15, 2014",A1Y200FSB7XBP3,"Ian ""Ian""","Wow, this really sucks!"


In [26]:
print ['{}'.format(review) for review in rawDF.filter(F.col('reviewerName') == 'Malor').rdd.map(lambda x: x['reviewText']).collect()]

