In [1]:
from pyspark.sql import SQLContext

# Create a SparkSQL context object
sqlCtx = SQLContext(sc)

In [2]:
# Load the Parquet schema file
schemaRdd = sc.textFile('/Users/kdunn/Google Drive/TWC/DDL/schema_34.avsc', 1).collect()

# Merge the list of lines into a readable string
mergedString = ' \n'.join(schemaRdd)

# Convert the string into a Python dictionary, replacing keywords with Python equivalents
schemaDict = eval(mergedString.replace('null', 'None').replace('false', 'False').replace('true', 'True'))

# Take the name of every top-level field and put in a nice list   
topLevelFields = [k['name'] for k in schemaDict["fields"]]

In [3]:
# Load the Parquet data
data = sqlCtx.parquetFile("/Users/kdunn/Desktop/TWC/schema_30_b6db110d-881c-434a-9746-360ce49c0af7.parquet")

In [55]:
from time import gmtime, strftime
import gzip

# Create a formattable template string (this also defines the db table columns)
templateString = "%(" + ")s\t%(".join(topLevelFields) + ")s"

# Initialize every top-level field in the schema with a 'Null' string
fieldsAndNulls = zip([f.strip('') for f in topLevelFields ], ['Null']*len(topLevelFields))

# Initialize the path to write the output to
targetFlatFile = "/Users/kdunn/Desktop/TWC/schemaFilteredFlat.tsv.gz"

def parquetToTSV(data):
    # Reference the path as a global since <RDD>.foreach(myFunction) doesn't
    # allow passing additional arguments into the function
    global targetFlatFile
    
    # Convert the PySpark RDD object into a Python Dict
    tempDict = data.asDict()
    
    #####
    #####
    ##### BEGIN: customer data-specific section 
    #####
    #####
    
    # Pre-filter conditions for Austin area only
    if 'customeraccount_divisionid' not in tempDict:
        return
    else:
        division = tempDict['customeraccount_divisionid']
        if division is not None and division.upper() != 'STX':
            return
        else: 
            if 'customeraccount_city' in tempDict:
                city = tempDict['customeraccount_divisionid']
                if city is not None and city.lower() != "austin":
                    return
            else:
                # No defined city in the record
                # means we can't explicity exclude it
                goOn = True
                
   
    # Create a dictionary from the key-value pairs
    # for building the record dictionary
    finalDict = dict( fieldsAndNulls )
    
    # Update the dictionary with this record's
    # data for every column
    finalDict.update(tempDict)
   
    # Flatten the key/val[,val] in the annotations section
    annotationsSection = list()
    if tempDict['annotations'] is not None:
        for key, val in tempDict['annotations'].items():
            if val.asDict() is not None:
                for param, op in val.asDict().items():
                    #print '='.join([str(param), str(op)])
                    try:
                        annotationsSection.append("{k}:".format(k=key) + '='.join([param, op]))
                    except TypeError:
                        annotationsSection.append("{k}:".format(k=key) + '='.join([param, 'Null']))
                    
                        
    annotationsSectionString = 'Null'
    if len(annotationsSection) > 0:
        annotationsSectionString = "|^|".join(annotationsSection)

    # Flatten content group fields (including the nested genre array)
    contentSection = list()
    for (k, v) in tempDict['content'].asDict().iteritems():
        theVal = v
        # Flatten the nested array
        if k == 'contentmetadata_genres':
            if v is not None:
                if isinstance(v, list):
                    theVal = '%^%'.join(v)
                else:
                    theVal = v
            else:
                theVal = 'Null'

        try:
            contentSection.append("{k}={v}".format(k=k, v=theVal.encode('utf-8', 'ignore')))
        except AttributeError:
            contentSection.append("{k}={v}".format(k=k, v='Null'))
        #except UnicodeEncodeError:
            #pass
            #continue #contentSection.append("{k}={v}".format(k=k.decode('utf-8', 'ignore'), v=theVal))
            
    contentSectionString = 'Null'
    if len(contentSection) > 0:
        contentSectionString = "|^|".join(contentSection)
          
    # Flatten the array of upcomingContent group fields (including the nested genre array)
    upcomingContentSection = list()
    for contentEntry in tempDict['upcomingContent']:
        thisContent = list()
        for (k, v) in contentEntry.asDict().iteritems():
            theVal = v
            # Flatten the nested array
            if k == 'contentmetadata_genres':
                if v is not None:
                    if len(v) > 0:
                        theVal = '%^%'.join(v)
                    else:
                        theVal = v
                else:
                    v = 'Null'             
                
            try:
                thisContent.append("{k}={v}".format(k=k, v=theVal.encode('utf-8', 'ignore')))
            except AttributeError:
                thisContent.append("{k}={v}".format(k=k, v='Null'))
            
        if len(thisContent) > 0:
            upcomingContentSection.append("%^%".join(thisContent))
        else:
            upcomingContentSection.append(thisContent)

    upcomingContentSectionString = 'Null'
    if len(upcomingContentSection) > 0:
        upcomingContentSectionString = "|^|".join(upcomingContentSection)
       
    timeshiftformatsString = 'Null'
    if 'contentrestriction_timeshiftformats' in tempDict and\
       tempDict['contentrestriction_timeshiftformats'] is not None and\
       len(tempDict['contentrestriction_timeshiftformats']) > 0:
        timeshiftformatsString = "|^|".join(tempDict['contentrestriction_timeshiftformats'])
 
    demographicsString = 'Null'
    if len(tempDict['demographics']) > 0:
        demographicsString = "|^|".join(["=".join([key, val]) for key, val in tempDict['demographics'].items()])
  
    # Replace the record's nested fields with the flattend versions generated above
    finalDict['annotations'] = annotationsSectionString
    finalDict['content'] = contentSectionString
    finalDict['upcomingContent'] = upcomingContentSectionString
    finalDict['contentrestriction_timeshiftformats'] = timeshiftformatsString
    finalDict['demographics'] = demographicsString
    
    # Convert millisecond epoch to UTC string timestamp
    finalDict['timestamp_received'] = strftime('%Y-%m-%d %H:%M:%S', gmtime(finalDict['timestamp_received']/1000))
    
    #####
    #####   
    ##### END customer data-specific section 
    #####
    #####
    
    # Make a final pass on the record,
    # force our 'Null' key where necessary
    # and ensure encoding is correct
    for (k,v) in finalDict.iteritems():
        if v is None:
            finalDict[k] = 'Null'
        elif isinstance(v, unicode):
            continue
        elif isinstance(v, str):
            finalDict[k] = v.decode('utf-8', 'ignore')
    
    # Place the values into consistent columns, tab delimited
    theRow = templateString % finalDict
    
    # gzip compress the output file
    with gzip.open(targetFlatFile, 'a+') as theFile:
        theFile.write(theRow.encode('utf-8', 'ignore'))
        theFile.write("\n")
        theFile.close()

In [38]:
open(targetFlatFile, 'w').close()

#%time data.foreach(parquetToTSV)

In [157]:
#%time parquetToTSV(data.take(7)[-1])

CPU times: user 20 ms, sys: 8.85 ms, total: 28.9 ms
Wall time: 170 ms


In [35]:
# This is supposed to parallelize the operation
# but seems to no work on local/single node context
distData = sc.parallelize(data, 4)
%time doIt = distData.foreach(parquetToTSV)

CPU times: user 4.49 ms, sys: 801 µs, total: 5.29 ms
Wall time: 34 ms


In [36]:
%time print(doIt)

None
CPU times: user 41 µs, sys: 15 µs, total: 56 µs
Wall time: 44.1 µs


In [40]:
import fnmatch
import os

# Recorsively glob the parquet files 
# of interest within a root directory
parquetFiles = []
for root, dirnames, filenames in os.walk('/Users/kdunn/Desktop/'):
  for filename in fnmatch.filter(filenames, '*.parquet'):
    parquetFiles.append(os.path.join(root, filename))

#print parquetFiles

In [56]:
%%time

for i, f in enumerate(parquetFiles):
    # Load the Parquet data
    moreData = sqlCtx.parquetFile(f)
    
    # Specify the global target path
    targetFlatFile = "/Users/kdunn/Desktop/TWC/schemaFilteredFlat-{}.tsv.gz".format(i)
    
    # Make sure it's empty
    open(targetFlatFile, 'w').close()
    
    # Do the thing
    moreData.foreach(parquetToTSV)

CPU times: user 71.7 ms, sys: 32.9 ms, total: 105 ms
Wall time: 1min 16s
