In [None]:
import sys, os
import json
from pyspark import SparkContext, SparkConf
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils
 
'''
Join the batting and salaries data for Barry Bonds per year.
 
The output should be the combined CSV string of batting and salaries data (one per year).
 
Use 'join' as the key for the final output in the reducer.
 
E.g:
"join"  "bondsba01,1986,1,PIT,NL,113,413,72,92,26,3,16,48,36,7,65,102,2,2,2,2,4,1986,PIT,NL,bondsba01,60000"
 
Schema:
Salaries: yearID  teamID  lgID  playerID  salary
Batting: playerID yearID  stint teamID  lgID  G AB  R H 2B  3B  HR  RBI SB  CS  BB  SO
 
Hints: 
Use split to split the CSV lines (e.g. s = line.split(','))
Both files are sent to the mapper. Use the length of the lines to determine which is which.
'''
if __name__ == '__main__':
    # Create Spark context
    conf = SparkConf().setAppName("4_join").set("spark.streaming.concurrentJobs", "2").set('spark.hadoop.validateOutputSpecs', False)
    sc = SparkContext(conf=conf)
    sc.setLogLevel("WARN")
 
    # Create Spark Streaming context
    ssc = StreamingContext(sparkContext=sc, batchDuration=1)
 
    # Defining the checkpoint directory
    ssc.checkpoint("/root/tmp")
 
    # Connect to Kafka and subscribe two topics
    kafkaStream = KafkaUtils.createDirectStream(ssc=ssc,
                                                kafkaParams={"metadata.broker.list": '<your ec2 instance public IP>:9092'},
                                                topics=['4_join_batting', '4_join_salaries'])
    
    '''
    Steps of joining samples from Salaries and Batting tables:
    1. filter samples by playerID: bondsba01
    2. determine whether a sample is from Salaries table or Batting table
    3. generate (key, value) pairs where yearID is the key
 
    ----- Once the above steps are done, we can start joining samples...
    
    What does updateStateByKey do?
    Return a new "state" DStream where the state for each key is updated by applying the given function on the previous state of the key and the new values for the key. This can be used to maintain arbitrary state data for each key.
 
    ---------------------------------------------------------------------
    4. Since the order of receiving Salaries and Batting samples are indeterministic, we need to use the updateStateByKey function to maintain and update state for each key(yearID).
    5. Apply a sort function to every RDD within the DStream, so that we can get the output sorted by yearID.
    '''
 
    # filter playerID and distinguish salaries and batting samples
    def categorize(fields):
      if len(fields) == 5 and fields[3] == 'bondsba01':
        return fields[0], ('S', ','.join(fields)) #this will return year, ('S', ','.join(fields))
      elif fields[0] == 'bondsba01':
        return fields[1], ('B', ','.join(fields)) #this will return year, ('B', ','.join(fields))
      else:
        return None, None
 
    def updateState(new, old):
      tmp = None
      if new and old:
        new.extend(old)
        tmp = new
      elif new:
        tmp = new
      elif old:
        tmp = old
 
      if tmp and len(tmp) > 1 and type(tmp) is list:
        # @todo: determine the order of salaries and batting in the tmp array, then connect them in the order "salaries + batting" preceded by join
      else:
        return tmp
 
    #@todo complete the filter code to filter by yearid
    kstream = kafkaStream.map(lambda line: categorize(line[1].strip().split(','))) \
                         .filter(<complete this>) \ 
                         .updateStateByKey(updateState) \
                        # @todo: sort RDD data by key(yearID) in ascending order using RDD transformation
    
    # kstream.repartition(1).saveAsTextFiles('./4_join/output')
    #@todo change with your s3 location
    kstream.foreachRDD(lambda rdd: rdd.repartition(1).saveAsTextFile('s3://vandy-bigdata-kzw/hw6/4_join.out'))
 

    # Start the streaming context
    ssc.start()
    ssc.awaitTerminationOrTimeout(120)
    ssc.stop()