Beijing House

In [ ]:
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.streaming._

val ssc = new StreamingContext(sparkContext, Seconds(2))
    ssc.checkpoint(".")

    // Initial state RDD for mapWithState operation
    val initialRDD = ssc.sparkContext.parallelize(List(("hello", 1), ("world", 1)))

    // Create a ReceiverInputDStream on target ip:port and count the
    // words in input stream of \n delimited test (eg. generated by 'nc')
    val lines = ssc.socketTextStream("localhost", "22222".toInt)
    val words = lines.flatMap(_.split(" "))
    val wordDstream = words.map(x => (x, 1))

    // Update the cumulative count using mapWithState
    // This will give a DStream made of state (which is the cumulative count of the words)
    val mappingFunc = (word: String, one: Option[Int], state: State[Int]) => {
      val sum = one.getOrElse(0) + state.getOption.getOrElse(0)
      val output = (word, sum)
      state.update(sum)
      output
    }

    val stateDstream = wordDstream.mapWithState(
      StateSpec.function(mappingFunc).initialState(initialRDD))
    stateDstream.print()
    ssc.start()
    ssc.awaitTermination()


The cell was cancelled.


## Spark streaming

### Create context with batch 2s 

In [ ]:
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.SparkContext._
import org.apache.spark.streaming.twitter._

val ssc = new StreamingContext(sparkContext, Seconds(2))

### Listen twitter stream 

#### We're going to **filter** the tweets to only those containing the following words.

In [ ]:
val filters = Array("spark", "scala", "music")

#### Create the twitter listeners

In [ ]:
val twitterStream = TwitterUtils.createStream(ssc, None, filters)

#### Count by hashtag and sort  

##### The windows are `60s` long

In [ ]:
import StreamingContext._
val hashTags = twitterStream.flatMap(status => status.getText.split(" ").filter(_.startsWith("#")))

val topCounts60 = hashTags.map((_, 1)).reduceByKeyAndWindow(_ + _, Seconds(60))
                          .map{case (topic, count) => (count, topic)}
                          .transform(_.sortByKey(false))

##### Creating the text output to be updated by the stream of result 

In [ ]:
val result = ul(10)

##### Let's show the 10 first hashtag every window 

In [ ]:
// Print popular hashtags
topCounts60.foreachRDD(rdd => {
  val topList = rdd.take(10).toList
  val r = topList.map{case (count, tag) => s"$tag: $count"}
  result(r)
})

### Show the Geolocations

In [ ]:
val geo = widgets.GeoPointsChart(Seq((0d,0d, "init")))

In [ ]:
twitterStream .window(Seconds(60), Seconds(6))  
              . filter{ s => 
                s.getGeoLocation() != null
              }
              .map(s => (s.getGeoLocation().getLatitude(), s.getGeoLocation().getLongitude(), s.getText()))
              .foreachRDD{rdd => 
                geo.applyOn(rdd.take(100))
              }

###  Start listening twitter

This will listen the twitter stream, and the computation above will update the `resuilt` every `2s` using the last `60s` of values.

In [ ]:
ssc.start()

### Stop listening twitter 

In [ ]:
// commented to all 'run all' :-D
//ssc.stop()