In [15]:
spark

In [20]:
df = (spark.read
      .format("com.mongodb.spark.sql.DefaultSource")
      .option("uri","mongodb://localhost/tweet.sample")
      .load())
df.createOrReplaceTempView('tweet')

In [22]:
query = '''SELECT lang, count(*) count FROM tweet WHERE delete IS NULL GROUP BY 1 ORDER BY 2 DESC'''
spark.sql(query).show(3)
        

+----+-----+
|lang|count|
+----+-----+
|  en| 1816|
|  ja| 1764|
|  ko|  641|
+----+-----+
only showing top 3 rows



In [29]:
query = '''SELECT from_unixtime(timestamp_ms / 1000) time, text from tweet WHERE lang = 'en' '''
en_tweets = spark.sql(query)

In [30]:
en_tweets.show(3)

+-------------------+--------------------+
|               time|                text|
+-------------------+--------------------+
|2017-11-26 16:41:59|RT @Paula_White: ...|
|2017-11-26 16:41:59|          anywayyyyy|
|2017-11-26 16:41:59|RT @ENGlobalME: T...|
+-------------------+--------------------+
only showing top 3 rows



In [32]:
from pyspark.sql import Row

def text_split(row):
    for word in row.text.split():
        yield Row(time=row.time, word=word)

In [33]:
en_tweets.rdd.take(1)

[Row(time=u'2017-11-26 16:41:59', text=u'RT @Paula_White: Your stutter won\u2019t stop your anointing!')]

In [34]:
en_tweets.rdd.flatMap(text_split).take(2)

[Row(time=u'2017-11-26 16:41:59', word=u'RT'),
 Row(time=u'2017-11-26 16:41:59', word=u'@Paula_White:')]

In [35]:
en_tweets.rdd.flatMap(text_split).toDF().show(2)

+-------------------+-------------+
|               time|         word|
+-------------------+-------------+
|2017-11-26 16:41:59|           RT|
|2017-11-26 16:41:59|@Paula_White:|
+-------------------+-------------+
only showing top 2 rows



In [36]:
words = en_tweets.rdd.flatMap(text_split).toDF()

In [38]:
words.createOrReplaceTempView('words')

In [40]:
query = '''SELECT word, count(*) count FROM words GROUP BY 1 ORDER BY 2 DESC'''
spark.sql(query).show(3)
        

+----+-----+
|word|count|
+----+-----+
|  RT| 1130|
| the|  524|
|  to|  462|
+----+-----+
only showing top 3 rows



In [41]:
spark.sql(query).show(100)

+-------+-----+
|   word|count|
+-------+-----+
|     RT| 1130|
|    the|  524|
|     to|  462|
|      a|  326|
|      I|  310|
|     is|  277|
|    and|  276|
|     of|  255|
|    you|  247|
|    for|  229|
|     in|  227|
|     my|  150|
|   that|  142|
|   this|  138|
|     on|  134|
|     it|  110|
|     be|  110|
|     me|  104|
|     so|   97|
|   with|   89|
|    The|   88|
|   have|   87|
|    are|   85|
|    was|   84|
|      i|   80|
|     at|   78|
|   just|   74|
|    all|   73|
|      -|   72|
|   your|   71|
|     if|   66|
|  &amp;|   65|
|    but|   64|
|   like|   64|
|     by|   61|
|    not|   59|
|     he|   58|
|    has|   57|
|    who|   55|
|   from|   54|
|     we|   54|
|     up|   53|
|    get|   52|
|   when|   51|
|    You|   51|
|     as|   49|
|   they|   49|
|   will|   48|
|     or|   47|
|    can|   45|
|    out|   43|
|    one|   42|
|   love|   42|
|     an|   42|
|   more|   42|
|    now|   39|
|    how|   38|
|     do|   38|
|   good|   37|
| really

In [42]:
words.write.saveAsTable('twitter_sample_words')

In [43]:
!ls -R spark-warehouse

[34mtwitter_sample_words[m[m

spark-warehouse/twitter_sample_words:
_SUCCESS
part-00000-356bb5fb-bf9f-4227-9349-f3b2b6a4d41b-c000.snappy.parquet


In [44]:
spark.table('twitter_sample_words').count()

26057