Title: identifying most popular country 

Author: Ivan Zheng

Date: 10/06

In [1]:
# Import and create a new SQLContext 
import pyspark
from pyspark.sql import SQLContext
from pyspark import SparkContext
sc = SparkContext()
sqlContext = SQLContext(sc)

NameError: name 'sc' is not defined

### Step 1: Transforming the Country Data to pyspark SQL formate

In [None]:
# Read the country CSV file into an RDD.
country_lines = sc.textFile('data/country-list.csv')

In [None]:
country_lines.take(5)

In [None]:
# Convert each line into a pair of words
country_words = country_lines.map(lambda line : line.split(", "))

In [None]:
country_words.take(5)

In [None]:
# Convert each pair of words into a tuple
country_tuples = country_words.map(lambda x: (x[0], x[1]))

In [None]:
country_tuples.take(5)

In [None]:
# Create the DataFrame, look at schema and contents
countryDF = sqlContext.createDataFrame(country_tuples, ["country", "code"])
countryDF.printSchema()
countryDF.take(3)

###### Transformed the Country Date to pyspark SQL formate

### Step 2: Calculating the most popular country in tweet text

In [None]:
# Read tweets CSV file into RDD of lines
tweet_lines = sc.textFile('data/tweet_output.csv')

In [None]:
tweet_lines.take(3)

In [None]:
tweet_lines.count() #before cleaning

In [None]:
# Clean the data: some tweets are empty. Remove the empty tweets using filter() 
tweet_lines = tweet_lines.filter(lambda x: x is not ' ')

In [None]:
tweet_lines.count() #before removing null

##### Perform WordCount on the cleaned tweet texts.

In [None]:
tweet_words = tweet_lines.flatMap(lambda line : line.split(" "))

In [None]:
tweet_tuples = tweet_words.map(lambda word : (word, 1))

In [None]:
tweet_counts = tweet_tuples.reduceByKey(lambda a, b: (a + b))

In [None]:
tweet_counts.take(5)

In [None]:
# Create the DataFrame of tweet word counts
tweetDF = sqlContext.createDataFrame(tweet_counts, ["country", "counts"])
tweetDF.printSchema()
tweetDF.take(3)

In [None]:
# Join the country and tweet data frames (on the appropriate column)
countryDF.printSchema()

In [None]:
tweetDF.show(10)

In [None]:
countryDF.show(10)

In [None]:
mergeDF = countryDF.join(tweetDF, 'country')

In [None]:
mergeDF.show(5)

In [None]:
# Number of distinct countries mentioned
mergeDF.filter(mergeDF["counts"] > 0).count()

In [None]:
#Number of countries mentioned in tweets.
from pyspark.sql.functions import sum
mergeDF.select(sum('counts')).show()

In [None]:
# Table 1: top three countries and their counts.
from pyspark.sql.functions import desc
mergeDF.sort(mergeDF.counts.desc()).show(5)

In [None]:
# Table 2: counts for Wales, Iceland, and Japan.
mergeDF.filter(mergeDF["country"] == 'Wales').show()

In [None]:
mergeDF.filter(mergeDF["country"] == 'Kenya').show()

In [None]:
mergeDF.filter(mergeDF["country"] == 'Netherlands').show()

In [None]:
mergeDF.filter(mergeDF["country"] == 'Iceland').show()

In [None]:
mergeDF.filter(mergeDF["country"] == 'Japan').show()

Summary, a short spark exercise using PySpark SQL and generate output using MonogDB.