In [1]:
import findspark
findspark.init()

In [2]:
from pyspark import SparkContext
sc = SparkContext("local", "AllUSAStates")
sc.setLogLevel("WARN")

In [4]:
fileRdd = sc.textFile("hdfs://192.168.93.128:9000/input/all_us_states.csv")
# action
print("Count ", fileRdd.count() )

Count  52


In [5]:
print (fileRdd.first())

abbr,name


In [6]:
# first() read first line
header = fileRdd.first()

# filter takes the content/record when the condition is true
# filter reject the record if the condition is false
# exclude first line /header from records
statesRdd = fileRdd.filter (lambda line: line != header )
print(statesRdd.collect())

['AL,Alabama', 'AK,Alaska', 'AZ,Arizona', 'AR,Arkansas', 'CA,California', 'CO,Colorado', 'CT,Connecticut', 'DE,Delaware', 'DC,District of Columbia', 'FL,Florida', 'GA,Georgia', 'HI,Hawaii', 'ID,Idaho', 'IL,Illinois', 'IN,Indiana', 'IA,Iowa', 'KS,Kansas', 'KY,Kentucky', 'LA,Louisiana', 'ME,Maine', 'MD,Maryland', 'MA,Massachusetts', 'MI,Michigan', 'MN,Minnesota', 'MS,Mississippi', 'MO,Missouri', 'MT,Montana', 'NE,Nebraska', 'NV,Nevada', 'NH,New Hampshire', 'NJ,New Jersey', 'NM,New Mexico', 'NY,New York', 'NC,North Carolina', 'ND,North Dakota', 'OH,Ohio', 'OK,Oklahoma', 'OR,Oregon', 'PA,Pennsylvania', 'RI,Rhode Island', 'SC,South Carolina', 'SD,South Dakota', 'TN,Tennessee', 'TX,Texas', 'UT,Utah', 'VT,Vermont', 'VA,Virginia', 'WA,Washington', 'WV,West Virginia', 'WI,Wisconsin', 'WY,Wyoming']


In [7]:
print(statesRdd.take(5))

['AL,Alabama', 'AK,Alaska', 'AZ,Arizona', 'AR,Arkansas', 'CA,California']


In [8]:
# map, tranform the data, split line, convert cel to faren, covner units, clean data...
splitRdd = statesRdd.map (lambda line: line.split(","))

print (splitRdd.take(5))

[['AL', 'Alabama'], ['AK', 'Alaska'], ['AZ', 'Arizona'], ['AR', 'Arkansas'], ['CA', 'California']]


In [9]:
# convert the list of state code, state name into tuple

# state is a list ['AL', 'alabama'], then output will be ('AL', 'alabama')
# tuppleRdd also can be called as KeyedRdd where as state[0], ie AL is a key, state name' Alabama is a value
tupleRdd = splitRdd.map (lambda state: (state[0], state[1]) )
print (tupleRdd.take(5))

[('AL', 'Alabama'), ('AK', 'Alaska'), ('AZ', 'Arizona'), ('AR', 'Arkansas'), ('CA', 'California')]


In [10]:
# sort the data in ascending order based on key which is the State code
sortedRdd = tupleRdd.sortByKey()
print("asc ", sortedRdd.take(5))

asc  [('AK', 'Alaska'), ('AL', 'Alabama'), ('AR', 'Arkansas'), ('AZ', 'Arizona'), ('CA', 'California')]


In [12]:
# sort the data in decending order based on key
# False means, decending order
sortedDescRdd = tupleRdd.sortByKey(False)
print(sortedDescRdd.take(5))

[('WY', 'Wyoming'), ('WV', 'West Virginia'), ('WI', 'Wisconsin'), ('WA', 'Washington'), ('VT', 'Vermont')]


In [13]:
sortedDescRdd.getNumPartitions()

1

In [14]:
# Write the results into Hadoop /output/states-sorted
sortedDescRdd.saveAsTextFile("hdfs://192.168.93.128:9000/output/states-sorted")
#  hdfs dfs -cat /output/states-sorted/part-00000

In [17]:
# countByKey() is for keyed rdd
# take the occurance of the key, count them, and produce an rdd with (key , count)
wordCount = tupleRdd.countByKey()

print(wordCount)

defaultdict(<class 'int'>, {'AL': 1, 'AK': 1, 'AZ': 1, 'AR': 1, 'CA': 1, 'CO': 1, 'CT': 1, 'DE': 1, 'DC': 1, 'FL': 1, 'GA': 1, 'HI': 1, 'ID': 1, 'IL': 1, 'IN': 1, 'IA': 1, 'KS': 1, 'KY': 1, 'LA': 1, 'ME': 1, 'MD': 1, 'MA': 1, 'MI': 1, 'MN': 1, 'MS': 1, 'MO': 1, 'MT': 1, 'NE': 1, 'NV': 1, 'NH': 1, 'NJ': 1, 'NM': 1, 'NY': 1, 'NC': 1, 'ND': 1, 'OH': 1, 'OK': 1, 'OR': 1, 'PA': 1, 'RI': 1, 'SC': 1, 'SD': 1, 'TN': 1, 'TX': 1, 'UT': 1, 'VT': 1, 'VA': 1, 'WA': 1, 'WV': 1, 'WI': 1, 'WY': 1})
