-
Notifications
You must be signed in to change notification settings - Fork 12
/
wordcount.py
29 lines (21 loc) · 983 Bytes
/
wordcount.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
from __future__ import print_function
from pyspark import SparkContext, SparkConf
import sys
if __name__ == "__main__":
# create Spark context with Spark configuration
conf = SparkConf().setAppName("Spark Count")
sc = SparkContext(conf=conf)
# get threshold
threshold = int(sys.argv[2])
# read in text file and split each document into words
tokenized = sc.textFile(sys.argv[1]).flatMap(lambda line: line.split(" "))
# count the occurrence of each word
wordCounts = tokenized.map(lambda word: (word, 1)).reduceByKey(lambda v1,v2:v1 +v2)
# filter out words with fewer than threshold occurrences
filtered = wordCounts.filter(lambda pair:pair[1] >= threshold)
# count characters
charCounts = filtered.flatMap(lambda pair:pair[0]).map(lambda c: c).map(lambda c: (c, 1)).reduceByKey(lambda v1,v2:v1 +v2)
list = charCounts.collect()
f1=open('/home/wce/clsadmin/output.txt', 'w+')
f1.write(repr(list)[1:-1])
f1.close()