In [3]:
import os
os.environ['PYSPARK_PYTHON'] = 'python'

In [5]:
from pyspark.sql import SparkSession
import re

In [6]:
spark = SparkSession.builder.appName("Word_Count").getOrCreate()
sc = spark.sparkContext

In [27]:
text = """
Mathematics plays an important role in competitive programming, and it is
not possible to become a successful competitive programmer without having
good mathematical skills. This section discusses some important mathematical
concepts and formulas that are needed later in the book.
Short code is ideal in competitive programming, because programs should be
written as fast as possible. Because of this, competitive programmers often define
shorter names for datatypes and other parts of code.
"""

In [28]:
from pyspark.sql.functions import explode, split, lower, col, count

In [29]:
df = spark.createDataFrame([(text,)], ["text"])

In [30]:
df.show()

+--------------------+
|                text|
+--------------------+
|\nMathematics pla...|
+--------------------+



In [31]:
words_df = (df.select(explode(split(lower(col("text")), r"[\s\W]+")).alias("word")).filter(col("word") != ""))

In [32]:
words_df.show()

+-----------+
|       word|
+-----------+
|mathematics|
|      plays|
|         an|
|  important|
|       role|
|         in|
|competitive|
|programming|
|        and|
|         it|
|         is|
|        not|
|   possible|
|         to|
|     become|
|          a|
| successful|
|competitive|
| programmer|
|    without|
+-----------+
only showing top 20 rows


In [34]:
word_count_df = words_df.groupBy("word").count().orderBy(col("count").desc())
word_count_df.show()

+------------+-----+
|        word|count|
+------------+-----+
| competitive|    4|
|          in|    3|
|         and|    3|
|        code|    2|
|     because|    2|
| programming|    2|
|mathematical|    2|
|          is|    2|
|    possible|    2|
|   important|    2|
|          of|    2|
|          as|    2|
|        this|    2|
|        some|    1|
|       often|    1|
|   datatypes|    1|
|       parts|    1|
|         not|    1|
|    programs|    1|
|     shorter|    1|
+------------+-----+
only showing top 20 rows


In [35]:
# Method 2, Creation of RDDs (Resillient Distributed Dataset)

rdd = sc.parallelize([text])

In [40]:
words_rdd = rdd.flatMap(lambda line: re.findall(r'\b\w+\b', line.lower()))

In [45]:
pairs_rdd = words_rdd.map(lambda word: (word, 1))

In [47]:
word_counts_rdd = pairs_rdd.reduceByKey(lambda a, b: a + b)

In [48]:
sorted_counts = word_counts_rdd.sortBy(lambda x: x[1], ascending = False)

In [50]:
for word, count in sorted_counts.collect():
    print(f"{word} : {count}")

competitive : 4
and : 3
in : 3
of : 2
mathematical : 2
this : 2
important : 2
code : 2
programming : 2
possible : 2
as : 2
because : 2
is : 2
mathematics : 1
become : 1
concepts : 1
are : 1
book : 1
programmers : 1
to : 1
having : 1
that : 1
later : 1
be : 1
define : 1
shorter : 1
it : 1
not : 1
formulas : 1
for : 1
parts : 1
good : 1
short : 1
a : 1
without : 1
section : 1
should : 1
written : 1
plays : 1
programmer : 1
needed : 1
ideal : 1
successful : 1
discusses : 1
some : 1
fast : 1
often : 1
an : 1
other : 1
role : 1
skills : 1
the : 1
datatypes : 1
programs : 1
names : 1
