In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import (
    split, 
    col, 
    explode, 
    lower, 
    regexp_extract,
)

ModuleNotFoundError: No module named 'pyspark'

In [34]:
spark = SparkSession.builder.appName("canvas").getOrCreate()
spark.sparkContext.setLogLevel("WARN")

In [48]:
book = spark.read.text("./datasets/pride_and_predudice.txt")

In [65]:
lines = book.select(split(col("value"), " ").alias("line"))
lines.show(truncate=False)

+---------------------------------------------------------------------------------------+
|line                                                                                   |
+---------------------------------------------------------------------------------------+
|[The, Project, Gutenberg, eBook, of, Pride, and, Prejudice]                            |
|[, , , , ]                                                                             |
|[This, ebook, is, for, the, use, of, anyone, anywhere, in, the, United, States, and]   |
|[most, other, parts, of, the, world, at, no, cost, and, with, almost, no, restrictions]|
|[whatsoever., You, may, copy, it,, give, it, away, or, re-use, it, under, the, terms]  |
|[of, the, Project, Gutenberg, License, included, with, this, ebook, or, online]        |
|[at, www.gutenberg.org., If, you, are, not, located, in, the, United, States,]         |
|[you, will, have, to, check, the, laws, of, the, country, where, you, are, located]    |
|[before, 

In [75]:
words = lines.select(explode(col("line")).alias("word"))
words.show()

+---------+
|     word|
+---------+
|      The|
|  Project|
|Gutenberg|
|    eBook|
|       of|
|    Pride|
|      and|
|Prejudice|
|         |
|         |
|         |
|         |
|         |
|     This|
|    ebook|
|       is|
|      for|
|      the|
|      use|
|       of|
+---------+
only showing top 20 rows



In [78]:
words_lower = words.select(lower(col("word")).alias("word_lower"))
words_lower.show()

+----------+
|word_lower|
+----------+
|       the|
|   project|
| gutenberg|
|     ebook|
|        of|
|     pride|
|       and|
| prejudice|
|          |
|          |
|          |
|          |
|          |
|      this|
|     ebook|
|        is|
|       for|
|       the|
|       use|
|        of|
+----------+
only showing top 20 rows



In [81]:
cleaned_words = words_lower.select(
    regexp_extract(col("word_lower"), "[a-z]+", 0).alias("word")
)
cleaned_words.show()

+---------+
|     word|
+---------+
|      the|
|  project|
|gutenberg|
|    ebook|
|       of|
|    pride|
|      and|
|prejudice|
|         |
|         |
|         |
|         |
|         |
|     this|
|    ebook|
|       is|
|      for|
|      the|
|      use|
|       of|
+---------+
only showing top 20 rows



In [83]:
words_notnull = cleaned_words.filter(col("word") != "")
words_notnull.show()

+---------+
|     word|
+---------+
|      the|
|  project|
|gutenberg|
|    ebook|
|       of|
|    pride|
|      and|
|prejudice|
|     this|
|    ebook|
|       is|
|      for|
|      the|
|      use|
|       of|
|   anyone|
| anywhere|
|       in|
|      the|
|   united|
+---------+
only showing top 20 rows



In [95]:
groups = words_notnull.groupBy(col("word")).count()
groups.show()

+------------+-----+
|        word|count|
+------------+-----+
|      online|    5|
|       those|   65|
|        some|  212|
|     insipid|    2|
|       still|   77|
|         art|    7|
|        hope|  126|
|        earl|    3|
|         few|   73|
|   destitute|    2|
|  palpitated|    1|
|   connected|   15|
|    cautious|    4|
|   imitation|    1|
|     solaced|    1|
|      poetry|    2|
|   arguments|    5|
|premeditated|    1|
|     elevate|    1|
|      doubts|    2|
+------------+-----+
only showing top 20 rows



In [92]:
result = groups.orderBy(col("count").desc())
result.show()

+----+-----+
|word|count|
+----+-----+
| the| 4842|
|  to| 4399|
|  of| 3957|
| and| 3785|
| her| 2281|
|   i| 2105|
|   a| 2080|
|  in| 2039|
| was| 1877|
| she| 1744|
|that| 1639|
|  it| 1597|
| not| 1526|
| you| 1444|
|  he| 1359|
| his| 1302|
|  be| 1280|
|  as| 1240|
| had| 1186|
|with| 1148|
+----+-----+
only showing top 20 rows



In [100]:
result.coalesce(1).write.csv("./datasets/word_count.csv")