### WordCount program using PySpark

In [None]:
!pip install pyspark



In [None]:
from pyspark.sql import SparkSession
APP_NAME = "WC_Example"
spark = SparkSession.builder.appName(APP_NAME).getOrCreate()
spark

In [None]:
#import library for sql functions
import pyspark.sql.functions as F

In [None]:
#Connect with gdrive
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


# Specify the data file to count

In [None]:
#import files from the Web
import requests
response = requests.get('https://www.gutenberg.org/files/1342/1342-0.txt')
if response.ok:
    with open('pride_and_prejudice.txt', 'w', encoding='utf-8') as file:
        file.writelines(response.text)

In [None]:
# load Pride and Prejudice into a DataFrame
wordDF = spark.read.text('pride_and_prejudice.txt')

In [None]:
# WordCount using DF (Using space)
# split() converts delimiter separated String to array on Dataframe, based on a delimiter like space, comma, etc
# explode() flatten nested array (Array of Array) DataFrame columns into rows
# selectExpr() takes SQL expression in a String and returns a new DataFrame
# Split, trim, and count the words.  Display in descending count order
wc = (wordDF.select(F.split('value',' ').alias('words'))    # split words into an array
         .select(F.explode('words').alias('word'))             # make each item of the array its own line
         .selectExpr("trim(word) as word")                     # left and right trim
         .filter("word != ''")                                 # filter out the empty strings
         .groupBy(['word'])                                    # group by word
         .agg(F.count('word'))                                 # perform a count aggregation on each word
         .orderBy(F.desc('count(word)')))                      # order descending
    
wc.show()

+----+-----------+
|word|count(word)|
+----+-----------+
| the|       4216|
|  to|       4123|
|  of|       3667|
| and|       3309|
|   a|       1944|
| her|       1856|
|  in|       1817|
| was|       1796|
|   I|       1725|
|that|       1417|
| not|       1363|
| she|       1303|
|  be|       1209|
| his|       1166|
| had|       1125|
|  as|       1119|
|with|       1040|
|  he|       1039|
| for|       1003|
| you|        992|
+----+-----------+
only showing top 20 rows



# WordCount program using RDD

In [None]:
# load Pride and Prejudice into an RDD
wRDD = spark.sparkContext.textFile('pride_and_prejudice.txt')

In [None]:
#Word Count program using RDD 
wcRDD = (wRDD.flatMap(lambda line: str(line).split(' '))
            .map(lambda word: (word, 1))
            .reduceByKey(lambda v1,v2: v1+v2)
            .sortBy((lambda x: x[1]), False))
wcRDD.take(20)

[('', 73700),
 ('the', 4216),
 ('to', 4123),
 ('of', 3667),
 ('and', 3309),
 ('a', 1944),
 ('her', 1856),
 ('in', 1817),
 ('was', 1796),
 ('I', 1725),
 ('that', 1417),
 ('not', 1363),
 ('she', 1303),
 ('be', 1209),
 ('his', 1166),
 ('had', 1125),
 ('as', 1119),
 ('with', 1040),
 ('he', 1039),
 ('for', 1003)]

In [None]:
# Find file names in gdrive
import os
os.listdir('./gdrive/My Drive/Asllani/ch11_data')

['myBlog.txt',
 'SalesRecords.csv',
 'EmployeePT.tsv',
 'customerFiles.json',
 '.DS_Store',
 'items']

In [None]:
# Code for reading a text file from gdrive into wfRDD
wordRDD = spark.sparkContext.textFile("./gdrive/My Drive/Asllani/ch11_data/myBlog.txt")
wordRDD.collect()

['Big Data techniques offer several advantages over traditional techniques.',
 'Hadoop is one Big Data analytics platform.  Hadoop has several ecosystems to perform Big Data Analytics.',
 'Spark is one of such ecosystems.  Spark can also run on Hadoop for Big Data Analitycs.']

In [None]:
#Word Count program using RDD (DF has no flatMap operation)
wc = (wordRDD.flatMap(lambda line: str(line).split(' '))
            .map(lambda word: (word, 1))
            .reduceByKey(lambda v1,v2: v1+v2)
            .sortBy((lambda x: x[1]), False))
wc.collect()

[('Big', 4),
 ('Data', 4),
 ('Hadoop', 3),
 ('several', 2),
 ('is', 2),
 ('', 2),
 ('Spark', 2),
 ('one', 2),
 ('techniques', 1),
 ('advantages', 1),
 ('traditional', 1),
 ('analytics', 1),
 ('platform.', 1),
 ('ecosystems', 1),
 ('perform', 1),
 ('Analytics.', 1),
 ('of', 1),
 ('run', 1),
 ('offer', 1),
 ('over', 1),
 ('techniques.', 1),
 ('has', 1),
 ('to', 1),
 ('such', 1),
 ('ecosystems.', 1),
 ('can', 1),
 ('also', 1),
 ('on', 1),
 ('for', 1),
 ('Analitycs.', 1)]