In [1]:
from pyspark import SparkContext
sc = SparkContext()

In [2]:
text = sc.textFile('All.txt',4)  # Text file with ~400k lines of text
text.take(5)

['%A Abdou, I.E.',
 '%A Wong, K.Y.',
 '%D 1982',
 '%T Analysis of linear interpolation schemes for bi-level image applications',
 '%J IBM J Research and Development']

### Word count

In [3]:
#%%writefile wc.py
import re

def word_split(row):
    words = re.split("\W+",row)
    words = [w.strip().lower() for w in words if w!=""]
    return words

#text.map(word_split).take(2)

# Using Spark to count the word frequencies in the file
def word_count(thetext):
    return thetext.flatMap(word_split).map(lambda x: (x,1)).reduceByKey(lambda x,y: (x+y)).sortBy(lambda x:-x[1])

it = word_count(text)
it.take(20)

[('the', 192153),
 ('and', 130177),
 ('of', 107542),
 ('to', 70870),
 ('a', 54480),
 ('in', 53619),
 ('i', 42233),
 ('that', 39658),
 ('is', 31691),
 ('he', 30562),
 ('for', 28383),
 ('it', 27587),
 ('with', 27279),
 ('his', 26246),
 ('you', 23833),
 ('not', 23567),
 ('be', 22997),
 ('was', 20423),
 ('as', 20126),
 ('my', 19492)]

### Character ":;'/.,<>?)(* count

In [6]:
#%%writefile cc.py
import re
def char_split(row):
    chars = re.split("\w+",row)
    chars = [w.rstrip() for w in chars if w.rstrip()!=""]
    return chars

#text.map(char_split).take(5)

# Using Spark to count the character frequencies in the file
def char_count(thetext):
    return thetext.flatMap(char_split).map(lambda x: (x,1)).reduceByKey(lambda x,y: (x+y)).sortBy(lambda x:-x[1])

a = char_count(text)
a.take(20)

[(',', 281996),
 ('.', 171010),
 ("'", 41520),
 (';', 40720),
 (':', 37684),
 ('-', 22948),
 ('?', 15276),
 (' (', 14632),
 ('"', 10710),
 ('!', 10699),
 (')', 9697),
 ('%', 8090),
 (' -', 3679),
 ('--', 3612),
 ('),', 3541),
 ('."', 3541),
 (',"', 3446),
 ('%,', 3173),
 ('. "', 2555),
 (' "', 2295)]

### Positive word search

In [7]:
#%%writefile ps.py

import re

# List of positive English words
positive_words = []
with open('positive-words.txt','r') as f:
    for line in f:
        positive_words.append(line.rstrip())

# List of a selection of the top 50 most common English words
english_common = []
with open('50-english.txt','r') as f:
    for line in f:
        english_common.append(line.rstrip())

def positive_split(row):
    words = re.split("\W+",row)
    words = [w.strip().lower() for w in words if w.strip().lower() in positive_words \
             and w.strip().lower() not in english_common and w.strip().lower() != ""]
    return words

#text.map(positive_split).take(2)

# Using Spark to count all the positive words in the file
def positive_count(thetext):
    return thetext.flatMap(positive_split).map(lambda x: (x,1)).reduceByKey(lambda x,y: (x+y)).sortBy(lambda x:-x[1])

it = positive_count(text)
it.take(20)

[('good', 4559),
 ('well', 4137),
 ('like', 3991),
 ('love', 3117),
 ('great', 2843),
 ('right', 1588),
 ('heaven', 1322),
 ('peace', 1218),
 ('master', 1167),
 ('work', 1151),
 ('better', 1094),
 ('sweet', 981),
 ('fair', 948),
 ('holy', 874),
 ('gold', 866),
 ('best', 826),
 ('grace', 794),
 ('faith', 735),
 ('strong', 683),
 ('noble', 674)]

### Negative word search

In [8]:
#%%writefile ns.py
import re

# List of negative English words
negative_words = []
with open('negative-words.txt','r') as f:
    for line in f:
        negative_words.append(line.rstrip())
        
# List of a selection of the top 50 most common English words
english_common = []
with open('50-english.txt','r') as f:
    for line in f:
        english_common.append(line.rstrip())

def negative_split(row):
    words = re.split("\W+",row)
    words = [w.strip().lower() for w in words if w.strip().lower() in negative_words \
             and w.strip().lower() not in english_common and w.strip().lower() != ""]
    return words

#text.map(negative_split).take(2)

# Using Spark to count all negative words in the file
def negative_count(thetext):
    return thetext.flatMap(negative_split).map(lambda x: (x,1)).reduceByKey(lambda x,y: (x+y)).sortBy(lambda x:-x[1])

it = negative_count(text)
it.take(20)

[('death', 1872),
 ('fear', 1249),
 ('poor', 1115),
 ('dead', 1114),
 ('die', 888),
 ('fall', 773),
 ('evil', 715),
 ('sin', 632),
 ('fool', 623),
 ('lie', 593),
 ('fell', 592),
 ('strange', 591),
 ('lost', 571),
 ('enemy', 571),
 ('mistress', 556),
 ('cry', 554),
 ('wilt', 547),
 ('cold', 528),
 ('break', 518),
 ('hard', 511)]