### Bigrams

In [1]:
import findspark
findspark.init()
import pyspark

In [2]:
sc = pyspark.SparkContext(appName = 'Bigram')

In [3]:
rdd = sc.textFile("ebook2.txt")

In [4]:
rdd.take(3)

[u'The Project Gutenberg EBook of Democracy In America, Volume 1 (of 2), by ',
 u'Alexis de Toqueville',
 u'']

In [5]:
def nonempty(x):
    if len(x)<1:
        return False
    else:
        return True

In [6]:
bookfiltered = rdd.filter(nonempty)

In [7]:
remove_quotes_and_lowercase = bookfiltered.map(lambda x:x.replace('"',' ').lower())

In [8]:
split_words = remove_quotes_and_lowercase.map(lambda x:x.split(" "))

In [9]:
def nonblank(word):
    temp = word
    for w in word:
        if len(w)<1:
            word.remove(w)
    return temp

In [10]:
nonblanks = split_words.map(nonblank)

In [11]:
def specialchars(rec):   
    mylist = []
    for w in rec:
        a = w.rstrip(',.;:?')
        b = a.replace("'s","")
        mylist.append(b)
    return mylist

In [12]:
remove_chars = nonblanks.map(specialchars)

In [13]:
def clean(word):
    temp = word
    for w in word:
        if w.startswith("*"):
            word.remove(w)
    return temp

In [14]:
clean_words = remove_chars.map(clean)

In [15]:
bigrams = clean_words.flatMap(lambda x: [((x[i]+" "+x[i+1]),1) for i in range (0, len(x)-1)])

In [16]:
bigrams.take(3)

[(u'the project', 1), (u'project gutenberg', 1), (u'gutenberg ebook', 1)]

In [17]:
freq = bigrams.reduceByKey(lambda x,y: x+y)

In [18]:
freq.take(3)

[(u'accepting office', 1), (u'or trample', 1), (u'defensive posture', 1)]

In [19]:
freq.count()

71090

In [20]:
import pandas as pd

In [21]:
df=freq.collect()

In [22]:
bigrams_df = pd.DataFrame(data = df)

In [23]:
bigrams_df = bigrams_df.rename(columns = {0:'bigram', 1:'count'})

In [24]:
bigrams_df.head(5)

Unnamed: 0,bigram,count
0,accepting office,1
1,or trample,1
2,defensive posture,1
3,probably have,1
4,so novel,1


#### 1. How many unique bigrams are there?

In [25]:
len(bigrams_df)

71090

There are **71,090** unique bigrams.

#### 2. List the top ten most frequent bigrams and their counts

In [26]:
top_ten_bigrams = bigrams_df.sort_values(by = 'count', ascending = False).head(10)
top_ten_bigrams

Unnamed: 0,bigram,count
3188,of the,3916
25498,in the,1540
16200,to the,1035
57177,and the,783
28783,it is,608
14250,by the,520
3607,the united,478
28193,of a,459
25555,united states,454
56993,to be,440


#### 3. What fraction of all bigrams occurrences does the top ten bigrams account for? That is, what is the cumulative frequency of the top ten bigrams?

In [27]:
tot_num_bigrams = bigrams_df['count'].sum()
tot_num_bigrams

176110

In [28]:
tot_count_top_ten = top_ten_bigrams['count'].sum()
tot_count_top_ten

10233

In [29]:
tot_count_top_ten/float(tot_num_bigrams)

0.0581057293736869

The total occurences for the top ten bigrams account for **5.81%** of the total occurences for all bigrams.

#### 4. How many bigrams appear only once?

In [30]:
len(bigrams_df[bigrams_df['count'] == 1])

51985

The number of bigrams that only appear once is **51,985**.