In [24]:
import nltk
import numpy
import codecs
from bs4 import BeautifulSoup
from __future__ import division, unicode_literals 
from urllib import request
from tabulate import tabulate
import pandas as pd
from nltk.tokenize import regexp_tokenize

In [48]:
## define reusable functions

def read_html(url):
    html = request.urlopen(url).read().decode('utf8')
    raw = BeautifulSoup(html, 'html.parser').get_text()
    tokens = word_tokenize(raw)
    text = nltk.Text(tokens)
    return text

def lexical_diversity(text):
    return len(set(text))/len(text)

def percentage(word,text):
    return 100* text.count(word)/len(text)

def read_utf8(url):
    response = request.urlopen(url)
    raw = response.read().decode('utf8')
    tokens = nltk.word_tokenize(raw)
    text = nltk.Text(tokens)
    return text

def vocab(tokens):
    return set(tokens)

def read_utf8_contentonly(url):
    response = request.urlopen(url)
    raw = response.read().decode('utf8')
    start=raw.find("CONTENTS")
    if start== -1:
        start=0
    #print(start)    
    if raw.rfind("End of the Project Gutenberg") != -1:
        end=raw.rfind("End of the Project Gutenberg")
    elif raw.rfind("End of Project Gutenberg's") != -1:
        end=raw.rfind("End of Project Gutenberg's")
    
    if end == -1:
        end=len(raw)
        
    #print(end)
    tokens = nltk.word_tokenize(raw[start:end])
    text = nltk.Text(tokens)
    return text


def text_norm(text):
    #remove numbers and punctuations and convert all words to lower case to remove duplicates later
    words=[word.lower() for word in text if word.isalpha()]
    return words
    

In [299]:
# pre-analyze ebook list

url = "http://www.gutenberg.org/cache/epub/9078/pg9078.txt"
response = request.urlopen(url)
#print(response.status, response.reason)
# good output is 200,OK
raw = response.read().decode('utf8')
tokens = word_tokenize(raw)
text = nltk.Text(tokens)


In [313]:
len(tokens)
len(raw)

684852

In [316]:
tk=read_utf8_contentonly("http://www.gutenberg.org/cache/epub/9078/pg9078.txt")
len(tk)

5542
671148


140591

In [226]:
# print tokens/text
#tokens is a list of strings
for t in range(0,10):
    print(text[t])
    
print(len(tokens))
print(type(tokens))

print(len(text))
print(type(text))

﻿Project
Gutenberg
's
Sanders
'
Union
Fourth
Reader
,
by
144160
<class 'list'>
144160
<class 'nltk.text.Text'>


In [215]:
## Create vocabulary
vocab=set(tokens)
print(len(vocab))
print(type(vocab))
# print set objects
start=100
for count, item in enumerate(vocab, start):
    print(count, item)
    if(count==start+10):
        break


17120
<class 'set'>
100 lamp
101 ILS
102 licks
103 eventide
104 HOLLAND
105 cur_v'd_
106 concave
107 When
108 _his_
109 recovery
110 _sp_


In [219]:
#second reader (The Beacon Second Reader BookIcon.png Fassett, James H.)
#text[0]=read_html("http://www.gutenberg.org/files/15659/15659-h/15659-h.htm")
#fifth reader (De La Salle Fifth Reader BookIcon.png Schools, Brothers of the Christian)
#text[1]=read_html("http://www.gutenberg.org/files/10811/10811-h/10811-h.htm")
#fourth reader (Sanders' Union Fourth Reader BookIcon.png Sanders, Charles W.)
#text[2]=read_html("http://www.gutenberg.org/files/9078/9078-h/9078-h.htm")

In [33]:
# define reusable functions for list of books

def createlist(booklist,norm):
    textread=[]
    docnames=[]
    if norm:
        for name,url in booklist.items():
            docnames.append(name)
            textread.append(text_norm(read_utf8_contentonly(url)))
    else:
        for name,url in booklist.items():
            docnames.append(name)
            textread.append(read_utf8(url))
    
    return docnames,textread

# find lexical diversity of the books
def lexdiv_list(textread):
    lexdiv=[]
    for tmp in textread:
        lexdiv.append(lexical_diversity(tmp))
    #for l in lexdiv:
    #    print(l)
    
    return lexdiv

# find vocab of books
def vocabsize_list(textread):
    vocabsize=[]
    for tmp in textread:
        vocabsize.append(len(vocab(tmp)))
    #for vc in vocabsize:
    #    print(vc)
    
    return vocabsize
    
# print tabular output
def tab_bookstats(book_pd):
    print(tabulate(book_pd, headers='keys', tablefmt='simple',showindex=False,))

### Analyze Lexical Diversity and Vocabulary Size of set of 3 books at a time from different grades

#### Lexical diversity and Vocabulary size on text that is not normalized/cleaned and from books of different authors

In [55]:
'''
diff_author_docs={"second reader :The Beacon Second Reader : Fassett, James H." : 'http://www.gutenberg.org/cache/epub/15659/pg15659.txt',
                  "fourth reader :Sanders' Union Fourth Reader : Sanders, Charles W." : 'http://www.gutenberg.org/cache/epub/9078/pg9078.txt',
                  "fifth reader :De La Salle Fifth Reader : Schools, Brothers of the Christian" :'http://www.gutenberg.org/cache/epub/10811/pg10811.txt'}
'''

diff_author_docs={"New National First Reader : Barnes, Charles J.":'http://www.gutenberg.org/files/13853/13853-0.txt',
                  "The Ontario Readers: Third Book : Ontario Ministry of Education":'http://www.gutenberg.org/cache/epub/18561/pg18561.txt',
                  "fifth reader :De La Salle Fifth Reader : Schools, Brothers of the Christian" :'http://www.gutenberg.org/cache/epub/10811/pg10811.txt'}
                  
da_dcname,da_textr = createlist(diff_author_docs,False)
da_ld=lexdiv_list(da_textr)
da_vc=vocabsize_list(da_textr)
da_df = pd.DataFrame({'docnames' : da_dcname,
                   'lexical_diversity' : da_ld,
                   'vocabulary_size' : da_vc})

tab_bookstats(da_df)

docnames                                                                       lexical_diversity    vocabulary_size
---------------------------------------------------------------------------  -------------------  -----------------
New National First Reader : Barnes, Charles J.                                          0.165179               2202
The Ontario Readers: Third Book : Ontario Ministry of Education                         0.120272               9877
fifth reader :De La Salle Fifth Reader : Schools, Brothers of the Christian             0.129179               9196


_Note_ : Inferences and analysis is purely based on samples chosen for this analysis and it could differ based on more samples.

At first randomly selected 3 books of different grades from different authors. It is assumed Different authors have different style of writing and may use different vocabulary as well.  From the above chosen sample, we can see that vocabulary size and lexical diversity are inversely proportional i.e. with increasing vocabulary_size, lexical diversity is decreasing, propertion rate being different. Although it cannot be said that with rising grade the vocabulary was increasing, which we can say from above samples as vocabulary size of 3rd grade bood was greater than 5th grade book.

#### Lexical diversity and Vocabulary size on text that is not normalized/cleaned and from books of same author"

In [35]:
same_author_docs={"McGuffey's First Eclectic Reader, Revised Edition : McGuffey, William Holmes" : 'http://www.gutenberg.org/cache/epub/14640/pg14640.txt',
                  "McGuffey's Third Eclectic Reader : McGuffey, William Holmes" : 'http://www.gutenberg.org/cache/epub/14766/pg14766.txt',
                  "McGuffey's Fifth Eclectic Reader : McGuffey, William Holmes" :'http://www.gutenberg.org/cache/epub/15040/pg15040.txt'}

sa_dcname,sa_textr = createlist(same_author_docs,False)
sa_ld=lexdiv_list(sa_textr)
sa_vc=vocabsize_list(sa_textr)
sa_df = pd.DataFrame({'docnames' : sa_dcname,
                   'lexical_diversity' : sa_ld,
                   'vocabulary_size' : sa_vc})
tab_bookstats(sa_df)

docnames                                                                        lexical_diversity    vocabulary_size
----------------------------------------------------------------------------  -------------------  -----------------
McGuffey's First Eclectic Reader, Revised Edition : McGuffey, William Holmes             0.167199               2147
McGuffey's Third Eclectic Reader : McGuffey, William Holmes                              0.124138               4714
McGuffey's Fifth Eclectic Reader : McGuffey, William Holmes                              0.112887              14292


_Note_ : Inferences and analysis is purely based on samples chosen for this analysis and it could differ based on more samples.

At first randomly selected 3 books of different grades from different authors. It is assumed Different authors have different style of writing and may use different vocabulary as well. So here we have picked up books from different grades but form the same author to see if authors prefer to change pattern in the vocabulary with rising grade. Well from the chosen samples, vocabulary size is inversely proportional to lexical diversity here also. However the vocabulary size is increasing with rising grade, propertion rate being different between books of 2 sonsecutive grades.

**Previous 2 samples showed a inverse relation between lexical diversity and vocabulary size. However, the data chosen had lot of non-text information as well and it would be interesting to see if considering a cleaner text gives us different answers. For this purpose some cleaning is performed on the text read from books.** 

**1) Mostly different books and different authors may structure their books differently. On manual inspection of the ebooks on gutenerg site, it was found that most of these books have header and footer information of book comprising of information on author, licenses, publications and project gutenberg.There was a different title of the actual start of book, so "CONTENT" followed by Table of contents is considered to be start of book. It does repeat some information but we will be able to remove lot of other information like Prefaces,Acknowledgements , etc which have more data.**  
**2) Similarly, to consider the end of book, it was seen that most of books had a sequence of words related to "End of the Project Gutenberg" , after which lot of information about this project is mentioned.**  
**3) Also to count the vocabulary, punctuations and numerical data has been removed**

#### Lexical diversity and Vocabulary size on text that is normalized/cleaned and from books of different authors

In [50]:
              
dan_dcname,dan_textr = createlist(diff_author_docs,True)
#print(dan_dcname)
#print(dan_textr)
dan_ld=lexdiv_list(dan_textr)
dan_vc=vocabsize_list(dan_textr)
dan_df = pd.DataFrame({'docnames' : dan_dcname,
                   'lexical_diversity' : dan_ld,
                   'vocabulary_size' : dan_vc})
tab_bookstats(dan_df)

docnames                                                                       lexical_diversity    vocabulary_size
---------------------------------------------------------------------------  -------------------  -----------------
New National First Reader : Barnes, Charles J.                                          0.114483                796
The Ontario Readers: Third Book : Ontario Ministry of Education                         0.117208               7397
fifth reader :De La Salle Fifth Reader : Schools, Brothers of the Christian             0.127714               6701


_Note_ : Inferences and analysis is purely based on samples chosen for this analysis and it could differ based on more samples.

This normalized data clearly shows the relation between lexical diversity and vocabulary size has changed now for the same set of books from different authors. We do not see a pattern of inverse relationship , also interesting to find that although vocabulary size from Grade 1 to Grade 3 has increased significantly( 796 to 7397 ) , lexical diversity seems to have changed slightly ( 0.1144 to 0.1172 ) . While the vocabulary size from Grade 3 to Grade 5 changed by a comparatively higher number, from 7397 to 6701, lexical diversity has increased with a higher proportion.

#### Lexical diversity and Vocabulary size on text that is  normalized/cleaned and from books of same author"

In [53]:
san_dcname,san_textr = createlist(same_author_docs,True)
san_ld=lexdiv_list(san_textr)
san_vc=vocabsize_list(san_textr)
san_df = pd.DataFrame({'docnames' : san_dcname,
                   'lexical_diversity' : san_ld,
                   'vocabulary_size' : san_vc})
tab_bookstats(san_df)

docnames                                                                        lexical_diversity    vocabulary_size
----------------------------------------------------------------------------  -------------------  -----------------
McGuffey's First Eclectic Reader, Revised Edition : McGuffey, William Holmes             0.151628               1136
McGuffey's Third Eclectic Reader : McGuffey, William Holmes                              0.123868               3200
McGuffey's Fifth Eclectic Reader : McGuffey, William Holmes                              0.111936              10571


_Note_ : Inferences and analysis is purely based on samples chosen for this analysis and it could differ based on more samples.

This normalized data clearly shows the relation between lexical diversity and vocabulary size has unchanged for the same set of books from same author. We see similar pattern of increasing vocabulary size with rising grade and inverse relation between lexical diversity and vocabulary size. Although it is interesting to see that vocabulary size has decreased significantly when we considered text normalization/cleanup , lexical diversity has not changed significantly. 

#### Final Inference

From the above text analysis on different sets of data it seems more appropriate to consider the statistics from cleaned up data. Not because they do not confirm a pattern but logically even though it was a basic cleanup, it should more closely represent the data contents of the book. And not just that there is no relation between the 2 statistics, the rate of change of these statistics considered individually is different . So the text complexity should be analyzed using both the scores jointly