In [1]:
import nltk, re, pprint
import pandas as pd
from urllib import request
from nltk import word_tokenize

In [2]:
url_grade3_text = "http://www.gutenberg.org/cache/epub/14766/pg14766.txt"
grade3_text_url_open = request.urlopen(url_grade3_text)
text_grade3_raw = grade3_text_url_open.read().decode('utf-8-sig')
text_grade3_raw_start = text_grade3_raw.find("MCGUFFEY\'S\r\n\r\nTHIRD READER")
text_grade3_raw_end = text_grade3_raw.rfind("End of the Project Gutenberg EBook of McGuffey\'s Third Eclectic Reader")
text_grade3_raw_sub = text_grade3_raw[text_grade3_raw_start:text_grade3_raw_end]
text_grade3_raw_clean = text_grade3_raw_sub.lower()
text_grade3_tokenize = word_tokenize(text_grade3_raw_clean)

In [3]:
url_grade4_text = "http://www.gutenberg.org/cache/epub/14880/pg14880.txt"
grade4_text_url_open = request.urlopen(url_grade4_text)
text_grade4_raw = grade4_text_url_open.read().decode('utf-8-sig')
text_grade4_raw_start = text_grade4_raw.find("MCGUFFEY\'S FOURTH READER")
text_grade4_raw_end = text_grade4_raw.rfind("End of the Project Gutenberg EBook of McGuffey\'s Fourth Eclectic Reader")
text_grade4_raw_sub = text_grade4_raw[text_grade4_raw_start:text_grade4_raw_end]
text_grade4_raw_clean = text_grade4_raw_sub.lower()
text_grade4_tokenize = word_tokenize(text_grade4_raw_clean)

In [4]:
url_grade5_text = "http://www.gutenberg.org/cache/epub/15040/pg15040.txt"
grade5_text_url_open = request.urlopen(url_grade5_text)
text_grade5_raw = grade5_text_url_open.read().decode('utf-8-sig')
text_grade5_raw_start = text_grade5_raw.find("McGuffey\'s Fifth Reader")
text_grade5_raw_end = text_grade5_raw.rfind("End of the Project Gutenberg EBook of McGuffey\'s Fifth Eclectic Reader")
text_grade5_raw_sub = text_grade5_raw[text_grade5_raw_start:text_grade5_raw_end]
text_grade5_raw_clean = text_grade5_raw_sub.lower()
text_grade5_tokenize = word_tokenize(text_grade5_raw_clean)

In [5]:
results_df = pd.DataFrame(columns=['Grade_Level'
                                   , 'Lexical_Diversity_Score'
                                   , 'Vocabulary_Count'
                                  ])
results_df

Unnamed: 0,Grade_Level,Lexical_Diversity_Score,Vocabulary_Count


In [6]:
def lexical_diversity(text):
    vocab_cnt = len(set(text))
    lex_div = vocab_cnt / len(text)
    return (lex_div, vocab_cnt)

In [7]:
(lex_div, vocab_cnt) = lexical_diversity(text_grade3_tokenize)

results_df = results_df.append({'Grade_Level': 'Grade 3'
                                , 'Lexical_Diversity_Score': lex_div
                                , 'Vocabulary_Count': vocab_cnt}
                              , ignore_index=True)

In [8]:
(lex_div, vocab_cnt) = lexical_diversity(text_grade4_tokenize)

results_df = results_df.append({'Grade_Level': 'Grade 4'
                                , 'Lexical_Diversity_Score': lex_div
                                , 'Vocabulary_Count': vocab_cnt}
                              , ignore_index=True)

In [9]:
(lex_div, vocab_cnt) = lexical_diversity(text_grade5_tokenize)

results_df = results_df.append({'Grade_Level': 'Grade 5'
                                , 'Lexical_Diversity_Score': lex_div
                                , 'Vocabulary_Count': vocab_cnt}
                              , ignore_index=True)

In [10]:
results_df.iloc[:, [0, 1]]

Unnamed: 0,Grade_Level,Lexical_Diversity_Score
0,Grade 3,0.10671
1,Grade 4,0.101938
2,Grade 5,0.103179


In [11]:
results_df.iloc[:, [0, 2]]

Unnamed: 0,Grade_Level,Vocabulary_Count
0,Grade 3,3421
1,Grade 4,7436
2,Grade 5,11435


In [12]:
results_df

Unnamed: 0,Grade_Level,Lexical_Diversity_Score,Vocabulary_Count
0,Grade 3,0.10671,3421
1,Grade 4,0.101938,7436
2,Grade 5,0.103179,11435


# Normalizing the Scores

In [13]:
docs=[text_grade3_raw_clean, text_grade4_raw_clean, text_grade5_raw_clean]

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer 
 
# settings that you use for count vectorizer will go here
tfidf_vectorizer=TfidfVectorizer(use_idf=True)
 
# just send in all your docs here
tfidf_vectorizer_vectors=tfidf_vectorizer.fit_transform(docs)

In [15]:
# get the first vector out (for the first document)
text_grade3_raw_clean_tfidfvectorizer=tfidf_vectorizer_vectors[0]
 
# place tf-idf values in a pandas data frame
text_grade3_raw_clean_tfidfvectorizer_df = pd.DataFrame(text_grade3_raw_clean_tfidfvectorizer.T.todense(), index=tfidf_vectorizer.get_feature_names(), columns=["tfidf"])
text_grade3_raw_clean_tfidfvectorizer_df.sort_values(by=["tfidf"],ascending=False)

Unnamed: 0,tfidf
the,0.591307
and,0.402794
to,0.310120
he,0.188061
of,0.169074
...,...
ger,0.000000
geranium,0.000000
germantown,0.000000
germinating,0.000000


In [16]:
# get the second vector out (for the first document)
text_grade4_raw_clean_tfidfvectorizer=tfidf_vectorizer_vectors[1]
 
# place tf-idf values in a pandas data frame
text_grade4_raw_clean_tfidfvectorizer_df = pd.DataFrame(text_grade4_raw_clean_tfidfvectorizer.T.todense(), index=tfidf_vectorizer.get_feature_names(), columns=["tfidf"])
text_grade4_raw_clean_tfidfvectorizer_df.sort_values(by=["tfidf"],ascending=False)

Unnamed: 0,tfidf
the,0.665598
and,0.374853
of,0.275452
to,0.267433
in,0.182018
...,...
illustrated,0.000000
illustrate,0.000000
illusion,0.000000
illumined,0.000000


In [17]:
# get the third vector out (for the first document)
text_grade5_raw_clean_tfidfvectorizer=tfidf_vectorizer_vectors[2]
 
# place tf-idf values in a pandas data frame
text_grade5_raw_clean_tfidfvectorizer_df = pd.DataFrame(text_grade5_raw_clean_tfidfvectorizer.T.todense(), index=tfidf_vectorizer.get_feature_names(), columns=["tfidf"])
text_grade5_raw_clean_tfidfvectorizer_df.sort_values(by=["tfidf"],ascending=False)

Unnamed: 0,tfidf
the,0.679837
and,0.373114
of,0.338798
to,0.250472
in,0.202713
...,...
edward,0.000000
edges,0.000000
eddie,0.000000
rustling,0.000000


In [18]:
fdist_grade3 = nltk.FreqDist(text_grade3_tokenize)
#sorted([word for word in text_grade3_tokenize if len(word) > 10 and fdist_grade3[word] > 3])

grade3_complex_words = []
for word in sorted(fdist_grade3):
    if(len(word) > 10):
        print(word, '->', fdist_grade3[word], end='\n')
        grade3_complex_words.append(word)

beggar-like -> 1
blackberries -> 1
broad-brimmed -> 1
castle-building -> 3
chestnut-colored -> 1
chick-a-de-dee -> 8
coarse-looking -> 1
commandment -> 2
commandments -> 2
constructed -> 1
deliverance -> 1
discouraged -> 1
disobedient -> 1
encountered -> 1
ever-wakeful -> 1
ever-watchful -> 1
experienced -> 1
forgiveness -> 1
gingerbread -> 2
grandfather -> 1
grandmother -> 12
hard-working -> 1
horse-chestnuts -> 1
ill-behaved -> 1
ill-looking -> 1
immediately -> 2
neighborhood -> 1
newfoundland -> 1
overflowing -> 1
pocketknife -> 1
poorly-dressed -> 1
satisfaction -> 2
schoolhouse -> 2
schoolmates -> 2
snow-flakes -> 1
snowy-white -> 1
sorrowfully -> 1
strawberries -> 4
streamer-like -> 1
sure-footed -> 1
transgressions -> 1
well-behaved -> 1


In [19]:
fdist_grade4 = nltk.FreqDist(text_grade4_tokenize)
#sorted([word for word in text_grade4_tokenize if len(word) > 10 and fdist_grade4[word] > 3])

grade4_complex_words = []
for word in sorted(fdist_grade4):
    if(len(word) > 10):
        print(word, '->', fdist_grade4[word], end='\n')
        grade4_complex_words.append(word)

a-lac'ri-ty -> 1
ab'so-lute-ly -> 1
abbreviation -> 1
aberbrothok -> 4
ac-cus'tomed -> 1
ac-knowledged -> 1
accidentally -> 2
accompanied -> 2
accomplishing -> 1
accordingly -> 2
acquaintance -> 1
ad-van'ta-ges -> 1
advancement -> 1
affectionate -> 1
al'pen-stock -> 1
antic-ipation -> 1
ap-pall'ing -> 1
ap-pearance -> 1
ap-pre'ci-at-ed -> 1
ap-pre'shi-at-ed -> 1
ap-pro-ba'tion -> 1
application -> 1
appreciated -> 1
approaching -> 2
approbation -> 1
appropriate -> 1
ar-o-mat'ic -> 1
ar-til'er-y -> 1
arrangements -> 1
as-cer-tained -> 1
as-sist'-ance -> 1
ascertained -> 1
astonishment -> 2
at-tract'ed -> 1
at-trib'ut-ed -> 1
bar'ba-rous -> 1
beautifully -> 2
blithe'some -> 1
blue-veined -> 1
blus'ter-ing -> 1
broad-brimmed -> 1
buf'fet-ing -> 1
capabilities -> 1
chanticleer -> 1
char'ac-ter -> 1
cheerfulness -> 1
chlo're-form -> 1
chop'sticks -> 1
christopher -> 1
churchgoing -> 1
circumstance -> 2
circumstances -> 1
clam'or-ous -> 1
clean-swept -> 1
close'reefed -> 1
close-reefed -> 1
c

In [20]:
fdist_grade5 = nltk.FreqDist(text_grade5_tokenize)
#sorted([word for word in text_grade5_tokenize if len(word) > 10 and fdist_grade5[word] > 3])

grade5_complex_words = []
for word in sorted(fdist_grade5):
    if(len(word) > 10):
        print(word, '->', fdist_grade5[word], end='\n')
        grade5_complex_words.append(word)

a-chieve'ment -> 1
ab-sorb'ing -> 1
ab-stract'-ed-ly -> 1
abandonment -> 1
abbreviation -> 1
absent-minded -> 2
abstractedly -> 1
ac-quaintance -> 1
accidentally -> 1
accountableness -> 1
accumulated -> 1
achievement -> 1
achievements -> 1
acknowledge -> 2
acknowledged -> 2
acknowledgment -> 1
acquaintance -> 3
ad-min'is-tered -> 1
ad-mo'nish'un -> 1
ad-mo-ni'tion -> 1
administered -> 1
administering -> 1
advancement -> 1
adventurous -> 1
af-flict'ed -> 1
affectionate -> 2
ag'gra-vat-ing -> 1
ag-gress'ors -> 1
aggravating -> 2
agricultural -> 1
agriculture -> 1
all-absorbing -> 1
alleghanies -> 1
ambrosianae -> 1
an-ni'hi-lates -> 1
an-te-di-lu'-vi-an -> 1
annihilates -> 1
anonymously -> 1
antediluvian -> 1
anticipation -> 1
antislavery -> 1
ap-pa-ra'tus -> 1
ap-peal'ing-ly -> 1
ap-pli-ca'tion -> 1
ap-point'ed -> 1
ap-pre-hen'sion -> 1
ap-pre-hend -> 1
appealingly -> 1
application -> 3
applications -> 1
appointment -> 2
appreciated -> 1
apprehension -> 2
apprenticed -> 3
apprenticeship

reprimanded -> 1
republished -> 2
resemblance -> 1
resignation -> 2
respectfully -> 1
responsibilitie -> 1
restoration -> 1
retaliating -> 1
retributory -> 1
reverberated -> 1
reverberating -> 2
reverential -> 1
revolutionary -> 1
righteousness -> 2
rough-looking -> 1
roundabouts -> 1
ru-mi-na'tion -> 1
sabbath-school -> 1
satisfaction -> 5
saun'ter-ing -> 1
saunterings -> 1
scatter-brain -> 1
scholarship -> 1
schoolfellow -> 1
schoolfellows -> 1
schoolhouse -> 2
schoolmaster -> 3
scru'pu-lous-ly -> 1
scrupulously -> 2
scur'ril-ous -> 1
second-rate -> 1
secretaries -> 2
seed-bearing -> 1
self'-pos-sessed -> 1
self-conceit -> 1
self-educated -> 1
self-evident -> 1
self-instruction -> 1
self-interest -> 1
self-possessed -> 2
self-respect -> 1
selfishness -> 1
sen-ti-ment'al -> 1
sensibility -> 2
sentimental -> 1
separat-ing -> 1
settlements -> 1
shakespeare -> 7
shamelessly -> 1
si-mul-ta'ne-ous -> 1
sig'na-ture -> 1
significant -> 1
significantly -> 1
signification -> 1
simultaneous -> 

# References

- https://kavita-ganesan.com/tfidftransformer-tfidfvectorizer-usage-differences/#.XsvgA8BOluU