In [19]:
import nltk
from nltk.corpus import gutenberg
import text_normalizer as tn
from operator import itemgetter

In [20]:
data = open('elephants.txt', 'r+').readlines()
sentences = nltk.sent_tokenize(data[0])
len(sentences)

29

In [21]:
# Viewing the first 3 lines
sentences[:3]

['Elephants are large mammals of the family Elephantidae and the order Proboscidea.',
 'Three species are currently recognised: the African bush elephant (Loxodonta africana), the African forest elephant (L. cyclotis), and the Asian elephant (Elephas maximus).',
 'Elephants are scattered throughout sub-Saharan Africa, South Asia, and Southeast Asia.']

In [23]:
# let's now use our nifty text_normalizer module to do some very basic text preprocessing on our corpus
norm_senteces=tn.normalize_corpus(sentences,text_lower_case=False,text_stemming=False,text_lemmatization=False,stopword_removal=False)
norm_senteces[:3]

['Elephants are large mammals of the family Elephantidae and the order Proboscidea',
 'Three species are currently recognised the African bush elephant Loxodonta africana the African forest elephant L cyclotis and the Asian elephant Elephas maximus',
 'Elephants are scattered throughout subSaharan Africa South Asia and Southeast Asia']

extract all possible noun phrases from our corpus of documents/sentences

In [26]:
import itertools
stopwords = nltk.corpus.stopwords.words('english')

def get_chunks(sentences, grammar = r'NP: {<DT>? <JJ>* <NN.*>+}', stopword_list=stopwords):
    
    all_chunks = []
    chunker = nltk.chunk.regexp.RegexpParser(grammar)
    
    for sentence in sentences:
        tagged_sents = [nltk.pos_tag(nltk.word_tokenize(sentence))]   
        chunks = [chunker.parse(tagged_sent) for tagged_sent in tagged_sents]#apply regular expression pattern
        wtc_sents = [nltk.chunk.tree2conlltags(chunk) for chunk in chunks]    
        flattened_chunks = list(itertools.chain.from_iterable(wtc_sent for wtc_sent in wtc_sents))#seperation,i,i,t,w,l,.
        valid_chunks_tagged = [(status, [wtc for wtc in chunk]) 
                    for status, chunk in itertools.groupby(flattened_chunks, lambda word_pos_chunk: word_pos_chunk[2] != 'O')]
        valid_chunks = [' '.join(word.lower() 
                                for word, tag, chunk in wtc_group 
                                    if word.lower() not in stopword_list) 
                                        for status, wtc_group in valid_chunks_tagged
                                            if status]
                                            
        all_chunks.append(valid_chunks)
    
    return all_chunks

In [27]:
chunks = get_chunks(norm_senteces)
chunks[:3]

[['elephants', 'large mammals', 'family elephantidae', 'order proboscidea'],
 ['species',
  'african bush elephant loxodonta',
  'african forest elephant l cyclotis',
  'asian elephant elephas maximus'],
 ['elephants', 'subsaharan africa south asia', 'southeast asia']]

We now build on top of our get_chunks() function by implementing the necessary logic
for Step 2, where we will build a TF-IDF based model on our keyphrases using Gensim
and then compute TF-IDF based weights for each keyphrase based on its occurrence in
the corpus

In [28]:
from gensim import corpora, models

def get_tfidf_weighted_keyphrases(sentences,  grammar=r'NP: {<DT>? <JJ>* <NN.*>+}', top_n=10):
    valid_chunks = get_chunks(sentences, grammar=grammar)
    dictionary = corpora.Dictionary(valid_chunks)
    corpus = [dictionary.doc2bow(chunk) for chunk in valid_chunks]
    tfidf = models.TfidfModel(corpus)
    corpus_tfidf = tfidf[corpus]
    weighted_phrases = {dictionary.get(idx): value 
                           for doc in corpus_tfidf 
                               for idx, value in doc}                           
    weighted_phrases = sorted(weighted_phrases.items(), key=itemgetter(1), reverse=True)
    weighted_phrases = [(term, round(wt, 3)) for term, wt in weighted_phrases]
    return weighted_phrases[:top_n]

which has a keywords
function that extracts keywords from the text. This uses a variation of the TextRank
algorithm

In [29]:
from gensim.summarization import keywords

key_words = keywords(data[0], ratio=1.0, scores=True, lemmatize=True)
[(item, round(score, 3)) for item, score in key_words][:25]

[('african bush elephant', 0.261),
 ('including', 0.141),
 ('family', 0.137),
 ('cow', 0.124),
 ('forests', 0.108),
 ('female', 0.103),
 ('asia', 0.102),
 ('objects', 0.098),
 ('ivory', 0.098),
 ('sight', 0.098),
 ('tigers', 0.098),
 ('males', 0.088),
 ('religion', 0.087),
 ('folklore', 0.087),
 ('known', 0.087),
 ('larger ears', 0.085),
 ('water', 0.075),
 ('highly recognisable', 0.075),
 ('breathing lifting', 0.074),
 ('flaps', 0.073),
 ('africa', 0.072),
 ('gomphotheres', 0.072),
 ('animals tend', 0.071),
 ('success', 0.071),
 ('south', 0.07)]