In [17]:
import editdistance
import io
import itertools
import networkx as nx
import nltk
import os

In [18]:
def setup_environment():
    nltk.download('punkt')
    nltk.download('averaged_perceptron_tagger')
    print('Completed resource downloads.')

def filter_for_tags(tagged,tags=['NN','JJ','NNP']):                         #NN for noun,NNP for proper noun and JJ for adjective
    return [item for item in tagged if item[1] in tags]

def normalize(tagged):
    return [(item[0].replace('.',''),item[1]) for item in tagged]

def unique_everseen(iterable, key=None):
    """List unique elements in order of appearance.
       Examples: unique_everseen('AAAABBBCCDAABBB') --> A B C D
                 unique_everseen('ABBCcAD', str.lower) --> A B C D  """
    seen = set()
    seen_add = seen.add
    if key is None:
        for element in [x for x in iterable if x not in seen]:
            seen_add(element)
            yield element
    else:
        for element in iterable:
            k = key(element)
            if k not in seen:
                seen_add(k)
                yield element

def build_graph(nodes):
    gr = nx.Graph()
    gr.add_nodes_from(nodes)
    nodePairs = list(itertools.combinations(nodes, 2))
    
    for pair in nodePairs:
        firstString = pair[0]
        secondString = pair[1]
        levDistance = editdistance.eval(firstString ,secondString)
        gr.add_edge(firstString ,secondString ,weight=levDistance)
        
    return gr

In [27]:
def extract_key_phrases(text):
    word_tokens = nltk.word_tokenize(text)
    tagged = nltk.pos_tag(word_tokens)
    textlist = [x[0] for x in tagged]
    tagged = filter_for_tags(tagged)
    tagged = normalize(tagged)
    unique_word_set = unique_everseen([x[0] for x in tagged])
    word_set_list = list(unique_word_set)
    graph = build_graph(word_set_list)
    calculated_page_rank = nx.pagerank(graph, weight='weight')
    keyphrases = sorted(calculated_page_rank, key=calculated_page_rank.get,reverse=True)
    one_third = len(word_set_list) // 3
    keyphrases = keyphrases[0:one_third + 1]
    modified_key_phrases = set([])
    dealt_with = set([])
    i = 0
    j = 1
    while j < len(textlist):
        first = textlist[i]
        second = textlist[j]
        if first in keyphrases and second in keyphrases:
            keyphrase = first + ' ' + second
            modified_key_phrases.add(keyphrase)
            dealt_with.add(first)
            dealt_with.add(second)
        else:
            if first in keyphrases and first not in dealt_with:
                modified_key_phrases.add(first)
            if j == len(textlist) - 1 and second in keyphrases and second not in dealt_with:
                modified_key_phrases.add(second)

        i = i + 1
        j = j + 1

    return modified_key_phrases

def extract_sentences(text, summary_length=100, clean_sentences=False, language='english'):
    sent_detector = nltk.data.load('tokenizers/punkt/'+language+'.pickle')
    sentence_tokens = sent_detector.tokenize(text.strip())
    graph = build_graph(sentence_tokens)
    calculated_page_rank = nx.pagerank(graph, weight='weight')
    sentences = sorted(calculated_page_rank, key=calculated_page_rank.get,reverse=True)
    summary = ' '.join(sentences)
    summary_words = summary.split()
    summary_words = summary_words[0:summary_length]
    dot_indices = [idx for idx, word in enumerate(summary_words) if word.find('.') != -1]
    if clean_sentences and dot_indices:
        last_dot = max(dot_indices) + 1
        summary = ' '.join(summary_words[0:last_dot])
    else:
        summary = ' '.join(summary_words)

    return summary

def write_files(summary, key_phrases, filename):
    print("Generating output to " + 'keywords/' + filename)
    key_phrase_file = io.open('keywords/' + filename, 'w',encoding="utf8")
    for key_phrase in key_phrases:
        key_phrase_file.write(key_phrase + '\n')
    key_phrase_file.close()

    print("Generating output to " + 'summaries/' + filename)
    summary_file = io.open('summaries/' + filename, 'w',encoding="utf8")
    summary_file.write(summary)
    summary_file.close()

    print("-")

In [28]:
articles = os.listdir("articles")
for article in articles:
    article_file = io.open('articles/'+article,'r',encoding="utf8")
    text = article_file.read()
    keyphrases = extract_key_phrases(text)
    summary = extract_sentences(text)
    write_files(summary, keyphrases, article)

Generating output to keywords/1.txt
Generating output to summaries/1.txt
-
Generating output to keywords/10.txt
Generating output to summaries/10.txt
-
Generating output to keywords/2.txt
Generating output to summaries/2.txt
-
Generating output to keywords/3.txt
Generating output to summaries/3.txt
-
Generating output to keywords/4.txt
Generating output to summaries/4.txt
-
Generating output to keywords/5.txt
Generating output to summaries/5.txt
-
Generating output to keywords/6.txt
Generating output to summaries/6.txt
-
Generating output to keywords/7.txt
Generating output to summaries/7.txt
-
Generating output to keywords/8.txt
Generating output to summaries/8.txt
-
Generating output to keywords/9.txt
Generating output to summaries/9.txt
-
