In [1]:
from __future__ import division
import re

# This is a naive text summarization algorithm
# Created by Shlomi Babluki 
# April, 2013

In [4]:
class SummaryTool(object):

    # Naive method for splitting a text into sentences
    def split_content_to_sentences(self, content):
        content = content.replace("\n", ". ")
        return content.split(". ")

    # Naive method for splitting a text into paragraphs
    def split_content_to_paragraphs(self, content):
        return content.split("\n\n")

    # Caculate the intersection between 2 sentences
    def sentences_intersection(self, sent1, sent2):

        # split the sentence into words/tokens
        s1 = set(sent1.split(" "))
        s2 = set(sent2.split(" "))

        # If there is not intersection, just return 0
        if (len(s1) + len(s2)) == 0:
            return 0

        # We normalize the result by the average number of words
        return len(s1.intersection(s2)) / ((len(s1) + len(s2)) / 2)

    # Format a sentence - remove all non-alphbetic chars from the sentence
    # We'll use the formatted sentence as a key in our sentences dictionary
    def format_sentence(self, sentence):
        sentence = re.sub(r'\W+', '', sentence)
        return sentence

    # Convert the content into a dictionary <K, V>
    # k = The formatted sentence
    # V = The rank of the sentence
    def get_sentences_ranks(self, content):

        # Split the content into sentences
        sentences = self.split_content_to_sentences(content)

        # Calculate the intersection of every two sentences
        n = len(sentences)
        values = [[0 for x in xrange(n)] for x in xrange(n)]
        for i in range(0, n):
            for j in range(0, n):
                values[i][j] = self.sentences_intersection(sentences[i], sentences[j])

        # Build the sentences dictionary
        # The score of a sentences is the sum of all its intersection
        sentences_dic = {}
        for i in range(0, n):
            score = 0
            for j in range(0, n):
                if i == j:
                    continue
                score += values[i][j]
            sentences_dic[self.format_sentence(sentences[i])] = score
        return sentences_dic

    # Return the best sentence in a paragraph
    def get_best_sentence(self, paragraph, sentences_dic):

        # Split the paragraph into sentences
        sentences = self.split_content_to_sentences(paragraph)

        # Ignore short paragraphs
        if len(sentences) < 2:
            return ""

        # Get the best sentence according to the sentences dictionary
        best_sentence = ""
        max_value = 0
        for s in sentences:
            strip_s = self.format_sentence(s)
            if strip_s:
                if sentences_dic[strip_s] > max_value:
                    max_value = sentences_dic[strip_s]
                    best_sentence = s

        return best_sentence

    # Build the summary
    def get_summary(self, title, content, sentences_dic):

        # Split the content into paragraphs
        paragraphs = self.split_content_to_paragraphs(content)

        # Add the title
        summary = []
        summary.append(title.strip())
        summary.append("")

        # Add the best sentence from each paragraph
        for p in paragraphs:
            sentence = self.get_best_sentence(p, sentences_dic).strip()
            if sentence:
                summary.append(sentence)

        return ("\n").join(summary)


# Main method, just run "python summary_tool.py"
def main():

    # Demo
    # Content from: "http://thenextweb.com/apps/2013/03/21/swayy-discover-curate-content/"

    title = """
    Obama Criticizes F.B.I. Director: ‘We Don’t Operate on Incomplete Information'
    """

    content = """
    WASHINGTON — President Obama sharply criticized the decision by his F.B.I. director to alert Congress on Friday about the discovery of new emails related to the Hillary Clinton server case, implying that it violated investigative guidelines and trafficked in innuendo.

“We don’t operate on incomplete information,” Mr. Obama said in an interview with NowThis News, broadcast Wednesday. “We don’t operate on leaks. We operate based on concrete decisions that are made.”

“When this was investigated thoroughly the last time, the conclusion of the F.B.I., the conclusion of the Justice Department, the conclusion of repeated congressional investigations was that she had made some mistakes but that there wasn’t anything there that was prosecutable,” Mr. Obama said.

 Follow
 The president did not mention the F.B.I. director, James B. Comey, but it was clear Mr. Obama was referring to him.

Declaring that he had “made a very deliberate effort to make sure that I don’t look like I’m meddling in what are supposed to be independent processes for making these assessments,” Mr. Obama nonetheless expressed confidence in Mrs. Clinton.

“I trust her,’’ he said. “I know her. And I wouldn’t be supporting her if I didn’t have absolute confidence in her integrity and her interest in making sure that young people have a better future.’’

White House officials later downplayed Mr. Obama’s remarks about the F.B.I. and insisted he had not meant to criticize Mr. Comey.

“The president went out of his way to say he wouldn’t comment on any particular investigations,” Eric Schultz, a White House spokesman, told reporters on Air Force One while Mr. Obama was en route to North Carolina to campaign for Mrs. Clinton. Mr. Schultz characterized Mr. Obama’s remarks as mirroring those made in recent days by the White House press secretary, Josh Earnest, who had said that while the White House would not criticize Mr. Comey’s decision to update Congress on the status of an ongoing investigation, Mr. Obama believed that rules intended to keep such investigations confidential were good ones and should be followed.

For the last several days, the F.B.I. has been analyzing emails belonging to Huma Abedin, a top adviser to Mrs. Clinton. Agents discovered the emails last month in an unrelated investigation into Ms. Abedin’s estranged husband, the disgraced former congressman Anthony D. Weiner.

In a letter to Congress, Mr. Comey said those emails might be pertinent to the investigation into Mrs. Clinton’s use of a private email server. Authorities concluded that case in July with no charges. But the letter, sent over the objection of the Justice Department, led to controversy because it deviated from longstanding guidelines.

Mr. Obama took a more pointed tone than his press secretary, Josh Earnest, who said on Tuesday that the White House did not have an official position on Mr. Comey’s decision. Mr. Earnest referred to the Justice Department guidelines, however, and said, “The president believes that it’s important for those guidelines and norms to be followed.”

It is increasingly unlikely that agents will finish their work on the emails by Election Day, F.B.I. officials said. They said there was a chance they could offer updates before next Tuesday.

The renewed interest in Mrs. Clinton’s emails — a matter she believed she had put behind her months ago — has exploded in the final days of the presidential campaign, with recent polls showing that the race is tightening. But Mrs. Clinton remains ahead of her Republican challenger, Donald J. Trump, in most national polls to date.

Much is unknown about the new emails, including why they were on Mr. Weiner’s laptop in the first place. Ms. Abedin, through her lawyers, has adamantly denied using that laptop, which people with knowledge of the matter have said was identified in court papers as a Dell model. People with knowledge of the matter have said that the emails may have ended up on the laptop because they were inadvertently backed up or downloaded onto an older computer and then transferred from the older computer to the laptop’s hard drive when the older computer was replaced.

Mr. Obama first commented publicly on the investigation last year before the F.B.I. had determined that neither Mrs. Clinton nor her aides would face charges for mishandling classified information found on the secretary of state’s private email server. The president’s remarks angered F.B.I. agents who said he was prejudging the investigation.

“I don’t think it posed a national security problem,” Mr. Obama said on “60 Minutes” on CBS in October 2015. He said it had been a mistake for Mrs. Clinton to use a private email account when she was secretary of state, but his conclusion was unmistakable: “This is not a situation in which America’s national security was endangered.”


    """

    # Create a SummaryTool object
    st = SummaryTool()

    # Build the sentences dictionary
    sentences_dic = st.get_sentences_ranks(content)

    # Build the summary with the sentences dictionary
    summary = st.get_summary(title, content, sentences_dic)

    # Print the summary
    print summary

    # Print the ratio between the summary length and the original length
    print ""
    print "Original Length %s" % (len(title) + len(content))
    print "Summary Length %s" % len(summary)
    print "Summary Ratio: %s" % (100 - (100 * (len(summary) / (len(title) + len(content)))))


if __name__ == '__main__':
    main()

Obama Criticizes F.B.I. Director: ‘We Don’t Operate on Incomplete Information'

WASHINGTON — President Obama sharply criticized the decision by his F.B.I
“We don’t operate on incomplete information,” Mr
“When this was investigated thoroughly the last time, the conclusion of the F.B.I., the conclusion of the Justice Department, the conclusion of repeated congressional investigations was that she had made some mistakes but that there wasn’t anything there that was prosecutable,” Mr
Follow
Declaring that he had “made a very deliberate effort to make sure that I don’t look like I’m meddling in what are supposed to be independent processes for making these assessments,” Mr
And I wouldn’t be supporting her if I didn’t have absolute confidence in her integrity and her interest in making sure that young people have a better future.’’
and insisted he had not meant to criticize Mr
Comey’s decision to update Congress on the status of an ongoing investigation, Mr
Agents discovered the emails last 

In [20]:
from nltk.tokenize import sent_tokenize,word_tokenize
from nltk.corpus import stopwords
from collections import defaultdict
from string import punctuation
from heapq import nlargest

class FrequencySummarizer:
    def __init__(self, min_cut=0.1, max_cut=0.9):
            """
         Initilize the text summarizer.
         Words that have a frequency term lower than min_cut 
         or higer than max_cut will be ignored.
        """
        self._min_cut = min_cut
        self._max_cut = max_cut 
        self._stopwords = set(stopwords.words('english') + list(punctuation))

    def _compute_frequencies(self, word_sent):
        """ 
      Compute the frequency of each of word.
      Input: 
       word_sent, a list of sentences already tokenized.
      Output: 
       freq, a dictionary where freq[w] is the frequency of w.
    """
        freq = defaultdict(int)
        for s in word_sent:
            for word in s:
                if word not in self._stopwords:
                    freq[word] += 1
        # frequencies normalization and fitering
        m = float(max(freq.values()))
        for w in freq.keys():
            freq[w] = freq[w]/m
            if freq[w] >= self._max_cut or freq[w] <= self._min_cut:
                del freq[w]
        return freq

    def summarize(self, text, n):
        """
        Return a list of n sentences which represent the summary of text."""
        sents = sent_tokenize(text)
        assert n <= len(sents)
        word_sent = [word_tokenize(s.lower()) for s in sents]
        self._freq = self._compute_frequencies(word_sent)
        ranking = defaultdict(int)
        for i,sent in enumerate(word_sent):
            for w in sent:
                if w in self._freq:
                    ranking[i] += self._freq[w]
            sents_idx = self._rank(ranking, n)    
        return [sents[j] for j in sents_idx]

    def _rank(self, ranking, n):
        """ return the first n sentences with highest ranking """
        return nlargest(n, ranking, key=ranking.get)


IndentationError: unindent does not match any outer indentation level (<ipython-input-20-0641fa42a199>, line 14)

In [21]:
#https://github.com/miso-belica/sumy
!pip install sumy

Collecting sumy
  Downloading sumy-0.4.1-py2.py3-none-any.whl (50kB)
[K    100% |████████████████████████████████| 51kB 2.1MB/s 
Collecting docopt<0.7,>=0.6.1 (from sumy)
  Downloading docopt-0.6.2.tar.gz
Collecting breadability>=0.1.20 (from sumy)
  Downloading breadability-0.1.20.tar.gz
Building wheels for collected packages: docopt, breadability
  Running setup.py bdist_wheel for docopt ... [?25l- \ done
[?25h  Stored in directory: /Users/veenakumar/Library/Caches/pip/wheels/b2/16/5f/c33a2bb5f2dce71205f8e65cbfd05647d79d441282be31fd82
  Running setup.py bdist_wheel for breadability ... [?25l- \ done
[?25h  Stored in directory: /Users/veenakumar/Library/Caches/pip/wheels/1c/a2/95/71125ff5340c414140d87638e9408b8e76f0d006a48f5b876a
Successfully built docopt breadability
Installing collected packages: docopt, breadability, sumy
Successfully installed breadability-0.1.20 docopt-0.6.2 sumy-0.4.1
[33mYou are using pip version 8.1.2, however version 9.0.0 is available.
You shou

In [22]:
from sumy.parsers.html import HtmlParser
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lsa import LsaSummarizer as Summarizer
from sumy.nlp.stemmers import Stemmer
from sumy.utils import get_stop_words


In [29]:
from sumy.summarizers.lex_rank import LexRankSummarizer #We're choosing Lexrank, other algorithms are also built in

file = "practice.txt" #name of the plain-text file
parser = PlaintextParser.from_file(file, Tokenizer("english"))
summarizer = Summarizer()

summary = summarizer(parser.document, 3) #Summarize the document with 5 sentences

for sentence in summary:
    print sentence

The eukaryotic cell spends most of its "life" in interphase of the cell cycle, which can be subdivided into the three phases, G1, S and G2.
During interphase, the cell does what it is supposed to do.
Though cells have many common functions, such as DNA replication, they also have certain specific functions.
