In [1]:
# coding=UTF-8
from __future__ import division
import re

# This is a naive text summarization algorithm
# Created by Shlomi Babluki
# April, 2013



In [2]:
class SummaryTool(object):

    # Naive method for splitting a text into sentences
    def split_content_to_sentences(self, content):
        content = content.replace("\n", ". ")
        return content.split(". ")

    # Naive method for splitting a text into paragraphs
    def split_content_to_paragraphs(self, content):
        return content.split("\n\n")

    # Caculate the intersection between 2 sentences
    def sentences_intersection(self, sent1, sent2):

        # split the sentence into words/tokens
        s1 = set(sent1.split(" "))
        s2 = set(sent2.split(" "))

        # If there is not intersection, just return 0
        if (len(s1) + len(s2)) == 0:
            return 0

        # We normalize the result by the average number of words
        return len(s1.intersection(s2)) / ((len(s1) + len(s2)) / 2)

    # Format a sentence - remove all non-alphbetic chars from the sentence
    # We'll use the formatted sentence as a key in our sentences dictionary
    def format_sentence(self, sentence):
        sentence = re.sub(r'\W+', '', sentence)
        return sentence

    # Convert the content into a dictionary <K, V>
    # k = The formatted sentence
    # V = The rank of the sentence
    def get_senteces_ranks(self, content):

        # Split the content into sentences
        sentences = self.split_content_to_sentences(content)

        # Calculate the intersection of every two sentences
        n = len(sentences)
        # [Sam python 2.7 -> 3.4] values = [[0 for x in xrange(n)] for x in xrange(n)]
        values = [[0 for x in range(n)] for x in range(n)]
        for i in range(0, n):
            for j in range(0, n):
                values[i][j] = self.sentences_intersection(sentences[i], sentences[j])

        # Build the sentences dictionary
        # The score of a sentences is the sum of all its intersection
        sentences_dic = {}
        for i in range(0, n):
            score = 0
            for j in range(0, n):
                if i == j:
                    continue
                score += values[i][j]
            sentences_dic[self.format_sentence(sentences[i])] = score
        return sentences_dic

    # Return the best sentence in a paragraph
    def get_best_sentence(self, paragraph, sentences_dic):

        # Split the paragraph into sentences
        sentences = self.split_content_to_sentences(paragraph)

        # Ignore short paragraphs
        if len(sentences) < 2:
            return ""

        # Get the best sentence according to the sentences dictionary
        best_sentence = ""
        max_value = 0
        for s in sentences:
            strip_s = self.format_sentence(s)
            if strip_s:
                if sentences_dic[strip_s] > max_value:
                    max_value = sentences_dic[strip_s]
                    best_sentence = s

        return best_sentence

    # Build the summary
    def get_summary(self, title, content, sentences_dic):

        # Split the content into paragraphs
        paragraphs = self.split_content_to_paragraphs(content)

        # Add the title
        summary = []
        summary.append(title.strip())
        summary.append("")

        # Add the best sentence from each paragraph
        for p in paragraphs:
            sentence = self.get_best_sentence(p, sentences_dic).strip()
            if sentence:
                summary.append(sentence)

        return ("\n").join(summary)




In [6]:
# Main method, just run "python summary_tool.py"
# def main():

    # Demo
    # Content from: "http://thenextweb.com/apps/2013/03/21/swayy-discover-curate-content/"

title = """
<< The Title of Article >>
"""

#    content = open('G:\Tool_PGM\Python\000Code\nusbotpaper.txt', encoding = "ISO-8859-1", errors='ignore')
#    with codecs.open('G:\Tool_PGM\Python\000Code\nusbotpaper.txt', "r", encoding="ISO-8859-1", errors='ignore') as fcontent:
#    content = fcontent
#    f = open('G:\Tool_PGM\Python\000Code\nusbotpaper.txt', encoding = "ISO-8859-1", errors='ignore')
#    content = Rtf15Reader.read(f)
#    f.close()

content = """
1 INTRODUCTION
It was built based on UAlbertaBot 2013 architecture, which employs a Multi-Agent based design system
NUS-Bot originated a high-level strategic reasoning mechanism, consisting Intelligence Preparation of the Battlefield (IPB) and Layered Influence Map (LIM)
It is a systematic, continuous process of analyzing the threat and environment in a specific geographic area
Applying the IPB process helps the commander selectively apply and maximize his combat power at critical points in time and space on the battlefield via:
LIM’s update frequency can vary in order to produce different granularity for different scale of reasoning requirements, to satisfy real-time computation expenditure

The dendrogram of the hierarchical clustering shows 2 distant clusters, with the larger of the 2 breaking into another 2 clusters
The clustering algorithm was run iteratively from 2 to 10 initial centroids and the resulting Sum (withinSS) plot showed a massive drop in the different from the previous within cluster sum of squared error at the iteration using 4 clusters
The first 2 principal components preserved 80% of the information in the data.

A possible alternative to the above methodology would be to apply outlier detection techniques such as local outlier factor (LOF) or local density cluster-based outlier factor (LDCOF) to identify and remove such outliers first before commencing hierarchical and the subsequent k-means clustering
These games run very long, have low to medium game scores per second, have relatively low winner and loser scores, and low winner/loser scores (0.4 to around 5)? It suggests that the gameplay was largely defensive and focused on construction and resource gathering
The large range in winner/loser scores from 0.4 to 32 suggests that the dominant strategy was played against a wide variety of other strategies
It suggests that the dominant engagement strategy in this game was that of balanced aggressive builders who manage to progress to mid-game with mid-level units and yet constant engage their opponent's units and buildings which explains the high game scores/second.

    """

In [7]:
    # Create a SummaryTool object
st = SummaryTool()

    # Build the sentences dictionary
sentences_dic = st.get_senteces_ranks(content)

    # Build the summary with the sentences dictionary
summary = st.get_summary(title, content, sentences_dic)

    # print the summary
print(summary)

    # print(the ratio between the summary length and the original length
print("")
print("Original Length %s" % (len(title) + len(content)))
print("Summary Length %s" % len(summary))
print("Summary Ratio: %s" % (100 - (100 * (len(summary) / (len(title) + len(content))))))

#   output file locaiotn: G:\Tool_PGM\Python\
#     f = open("nusbot_summary.txt","w") 
#     f.write("Original Length %s" % (len(title) + len(content)))
#     f.write("\n")
#     f.write("Summary Length %s" % len(summary))
#     f.write("\n")
#     f.write("Summary Ratio: %s" % (100 - (100 * (len(summary) / (len(title) + len(content))))))
#     f.write("\n")
#     f.write("\n")
#     f.write(summary)
#     f.close()


<< The Title of Article >>

It is a systematic, continuous process of analyzing the threat and environment in a specific geographic area
The clustering algorithm was run iteratively from 2 to 10 initial centroids and the resulting Sum (withinSS) plot showed a massive drop in the different from the previous within cluster sum of squared error at the iteration using 4 clusters
The large range in winner/loser scores from 0.4 to 32 suggests that the dominant strategy was played against a wide variety of other strategies

Original Length 2159
Summary Length 521
Summary Ratio: 75.8684576193
