## This is a naive text summarization algorithm
#### Created by Shlomi Babluki
#### April, 2013


## www.KudosData.com
#### Updated by Sam GU Zhan, to support input text in Chinese
#### March, 2017

In [1]:
# coding=UTF-8
from __future__ import division
import re

# Python2 unicode & float-division support:
# from __future__ import unicode_literals, division

In [2]:
# %matplotlib inline
# import pandas as pd
import numpy as np
# import matplotlib.pyplot as plt

import io

# 中文字符和语言处理库
import jieba

# 机器学习库 sklearn 分类学习模型库
#from sklearn import linear_model
from sklearn.feature_extraction import DictVectorizer # 数据结构变换：把 Dict 转换为 稀疏矩阵
# from sklearn.linear_model import LogisticRegression  # 逻辑回归分类模型
# from sklearn.pipeline import make_pipeline # 封装机器学习模型流程
# from sklearn.metrics import confusion_matrix, roc_curve, auc

# 中文显示设置
from pylab import *  
mpl.rcParams['font.sans-serif'] = ['SimHei'] # 指定默认字体  
mpl.rcParams['axes.unicode_minus'] = False # 解决保存图像是负号'-'显示为方块的问题 
mpl.rcParams['font.size'] = 14 # 设置字体大小

np.random.seed(88)

In [3]:
# Python3
# 中文分词功能小函数， 输出 字符串， 各词组由空格分隔
def KudosData_word_tokenizer(foo):
    seg_token = jieba.cut(str(foo), cut_all=True)
    seg_str = str(' '.join(seg_token))
    return seg_str
# Python2
# 中文分词功能小函数， 输出 字符串， 各词组由空格分隔
# def KudosData_word_tokenizer(foo):
#     seg_token = jieba.cut(foo, cut_all=True)
#     seg_str = ' '.join(seg_token)
#     return seg_str

In [4]:
class SummaryTool(object):

    # Naive method for splitting a text into paragraphs
    def split_content_to_paragraphs(self, content):
        # pre-process to identiy new paragraph:
#         content = content.replace("\n\n", "\n\n  ") # new paragraph Chinese converstion: \n\n -> \n\n + Space Space
        content = content.replace("\n  ", "\n\n") # new paragraph: \n + Space Space
        content = content.replace("\n	", "\n\n") # new paragraph: \n + Tab

        content = content.replace("	", " ") # clearing: Tab
        content = content.replace("\n\n\n", "\n\n") # clearing
        content = content.replace("\n\n\n", "\n\n") # clearing
        content = content.replace("\n\n\n", "\n\n") # clearing
        content = content.replace("\n\n\n", "\n\n") # clearing
#         content = content.replace("   ", "  ") # clearing: 3 Spaces -> 2 Spaces
#         content = content.replace("   ", "  ") # clearing: 3 Spaces -> 2 Spaces
#         content = content.replace("   ", "  ") # clearing: 3 Spaces -> 2 Spaces
#         content = content.replace("   ", "  ") # clearing: 3 Spaces -> 2 Spaces
        content = content.replace("  ", " ") # clearing: 2 Spaces -> 1 Space
        content = content.replace("  ", " ") # clearing: 2 Spaces -> 1 Space
        content = content.replace("  ", " ") # clearing: 2 Spaces -> 1 Space
        content = content.replace("  ", " ") # clearing: 2 Spaces -> 1 Space

        # debug:
        print('')
        print('.................................................................')
        print('-->> splitting below text into paragraphs -- original input text:')
        print(content)
        print('.................................................................')

        return content.split("\n\n") # split paragraph base on: \n\n. New paragraph will have 2 leading Spaces

    # Naive method for splitting a text/paragraphs into sentences
    def split_content_to_sentences(self, content):
#         content = content.replace("\n", ". ")
#         return content.split(". ")

#         content = re.sub(r'\W+', '。', content)
#         content = content.replace("‘", "")
#         content = content.replace("’", "")
#         content = content.replace("“", "")
#         content = content.replace("”", "")
        content = content.replace("\n", "。") # clearing: append sentences within paragraph
    
#         content = content.replace("，", "。") # keep comma as it is

#         content = content.replace("；", "；。")
#         content = content.replace("？", "？。")
#         content = content.replace("！", "！。")
        content = content.replace("；", "。")
        content = content.replace("？", "。")
        content = content.replace("！", "。")

# make (xxx) as part of previous sentence.
#         content = content.replace("(", "。(")
        content = content.replace(")", ")。")
    
# make “xxx” as part of previous sentence.
#         content = content.replace("“", "。“")
        content = content.replace("”", "”。")
    
# make ‘xxx’ as part of previous sentence.
#         content = content.replace("‘", "。‘")
        content = content.replace("’", "’。")
    
#         content = content.replace("[", "。[")
#         content = content.replace("]", "]。")
#         content = content.replace("{", "。{")
#         content = content.replace("}", "}。")
        content = content.replace("[", "。")
        content = content.replace("]", "。")
        content = content.replace("{", "。")
        content = content.replace("}", "。")
        content = content.replace("|", "。")

        
# temp clearing
        content = content.replace("。,", "")
        content = content.replace("。-", "")
        content = content.replace("。_", "")
        content = content.replace("。<", "")
        content = content.replace("。(", "")

        content = content.replace(",", "")
        content = content.replace("-", "")
        content = content.replace("_", "")
        content = content.replace("<", "")
        content = content.replace(">", "")
        content = content.replace("(", "")
        content = content.replace(")", "")

        
        content = content.replace("。·", "")
        content = content.replace("。／", "")
        content = content.replace("。–", "")
        content = content.replace("。—", "")
        content = content.replace("。－", "")
        content = content.replace("。，", "")
        content = content.replace("。、", "")
        content = content.replace("。《", "")
        content = content.replace("。（", "")
        content = content.replace("。“", "")
        content = content.replace("。‘", "")

        content = content.replace("·", "")
        content = content.replace("／", "")
        content = content.replace("–", "")
        content = content.replace("—", "")
        content = content.replace("－", "")
        content = content.replace("，", "")
        content = content.replace("、", "")
        content = content.replace("《", "")
        content = content.replace("》", "")
        content = content.replace("（", "")
        content = content.replace("）", "")
        content = content.replace("“", "")
        content = content.replace("”", "")
        content = content.replace("‘", "")
        content = content.replace("’", "")

        # please note, below clearing Space is very good for Chinese, but bad for English sentences embedded in Chinese.
        content = content.replace(" ", "") # clearing: Space
# temp clearing - end
        
        content = content.replace("	", "。") # clearing: Tab
        content = content.replace("　", "。") # clearing: special Tab
        
        content = content.replace("。。", "。") # clearing
        content = content.replace("。。", "。") # clearing
        content = content.replace("。。", "。") # clearing
        content = content.replace("。。", "。") # clearing
        content = content.replace("。。", "。") # clearing
        content = content.replace("。。", "。") # clearing
        content = content.replace("。。", "。") # clearing
        content = content.replace("。。", "。") # clearing

# make sentence as part of previous sentence with ':'.
#         content = content.replace("：。", "：")
        content = content.replace("：。", "")
        content = content.replace("：", "")
        content = content.replace(":。", "")
        content = content.replace(":", "")
        
        print('')
        print('-->> splitting below paragraph into sentences')
        print(content)
        print('')
        return content.split("。")

    # Caculate the intersection between 2 sentences
    def sentences_intersection(self, sent1, sent2):
        # www.KudosData.com
        sent1chn = KudosData_word_tokenizer(sent1)
        sent2chn = KudosData_word_tokenizer(sent2)

        # split the sentence into words/tokens
#         s1 = set(sent1.split(" "))
#         s2 = set(sent2.split(" "))
        s1 = set(sent1chn.split(" "))
        s2 = set(sent2chn.split(" "))
        # debug
#         print(sent1)
#         print(sent1chn)
#         print(s1)
#         print('')       
        # If there is not intersection, just return 0
        if (len(s1) + len(s2)) == 0:
            print('# If there is not intersection, just return 0')
            return 0

        # We normalize the result by the average number of words
        return len(s1.intersection(s2)) / ((len(s1) + len(s2)) / 2)

    # Format a sentence - remove all non-alphbetic chars from the sentence
    # We'll use the formatted sentence as a key in our sentences dictionary
    def format_sentence(self, sentence):
        sentence = re.sub(r'\W+', '', sentence)
        return sentence

    # Convert the content into a dictionary <K, V>
    # k = The formatted sentence
    # V = The rank of the sentence
    def get_senteces_ranks(self, content):
        print(' ')
        print('start: get_senteces_ranks(self, content) ..................................')

        # Split the paragraph into sentences
        sentences = self.split_content_to_sentences(content)

        # Calculate the intersection of every two sentences
        n = len(sentences)
        # [Sam python 2.7 -> 3.4] values = [[0 for x in xrange(n)] for x in xrange(n)]
        values = [[0 for x in range(n)] for x in range(n)]
        for i in range(0, n):
            for j in range(0, n):
                values[i][j] = self.sentences_intersection(sentences[i], sentences[j])

        # Build the sentences dictionary
        # The score of a sentences is the sum of all its intersection
        sentences_dic = {}
        for i in range(0, n):
            score = 0
            for j in range(0, n):
                if i == j:
                    continue
                score += values[i][j]
            sentences_dic[self.format_sentence(sentences[i])] = score
            
            # www.KudosData.com
            print('################# successfully completed: get_senteces_ranks() : %d' % i)
        return sentences_dic

    # Return the best sentence in a paragraph
    def get_best_sentence(self, paragraph, sentences_dic):

        # Split the paragraph into sentences
        sentences = self.split_content_to_sentences(paragraph)

        # Ignore short paragraphs
        if len(sentences) < 2:
            print('==>> Ignore short paragraphs')
            return ""

        # Get the best sentence according to the sentences dictionary
#         best_sentence = ""
        best_sentence = "### Best_sentence NOT produced for this paragraph ###"
        max_value = 0
        for s in sentences:
            strip_s = self.format_sentence(s)
            if strip_s:
                if sentences_dic[strip_s] > max_value:
                    max_value = sentences_dic[strip_s]
                    best_sentence = s

        return best_sentence

    # Build the summary
    def get_summary(self, title, content, sentences_dic):

        # Split the test into paragraphs
        paragraphs = self.split_content_to_paragraphs(content)

        # Add the title
        summary = []
# remove title from displaying
#         summary.append(title.strip())
        summary.append("")

        # Add the best sentence from each paragraph
        for p in paragraphs:
            sentence = self.get_best_sentence(p, sentences_dic).strip()
            if sentence:
                summary.append(sentence)

        return ("\n").join(summary)




# Main()

### process inputs

In [5]:
# process Unicode text input
with io.open('input_text.txt','r',encoding='utf8') as f:
    text = f.read()

    # with io.open('output_text.txt','w',encoding='utf8') as f:
#     f.write(text)

In [6]:
content = text

# content

In [7]:
title = '''
<Dummy Title>
'''

title

'\n<Dummy Title>\n'

# Run()

In [8]:
    # Create a SummaryTool object
st = SummaryTool()

    # Build the sentences dictionary
sentences_dic = st.get_senteces_ranks(content)

    # Build the summary with the sentences dictionary
summary = st.get_summary(title, content, sentences_dic)

    # print(the ratio between the summary length and the original length
print("Original Length %s" % (len(title) + len(content)))
print("Summary Length %s" % len(summary))
print("Summary Ratio: %s" % (100 - (100 * (len(summary) / (len(title) + len(content))))))
print("")

    # print the summary
print(summary)

#   output file locaiotn: G:\Tool_PGM\Python\
#     f = open("nusbot_summary.txt","w") 
#     f.write("Original Length %s" % (len(title) + len(content)))
#     f.write("\n")
#     f.write("Summary Length %s" % len(summary))
#     f.write("\n")
#     f.write("Summary Ratio: %s" % (100 - (100 * (len(summary) / (len(title) + len(content))))))
#     f.write("\n")
#     f.write("\n")
#     f.write(summary)
#     f.close()


Building prefix dict from the default dictionary ...
DEBUG:jieba:Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\TELESC~1\AppData\Local\Temp\jieba.cache
DEBUG:jieba:Loading model from cache C:\Users\TELESC~1\AppData\Local\Temp\jieba.cache
Loading model cost 2.100 seconds.
DEBUG:jieba:Loading model cost 2.100 seconds.
Prefix dict has been built succesfully.
DEBUG:jieba:Prefix dict has been built succesfully.


 
start: get_senteces_ranks(self, content) ..................................

-->> splitting below paragraph into sentences
汉语。编辑。维基百科自由的百科全书。本条目存在隐藏的内容可能损害或改善读者的阅览体验。请协助改善条目以符合维基百科的标准2015年9月12日。一般应该仅由特定标准化模板提供折叠资料表格勿因故事剧情或项目混杂而隐藏。内容应该考虑其他方式呈现。重复记载过度细节与无助了解主题的堆砌内容等需要考虑除去。中文重定向于此书面语参见汉语书面语文字参见汉字。关于中国人使用的语言参见中国语言。两岸三地东南亚官方非官方组织各自对于汉语的定义语音词汇参见现代标准汉语。汉语。Hanyutradsimp.svg。正体字和简体字书写的汉语。区域中国大陆香港澳门台湾新加坡马来西亚印度尼西亚泰国越南柬埔寨缅甸圣诞岛以及世界各地的华人社区。母语使用人数约15亿2015。15000万人作为第二语言日期不详。语系。汉藏语系。汉语族。汉语。早期形式。上古汉语。中古汉语。近代汉语。汉语。标准形式。现代标准汉语。粤语。方言。官话。晋语。吴语含上海话。徽语。赣语。湘语。闽北语。闽南语。闽东语。闽中语。莆仙语。客家语。粤语。瓦乡话。文字汉字注音符号汉语拼音小儿经。官方地位。作为官方语言中华人民共和国。香港。澳门。中华民国。新加坡。承认少数语言加拿大。马来西亚。美国。印尼。英国。菲律宾。管理机构中华民国教育部终身教育司。中华人民共和国国家语言文字工作委员会。新加坡推广华语理事会。马来西亚华语规范理事会。香港语文教育及研究常务委员会。语言代码。ISO6391zh。ISO6392chiB。zhoT。ISO6393分别为cdo闽东语。cjy晋语。cmn官话。cpx莆仙语。czh徽语。czo闽中语。dng东干语。gan赣语。hak客家语。hsn湘语。ltc中古汉语。lzh文言文。mnp闽北语。nan闽南语。och上古汉语。wuu吴语。wxa瓦乡话。yue粤语。汉语使用者分布。作为主要语言官方语言及母语。多于500万使用者。多于100万使用者。多于50万使用者。多于10万使用者。汉语使用者主要居住点。汉语。Mapofsiniticlanguagesfullzh.svg。大中华区汉

In [9]:
with io.open('output_topic_summary.txt','w',encoding='utf8') as f:
    f.write("Original Length : %s" % (len(title) + len(content)))
    f.write("\n")
    f.write("Summary  Length : %s" % len(summary))
    f.write("\n")
    f.write("Summary  Ratio  : %s" % (100 - (100 * (len(summary) / (len(title) + len(content))))))
    f.write("\n")
    f.write("\n")
    f.write(summary)
    f.close()