In [1]:
# coding=UTF-8
# from __future__ import division
import re

# This is a naive text summarization algorithm
# Created by Shlomi Babluki
# April, 2013

# www.KudosData.com
# Sam GU Zhan
# March, 2017

# Python2 unicode & float-division support:
from __future__ import unicode_literals, division

In [2]:
# %matplotlib inline
# import pandas as pd
import numpy as np
# import matplotlib.pyplot as plt

import io

# 中文字符和语言处理库
import jieba

# 机器学习库 sklearn 分类学习模型库
#from sklearn import linear_model
from sklearn.feature_extraction import DictVectorizer # 数据结构变换：把 Dict 转换为 稀疏矩阵
# from sklearn.linear_model import LogisticRegression  # 逻辑回归分类模型
# from sklearn.pipeline import make_pipeline # 封装机器学习模型流程
# from sklearn.metrics import confusion_matrix, roc_curve, auc

# 中文显示设置
from pylab import *  
mpl.rcParams['font.sans-serif'] = ['SimHei'] # 指定默认字体  
mpl.rcParams['axes.unicode_minus'] = False # 解决保存图像是负号'-'显示为方块的问题 
mpl.rcParams['font.size'] = 14 # 设置字体大小

np.random.seed(88)

In [3]:
# Python3
# 中文分词功能小函数， 输出 字符串， 各词组由空格分隔
# def KudosData_word_tokenizer(foo):
#     seg_token = jieba.cut(str(foo), cut_all=True)
#     seg_str = str(' '.join(seg_token))
#     return seg_str
# Python2
# 中文分词功能小函数， 输出 字符串， 各词组由空格分隔
def KudosData_word_tokenizer(foo):
    seg_token = jieba.cut(foo, cut_all=True)
    seg_str = ' '.join(seg_token)
    return seg_str

In [11]:
class SummaryTool(object):

    # Naive method for splitting a text into sentences
    def split_content_to_sentences(self, content):
#         content = content.replace("\n", ". ")
#         return content.split(". ")
        content = content.replace("\n", "。")
        content = content.replace("？", "。")
        content = content.replace("！", "。")
        content = content.replace("；", "。")
        content = content.replace("，", "。")
        return content.split("。")

    # Naive method for splitting a text into paragraphs
    def split_content_to_paragraphs(self, content):
        return content.split("\n\n")

    # Caculate the intersection between 2 sentences
    def sentences_intersection(self, sent1, sent2):
        # www.KudosData.com
        sent1chn = KudosData_word_tokenizer(sent1)
        sent2chn = KudosData_word_tokenizer(sent2)

        # split the sentence into words/tokens
#         s1 = set(sent1.split(" "))
#         s2 = set(sent2.split(" "))
        s1 = set(sent1chn.split(" "))
        s2 = set(sent2chn.split(" "))
        
        # debug
        print(sent1)
        print(sent1chn)
        print(s1)
        print('')

        
        # If there is not intersection, just return 0
        if (len(s1) + len(s2)) == 0:
            print('# If there is not intersection, just return 0')
            return 0


        # We normalize the result by the average number of words
        return len(s1.intersection(s2)) / ((len(s1) + len(s2)) / 2)

    # Format a sentence - remove all non-alphbetic chars from the sentence
    # We'll use the formatted sentence as a key in our sentences dictionary
    def format_sentence(self, sentence):
        sentence = re.sub(r'\W+', '', sentence)
        return sentence

    # Convert the content into a dictionary <K, V>
    # k = The formatted sentence
    # V = The rank of the sentence
    def get_senteces_ranks(self, content):

        # Split the content into sentences
        sentences = self.split_content_to_sentences(content)

        # Calculate the intersection of every two sentences
        n = len(sentences)
        # [Sam python 2.7 -> 3.4] values = [[0 for x in xrange(n)] for x in xrange(n)]
        values = [[0 for x in range(n)] for x in range(n)]
        for i in range(0, n):
            for j in range(0, n):
                values[i][j] = self.sentences_intersection(sentences[i], sentences[j])

        # Build the sentences dictionary
        # The score of a sentences is the sum of all its intersection
        sentences_dic = {}
        for i in range(0, n):
            score = 0
            for j in range(0, n):
                if i == j:
                    continue
                score += values[i][j]
            sentences_dic[self.format_sentence(sentences[i])] = score
        return sentences_dic

    # Return the best sentence in a paragraph
    def get_best_sentence(self, paragraph, sentences_dic):

        # Split the paragraph into sentences
        sentences = self.split_content_to_sentences(paragraph)

        # Ignore short paragraphs
        if len(sentences) < 2:
            return ""

        # Get the best sentence according to the sentences dictionary
        best_sentence = ""
        max_value = 0
        for s in sentences:
            strip_s = self.format_sentence(s)
            if strip_s:
                if sentences_dic[strip_s] > max_value:
                    max_value = sentences_dic[strip_s]
                    best_sentence = s

        return best_sentence

    # Build the summary
    def get_summary(self, title, content, sentences_dic):

        # Split the content into paragraphs
        paragraphs = self.split_content_to_paragraphs(content)

        # Add the title
        summary = []
        summary.append(title.strip())
        summary.append("")

        # Add the best sentence from each paragraph
        for p in paragraphs:
            sentence = self.get_best_sentence(p, sentences_dic).strip()
            if sentence:
                summary.append(sentence)

        return ("\n").join(summary)




# Main()

### process inputs

In [12]:
# article = pd.read_csv('input_article.csv', encoding='utf-8')

In [13]:
# process Unicode text input
with io.open('input_text.txt','r',encoding='utf8') as f:
    text = f.read()

    # with io.open('output_text.txt','w',encoding='utf8') as f:
#     f.write(text)

In [14]:
content = text

# content

In [15]:
title = '''
<Dummy Title>
'''

title

u'\n<Dummy Title>\n'

In [16]:
    # Create a SummaryTool object
st = SummaryTool()

    # Build the sentences dictionary
sentences_dic = st.get_senteces_ranks(content)

    # Build the summary with the sentences dictionary
summary = st.get_summary(title, content, sentences_dic)

    # print(the ratio between the summary length and the original length
print("Original Length %s" % (len(title) + len(content)))
print("Summary Length %s" % len(summary))
print("Summary Ratio: %s" % (100 - (100 * (len(summary) / (len(title) + len(content))))))
print("")

    # print the summary
print(summary)

#   output file locaiotn: G:\Tool_PGM\Python\
#     f = open("nusbot_summary.txt","w") 
#     f.write("Original Length %s" % (len(title) + len(content)))
#     f.write("\n")
#     f.write("Summary Length %s" % len(summary))
#     f.write("\n")
#     f.write("Summary Ratio: %s" % (100 - (100 * (len(summary) / (len(title) + len(content))))))
#     f.write("\n")
#     f.write("\n")
#     f.write(summary)
#     f.close()


第一章 黑魔王崛起
第一 第一章 一章   黑 魔王 崛起
set([u'', u'\u4e00\u7ae0', u'\u5d1b\u8d77', u'\u7b2c\u4e00', u'\u7b2c\u4e00\u7ae0', u'\u9ed1', u'\u9b54\u738b'])

第一章 黑魔王崛起
第一 第一章 一章   黑 魔王 崛起
set([u'', u'\u4e00\u7ae0', u'\u5d1b\u8d77', u'\u7b2c\u4e00', u'\u7b2c\u4e00\u7ae0', u'\u9ed1', u'\u9b54\u738b'])

第一章 黑魔王崛起
第一 第一章 一章   黑 魔王 崛起
set([u'', u'\u4e00\u7ae0', u'\u5d1b\u8d77', u'\u7b2c\u4e00', u'\u7b2c\u4e00\u7ae0', u'\u9ed1', u'\u9b54\u738b'])

第一章 黑魔王崛起
第一 第一章 一章   黑 魔王 崛起
set([u'', u'\u4e00\u7ae0', u'\u5d1b\u8d77', u'\u7b2c\u4e00', u'\u7b2c\u4e00\u7ae0', u'\u9ed1', u'\u9b54\u738b'])

第一章 黑魔王崛起
第一 第一章 一章   黑 魔王 崛起
set([u'', u'\u4e00\u7ae0', u'\u5d1b\u8d77', u'\u7b2c\u4e00', u'\u7b2c\u4e00\u7ae0', u'\u9ed1', u'\u9b54\u738b'])

第一章 黑魔王崛起
第一 第一章 一章   黑 魔王 崛起
set([u'', u'\u4e00\u7ae0', u'\u5d1b\u8d77', u'\u7b2c\u4e00', u'\u7b2c\u4e00\u7ae0', u'\u9ed1', u'\u9b54\u738b'])

第一章 黑魔王崛起
第一 第一章 一章   黑 魔王 崛起
set([u'', u'\u4e00\u7ae0', u'\u5d1b\u8d77', u'\u7b2c\u4e00', u'\u7b2c\u4e00\u7ae0', u'\u9ed1', u'\u9b54\u7

In [17]:
with io.open('output_topic_summary.txt','w',encoding='utf8') as f:
    f.write("Original Length : %s" % (len(title) + len(content)))
    f.write("\n")
    f.write("Summary  Length : %s" % len(summary))
    f.write("\n")
    f.write("Summary  Ratio  : %s" % (100 - (100 * (len(summary) / (len(title) + len(content))))))
    f.write("\n")
    f.write("\n")
    f.write(summary)
    f.close()