# Chinese text summarization algorithm
### www.KudosData.com
#### By: Sam GU Zhan
#### March, 2017

# Imports

In [1]:
# coding=UTF-8
from __future__ import division
import re

# Python2 unicode & float-division support:
# from __future__ import unicode_literals, division

In [2]:
# %matplotlib inline
import pandas as pd
import numpy as np
# import matplotlib.pyplot as plt

import io

# 中文字符和语言处理库
import jieba

# 机器学习库 sklearn 分类学习模型库
#from sklearn import linear_model
from sklearn.feature_extraction import DictVectorizer # 数据结构变换：把 Dict 转换为 稀疏矩阵
# from sklearn.linear_model import LogisticRegression  # 逻辑回归分类模型
# from sklearn.pipeline import make_pipeline # 封装机器学习模型流程
# from sklearn.metrics import confusion_matrix, roc_curve, auc

# 中文显示设置
from pylab import *  
mpl.rcParams['font.sans-serif'] = ['SimHei'] # 指定默认字体  
mpl.rcParams['axes.unicode_minus'] = False # 解决保存图像是负号'-'显示为方块的问题 
mpl.rcParams['font.size'] = 14 # 设置字体大小

np.random.seed(88)

# Define Functions

In [3]:
# Python3
# 中文分词功能小函数， 输出 字符串， 各词组由空格分隔
def KudosData_word_tokenizer(foo):
    seg_token = jieba.cut(str(foo), cut_all=True)
    seg_str = str(' '.join(seg_token))
    return seg_str
# Python2
# 中文分词功能小函数， 输出 字符串， 各词组由空格分隔
# def KudosData_word_tokenizer(foo):
#     seg_token = jieba.cut(foo, cut_all=True)
#     seg_str = ' '.join(seg_token)
#     return seg_str

# Input text

In [4]:
# process Unicode text input
with io.open('input_text.txt','r',encoding='utf8') as f:
    text = f.read()

content = text

title = '''
<Dummy Title>
'''

In [5]:
# content

In [6]:
def format_sentence(text):
#     sentence = re.sub(r'\W+', '#', sentence)
    text = re.sub(r'\t+', '', text) # remove one or more Tab
    return text

In [7]:
def linebreak_conversion_win_linux(text):
    text = re.sub(r'\r', '', text) # remove one or more Tab
    text = re.sub(r'\u3000', ' ', text) # convert white space: \u3000    
    return text

In [8]:
def clean_some_whitespace(text): # Does not remove normal Space
#     sentence = re.sub(r'\W+', '#', sentence)
    text = re.sub(r'\t+', '', text) # remove one or more Tab
    text = re.sub(r'\f+', '', text) # remove one or more
    text = re.sub(r'\v+', '', text) # remove one or more
    text = re.sub(r'\n+', '', text) # remove one or more \n, this is to merge sentences within paragraph

    text = re.sub(r'(\^\*\#)( +)(\#\*\^)', '^*##*^', text) # remove one or more Spaces between Paragraph-Tags or Sentence-Tags

    text = re.sub(r' +', ' ', text) # merge two or more Spaces to 1 Space
#     text = re.sub(r'( +)\W', r'\W', text) # remove Spaces before special char
#     text = re.sub(r'\W( +)', r'\W', text) # remove Spaces after special char
    
    return text

In [9]:
### Define Paragraph-Tag =  
#   #*^P^*#

### Define Sentence-Tag =  
#   #*^S^*#


In [10]:
# add a special tage to end of each paragraph
def tag_paragraph(text):
    text = re.sub(r'((\n ) +)+', '#*^P^*#', text) # Tag paragraph, pattern: \n + two or more Spaces
    text = re.sub(r'((\n\t) +)+', '#*^P^*#', text) # Tag paragraph, pattern: \n + two or more Tabs
    text = re.sub(r'(\n( *)\n)+', '#*^P^*#', text) # Tag paragraph, pattern: \n + zero or more Spaces + \n
    text = re.sub(r'(\#\*\^P\^\*\#)+', '#*^P^*#', text) # merge two or more Paragraph-Tags -> 1 Paragraph-Tag

    return text

In [11]:
# add a special tage to end of each sentence
def tag_sentence(text):
    text = re.sub(r'。+', '。#*^S^*#', text) # Tag sentence - Chinese
    text = re.sub(r'！+', '！#*^S^*#', text) # Tag sentence - Chinese
    text = re.sub(r'\？+', '？#*^S^*#', text) # Tag sentence - Chinese
    text = re.sub(r'；+', '；#*^S^*#', text) # Tag sentence - Chinese

    text = re.sub(r'!+', '!#*^S^*#', text) # Tag sentence - English
    text = re.sub(r'\?+', '?#*^S^*#', text) # Tag sentence - English
    text = re.sub(r';+', ';#*^S^*#', text) # Tag sentence - English
   
    # merge two or more sentence-Tags -> 1 Sentence-Tag
    text = re.sub(r'(\W?(\#\*\^S\^\*\#))+', '。#*^S^*#', text)

    # remove a Sentence-Tag immediately before an ending ”
    text = re.sub(r'\#\*\^S\^\*\#”', '”', text) 
    
    # remove a Sentence-Tag immediately before a Paragraph-Tag
    text = re.sub(r'(\#\*\^S\^\*\#)( *)(\#\*\^P\^\*\#)', '#*^P^*#', text) 
    
    return text

### Start tagging:

In [12]:
content_format = linebreak_conversion_win_linux(content)
# content_format

In [13]:
content_format = tag_paragraph(content_format)
# content_format

In [14]:
content_format = clean_some_whitespace(content_format)
# content_format

In [15]:
content_format = tag_sentence(content_format)
# content_format

In [16]:
content_format = clean_some_whitespace(content_format)
content_format

'《黄金时代》王小波#*^P^*#一 #*^P^*#我二十一岁时，正在云南插队。#*^S^*#陈清扬当时二十六岁，就在我插队的地方当医生。#*^S^*#我在山下十四队，她在山上十五队。#*^S^*#有一天她从山上下来，和我讨论她不是破鞋的问题。#*^S^*#那时我还不大认识她，只能说有一点知道。#*^S^*#她要讨论的事是这祥的：虽然所有的人都说她是一个破鞋，但她以为自己不是的。#*^S^*#因为破鞋偷汉，而她没有愉过汉。#*^S^*#虽然她丈夫已经住了一年监狱，但她没有偷过汉。#*^S^*#在此之前也未偷过汉。#*^S^*#所以她简直不明白，人们为什么要说她是破鞋。#*^S^*#如果我要安慰她，并不困难。#*^S^*#我可以从逻辑上证明她不是破鞋。#*^S^*#如果陈清扬是破鞋，即陈清扬偷汉，则起码有一个某人为其所偷。#*^S^*#如今不能指出某人，所以陈清扬偷汉不能成立。#*^S^*#但是我偏说，陈清扬就是破鞋，而且这一点毋庸置疑。#*^P^*#陈清扬找我证明她不是破鞋，起因是我找她打针。#*^S^*#这事经过如下：农忙时队长不叫我犁田，而是叫我去插秧，这样我的腰就不能经常直立，认识我的人都知道，我的腰上有旧伤，而且我身高在一米九以上。#*^S^*#如此插了一个月，我腰痛难忍，不打封闭就不能入睡。#*^S^*#我们队医务室那一把针头镀层剥落，而且都有倒钩，经常把我腰上的肉钩下来。#*^S^*#后来我的腰就像中了散弹枪，伤痕久久不褪。#*^S^*#就在这种情况下，我想起十五队的队医陈清扬是北医大毕业的大夫，对针头和勾针大概还能分清，所以我去找她看病，看完病回来，不到半个小时，她就追到我屋里来，要我证明她不是破鞋。#*^P^*#陈清扬说，她丝毫也不藐视破鞋。#*^S^*#据她观察，破鞋都很善良，乐于助人，而且最不乐意让人失望。#*^S^*#因此她对破鞋还有一点钦佩。#*^S^*#问题不在于破鞋好不好，而在于她根本不是破鞋。#*^S^*#就如一只猫不是一只狗一样。#*^S^*#假如一只猫被人叫成一只狗，它也会感到很不自在。#*^S^*#现在大家都管她叫被鞋，弄得她魂不守舍，几乎连自己是谁都不知道了。#*^P^*#陈清扬在我的草房里时，裸臂赤腿穿一件白大褂，和她在山上那间医务室里装束一样，所不同的是披散的长发用个手绢束住，脚上也多了一双拖鞋。#*^S^*#看了她的样子，


### Transfer tagged text to Pandas Dataframe

In [17]:
# Split a text into paragraphs
def split_article_to_paragraphs(text):
#     text = text.replace("#*^P^*#", "#*^S^*#") # convert Paragraph-Tag        
    return text.split("#*^P^*#")


In [18]:
# Split a paragraph into sentences
def split_paragraph_to_sentences(text):
#     text = text.replace("#*^P^*#", "#*^S^*#") # convert Paragraph-Tag        
    return text.split("#*^S^*#")


In [19]:
# 1st loop Paragraphs list, 2nd loop Sentences list
# create a few new columns, then write into dataframe, together with original Sentence string

# define empty dataframe:
df_article = pd.DataFrame(columns=('sentence', 'sentence_id', 'sentence_id_paragraph', 'paragraph_id', 'class_rank', 'score'))
df_sentence_id = 0

# split_article_to_paragraphs:
article_paragraphs = split_article_to_paragraphs(content_format)

for i in range(0, len(article_paragraphs)):
    # split_paragraph_to_sentences:
    article_paragraphs_sentences = split_paragraph_to_sentences(article_paragraphs[i])

    for j in range(0, len(article_paragraphs_sentences)):
        df_sentence_id = df_sentence_id + 1
        # write to dataframe:
        df_article.loc[len(df_article)] = [article_paragraphs_sentences[j] ,df_sentence_id, j+1, i+1 ,'' ,'']



In [20]:
df_article

Unnamed: 0,sentence,sentence_id,sentence_id_paragraph,paragraph_id,class_rank,score
0,《黄金时代》王小波,1,1,1,,
1,一,2,1,2,,
2,我二十一岁时，正在云南插队。,3,1,3,,
3,陈清扬当时二十六岁，就在我插队的地方当医生。,4,2,3,,
4,我在山下十四队，她在山上十五队。,5,3,3,,
5,有一天她从山上下来，和我讨论她不是破鞋的问题。,6,4,3,,
6,那时我还不大认识她，只能说有一点知道。,7,5,3,,
7,她要讨论的事是这祥的：虽然所有的人都说她是一个破鞋，但她以为自己不是的。,8,6,3,,
8,因为破鞋偷汉，而她没有愉过汉。,9,7,3,,
9,虽然她丈夫已经住了一年监狱，但她没有偷过汉。,10,8,3,,


### Calculate importance score for each sentence

### temp testing - reusing

In [21]:
class SummaryTool(object):

    # Naive method for splitting a text into paragraphs
    def split_content_to_paragraphs(self, content):
        # pre-process to identiy new paragraph:
#         content = content.replace("\n  ", "\n\n") # new paragraph: \n + English Space Space

        # debug:
        print('')
        print('.........................................................................')
        print('-->> splitting below text into paragraphs by #*^P^*# - processed input text:')
        print(content)
        print('.........................................................................')

        return content.split("#*^P^*#") # split paragraph base on: \n\n. New paragraph will have 2 leading Spaces

    # Naive method for splitting a text/paragraphs into sentences
    def split_content_to_sentences(self, content):
        content = content.replace("#*^P^*#", "#*^S^*#") # convert Paragraph-Tag
        
        print('')
        print('-->> splitting below paragraph into sentences:')
        print(content)

        return content.split("#*^S^*#")

    # Caculate the intersection between 2 sentences
    def sentences_intersection(self, sent1, sent2):
        # www.KudosData.com - Chinese
        sent1chn = KudosData_word_tokenizer(sent1)
        sent2chn = KudosData_word_tokenizer(sent2)

        # split the sentence into words/tokens
#         s1 = set(sent1.split(" "))
#         s2 = set(sent2.split(" "))
        s1 = set(sent1chn.split(" "))
        s2 = set(sent2chn.split(" "))
        # debug
#         print(sent1)
#         print(sent1chn)
#         print(s1)
#         print('')       
        # If there is not intersection, just return 0
        if (len(s1) + len(s2)) == 0:
            print('# If there is not intersection, just return 0')
            return 0

        # We normalize the result by the average number of words
        return len(s1.intersection(s2)) / ((len(s1) + len(s2)) / 2)

    # Format a sentence - remove all non-alphbetic chars from the sentence
    # We'll use the formatted sentence as a key in our sentences dictionary
    def format_sentence(self, sentence):
        sentence = re.sub(r'\W+', '', sentence)
        return sentence

    # Convert the content into a dictionary <K, V>
    # k = The formatted sentence
    # V = The rank of the sentence
    def get_senteces_ranks(self, content):
        print(' ')
        print('starting: get_senteces_ranks(self, content) ...')

        # Split the paragraph into sentences
        sentences = self.split_content_to_sentences(content)
        
        # www.KudosData.com - clean sentences
        # Don't know to how to do:
#         sentences = self.format_sentence(sentences)        

        # Calculate the intersection of every two sentences
        n = len(sentences)
        # [Sam python 2.7 -> 3.4] values = [[0 for x in xrange(n)] for x in xrange(n)]
        values = [[0 for x in range(n)] for x in range(n)]
        for i in range(0, n):
            for j in range(0, n):
                values[i][j] = self.sentences_intersection(sentences[i], sentences[j])

        # Build the sentences dictionary
        # The score of a sentences is the sum of all its intersection
        sentences_dic = {}
        for i in range(0, n):
            score = 0
            for j in range(0, n):
                if i == j:
                    continue
                score += values[i][j]
            sentences_dic[self.format_sentence(sentences[i])] = score
            
            # www.KudosData.com
            print('>>>>>>>>>>>>>>>>>>> successfully completed: get_senteces_ranks() : %d' % i)
        return sentences_dic

    # Return the best sentence in a paragraph
    def get_best_sentence(self, paragraph, sentences_dic):

        # Split the paragraph into sentences
        sentences = self.split_content_to_sentences(paragraph)

        # Ignore short paragraphs
        if len(sentences) < 2:
            print('==>> Ignore above short paragraph')
            return ""

        # Get the best sentence according to the sentences dictionary
#         best_sentence = ""
        best_sentence = "### Best_sentence NOT produced for this paragraph ###"
        max_value = 0
        for s in sentences:
            strip_s = self.format_sentence(s)
            if strip_s:
                if sentences_dic[strip_s] > max_value:
                    max_value = sentences_dic[strip_s]
                    best_sentence = s

        return best_sentence

    # Build the summary
    def get_summary(self, title, content, sentences_dic):

        # Split the test into paragraphs
        paragraphs = self.split_content_to_paragraphs(content)

        # Add the title
        summary = []
# If necessary, can remove title from displaying
        summary.append(title.strip())
        summary.append("")

        # Add the best sentence from each paragraph
        for p in paragraphs:
            sentence = self.get_best_sentence(p, sentences_dic).strip()
            if sentence:
                summary.append(sentence)

        return ("\n").join(summary)




In [22]:
content = content_format
# content

# Run Topic Summerization
### Get important sentence(s) of each paragraph from input text

In [23]:
    # Create a SummaryTool object
st = SummaryTool()

    # Build the sentences dictionary
sentences_dic = st.get_senteces_ranks(content)

    # Build the summary with the sentences dictionary
summary = st.get_summary(title, content, sentences_dic)

    # print(the ratio between the summary length and the original length
print("Original Length : %s" % (len(title) + len(content)))
print("Summary  Length : %s" % len(summary))
print("Summary  Ratio  : %s %%" % (100 * (len(summary) / (len(title) + len(content)))))
print("")

    # print the summary
print(summary)


Building prefix dict from the default dictionary ...
DEBUG:jieba:Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\TELESC~1\AppData\Local\Temp\jieba.cache
DEBUG:jieba:Loading model from cache C:\Users\TELESC~1\AppData\Local\Temp\jieba.cache
Loading model cost 1.795 seconds.
DEBUG:jieba:Loading model cost 1.795 seconds.
Prefix dict has been built succesfully.
DEBUG:jieba:Prefix dict has been built succesfully.


 
starting: get_senteces_ranks(self, content) ...

-->> splitting below paragraph into sentences:
《黄金时代》王小波#*^S^*#一 #*^S^*#我二十一岁时，正在云南插队。#*^S^*#陈清扬当时二十六岁，就在我插队的地方当医生。#*^S^*#我在山下十四队，她在山上十五队。#*^S^*#有一天她从山上下来，和我讨论她不是破鞋的问题。#*^S^*#那时我还不大认识她，只能说有一点知道。#*^S^*#她要讨论的事是这祥的：虽然所有的人都说她是一个破鞋，但她以为自己不是的。#*^S^*#因为破鞋偷汉，而她没有愉过汉。#*^S^*#虽然她丈夫已经住了一年监狱，但她没有偷过汉。#*^S^*#在此之前也未偷过汉。#*^S^*#所以她简直不明白，人们为什么要说她是破鞋。#*^S^*#如果我要安慰她，并不困难。#*^S^*#我可以从逻辑上证明她不是破鞋。#*^S^*#如果陈清扬是破鞋，即陈清扬偷汉，则起码有一个某人为其所偷。#*^S^*#如今不能指出某人，所以陈清扬偷汉不能成立。#*^S^*#但是我偏说，陈清扬就是破鞋，而且这一点毋庸置疑。#*^S^*#陈清扬找我证明她不是破鞋，起因是我找她打针。#*^S^*#这事经过如下：农忙时队长不叫我犁田，而是叫我去插秧，这样我的腰就不能经常直立，认识我的人都知道，我的腰上有旧伤，而且我身高在一米九以上。#*^S^*#如此插了一个月，我腰痛难忍，不打封闭就不能入睡。#*^S^*#我们队医务室那一把针头镀层剥落，而且都有倒钩，经常把我腰上的肉钩下来。#*^S^*#后来我的腰就像中了散弹枪，伤痕久久不褪。#*^S^*#就在这种情况下，我想起十五队的队医陈清扬是北医大毕业的大夫，对针头和勾针大概还能分清，所以我去找她看病，看完病回来，不到半个小时，她就追到我屋里来，要我证明她不是破鞋。#*^S^*#陈清扬说，她丝毫也不藐视破鞋。#*^S^*#据她观察，破鞋都很善良，乐于助人，而且最不乐意让人失望。#*^S^*#因此她对破鞋还有一点钦佩。#*^S^*#问题不在于破鞋好不好，而在于她根本不是破鞋。#*^S^*#就如一只猫不是一只狗一样。#*^S^*#假如一只猫被人叫成一只狗，它也会感到很不自在。#*^S^*#现在大家都管她叫被鞋，弄得她魂不守舍，几乎

### Output results to a file

In [24]:
with io.open('output_topic_summary.txt','w',encoding='utf8') as f:
    f.write("Original Length : %s" % (len(title) + len(content)))
    f.write("\n")
    f.write("Summary  Length : %s" % len(summary))
    f.write("\n")
    f.write("Summary  Ratio  : %s %%" % (100 * (len(summary) / (len(title) + len(content)))))
    f.write("\n")
    f.write("\n")
    f.write(summary)
    f.close()