# Chinese text summarization algorithm
### www.KudosData.com
#### By: Sam GU Zhan
#### March, 2017

# Imports

In [1]:
# coding=UTF-8
from __future__ import division
import re

# Python2 unicode & float-division support:
# from __future__ import unicode_literals, division

In [2]:
# %matplotlib inline
import pandas as pd
import numpy as np
# import matplotlib.pyplot as plt

import io

# 中文字符和语言处理库
import jieba

# 机器学习库 sklearn 分类学习模型库
#from sklearn import linear_model
from sklearn.feature_extraction import DictVectorizer # 数据结构变换：把 Dict 转换为 稀疏矩阵
# from sklearn.linear_model import LogisticRegression  # 逻辑回归分类模型
# from sklearn.pipeline import make_pipeline # 封装机器学习模型流程
# from sklearn.metrics import confusion_matrix, roc_curve, auc

# 中文显示设置
from pylab import *  
mpl.rcParams['font.sans-serif'] = ['SimHei'] # 指定默认字体  
mpl.rcParams['axes.unicode_minus'] = False # 解决保存图像是负号'-'显示为方块的问题 
mpl.rcParams['font.size'] = 14 # 设置字体大小

np.random.seed(88)

# Define Functions

In [3]:
# Python3
# 中文分词功能小函数， 输出 字符串， 各词组由空格分隔
def KudosData_word_tokenizer(foo):
    # remove lead & tail spaces firstly:
    foo = foo.strip()
    seg_token = jieba.cut(str(foo), cut_all=True)
    seg_str = str(' '.join(seg_token)).strip()

    return seg_str
# Python2
# 中文分词功能小函数， 输出 字符串， 各词组由空格分隔
# def KudosData_word_tokenizer(foo):
#     seg_token = jieba.cut(foo, cut_all=True)
#     seg_str = ' '.join(seg_token)
#     return seg_str

In [4]:
# Python3
# 中文分词功能小函数， 输出 dictionary： { key 词组， value 计数 }
def KudosData_word_count(foo):
    # remove lead & tail spaces firstly:
    foo = foo.strip()
    seg_token = jieba.cut(str(foo), cut_all=True)
    seg_str = str(' '.join(seg_token)).strip()
    seg_count = pd.value_counts(str(seg_str).lower().split(' '))
    seg_count = seg_count.to_dict() 
    seg_count.pop('', None) # remove EMPTY dict key: ''
    return seg_count
# Python2
# 中文分词功能小函数， 输出 dictionary： { key 词组， value 计数 }
# def KudosData_word_count(foo):
#     seg_token = jieba.cut(foo, cut_all=True)
#     seg_str = '^'.join(seg_token)
#     seg_count = pd.value_counts(seg_str.lower().split('^'))
#     return seg_count.to_dict()

# Input text

In [5]:
# process Unicode text input
with io.open('input_text.txt','r',encoding='utf8') as f:
    text = f.read()

content = text

title = '''
<Dummy Title>
'''

In [6]:
# content

In [7]:
def format_sentence(text):
#     sentence = re.sub(r'\W+', '#', sentence)
    text = re.sub(r'\t+', '', text) # remove one or more Tab
    return text

In [8]:
def linebreak_conversion_win_linux(text):
    text = re.sub(r'\r', '', text) # remove one or more Tab
    text = re.sub(r'\u3000', ' ', text) # convert white space: \u3000    
    return text

In [9]:
def clean_some_whitespace(text): # Does not remove normal Space
#     sentence = re.sub(r'\W+', '#', sentence)
    text = re.sub(r'\t+', '', text) # remove one or more Tab
    text = re.sub(r'\f+', '', text) # remove one or more
    text = re.sub(r'\v+', '', text) # remove one or more
    text = re.sub(r'\n+', '', text) # remove one or more \n, this is to merge sentences within paragraph

    text = re.sub(r'(\^\*\#)( +)(\#\*\^)', '^*##*^', text) # remove one or more Spaces between Paragraph-Tags or Sentence-Tags

    text = re.sub(r' +', ' ', text) # merge two or more Spaces to 1 Space

    # remove lead & tail spaces:
    text =text.strip()

#     text = re.sub(r'( +)\W', r'\W', text) # remove Spaces before special char
#     text = re.sub(r'\W( +)', r'\W', text) # remove Spaces after special char
    
    return text

In [10]:
### Define Paragraph-Tag =  
#   #*^P^*#

### Define Sentence-Tag =  
#   #*^S^*#


In [11]:
# add a special tage to end of each paragraph
def tag_paragraph(text):
    text = re.sub(r'((\n ) +)+', '#*^P^*#', text) # Tag paragraph, pattern: \n + two or more Spaces
    text = re.sub(r'((\n\t) +)+', '#*^P^*#', text) # Tag paragraph, pattern: \n + two or more Tabs
    text = re.sub(r'(\n( *)\n)+', '#*^P^*#', text) # Tag paragraph, pattern: \n + zero or more Spaces + \n
    text = re.sub(r'(\#\*\^P\^\*\#)+', '#*^P^*#', text) # merge two or more Paragraph-Tags -> 1 Paragraph-Tag

    return text

In [12]:
# add a special tage to end of each sentence
def tag_sentence(text):
    text = re.sub(r'。+', '。#*^S^*#', text) # Tag sentence - Chinese
    text = re.sub(r'！+', '！#*^S^*#', text) # Tag sentence - Chinese
    text = re.sub(r'\？+', '？#*^S^*#', text) # Tag sentence - Chinese
    text = re.sub(r'；+', '；#*^S^*#', text) # Tag sentence - Chinese

    text = re.sub(r'!+', '!#*^S^*#', text) # Tag sentence - English
    text = re.sub(r'\?+', '?#*^S^*#', text) # Tag sentence - English
    text = re.sub(r';+', ';#*^S^*#', text) # Tag sentence - English
   
    # merge two or more sentence-Tags -> 1 Sentence-Tag
    text = re.sub(r'(\W?(\#\*\^S\^\*\#))+', '。#*^S^*#', text)

    # remove a Sentence-Tag immediately before an ending ”
    text = re.sub(r'\#\*\^S\^\*\#”', '”', text) 
    
    # remove a Sentence-Tag immediately before a Paragraph-Tag
    text = re.sub(r'(\#\*\^S\^\*\#)( *)(\#\*\^P\^\*\#)', '#*^P^*#', text) 
    
    text = re.sub(r'(\#\*\^P\^\*\#)+', '#*^P^*#', text) # merge two or more Paragraph-Tags -> 1 Paragraph-Tag

    
    return text

### Start tagging:

In [13]:
content_format = linebreak_conversion_win_linux(content)
# content_format

In [14]:
content_format = tag_paragraph(content_format)
# content_format

In [15]:
content_format = clean_some_whitespace(content_format)
# content_format

In [16]:
content_format = tag_sentence(content_format)
# content_format

In [17]:
content_format = clean_some_whitespace(content_format)
content_format

'《黄金时代》王小波#*^P^*#一 #*^P^*#我二十一岁时，正在云南插队。#*^S^*#陈清扬当时二十六岁，就在我插队的地方当医生。#*^S^*#我在山下十四队，她在山上十五队。#*^S^*#有一天她从山上下来，和我讨论她不是破鞋的问题。#*^S^*#那时我还不大认识她，只能说有一点知道。#*^S^*#她要讨论的事是这祥的：虽然所有的人都说她是一个破鞋，但她以为自己不是的。#*^S^*#因为破鞋偷汉，而她没有愉过汉。#*^S^*#虽然她丈夫已经住了一年监狱，但她没有偷过汉。#*^S^*#在此之前也未偷过汉。#*^S^*#所以她简直不明白，人们为什么要说她是破鞋。#*^S^*#如果我要安慰她，并不困难。#*^S^*#我可以从逻辑上证明她不是破鞋。#*^S^*#如果陈清扬是破鞋，即陈清扬偷汉，则起码有一个某人为其所偷。#*^S^*#如今不能指出某人，所以陈清扬偷汉不能成立。#*^S^*#但是我偏说，陈清扬就是破鞋，而且这一点毋庸置疑。#*^P^*#陈清扬找我证明她不是破鞋，起因是我找她打针。#*^S^*#这事经过如下：农忙时队长不叫我犁田，而是叫我去插秧，这样我的腰就不能经常直立，认识我的人都知道，我的腰上有旧伤，而且我身高在一米九以上。#*^S^*#如此插了一个月，我腰痛难忍，不打封闭就不能入睡。#*^S^*#我们队医务室那一把针头镀层剥落，而且都有倒钩，经常把我腰上的肉钩下来。#*^S^*#后来我的腰就像中了散弹枪，伤痕久久不褪。#*^S^*#就在这种情况下，我想起十五队的队医陈清扬是北医大毕业的大夫，对针头和勾针大概还能分清，所以我去找她看病，看完病回来，不到半个小时，她就追到我屋里来，要我证明她不是破鞋。#*^P^*#陈清扬说，她丝毫也不藐视破鞋。#*^S^*#据她观察，破鞋都很善良，乐于助人，而且最不乐意让人失望。#*^S^*#因此她对破鞋还有一点钦佩。#*^S^*#问题不在于破鞋好不好，而在于她根本不是破鞋。#*^S^*#就如一只猫不是一只狗一样。#*^S^*#假如一只猫被人叫成一只狗，它也会感到很不自在。#*^S^*#现在大家都管她叫被鞋，弄得她魂不守舍，几乎连自己是谁都不知道了。#*^P^*#陈清扬在我的草房里时，裸臂赤腿穿一件白大褂，和她在山上那间医务室里装束一样，所不同的是披散的长发用个手绢束住，脚上也多了一双拖鞋。#*^S^*#看了她的样子，


### Transfer tagged text to Pandas Dataframe

In [18]:
# Split a text into paragraphs
def split_article_to_paragraphs(text):
#     text = text.replace("#*^P^*#", "#*^S^*#") # convert Paragraph-Tag        
    return text.split("#*^P^*#")


In [19]:
# Split a paragraph into sentences
def split_paragraph_to_sentences(text):
#     text = text.replace("#*^P^*#", "#*^S^*#") # convert Paragraph-Tag        
    return text.split("#*^S^*#")


In [20]:
# 1st loop Paragraphs list, 2nd loop Sentences list
# create a few new columns, then write into dataframe, together with original Sentence string

# define empty dataframe:
df_article = pd.DataFrame(columns=('sentence', 
                                   'sentence_word_count', 
                                   'sentence_id', 
                                   'sentence_id_paragraph', 'paragraph_id', 'class_rank', 'score'))
df_sentence_id = 0

# split_article_to_paragraphs:
article_paragraphs = split_article_to_paragraphs(content_format)

for i in range(0, len(article_paragraphs)):
    # split_paragraph_to_sentences:
    article_paragraphs_sentences = split_paragraph_to_sentences(article_paragraphs[i].strip())

    for j in range(0, len(article_paragraphs_sentences)):
        if article_paragraphs_sentences[j].strip() != '':
            df_sentence_id = df_sentence_id + 1
            # write to dataframe:
            df_article.loc[len(df_article)] = [article_paragraphs_sentences[j].strip(), 
                                               len(article_paragraphs_sentences[j].strip()), 
                                               df_sentence_id, 
                                               j+1, i+1, '', '']


In [21]:
# Make sure no empty sentences:
print('Number of empty sentences in dataframe: %d ' % len(df_article[df_article['sentence'] == '']))

Number of empty sentences in dataframe: 0 


In [22]:
df_article

Unnamed: 0,sentence,sentence_word_count,sentence_id,sentence_id_paragraph,paragraph_id,class_rank,score
0,《黄金时代》王小波,9,1,1,1,,
1,一,1,2,1,2,,
2,我二十一岁时，正在云南插队。,14,3,1,3,,
3,陈清扬当时二十六岁，就在我插队的地方当医生。,22,4,2,3,,
4,我在山下十四队，她在山上十五队。,16,5,3,3,,
5,有一天她从山上下来，和我讨论她不是破鞋的问题。,23,6,4,3,,
6,那时我还不大认识她，只能说有一点知道。,19,7,5,3,,
7,她要讨论的事是这祥的：虽然所有的人都说她是一个破鞋，但她以为自己不是的。,36,8,6,3,,
8,因为破鞋偷汉，而她没有愉过汉。,15,9,7,3,,
9,虽然她丈夫已经住了一年监狱，但她没有偷过汉。,22,10,8,3,,


### Calculate importance score for each sentence

### [Optional Reference] word_tokenizer

In [23]:
# KudosData_word_tokenizer
df_article['sentence_tokenized'] = df_article['sentence'].apply(lambda x: KudosData_word_tokenizer(x))

Building prefix dict from the default dictionary ...
DEBUG:jieba:Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\TELESC~1\AppData\Local\Temp\jieba.cache
DEBUG:jieba:Loading model from cache C:\Users\TELESC~1\AppData\Local\Temp\jieba.cache
Loading model cost 1.768 seconds.
DEBUG:jieba:Loading model cost 1.768 seconds.
Prefix dict has been built succesfully.
DEBUG:jieba:Prefix dict has been built succesfully.


### [Optional Reference] Term Frequency

In [24]:
# KudosData_word_count
df_article['sentence_tf'] = df_article['sentence'].apply(lambda x: KudosData_word_count(x))

### [Optional Reference] TF-IDF

In [25]:
from sklearn.feature_extraction.text import TfidfVectorizer
corpus = df_article['sentence_tokenized']

vectorizer = TfidfVectorizer()

# my_stopword_list = ['and','to','the','of', 'in']
#vectorizer = TfidfVectorizer(stop_words=my_stopword_list)

# choice of no nomalization of tfidf output (not recommended)
#vectorizer = TfidfVectorizer(norm=None)

# TF-IDF score
tfidf = vectorizer.fit_transform(corpus)

# IDF score
idf_dict = dict(zip(vectorizer.get_feature_names(), vectorizer.idf_))

# TF is in df_article[['sentence_tf']]

In [26]:
### 把TF-iDF数值赋予相对应的词组
tfidf = tfidf.tocsr()

n_docs = tfidf.shape[0]
tfidftables = [{} for _ in range(n_docs)]
terms = vectorizer.get_feature_names()

for i, j in zip(*tfidf.nonzero()):
    tfidftables[i][terms[j]] = tfidf[i, j]

In [27]:
# Document-Term-Matrix's TF-IDF matrix size:
print ('This tfidf matrix is a very large table: [ %d rows/docs X %d columns/words ]' 
       % (tfidf.shape[0], tfidf.shape[1]))
print ('It contains %d eliments: one score per word per document !'
       % (tfidf.shape[0] * tfidf.shape[1]))

This tfidf matrix is a very large table: [ 1456 rows/docs X 3955 columns/words ]
It contains 5758480 eliments: one score per word per document !


In [28]:
# Add tfidf score into dataframe 
df_article['tfidf'] = tfidftables

In [29]:
# df_article[['sentence', 'sentence_tokenized', 'tfidf']].head()
df_article[['sentence', 'sentence_tokenized', 'sentence_tf', 'tfidf']]

Unnamed: 0,sentence,sentence_tokenized,sentence_tf,tfidf
0,《黄金时代》王小波,黄金 黄金时代 时代 王小波 小波,"{'王小波': 1, '黄金时代': 1, '小波': 1, '时代': 1, '黄金': 1}","{'黄金': 0.417605484371, '时代': 0.417605484371, '..."
1,一,一,{'一': 1},{}
2,我二十一岁时，正在云南插队。,我 二十 二十一 二十一岁 十一 十一岁 一岁 时 正在 云南 插队,"{'十一岁': 1, '云南': 1, '十一': 1, '二十一岁': 1, '二十': ...","{'一岁': 0.334452413499, '十一岁': 0.334452413499, ..."
3,陈清扬当时二十六岁，就在我插队的地方当医生。,陈 清扬 当时 二十 二十六 二十六岁 十六 十六岁 六岁 就 在 我 插队 的 地方 ...,"{'的': 1, '就': 1, '插队': 1, '地方': 1, '医生': 1, '清...","{'二十六': 0.346726964056, '十六岁': 0.346726964056,..."
4,我在山下十四队，她在山上十五队。,我 在 山下 十四 四队 她 在 山上 十五 五队,"{'十四': 1, '四队': 1, '山上': 1, '十五': 1, '她': 1, '...","{'十四': 0.461899610352, '四队': 0.461899610352, '..."
5,有一天她从山上下来，和我讨论她不是破鞋的问题。,有 一天 她 从 山上 上下 下来 和 我 讨论 她 不是 破鞋 的 问题,"{'的': 1, '山上': 1, '破鞋': 1, '她': 2, '下来': 1, '上...","{'上下': 0.424145930069, '一天': 0.407328288271, '..."
6,那时我还不大认识她，只能说有一点知道。,那时 我 还 不大 认识 她 只能 说 有 一点 知道,"{'一点': 1, '她': 1, '那时': 1, '有': 1, '我': 1, '还'...","{'一点': 0.354980862442, '不大': 0.428789491976, '..."
7,她要讨论的事是这祥的：虽然所有的人都说她是一个破鞋，但她以为自己不是的。,她 要 讨论 的 事 是 这 祥 的 虽然 所有 的 人 都 说 她 是 一个 破鞋 ...,"{'以为': 1, '要': 1, '破鞋': 1, '但': 1, '她': 3, '事'...","{'以为': 0.387291263007, '自己': 0.308576860009, '..."
8,因为破鞋偷汉，而她没有愉过汉。,因为 破鞋 偷 汉 而 她 没有 愉 过 汉,"{'而': 1, '过': 1, '破鞋': 1, '没有': 1, '她': 1, '因为...","{'因为': 0.599879892089, '破鞋': 0.559203184229, '..."
9,虽然她丈夫已经住了一年监狱，但她没有偷过汉。,虽然 她 丈夫 已经 住 了 一年 监狱 但 她 没有 偷 过 汉,"{'已经': 1, '但': 1, '她': 2, '了': 1, '偷': 1, '监狱'...","{'监狱': 0.472081018283, '虽然': 0.358119614565, '..."


### Calculate score for each sentence, based on pair-wise sentence comparison/intersection:

In [30]:
# experiment: use tf-idf and len(sentence) to calculate score
# tmp_mean = tmp_sum / len(df_article['sentence'][i])

# for i in range(0,len(df_article)):
#     if len(df_article['tfidf'][i]) == 0:
#         df_article['score'][i] = 0
#     else:
#         tmp_sum = 0
#         for key, values in df_article['tfidf'][i].items():
#             tmp_sum += values
        
#         tmp_mean = tmp_sum / len(df_article['sentence_tokenized'][i])
#         df_article['score'][i] = tmp_mean 


### Experiment:
### Calculate score for each sentence, based on pair-wise sentence comparison/intersection:

In [31]:
# Caculate raw intersection score between pair of two sentences, from df_article['sentence_tokenized']
def sentences_intersection(sent1tokenized, sent2tokenized):
    # www.KudosData.com - Chinese
    # split the sentence into words/tokens
    s1 = set(sent1tokenized.split(" "))
    s2 = set(sent2tokenized.split(" "))

    # If there is not intersection, just return 0
    if (len(s1) + len(s2)) == 0:
        print('# If there is not intersection, just return 0')
        return 0

    # Normalize the result by the average number of words
    return len(s1.intersection(s2)) / ((len(s1) + len(s2)) / 2)


In [32]:
# Calculate important score of every pair of sentences

n = len(df_article['sentence_tokenized'])
        
# [Sam python 2.7 -> 3.4] values = [[0 for x in xrange(n)] for x in xrange(n)]
df_score_raw_values = [[0 for x in range(n)] for x in range(n)]
for i in range(0, n):
    for j in range(0, n):
        df_score_raw_values[i][j] = sentences_intersection(df_article['sentence_tokenized'][i], 
                                                           df_article['sentence_tokenized'][j])

# The score of a sentence is the sum of all its intersection
sentences_dic = {}

for i in range(0, n):
    df_score = 0
    for j in range(0, n):
        if i == j:
            continue
        df_score += df_score_raw_values[i][j]
    df_article['score'][i] = df_score


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [33]:
df_article.head()

Unnamed: 0,sentence,sentence_word_count,sentence_id,sentence_id_paragraph,paragraph_id,class_rank,score,sentence_tokenized,sentence_tf,tfidf
0,《黄金时代》王小波,9,1,1,1,,107.43,黄金 黄金时代 时代 王小波 小波,"{'王小波': 1, '黄金时代': 1, '小波': 1, '时代': 1, '黄金': 1}","{'黄金': 0.417605484371, '时代': 0.417605484371, '..."
1,一,1,2,1,2,,4.30814,一,{'一': 1},{}
2,我二十一岁时，正在云南插队。,14,3,1,3,,145.021,我 二十 二十一 二十一岁 十一 十一岁 一岁 时 正在 云南 插队,"{'十一岁': 1, '云南': 1, '十一': 1, '二十一岁': 1, '二十': ...","{'一岁': 0.334452413499, '十一岁': 0.334452413499, ..."
3,陈清扬当时二十六岁，就在我插队的地方当医生。,22,4,2,3,,192.697,陈 清扬 当时 二十 二十六 二十六岁 十六 十六岁 六岁 就 在 我 插队 的 地方 ...,"{'的': 1, '就': 1, '插队': 1, '地方': 1, '医生': 1, '清...","{'二十六': 0.346726964056, '十六岁': 0.346726964056,..."
4,我在山下十四队，她在山上十五队。,16,5,3,3,,207.951,我 在 山下 十四 四队 她 在 山上 十五 五队,"{'十四': 1, '四队': 1, '山上': 1, '十五': 1, '她': 1, '...","{'十四': 0.461899610352, '四队': 0.461899610352, '..."


### Generate class_rank

In [34]:
# sort firstly
df_article = df_article.sort_values(by=['paragraph_id', 'score'], ascending=[True, False]).reset_index(drop=True)

In [35]:
# df_article.head()

In [36]:
# Generate Class_Rank

current_class_rank = 0
current_paragraph_id = 0

for i in range(0, len(df_article)):
    if df_article['paragraph_id'][i] != current_paragraph_id: # change of Paragraph, thus reset class_rank
        current_class_rank = 1
        current_paragraph_id = df_article['paragraph_id'][i]
    else:
        current_class_rank = current_class_rank + 1
        
    df_article['class_rank'][i] = current_class_rank


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [37]:
# sort Dataframe to 'result lookup mode'
df_article = df_article.sort_values(by=['class_rank', 'score', 'paragraph_id', 'sentence_id'], 
                                    ascending=[True, False, True, True]).reset_index(drop=True)

# Display only
# df_article.sort_values(by=['class_rank', 'score', 'paragraph_id', 'sentence_id'], 
#                        ascending=[True, False, True, True])

### Extract results based on user parameters
Max number of words

% of original number of words, etc

In [38]:
parm_max_word = 200
parm_max_percent = 0.15

In [42]:
df_article_display = df_article[0:30]

In [43]:
df_article_display = df_article_display.sort_values(by=['sentence_id'], ascending=[True]).reset_index(drop=True)
df_article_display[['sentence', 'paragraph_id', 'sentence_id_paragraph', 'class_rank', 'score']]

Unnamed: 0,sentence,paragraph_id,sentence_id_paragraph,class_rank,score
0,我是这么想的：假如我想证明她不是破鞋，就能证明她不是破鞋，那事情未免太容易了。,8,1,1,231.181
1,原来又有了另一种传闻，说她在和我搞破鞋。,11,4,1,237.559
2,她说：我什么也没有说。,35,1,1,269.224
3,后来她又改变了主意去找我，是因为所有的人都说她是破鞋，因此所有的人都是敌人。,39,4,1,238.451
4,我是想研究一下她的结构，这也是在她的许可之下。,47,5,1,241.894
5,我听了这话不高兴，她也发现了。,48,3,1,243.883
6,她来的时候，我没有盼着她来。,67,3,1,251.814
7,可是她说，快，混蛋，还拧我的腿。,72,4,1,249.731
8,陈清扬说，我叫人惦记上了。,78,3,1,249.351
9,他还说，我的行为够上了坏分子。,81,2,1,246.096


### Output results to a file

In [41]:
'''

with io.open('output_topic_summary.txt','w',encoding='utf8') as f:
    f.write("Original Length : %s" % (len(title) + len(content)))
    f.write("\n")
    f.write("Summary  Length : %s" % len(summary))
    f.write("\n")
    f.write("Summary  Ratio  : %s %%" % (100 * (len(summary) / (len(title) + len(content)))))
    f.write("\n")
    f.write("\n")
    f.write(summary)
    f.close()
    
    '''

'\n\nwith io.open(\'output_topic_summary.txt\',\'w\',encoding=\'utf8\') as f:\n    f.write("Original Length : %s" % (len(title) + len(content)))\n    f.write("\n")\n    f.write("Summary  Length : %s" % len(summary))\n    f.write("\n")\n    f.write("Summary  Ratio  : %s %%" % (100 * (len(summary) / (len(title) + len(content)))))\n    f.write("\n")\n    f.write("\n")\n    f.write(summary)\n    f.close()\n    \n    '