In [1]:
import pandas as pd
import numpy as np

import networkx as nx

from os import listdir
import re

import plotly.offline as py
import plotly.graph_objects as go


from src.text_rank import TextRank

In [2]:
text_directory = "output/policy_text/"

In [3]:
policy_dir = listdir(text_directory)

text_list = []

for file in policy_dir:
    file = text_directory+file
    
    with open(file,"r+")  as f:
        text = f.read()
    
    text = re.sub("\n","",text)
    text = re.sub("\u3000","",text)
    
    text_list.append(text)


sample_text = text_list[2]

In [8]:
allowPOS = ["n","v"]
stopwords = ["为了","依法"]
span = 3

In [6]:
all_keywords = []

for text in text_list:
    
    tr_keyword = TextRank(allowPOS,stopwords,span)
    key_word_list = tr_keyword.text_rank(text,5)
    
    all_keywords += key_word_list
    

Building prefix dict from the default dictionary ...
Dumping model to file cache /var/folders/1y/w6y_szxn0m12nsrrdnzgp4540000gn/T/jieba.cache
Loading model cost 1.268 seconds.
Prefix dict has been built successfully.


In [127]:
all_keywords = [word_tuple[0] for word_tuple in all_keywords]

In [128]:
all_keywords = set(all_keywords)

In [129]:
len(all_keywords)

564

In [112]:
with open("output/keyword_level_1.txt","w+") as f:
    
    for word in all_keywords:
        
        f.write(word+"\n")

In [130]:
with open("data/stopwords_keyword_filter.txt") as f:
    
    keywords_stopwords = f.readlines()
    

In [131]:
keywords_stopwords = [word.strip() for word in keywords_stopwords]

In [132]:
keywords_filtered = [word for word in all_keywords if word not in keywords_stopwords]

In [133]:
len(keywords_filtered)

326

In [136]:
with open("output/filtered_keywords.txt","w+") as f:
    
    for word in keywords_filtered:
        f.write(word+"\n")

In [23]:
sample_text = text_list[1]

In [24]:
allowPOS = ["n","v"]
stopwords = ["为了","依法"]
span = 3
tr_keyword = TextRank(allowPOS,stopwords,span)

In [26]:
# Text relationship Study

## cut the text into list of words
word_pair = tr_keyword._cut(sample_text)


## create the co-occurance matrix (this is for )
co_graph = tr_keyword.co_occurance_matrix(word_pair)
df = tr_keyword.co_occur_graph_to_matrix(co_graph,normalization=True)


## Visulize the text graph

word_list = list(df.columns)


name_dict = {}
for i in range(len(word_list)):
    
    name_dict[i] = word_list[i]


nx_graph = nx.from_numpy_array(df.values)
nx_graph = nx.relabel_nodes(nx_graph,name_dict)  




### Use the Plotly graphing function

#### Filter the edge with low weight
for edge in nx_graph.edges():
    
    if nx_graph.edges()[edge]['weight'] < 0.5:
        
        nx_graph.edges()[edge]['weight'] = 0


pos_ = nx.spring_layout(nx_graph)



#### Create the edge plot

def make_edge(x, y, text, width):
    return  go.Scatter(x         = x,
                       y         = y,
                       line      = dict(width = width,
                                   color = 'cornflowerblue'),
                       hoverinfo = 'text',
                       text      = ([text]),
                       mode      = 'lines')




edge_trace = []

for edge in nx_graph.edges():

	if nx_graph.edges()[edge]['weight'] > 0:

		char_1 = edge[0]
		char_2 = edge[1]


		x0, y0 = pos_[char_1]
		x1, y1 = pos_[char_2]

		text   = char_1 + '--' + char_2 + ': ' + str(nx_graph.edges()[edge]['weight'])

		trace  = make_edge([x0, x1, None], [y0, y1, None], text, 
                           width = 0.3*nx_graph.edges()[edge]['weight']**1.75)


		edge_trace.append(trace)


#### Create the node plot

node_trace = go.Scatter(x         = [],
                        y         = [],
                        text      = [],
                        textposition = "top center",
                        textfont_size = 10,
                        mode      = 'markers+text',
                        hoverinfo = 'none',
                        marker    = dict(color = [],
                                         size  = [],
                                         line  = None))



for node in nx_graph.nodes():
    x, y = pos_[node]
    node_trace['x'] += tuple([x])
    node_trace['y'] += tuple([y])
    node_trace['marker']['color'] += tuple(['cornflowerblue'])
    node_trace['marker']['size'] += tuple([5*1]) # this could control the size of node (could have size varying with weight)
    node_trace['text'] += tuple(['<b>' + node + '</b>'])




#### Set the layout

layout = go.Layout(
    paper_bgcolor='rgba(0,0,0,0)', # transparent background
    plot_bgcolor='rgba(0,0,0,0)', # transparent 2nd background
    xaxis =  {'showgrid': False, 'zeroline': False}, # no gridlines
    yaxis = {'showgrid': False, 'zeroline': False}, # no gridlines
)



fig = go.Figure(layout = layout)
#### Add all edge traces
for trace in edge_trace:
    fig.add_trace(trace)

#### Add node trace
fig.add_trace(node_trace)

#### Remove legend
fig.update_layout(showlegend = False)

#### Remove tick labels
fig.update_xaxes(showticklabels = False)
fig.update_yaxes(showticklabels = False)

#### Show figure
fig.show()


In [27]:
sample_text

'财税〔2017〕90号各省、自治区、直辖市、计划单列市财政厅（局）、国家税务局、地方税务局，新疆生产建设兵团财务局：现将租入固定资产进项税额抵扣等增值税政策通知如下：一、自2018年1月1日起，纳税人租入固定资产、不动产，既用于一般计税方法计税项目，又用于简易计税方法计税项目、免征增值税项目、集体福利或者个人消费的，其进项税额准予从销项税额中全额抵扣。二、自2018年1月1日起，纳税人已售票但客户逾期未消费取得的运输逾期票证收入，按照“交通运输服务”缴纳增值税。纳税人为客户办理退票而向客户收取的退票费、手续费等收入，按照“其他现代服务”缴纳增值税。三、自2018年1月1日起，航空运输销售代理企业提供境外航段机票代理服务，以取得的全部价款和价外费用，扣除向客户收取并支付给其他单位或者个人的境外航段机票结算款和相关费用后的余额为销售额。其中，支付给境内单位或者个人的款项，以发票或行程单为合法有效凭证；支付给境外单位或者个人的款项，以签收单据为合法有效凭证，税务机关对签收单据有疑义的，可以要求其提供境外公证机构的确认证明。航空运输销售代理企业，是指根据《航空运输销售代理资质认可办法》取得中国航空运输协会颁发的“航空运输销售代理业务资质认可证书”，接受中国航空运输企业或通航中国的外国航空运输企业委托，依照双方签订的委托销售代理合同提供代理服务的企业。四、自2016年5月1日至2017年6月30日，纳税人采取转包、出租、互换、转让、入股等方式将承包地流转给农业生产者用于农业生产，免征增值税。本通知下发前已征的增值税，可抵减以后月份应缴纳的增值税，或办理退税。五、根据《财政部 税务总局关于资管产品增值税有关问题的通知》（财税〔2017〕56号）有关规定，自2018年1月1日起,资管产品管理人运营资管产品提供的贷款服务、发生的部分金融商品转让业务，按照以下规定确定销售额：（一）提供贷款服务，以2018年1月1日起产生的利息及利息性质的收入为销售额；（二）转让2017年12月31日前取得的股票（不包括限售股）、债券、基金、非货物期货，可以选择按照实际买入价计算销售额，或者以2017年最后一个交易日的股票收盘价（2017年最后一个交易日处于停牌期间的股票，为停牌前最后一个交易日收盘价）、债券估值（中债金融估值中心有限公司或中证指数有限公司提供的债券估值）、基金份额净值、非货物期货结