### Read ckiptagger & Dataframe

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer,TfidfTransformer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse.dia import dia_matrix
from scipy.sparse import vstack 
import re
from ckiptagger import data_utils, construct_dictionary, WS, POS, NER

path = "./data"
ws = WS(path)

df = pd.read_csv('data_ETL2noPunc.csv')
# Replace '@' with ' ' in original dataframe
df.token = df.token.apply(lambda text: text.replace('@',' '))

### Tf-idf for Tokenized Text in Dataframe

In [2]:
tf = TfidfVectorizer()
tfidf_matrix = tf.fit_transform(df['token'])
cos_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
vocab = tf.vocabulary_

### Newly Entered Text Preprocess function
- Remove Punctuation
- Remove Spaces
- Sentence Segment

In [3]:
# Remove Punc., Remove Space, Words Segment
def Preprocess(text):
    rule = re.compile(r'[^a-zA-Z0-9\u4e00-\u9fa5]')
    text = rule.sub(' ',str(text))
    text = re.sub(' +', '',text)
    text = ws([text], sentence_segmentation=True)
    cut_corpus = []
    for word in text:
        cut_corpus.append(' '.join(word))
    return cut_corpus

In [4]:
tmptext = "真的難懂採   購的狀  況"
tmptext = Preprocess(tmptext)
print(tmptext)

['真的 難懂 採購 的 狀況']


### Recommend Law Function
1. Add new text in tf-idf matrix
2. Calculate new cosine similarity
3. Find top 10 similar text using Cosine Similarity
4. Showt top 10 similar texts and the law corresponding to those texts

In [5]:
def recommend_law(text, vocab = vocab, tfidf_matrix = tfidf_matrix):
    text = Preprocess(text)
    tf = TfidfVectorizer(vocabulary = vocab)
    new_tf = tf.fit_transform(text) 
    new_tfidf_matrix = vstack([tfidf_matrix,new_tf])
    new_cos_sim = cosine_similarity(new_tfidf_matrix, new_tfidf_matrix)
    newvocab = tf.vocabulary_
    sim_score = np.sort(new_cos_sim[new_cos_sim.shape[0]-1])[::-1][1:11]
    tmp_top_10_law = df[['Ex_Tittle','CE_Item2','CE_Comment']].iloc[np.argsort(new_cos_sim[new_cos_sim.shape[0]-1])[::-1][1:11]]
    tmp_top_10_law['similarity_score'] = [round(score*100,1) for score in sim_score]
    return newvocab, new_tfidf_matrix, new_cos_sim, tmp_top_10_law

newtext = "標金保證金未符合規定，請改進"
vocab, tfidf_matrix, cos_sim, top_10_law = recommend_law(newtext)

In [6]:
# 依序內文(Comment)相似程度排序，顯示相似的歷史資料中所使用的法律
top_10_law.drop(columns = ['CE_Comment']) # 資料保密 不顯示CE_Comment

Unnamed: 0,Ex_Tittle,CE_Item2,similarity_score
2083,政府採購法,30,42.8
1563,政府採購法,30,42.1
1760,政府採購法,30,41.5
1743,政府採購法,30,41.5
2138,政府採購法,30,41.0
1758,政府採購法,30,39.8
8024,政府採購法,30,37.6
14752,政府採購法,30,37.4
3131,政府採購法,30,36.9
1979,政府採購法,30,33.7
