In [57]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import jieba 
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer,TfidfTransformer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse.dia import dia_matrix
from scipy.sparse import vstack 
import re

### Preprocessing

In [2]:
# https://clay-atlas.com/blog/2019/09/24/python-chinese-tutorial-ckiptagger/
# from ckiptagger import WS, POS, NER

# ws = WS("./data")
# pos = POS("./data")
# ner = NER("./data")

def remove_punctuation(text):
    rule = re.compile(r'[^a-zA-Z0-9\u4e00-\u9fa5]')
    text = rule.sub(' ',str(text))
    return text
# https://codertw.com/%E7%A8%8B%E5%BC%8F%E8%AA%9E%E8%A8%80/356827/

# Define a function for cutting words 
def chinese_word_seg(text):
    return " ".join(jieba.cut(text,HMM = True)) # using HMM method

def Preprocess(text):
    text = chinese_word_seg(remove_punctuation(text))
    text = ' '.join(text.split())
    return(text)

In [82]:
df = pd.read_excel("200801至202008缺失類型.xlsx", sheets = "法規")

def df_Preprocessing(df):
    # rename columns
    df.columns = [col.split("\n",1)[1] for col in df.columns]
    # select None-NA data
    df = df[(df.CE_Item2.isnull()==False) & (df.Ex_Tittle.isnull() == False)]
    # drop useless columns
    df.drop(columns = ['CE_CI_Block','CE_Item3','CE_Item4','CE_Item5'],inplace = True)
    df = df[df.Ex_Tittle.isin(['政府採購法','政府採購法施行細則','採購評選委員會審議規則','採購評選委員會組織準則'])]
#     df["tokenize"] = df.CE_Comment.apply(Preprocess)
    df['CE_Item2'] = df['CE_Item2'].apply(lambda string: str(string).partition(' ')[0])
#     df['recommend']= df.apply(lambda x: '%s_%s' % (x['Ex_Tittle'],x['CE_Item2']),axis = 1)
    return df

df = df_Preprocessing(df)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [89]:
df.CE_Comment.apply(remove_punctuation).iloc[60]

'本次契約變更案部分內容係增列 水保監造服務費  惟本採購案原屬細部設計顧問服務 如當初評選項目僅針對廠商提供之 細部設計顧問 DDC  標的為主 契約變更將 監造服務 納入 是否符合政府採購法第22條第1項第4款 原有採購 之規定 請檢討澄釋  政府採購法第22條第1項第4款 '

### CountVecotizer (Text Frequency)

In [4]:
tf_vectorizer = CountVectorizer(strip_accents = None)
tf = tf_vectorizer.fit_transform(df['tokenize'])

In [5]:
words = np.array(tf_vectorizer.get_feature_names())
weight = tf.toarray().sum(axis = 0)
wordfq = pd.DataFrame({'word':words,'freq':weight})
wordfq.sort_values(by = 'freq',inplace = True, ascending = False)

### Tf-idf
https://stackabuse.com/text-classification-with-python-and-scikit-learn/ <br>
https://stackoverflow.com/questions/44461931/adding-a-new-document-to-the-term-document-matrix-for-similarity-calculations

In [58]:
tf = TfidfVectorizer()
tfidf_matrix = tf.fit_transform(df['tokenize'])
cos_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
vocab = tf.vocabulary_

In [81]:
newtext = "一直新增唷"
def addtext(text, vocab = vocab, tfidf_matrix = tfidf_matrix):
    tf = TfidfVectorizer(vocabulary = vocab)
    new_tf = tf.fit_transform([Preprocess(text)]) 
    # print(p_new.shape)
    new_tfidf_matrix = vstack([tfidf_matrix,new_tf])
    cossim = cosine_similarity(new_tfidf_matrix, new_tfidf_matrix)
    newvocab = tf.vocabulary_
    return newvocab, new_tfidf_matrix, cossim 

vocab, tfidf_matrix, cos_sim = addtext(new_text)

## Note
1. 先新增至tfidf找相近text 
2. 對應至df的law
3. 經專業人員判斷後再填寫正確法律回dataframe <br>
https://stackoverflow.com/questions/8897593/how-to-compute-the-similarity-between-two-text-documents

### Recommendation System

transform back to dataframe - law

In [18]:
indices = pd.Series(df.index)

def recommendations(entered, cosine_sim = cos_sim):
    recommended_law = []
    index = indices[indices == entered].index[0]
    similarity_scores = pd.Series(cosine_sim[index].sort_values(ascending = False))
    top_10_law = list(similarity_scores.iloc[1:11].index)
    for i in top_10_law:
        recommended_law.append(list(df.index)[i])
    return recommended_law
# https://www.geeksforgeeks.org/movie-recommender-based-on-plot-summary-using-tf-idf-vectorization-and-cosine-similarity/