In [1]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
np.set_printoptions(threshold=np.nan)

import jieba
jieba.set_dictionary('dict.txt')                  # 繁體字典
jieba.load_userdict('special_v3.txt')             # 法律專用詞字典
#with open('stop.txt', encoding='utf_8') as f:    # 停止字字典
#    stops = f.read().split('\n')
import re

import gensim
from gensim.models import Word2Vec
from sklearn.metrics import accuracy_score

import keras
from keras import Input
from keras import backend as K
from keras.layers import *
from keras.models import *
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences
from keras.optimizers import SGD, Adam
from keras.regularizers import l2
from keras.engine.topology import Layer
from keras.backend.tensorflow_backend import set_session
from keras.utils import to_categorical, plot_model

Building prefix dict from C:\Users\user\Desktop\Python\判決書\損害賠償\dict.txt ...
Loading model from cache C:\Users\user\AppData\Local\Temp\jieba.u199ece6fffe78be19b7dda1547b04bc1.cache
Loading model cost 0.640 seconds.
Prefix dict has been built succesfully.
Using TensorFlow backend.


### 資料前置

In [2]:
df = pd.read_csv('CFD_all_Y_v4.csv',encoding='utf_8_sig')

In [3]:
# df為所有損害賠償判決書  DF為分類標籤整理為1:1的判決書資料

df = df.dropna(subset = ['案件描述','判決主文類別'])
df = df[df['判決主文類別'] != 3]
df1 = df[df['判決主文類別'] == 1]
df2 = df[df['判決主文類別'] == 2]
df3 = df1.sample(len(df2), random_state=87)
DF = pd.concat([df3, df2])
DF['判決主文類別'] = DF['判決主文類別']-1
DF = DF.sample(frac=1)                       # 打亂順序

DF['判決主文類別'].value_counts()             # 1代表原告訴求駁回

1    4531
0    4531
Name: 判決主文類別, dtype: int64

### 文字處理

In [4]:
# 主文斷詞斷句

main_text = DF['案件描述'].tolist()
cat_list = DF['判決主文類別'].tolist()

# 主文斷詞  ['第一篇', '第二篇']
main_text_list = [' '.join(jieba.cut( txt, cut_all=False)) for txt in main_text]

# 主文斷句  [['第一篇第一句','第一篇第二句'], ['第二篇第一句','第二篇第二句']]
main_sentence_list = [re.split('，|。', i) for i in main_text_list]             

In [5]:
# 重要參數

MAX_SENT_LENGTH = 20    # 單句最大詞數
MAX_SENTS = 100         # 單篇最多句數
MAX_NB_WORDS = 150000   # 字典上限
EMBEDDING_DIM = 100     # 詞向量維度

In [7]:
# 文字資料前處理

tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(main_text_list)

# 訂好矩陣形狀 (樣本數, 單篇文章最多句數, 單句最大詞數)
data = np.zeros((len(main_text_list), MAX_SENTS, MAX_SENT_LENGTH), dtype='int32')   

# 多切掉少補零 (補在後面)
for i, sentences in enumerate(main_sentence_list):
    for j, sent in enumerate(sentences):
        if j< MAX_SENTS:
            wordTokens = text_to_word_sequence(sent)
            k=0
            for _, word in enumerate(wordTokens):
                if k<MAX_SENT_LENGTH and tokenizer.word_index[word]<MAX_NB_WORDS:
                    data[i,j,k] = tokenizer.word_index[word]
                    k=k+1                    
word_index = tokenizer.word_index

labels = to_categorical(np.asarray(cat_list))
print('data:(樣本數, 單篇最多句數, 單句最大詞數)', data.shape)
print('labels:', labels.shape)

data:(樣本數, 單篇最多句數, 單句最大詞數) (9062, 100, 20)
labels: (9062, 2)


### 切訓練、驗證、測試資料

In [8]:
VALIDATION_SPLIT = 0.1     # 驗証資料比例
TEST_SPLIT = 0.1           # 測試資料比例

In [9]:
# 訓練資料 驗證資料 測試資料

p1 = int(len(data)*(1-VALIDATION_SPLIT-TEST_SPLIT))
p2 = int(len(data)*(1-TEST_SPLIT))

x_train = data[:p1]
y_train = labels[:p1]

x_val = data[p1:p2]
y_val = labels[p1:p2]

x_test = data[p2:]
y_test = labels[p2:]

print( 'train : '+str(len(x_train)) )
print( 'val : '+str(len(x_val)) )
print( 'test : '+str(len(x_test)) )

train : 7249
val : 906
test : 907


In [10]:
# 全零 Array，用於Attention層的非監督學習

y_train_sent_score = np.zeros((len(y_train), MAX_SENTS))
y_train_word_score = np.zeros((len(y_train), MAX_SENT_LENGTH))
y_val_sent_score = np.zeros((len(y_val), MAX_SENTS))
y_val_word_score = np.zeros((len(y_val), MAX_SENT_LENGTH))

## 模型建構

### 詞向量-Word2Vec

In [16]:
# 排列順序按照Tokenizer在fit之後的詞順序，作為權重餵給 embedding Layer

# 讀取預先訓練好的word2vec模型
w2v_model = Word2Vec.load("w2v_model.model")

# 因Keras保留一層全零層所以需要加1
embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))                    

# 將word_list對應到word2vec詞向量
for word, i in word_index.items(): 
    if str(word) in w2v_model:
        embedding_matrix[i] = np.asarray(w2v_model[str(word)],dtype='float32')       
        
# 設定word2vec為embedding層
embedding_layer = Embedding(len(word_index) + 1,                                    
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],                              
                            input_length=MAX_SENT_LENGTH,
                            trainable=False)  # 已經訓練過word2vec了，因此不加入Keras訓練

### Attention層

In [12]:
# 定義Attention層

'''
代碼來源：https://gist.github.com/cbaziotis/6428df359af27d58078ca5ed9792bd6d
論文：https://arxiv.org/abs/1512.08756

'''

def dot_product(x, kernel):
    if K.backend() == 'tensorflow':
        # todo: check that this is correct
        return K.squeeze(K.dot(x, K.expand_dims(kernel)), axis=-1)
    else:
        return K.dot(x, kernel)

class Attention(Layer):
    def __init__(self,
                 W_regularizer=None, b_regularizer=None,
                 W_constraint=None, b_constraint=None,
                 bias=True,
                 return_attention=False,
                 **kwargs):
        self.supports_masking = True
        self.return_attention = return_attention
        self.init = initializers.get('glorot_uniform')
        self.W_regularizer = regularizers.get(W_regularizer)
        self.b_regularizer = regularizers.get(b_regularizer)
        self.W_constraint = constraints.get(W_constraint)
        self.b_constraint = constraints.get(b_constraint)
        self.bias = bias
        super(Attention, self).__init__(**kwargs)

    def build(self, input_shape):
        assert len(input_shape) == 3
        self.W = self.add_weight((input_shape[-1],),
                                 initializer=self.init,
                                 name='{}_W'.format(self.name),
                                 regularizer=self.W_regularizer,
                                 constraint=self.W_constraint)
        if self.bias:
            self.b = self.add_weight((input_shape[1],),
                                     initializer='zero',
                                     name='{}_b'.format(self.name),
                                     regularizer=self.b_regularizer,
                                     constraint=self.b_constraint)
        else:
            self.b = None
        self.built = True

    def compute_mask(self, input, input_mask=None):
        return None

    def call(self, x, mask=None):
        eij = dot_product(x, self.W)
        if self.bias:
            eij += self.b
        eij = K.tanh(eij)
        a = K.exp(eij)
        # apply mask after the exp. will be re-normalized next
        if mask is not None:
            # Cast the mask to floatX to avoid float64 upcasting in theano
            a *= K.cast(mask, K.floatx())
        # a /= K.cast(K.sum(a, axis=1, keepdims=True), K.floatx())
        a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())
        weighted_input = x * K.expand_dims(a)
        result = K.sum(weighted_input, axis=1)
        if self.return_attention:
            return [result, a]
        return result

    def compute_output_shape(self, input_shape):
        if self.return_attention:
            return [(input_shape[0], input_shape[-1]),
                    (input_shape[0], input_shape[1])]
        else:
            return input_shape[0], input_shape[-1]

### Keras建構模型

In [17]:
# w2v_GRU_Attention

"""
模型結構：
 案件描述輸入 > 句子輸入 > word2vec詞向量 > 雙向GRU > Attention層 (產出詞重要度) > 連接層 >
                                         雙向GRU > Attention層 (產出句子重要度) > 連接層 > 判決結果
model input: 案件描述
model output: 判決結果、句子重要度

model2 input: 句子
model2 output: 詞重要度

"""

sentence_input = Input(shape=(MAX_SENT_LENGTH,), dtype='int32')
l_embedded = embedding_layer(sentence_input)
l_lstm = Bidirectional(GRU(100, dropout=0.2, return_sequences=True))(l_embedded)
l_att, word_scores = Attention(return_attention=True)(l_lstm)
l_dense = Dense(100, activation='relu')(l_att)
Word_Model = Model(sentence_input, l_dense)

verdict_input = Input(shape=(MAX_SENTS, MAX_SENT_LENGTH), dtype='int32')
verdict_encoder = TimeDistributed(Word_Model)(verdict_input)
l_lstm_sent = Bidirectional(GRU(100, dropout=0.2, return_sequences=True))(verdict_encoder)
l_att_sent, sentence_scores = Attention(return_attention=True)(l_lstm_sent)
l_dense_sent = Dense(100, activation='relu')(l_att_sent)
preds = Dense(labels.shape[1], activation='softmax')(l_dense_sent)


model = Model(verdict_input, output=(preds, sentence_scores))
model2 = Model(sentence_input, output=(word_scores))

model.compile(loss='binary_crossentropy', loss_weights=[1,0], optimizer='rmsprop', metrics=['acc'])

In [18]:
# 模型一 : 分類結果與句的權重
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_5 (InputLayer)         (None, 100, 20)           0         
_________________________________________________________________
time_distributed_2 (TimeDist (None, 100, 100)          13715820  
_________________________________________________________________
bidirectional_4 (Bidirection (None, 100, 200)          120600    
_________________________________________________________________
attention_3 (Attention)      [(None, 200), (None, 100) 300       
_________________________________________________________________
dense_3 (Dense)              (None, 100)               20100     
_________________________________________________________________
dense_4 (Dense)              (None, 2)                 202       
Total params: 13,857,022
Trainable params: 282,122
Non-trainable params: 13,574,900
__________________________________________________________

In [19]:
# 模型二 : 字的權重
model2.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_4 (InputLayer)         (None, 20)                0         
_________________________________________________________________
embedding_3 (Embedding)      (None, 20, 100)           13574900  
_________________________________________________________________
bidirectional_3 (Bidirection (None, 20, 200)           120600    
_________________________________________________________________
attention_2 (Attention)      [(None, 200), (None, 20)] 220       
Total params: 13,695,720
Trainable params: 120,820
Non-trainable params: 13,574,900
_________________________________________________________________


In [22]:
# FIT起來 !!!!!

model.fit(x_train, [y_train, y_train_sent_score], epochs=3, validation_data=([x_val, [y_val, y_val_sent_score]]))

Train on 7249 samples, validate on 906 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x2123d55d7b8>

In [23]:
# 模型測試資料判決準確率

Y_test = y_test.dot([0,1])
test_pred_label = np.argmax(model.predict(x_test)[0], axis=1)

accuracy=accuracy_score(Y_test ,test_pred_label)
print('Test Data Acc : '+str(accuracy))

Test Data Acc : 0.8302094818081588


### 輸出model結果(判決預測、句子重要度)

In [53]:
# 測試資料丟模型跑結果

y_test_pred = model.predict(x_test)

y_test_pred_label = y_test_pred[0]    # 判決結果
y_test_pred_score = y_test_pred[1]    # 句子重要度

In [60]:
# 測試用文章

test_victor = 150

In [61]:
# y_test_pred_score[test_victor]        # 測試用文章句子重要度
# y_test_pred_score[test_victor].sum()  # 權重相加要約等於1 (會有np浮點數計算誤差)

In [62]:
# 將數字映回文字

sentences = x_test[test_victor].tolist()
reverse_word_map = dict(map(reversed, tokenizer.word_index.items())) # 將 word_index 字典反過來

def sequence_to_text(list_of_indices):
    words = [reverse_word_map.get(letter) for letter in list_of_indices]
    return(words)
sent_list = list(map(sequence_to_text, sentences)) # 對映回來

In [63]:
# 整理為 dataframe

df_match_attscore = pd.DataFrame()
sent_se = pd.Series(sent_list)
df_match_attscore['句子'] = sent_se.values
weight_list = y_test_pred_score[test_victor].tolist()
weight_se = pd.Series(weight_list)
df_match_attscore['分數'] = weight_se.values

In [64]:
# 判決結果

y_test_pred_label[test_victor]

array([0.781666  , 0.21833403], dtype=float32)

In [65]:
# 句子重要度

pd.set_option("display.max_rows",100)
df_match_attscore

Unnamed: 0,句子,分數
0,"[原告, 主張, ：, 被告, 羅栩亮, 為, 兆良, 科技, 股份有限公司, （, 下稱,...",0.031984
1,"[被告, 黃泳學, 為, 投資, 未, 上市, 、, 上櫃, 公司股票, 並, 非法, 經營...",0.031984
2,"[兆良, 公司, 透過, 電話, 承銷, None, None, None, None, N...",0.031961
3,"[並, 利用, 媒體, 向, 投資人, 喊話, None, None, None, None...",0.031883
4,"[宣稱, 兆良, 公司, 和, 國際, 醫療, 器材, 大, 廠, 合作, None, No...",0.031937
5,"[簽署, 備忘錄, None, None, None, None, None, None, ...",0.031939
6,"[原告, 受此, 吸引, 乃, 投資, 購買, 兆良, 公司, 未, 上市, 股票, Non...",0.031985
7,"[惟, 兆良, 公司, 確屬, 空殼, 公司, 而, 有, 詐騙, 投資人, 之, 事實, ...",0.031988
8,"[原告, 因, 被告, 共同, 故意, 詐欺, 之, 行為, 而, 受騙, None, No...",0.031988
9,"[先後, 於, 民國, 103, 年, 6, 月, 3, 日, 、, 同年, 8, 月, 1...",0.031988


In [72]:
# 存檔
# df_match_attscore.to_csv('案例1Attention.csv', encoding='utf_8_sig')