-
Notifications
You must be signed in to change notification settings - Fork 3
/
net.py
136 lines (123 loc) · 5.8 KB
/
net.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
# -*- encoding:utf-8 -*-
"""
@作者:Mr.zhang
@文件名:net.py
@时间:20-3-18 上午10:16
@文档说明:
"""
import os
import numpy as np
import config
import codecs
from keras import Sequential, regularizers
from keras_preprocessing import text
from keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dropout, Dense
class Init_embedded_matrix():
def __init__(self):
self.embedded_matrix_size = config.embedded_matrix_size
self.tokenizer = self.get_tokenizer()
self.word_embedding = self.loading_Word_Embedding()
def get_tokenizer(self):
self.tokenizer_path = config.data_preprocessing_config().tokenizer_path
if not os.path.exists(self.tokenizer_path):
print("please run data_preprocessing generate '{}'".format(self.tokenizer_path))
exit()
else:
with open(self.tokenizer_path, "r") as f:
tokenizer_json = f.read()
tokenizer = text.tokenizer_from_json(tokenizer_json)
return tokenizer
def loading_Word_Embedding(self):
'''
:param path: wiki.zh.vec
:return: embeddings_index
'''
from tqdm import tqdm
print('loading word embeddings...') # 加载字嵌入
embeddings_index = {} # Word vector dict
with codecs.open(config.wiki_zh_vec_path, encoding='utf-8') as f: # open the file
for line in tqdm(f):
values = line.rstrip().rsplit(' ') # 删除 string 字符串末尾的指定字符(默认为空格) rsplit(以 ‘ ’对字符串进行分割)
word = values[0] # word
coefs = np.asarray(values[1:], dtype='float32') # Word vector to np.array
embeddings_index[word] = coefs
print('found %s word vectors' % len(embeddings_index)) # print len(dict) (111052 word vectors)
return embeddings_index
def get_embedding_matrix(self, embed_dim=300):
'''
:param embed_dim: embed dim=300
:return: word_matrix
word_index: {word1:1,
word2:2,
word3:3,
...}
word_embedding: {word1:[300,],
word2:[300,],
word3:[300,],
...}
word_matrix: {[300,], # 对应词的词向量放置在对应的行(array)
[300,],
[300,],
...}
'''
# We can now prepare our embedding matrix limiting to a max number of words:
print('preparing embedding matrix...')
words_not_found = [] # not found word
nb_words = min(self.embedded_matrix_size, len(self.tokenizer.word_index)) # 确保设置的词频大小和实际统计的词频大小取最小
word_matrix = np.zeros((nb_words, embed_dim)) # create zeros embedding matrix shape=(100000, 300)
for word, i in self.tokenizer.word_index.items(): # ergodic word_index.items()
if i >= nb_words: # nb_words = 100000
continue
embedding_vector = self.word_embedding.get(
word) # embedding_vector = wiki.sample.vec.get(word) = values = (300, )
if (embedding_vector is not None) and len(embedding_vector) > 0:
# words not found in embedding index will be all-zeros.
word_matrix[i] = embedding_vector
else:
words_not_found.append(word)
print('number of null word embeddings: %d' % np.sum(np.sum(word_matrix, axis=1) == 0)) # 求出每行和为零的总共行数
# It's interesting to look at the words not found in the embeddings
print("sample words not found: ", np.random.choice(words_not_found, 100))
return word_matrix, nb_words, embed_dim
class Model_init_parameter_config():
def __init__(self, data, column_names):
self.data = data
self.data_column_name = column_names[0]
self.class_column_name = column_names[1]
self.num_class = self.get_num_class()
self.input_max_seq_len = config.input_max_seq_len
def get_num_class(self):
# label_dict = one_hot(label_names)
label_names = self.data[self.class_column_name].unique()
num_classes = len(label_names)
print("label_names:", label_names)
return num_classes
class Init_model():
def __init__(self):
self.embedded_matrix, self.nb_words, self.embed_dim = Init_embedded_matrix().get_embedding_matrix()
def Model(self,
num_classes,
input_max_seq_len,
num_filters=512,
weight_decay=1e-4):
self.model = Sequential()
self.model.add(Embedding(self.nb_words, self.embed_dim, weights=[self.embedded_matrix], input_length=input_max_seq_len, trainable=False,
name="input"))
self.model.add(Dropout(0.2))
self.model.add(Conv1D(num_filters, 7, activation='relu', padding='same'))
self.model.add(GlobalMaxPooling1D())
self.model.add(Dense(256, activation='relu'))
self.model.add(Dense(128, activation='relu'))
self.model.add(Dense(64, activation='relu'))
self.model.add(Dense(32, activation='relu'))
self.model.add(Dense(16, activation="relu", kernel_regularizer=regularizers.l2(weight_decay)))
self.model.add(Dense(num_classes, activation="sigmoid", name="output"))
self.model.compile(optimizer="adam", loss='binary_crossentropy', metrics=['accuracy'])
return self.model
if __name__ == '__main__':
import pandas as pd
data = pd.read_csv(config.train_data_path, sep=",", header=0)
model_parame = Model_init_parameter_config(data, config.column_names)
print(model_parame.get_num_class())
model = Init_model().Model(model_parame.num_class, model_parame.input_max_seq_len)
model.summary()