In [1]:
import sys, os, re, csv, codecs, numpy as np, pandas as pd

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers

Using TensorFlow backend.


In [2]:
EMBEDDING_FILE='F:/BaiduDiskDownload/sgns.zhihu.word'
TRAIN_DATA_FILE='data/trainingset.csv'
TEST_DATA_FILE='data/testa.csv'

In [3]:
flag = 0
for big_file in open(EMBEDDING_FILE, encoding='utf-8'):
    flag = flag + 1
    print(big_file[:100])
    if flag == 10:
        break

， -0.141209 0.270277 0.315273 0.063904 0.018635 0.188373 0.018318 0.143256 -0.110122 -0.056711 -0.12
的 -0.063668 0.245708 0.077410 -0.232004 0.074862 0.269867 -0.032787 0.127365 0.049791 -0.012506 -0.2
。 -0.141476 0.323197 0.234520 0.035564 0.112336 0.233337 0.019396 0.034489 -0.094744 -0.184300 -0.09
了 0.069129 0.268287 0.020775 0.001612 0.076566 0.150568 -0.144764 -0.046387 0.015076 0.224718 0.1939
和 -0.100068 0.259799 -0.227210 -0.212539 -0.126704 0.123879 -0.152488 0.242198 -0.302427 0.303838 -0
是 -0.166937 0.368780 0.165970 -0.074695 0.277038 0.228472 -0.157383 0.115142 -0.321472 0.004853 -0.0
、 -0.080736 0.240971 -0.020342 0.056467 0.121436 0.253740 -0.095139 0.062403 0.059077 0.058006 -0.12
一个 -0.326003 0.425066 -0.205147 -0.175781 0.140471 0.097350 -0.167934 0.295831 -0.138174 0.138699 -0
我 0.005664 0.407404 -0.153532 0.161316 0.168288 0.259601 -0.267191 0.001471 -0.149739 -0.156568 -0.1
有 -0.524946 0.342831 0.059886 0.433878 0.533997 -0.003256 -0.038177 -0.012324 -0.448267 0.2

In [12]:
embed_size = 300 # how big is each word vector
max_features = 20000 # how many unique words to use (i.e num rows in embedding vector)
maxlen = 1000 # max number of words in a comment to use

In [5]:
train = pd.read_csv(TRAIN_DATA_FILE)
test = pd.read_csv(TEST_DATA_FILE)

In [6]:
sentiments = train.columns.tolist()[2:]

In [7]:
list_sentences_train = train["content"].fillna("_na_").values
y = train[sentiments].values
list_sentences_test = test["content"].fillna("_na_").values

In [8]:
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(list_sentences_train))
list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)
list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)
X_t = pad_sequences(list_tokenized_train, maxlen=maxlen)
X_te = pad_sequences(list_tokenized_test, maxlen=maxlen)

In [9]:
from tqdm import tqdm as tq

def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')

embeddings_index = dict(get_coefs(*o.strip().split()) for o in tq(open(EMBEDDING_FILE, encoding='utf-8')))

259922it [00:27, 9604.52it/s] 


In [10]:
all_embs = np.stack(embeddings_index.values())
emb_mean,emb_std = all_embs.mean(), all_embs.std()
emb_mean,emb_std

(0.0040492876, 0.17262922)

In [13]:
word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
for word, i in tq(word_index.items()):
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector

100%|█████████████████████████████████████████████████████████████████████| 717350/717350 [00:00<00:00, 1789228.16it/s]


In [14]:
inp = Input(shape=(maxlen,))
x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
x = Bidirectional(LSTM(300, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(x)
x = GlobalMaxPool1D()(x)
x = Dense(300, activation="relu")(x)
x = Dropout(0.1)(x)
x = Dense(20, activation="sigmoid")(x)
model = Model(inputs=inp, outputs=x)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [15]:
model.fit(X_t, y, batch_size=32, epochs=2, validation_split=0.1)

Train on 94500 samples, validate on 10500 samples
Epoch 1/2
 1088/94500 [..............................] - ETA: 32:29:57 - loss: 0.7282 - acc: 0.137 - ETA: 32:08:25 - loss: 0.5702 - acc: 0.119 - ETA: 31:38:10 - loss: 0.3666 - acc: 0.108 - ETA: 31:26:31 - loss: 0.1242 - acc: 0.102 - ETA: 31:32:13 - loss: -0.1699 - acc: 0.10 - ETA: 31:44:23 - loss: -0.4852 - acc: 0.09 - ETA: 31:56:35 - loss: -0.8388 - acc: 0.09 - ETA: 32:17:33 - loss: -1.3237 - acc: 0.09 - ETA: 32:32:35 - loss: -1.8408 - acc: 0.09 - ETA: 32:43:13 - loss: -2.3854 - acc: 0.09 - ETA: 32:47:17 - loss: -2.9452 - acc: 0.09 - ETA: 32:56:29 - loss: -3.6323 - acc: 0.09 - ETA: 33:08:41 - loss: -4.2479 - acc: 0.10 - ETA: 33:21:51 - loss: -5.0060 - acc: 0.10 - ETA: 33:32:36 - loss: -5.6791 - acc: 0.10 - ETA: 33:46:27 - loss: -6.3902 - acc: 0.10 - ETA: 33:56:08 - loss: -6.9867 - acc: 0.10 - ETA: 34:08:20 - loss: -7.5973 - acc: 0.10 - ETA: 34:15:52 - loss: -8.1563 - acc: 0.10 - ETA: 34:25:31 - loss: -8.6568 - acc: 0.10 - ETA: 34:33:46

KeyboardInterrupt: 

In [None]:
# loss不收敛，放弃LSTM