In [0]:
import pandas as pd

In [0]:
!git clone https://github.com/wshuyi/demo-chinese-text-classification-lstm-keras.git

In [0]:
from pathlib import Path

In [0]:
mypath = Path("demo-chinese-text-classification-lstm-keras")

In [0]:
df = pd.read_csv(mypath/'dianping.csv')

In [0]:
df.head()

In [0]:
!pip install jieba

In [0]:
import jieba

In [0]:
df['text'] = df.comment.apply(lambda x: " ".join(jieba.cut(x)))

In [0]:
df.head()

In [0]:
df = df[['text', 'sentiment']]

In [0]:
df.head()

In [0]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import numpy as np

In [0]:
maxlen = 100
max_words = 10000

In [0]:
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(df.text)
sequences = tokenizer.texts_to_sequences(df.text)

In [0]:
type(sequences)

In [0]:
sequences[:1]

In [0]:
for sequence in sequences[:5]:
  print(len(sequence))

In [0]:
data = pad_sequences(sequences, maxlen=maxlen)

In [0]:
data

In [0]:
word_index = tokenizer.word_index

In [0]:
type(word_index)

In [0]:
print(word_index)

In [0]:
labels = np.array(df.sentiment)

In [0]:
labels

In [0]:
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]

In [0]:
labels

In [0]:
training_samples = int(len(indices) * .8)
validation_samples = len(indices) - training_samples

In [0]:
training_samples

In [0]:
validation_samples

In [0]:
X_train = data[:training_samples]
y_train = labels[:training_samples]
X_valid = data[training_samples: training_samples + validation_samples]
y_valid = labels[training_samples: training_samples + validation_samples]

In [0]:
X_train

In [0]:
!pip install gensim

In [0]:
from gensim.models import KeyedVectors

In [0]:
myzip = mypath / 'zh.zip'

In [0]:
!unzip $myzip

In [0]:
zh_model = KeyedVectors.load_word2vec_format('zh.vec')

In [0]:
zh_model.vectors[0]

In [0]:
list(iter(zh_model.vocab))[:5]

In [0]:
len(zh_model[next(iter(zh_model.vocab))])

In [0]:
embedding_dim = len(zh_model[next(iter(zh_model.vocab))])

In [0]:
embedding_matrix = np.random.rand(max_words, embedding_dim)

In [0]:
embedding_matrix

In [0]:
embedding_matrix = (embedding_matrix - 0.5) * 2

In [0]:
embedding_matrix

In [0]:
zh_model.get_vector('的')

In [0]:
zh_model.get_vector("王树义")

In [0]:
for word, i in word_index.items():
    if i < max_words:
        try:
          embedding_vector = zh_model.get_vector(word)
          embedding_matrix[i] = embedding_vector
        except:
          pass
            

In [0]:
embedding_matrix

In [0]:
from keras.models import Sequential
from keras.layers import Embedding, Flatten, Dense, LSTM

units = 32

model = Sequential()
model.add(Embedding(max_words, embedding_dim))
model.add(LSTM(units))
model.add(Dense(1, activation='sigmoid'))
model.summary()

In [0]:
model.layers[0].set_weights([embedding_matrix])
model.layers[0].trainable = False

In [0]:
model.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['acc'])
history = model.fit(X_train, y_train,
                    epochs=10,
                    batch_size=32,
                    validation_data=(X_valid, y_valid))
model.save("mymodel.h5")

In [0]:
import matplotlib.pyplot as plt

acc = history.history['acc']
val_acc = history.history['val_acc']
loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(1, len(acc) + 1)

plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.legend()

plt.figure()

plt.plot(epochs, loss, 'bo', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.legend()

plt.show()