# Классификатор на основе модели от Google

Модель:
*  https://github.com/mmihaltz/word2vec-GoogleNews-vectors

In [0]:
from gensim.models import KeyedVectors
from keras.layers.core import Dense, SpatialDropout1D
from keras.layers.convolutional import Conv1D
from keras.layers.embeddings import Embedding
from keras.layers.pooling import GlobalMaxPooling1D
from keras.models import Sequential
from keras.preprocessing.sequence import pad_sequences
from keras.utils import np_utils
from sklearn.model_selection import train_test_split
import collections
import nltk
import numpy as np
import codecs
import pandas as pd
import gensim
from nltk.corpus import stopwords
from gensim.models import Word2Vec
import keras
from keras.preprocessing.text import Tokenizer
from keras.layers import Dense, Flatten, Embedding, Dropout, Conv1D, GlobalMaxPooling1D, Activation
from keras.models import Model
import sklearn
from sklearn.model_selection import train_test_split
from keras.utils import to_categorical
from keras.models import save_model, load_model
from keras.callbacks import ModelCheckpoint
from nltk.tokenize import RegexpTokenizer

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
def tokenize(text):
  regex_tokenizer = RegexpTokenizer('[a-zA-Z]+')
  words = regex_tokenizer.tokenize(text.lower())
  stop_words = set(stopwords.words("english"))
  without_stop_words = [w for w in words if w not in stop_words]
  return without_stop_words

In [0]:
def transform_from_categorical(prediction, intents):
  answers = []

  for i in range(prediction.shape[0]):
    answers.append(intents[prediction[i]])

  return answers

def transform_to_categorical(intents_for_each, unique_intents):
  intents_to_digit = []

  for intent in intents_for_each:
    intents_to_digit.append(unique_intents.index(intent))

  return to_categorical(intents_to_digit, len(unique_intents))

In [63]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

Загрузка данных и модели от Google

In [7]:
%%time
file_csv = pd.read_csv('drive/My Drive/ForGensim/train.csv')
word2vec_model = KeyedVectors.load_word2vec_format(
    "drive/My Drive/ForGensim//GoogleNews-vectors-negative300.bin.gz",
    binary=True)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


CPU times: user 1min 46s, sys: 5.28 s, total: 1min 51s
Wall time: 1min 53s


In [0]:
VOCAB_SIZE = len(word2vec_model.vocab)
EMBED_SIZE = 300
NUM_FILTERS = 256
NUM_WORDS = 3
BATCH_SIZE = 64
NUM_EPOCHS = 10

In [0]:
counter = collections.Counter()
maxlen = 0

Просмотрели некоторые слова в словаре

In [66]:
from itertools import islice
list(islice(word2vec_model.vocab, 15010, 15020))

['John_F._Kennedy',
 'ideals',
 'insane',
 'Dow_Jones_Industrial_Average',
 'Guillen',
 'Established',
 'lip',
 'SS',
 'Drop',
 'prominence']

Работа с набором данных (удаление стоп-слов, получение частотного словаря, максимальной длины)

In [0]:
all_texts = file_csv['text']
all_intents = file_csv['intents']

In [0]:
sentences = np.array(all_texts.apply(lambda x : tokenize(x)))

In [0]:
for words in sentences:
    if len(words) > maxlen:
        maxlen = len(words)
    for word in words:
        counter[word] += 1

Подготовка данных для обучения на основе модели от Google

In [0]:
vocab_sz = len(counter) + 1

In [0]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentences)

In [0]:
xs =  tokenizer.texts_to_sequences(sentences)

In [0]:
ys = transform_to_categorical(all_intents, all_intents.unique().tolist())

In [0]:
X = pad_sequences(xs, maxlen=maxlen)
Y = ys

In [82]:
Xtrain, Xtest, Ytrain, Ytest = \
    train_test_split(X, Y, test_size=0.3, random_state=42)
print(Xtrain.shape, Xtest.shape, Ytrain.shape, Ytest.shape)

(7961, 22) (3413, 22) (7961, 5) (3413, 5)


Создание keras модели

In [0]:
embedding_weights = np.zeros((vocab_sz, EMBED_SIZE))
for word, index in word2index.items():
    try:
        embedding_weights[index, :] = word2vec_model[word]
    except KeyError:
        pass

In [0]:
COUNT_CLASSES = all_intents.unique().shape[0]

In [0]:
model = Sequential()
model.add(Embedding(vocab_sz, EMBED_SIZE, input_length=maxlen,
                    weights=[embedding_weights],
                    trainable=True))
model.add(SpatialDropout1D(0.2))
model.add(Conv1D(filters=NUM_FILTERS, kernel_size=NUM_WORDS,
                 activation="relu"))
model.add(GlobalMaxPooling1D())
model.add(Dense(COUNT_CLASSES, activation="softmax"))

model.compile(optimizer="adam", loss="categorical_crossentropy",
              metrics=["accuracy"])

In [86]:
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 22, 300)           2773500   
_________________________________________________________________
spatial_dropout1d_2 (Spatial (None, 22, 300)           0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 20, 256)           230656    
_________________________________________________________________
global_max_pooling1d_2 (Glob (None, 256)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 5)                 1285      
Total params: 3,005,441
Trainable params: 3,005,441
Non-trainable params: 0
_________________________________________________________________


Обучение keras модели

In [87]:
history = model.fit(Xtrain, Ytrain, batch_size=BATCH_SIZE,
                    epochs=NUM_EPOCHS,
                    callbacks=[ModelCheckpoint('drive/My Drive/ForGensim/google_keras_model.h5', save_best_only = True)],
                    validation_data=(Xtest, Ytest))

Train on 7961 samples, validate on 3413 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [88]:
score = model.evaluate(Xtest, Ytest, verbose=1)
print("Test score: {:.3f}, accuracy: {:.3f}".format(score[0], score[1]))

Test score: 0.015, accuracy: 0.995


Модель обучилась с точностью ~99%

# Тестирование (проверка работы) классификатора на небольших данных

In [0]:
test = ['Add track to my Digster Future Hits', 'what a temperature today in kirov', 'add to playlist my song my melody', 
        'What is the hottest temperature on earth right now?', 'Find a movie schedule for 12 hours from now', 'play ed sheeran song'] 

In [92]:
sequences_test = tokenizer.texts_to_sequences(test)
sequences_test

[[2, 21, 576, 617, 111],
 [572, 147],
 [2, 3, 18, 193],
 [572, 1039, 231],
 [13, 5, 17, 87],
 [1, 7325, 18]]

In [0]:
X_predict = pad_sequences(sequences_test, maxlen=maxlen)

In [0]:
prediction = model.predict_classes(X_predict)

In [95]:
answers = transform_from_categorical(prediction, all_intents.unique().tolist()) 
answers

['AddToPlaylist',
 'GetWeather',
 'AddToPlaylist',
 'GetWeather',
 'SearchScreeningEvent',
 'PlayMusic']