In [1]:
import pandas as pd
import tensorflow as tf
from keras_preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical

Загружаем список товаров из файла

In [2]:
data = pd.read_csv('goods_10K.csv')
data.head()

Unnamed: 0,name,category
0,Поло Print Bar,10
1,Футболка VseMayki.Ru,10
2,Майка Print Bar,10
3,"Friedrich Wilhelm Hermann Wagener ""Staats- un...",7
4,Шорты Quiksilver,10


Задаем список классов: индекс в массиве соответствует числу в колонке **category**

In [3]:
labels = [
    'Авто',
    'Товары для здоровья',
    'Электроника',
    'Бытовая техника',
    'Строительство и ремонт',
    'Товары для дома',
    'Детские товары',
    'Досуг и развлечения',
    'Компьютерная техника',
    'Товары для красоты',
    'Одежда, обувь и аксессуары',
    'Продукты',
    'Спорт и отдых',
    'Дача, сад и огород',
    'Товары для животных',
]

Разобьем данные на 2 набора: тренировочный и тестовый

In [4]:
X = data['name']
y = data['category']

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y, shuffle=True)

Преобразуем категориальные переменные

In [8]:
y_train.head()

5985    10
3641    10
6765    10
3578    10
5491    10
Name: category, dtype: int64

In [9]:
y_train_final = to_categorical(y_train)
y_test_final = to_categorical(y_test)

In [10]:
print(y_train_final[0:10])

[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]]


In [11]:
print(X_train.shape, y_train_final.shape)
print(X_test.shape, y_test_final.shape)

(7000,) (7000, 15)
(3000,) (3000, 15)


In [12]:
# Максимальное количество уникальных токетов (слов) в нашем словаре
MAX_WORDS = 50000
# Это специальное значение для слов, которых нет в нашем словаре
OOV_TOKEN = '<OOV>'
# Максимальная длина предложения
MAX_SEQUENCE_LENGTH = 10
EMBEDDING_DIM = 100
# Сколько раз будем прогонять весь тренировочный набор данных при обучении
EPOCHS = 3
BATCH_SIZE = 16

In [13]:
tokenizer = Tokenizer(
    num_words=MAX_WORDS, 
    filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~0123456789', # Фильтруем спецсимволы и цифры
    lower=True, 
    oov_token=OOV_TOKEN)
tokenizer.fit_on_texts(X_train)
word_index = tokenizer.word_index
print('Всего %s уникальных токенов' % len(word_index))

Всего 7540 уникальных токенов


In [14]:
def preprocess_text(texts):
    texts = tokenizer.texts_to_sequences(texts)
    return pad_sequences(texts, maxlen=MAX_SEQUENCE_LENGTH)

In [15]:
X_train_final = preprocess_text(X_train)
X_test_final = preprocess_text(X_test)
print(X_train_final.shape)
print(X_test_final.shape)

(7000, 10)
(3000, 10)


In [16]:
print(X_train_final[0:10])

[[   0    0    0    0    0    0    0    2    4    3]
 [   0    0    0    0    0    0    0    2    4    3]
 [   0    0    0    0    0    0    0   16    4    3]
 [   0    0    0    0    0    0    0    0   67 2200]
 [   0    0    0    0    0    0    0    8  835  517]
 [   0    0    0    0    0    0    0  211  638  212]
 [   0    0    0  134  124 2201 2202   71  164 2203]
 [   0    0    0    0    0 1244 2204   19 2205   93]
 [   0    0    0    0    0    0    0    7    4    3]
 [   0    0    0    0    0    0    0    0   34 2206]]


Обучение нейросети

In [17]:
model = Sequential()
model.add(layers.Embedding(input_dim=MAX_WORDS, output_dim=EMBEDDING_DIM, input_length=MAX_SEQUENCE_LENGTH))
model.add(layers.SpatialDropout1D(0.2))
model.add(layers.LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(layers.Dense(len(labels), activation="softmax"))
model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 10, 100)           5000000   
_________________________________________________________________
spatial_dropout1d (SpatialDr (None, 10, 100)           0         
_________________________________________________________________
lstm (LSTM)                  (None, 100)               80400     
_________________________________________________________________
dense (Dense)                (None, 15)                1515      
Total params: 5,081,915
Trainable params: 5,081,915
Non-trainable params: 0
_________________________________________________________________


In [None]:
model.fit(X_train_final, y_train_final, epochs=EPOCHS, batch_size=BATCH_SIZE, validation_split=0.1, verbose=1)

Train on 6300 samples, validate on 700 samples
Epoch 1/3
Epoch 2/3

Сохраняем обученную модель и токенизатор

In [None]:
def save_model(model, path):
    tf.keras.models.save_model(
        model,
        path,
        overwrite=True,
        include_optimizer=True,
        save_format=None)

In [None]:
import pickle

def save_tokenizer(tokenizer, path):
    with open(path, 'wb') as file:
        pickle.dump(tokenizer, file, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
save_model(model, "goods_classifier.h5")
save_tokenizer(tokenizer, "tokenizer.pickle")