In [1]:
import pandas as pd

# Анализ данных

In [2]:
fields = ['Product', 'Summary']
data = pd.read_csv("bugs-2018-04-05.csv", usecols=fields)
data.head()

Unnamed: 0,Product,Summary
0,Thunderbird,e-mail tag is not visible when thread is colla...
1,Calendar,Make use of Assert.jsm in xpcshell tests
2,Core,Add touch-action regions to the layer EventReg...
3,Core,Transition more fields of FrameMetrics to use ...
4,Calendar,Getter fails in calender-migration-dialog on f...


In [3]:
data.describe()

Unnamed: 0,Product,Summary
count,10000,10000
unique,78,9952
top,Core,reboots-scl1
freq,2122,8


In [4]:
import re
RE_WORDS = re.compile(r'''
    # Find words in a string. Order matters!
    [A-Z]+(?=[A-Z][a-z]) |  # All upper case before a capitalized word
    [A-Z]?[a-z]+ |  # Capitalized words / all lower case
    [A-Z]+ |  # All upper case
    \d+  # Numbers
''', re.VERBOSE)

def clean_text(text):
    text = text.replace(r"[^A-Za-z0-9]", " ")
    text = re.sub("[\]\[_'\\\/.,;:\-()<>\"]", " ", text)
    text = ' '.join(word for word in RE_WORDS.findall(text) if len(word)>2 or word.isupper())
    text = text.lower()
    return text

data['CleanSummary'] = data.apply(lambda x: clean_text(x['Summary']), axis=1)

from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')

data['Tokens'] = data['CleanSummary'].apply(tokenizer.tokenize)
data.to_pickle('dataframe.pkl')

data.tail()


Unnamed: 0,Product,Summary,CleanSummary,Tokens
9995,Webmaker,[Meta] Solution to user's preference locale se...,meta solution user preference locale setting a...,"[meta, solution, user, preference, locale, set..."
9996,Webmaker,Figure out a way to instantiate variables in a...,figure out way instantiate variables angular,"[figure, out, way, instantiate, variables, ang..."
9997,Webmaker,Change SECRET_SESSION to match other apps,change secret session match other apps,"[change, secret, session, match, other, apps]"
9998,Webmaker,Missing selectize in define details-form,missing selectize define details form,"[missing, selectize, define, details, form]"
9999,Webmaker,Popcorn video from three months ago - Text fie...,popcorn video from three months ago text field...,"[popcorn, video, from, three, months, ago, tex..."


In [5]:
all_words = [word for tokens in data['Tokens'] for word in tokens]
sentence_lengths = [len(tokens) for tokens in data['Tokens']]
VOCAB = sorted(list(set(all_words)))
print("%s words total, with a vocabulary size of %s" % (len(all_words), len(VOCAB)))
print("Max sentence length is %s" % max(sentence_lengths))

83058 words total, with a vocabulary size of 8622
Max sentence length is 37


In [6]:
categories = {}
keys = enumerate(data['Product'].unique())
for key, value in keys:
    categories[value] = key + 1
data['ProductIndex'] = data['Product'].map(categories)
print(categories)

# Этот код - пробовал объединять схожие категории, не нужен сейчас
#keys = sorted(list(set(data['Product'])))
#print(keys)
#data['ProductIndex'] = [keys.index(key) for key in data['Product']]

#joinedKeys = {}
#index = 6
#for key in keys:
#    k = key.lower()
#    if "mozilla" in k or "bugzilla" in k or "web" in k:
#        joinedKeys.setdefault(key, 0)
#    elif "firefox" in k:
#        joinedKeys.setdefault(key, 1)
#    elif "penelope" in k or "documentation" in k:
#        joinedKeys.setdefault(key, 2)
#    elif "core" in k or "task" in k or "socorro" in k or "seamonkey" in k or "tool" in k or "develop" in k:
#        joinedKeys.setdefault(key, 3)
#    elif "testing" in k:
#        joinedKeys.setdefault(key, 4)
#    elif "infractructure" in k or "release" in k or "tracking" in k:
#        joinedKeys.setdefault(key, 5)
    # next is for other - too litle data for that
#    elif "community building" in k or "directory" in k or "skywriter" in k or "shield" in k or "nspr" in k \
#            or "nss" in k or "privacy" in k or "snippets" in k or "jss" in k:
#        joinedKeys.setdefault(key, 100)
#    else:
#        joinedKeys.setdefault(key, index)
#        index += 1

#joinedKeysList = list(joinedKeys.keys())
#print(len(joinedKeysList))
#print(len(set(joinedKeys.values())))
#print(joinedKeys)
#data['JoinedProductIndex'] = [joinedKeys[key] for key in data['Product']]
data.to_pickle('dataframe.pkl')
data.tail()

{'Thunderbird': 1, 'Calendar': 2, 'Core': 3, 'Firefox OS': 4, 'www.mozilla.org': 5, 'Toolkit': 6, 'Firefox': 7, 'SeaMonkey': 8, 'Chat Core': 9, 'Firefox for Android Graveyard': 10, 'Mozilla Labs': 11, 'Firefox Build System': 12, 'Tree Management Graveyard': 13, 'Toolkit Graveyard': 14, 'Participation Infrastructure': 15, 'Developer Documentation': 16, 'MailNews Core': 17, 'Mozilla Localizations': 18, 'Webtools': 19, 'Instantbird': 20, 'Cloud Services': 21, 'Firefox for Metro': 22, 'Firefox for Android': 23, 'Firefox Health Report Graveyard': 24, 'Other Applications': 25, 'addons.mozilla.org Graveyard': 26, 'Websites Graveyard': 27, 'support.mozilla.org': 28, 'Firefox Affiliates Graveyard': 29, 'Bugzilla': 30, 'Websites': 31, 'Infrastructure & Operations': 32, 'mozilla.org Graveyard': 33, 'Release Engineering': 34, 'Tracking': 35, 'Firefox for iOS': 36, 'Hello (Loop)': 37, 'Mozilla QA': 38, 'Mozilla QA Graveyard': 39, 'Core Graveyard': 40, 'Tech Evangelism': 41, 'Testing': 42, 'develope

Unnamed: 0,Product,Summary,CleanSummary,Tokens,ProductIndex
9995,Webmaker,[Meta] Solution to user's preference locale se...,meta solution user preference locale setting a...,"[meta, solution, user, preference, locale, set...",51
9996,Webmaker,Figure out a way to instantiate variables in a...,figure out way instantiate variables angular,"[figure, out, way, instantiate, variables, ang...",51
9997,Webmaker,Change SECRET_SESSION to match other apps,change secret session match other apps,"[change, secret, session, match, other, apps]",51
9998,Webmaker,Missing selectize in define details-form,missing selectize define details form,"[missing, selectize, define, details, form]",51
9999,Webmaker,Popcorn video from three months ago - Text fie...,popcorn video from three months ago text field...,"[popcorn, video, from, three, months, ago, tex...",51


In [7]:
# Код для выкидывания данных по редким категориям - тоже не нужен

#keys = sorted(list(set(data['Product'])))
#print("Number of products with more then 300 records")
#prodCount = 0
#keyCounts = {}
#for key in keys:
#    count = len(data[(data['Product'] == key)])
#    if count > 300:
#        keyCounts[key] = count
#        prodCount +=1
    
#keyCounts = sorted(keyCounts.items(), key=operator.itemgetter(1))
#print(keyCounts)
#print(keyCounts[0][0])
#print(prodCount)

In [8]:
#print(keyCounts.keys())
#for key in keys:
#    if key not in keyCounts.keys():
#        data = data[(data['Product'] != key)]
#        print(key)
        
#data.tail()
        

In [9]:
# Код создания индексов для оставшихся top 10 категорий - эже не нужен, так как ничего не выкидываю

#data = pd.read_pickle('dataframe.pkl')
#topKeys = list(keyCounts.keys())
#print(topKeys)
#data['Top10ProductIndex'] = [topKeys.index(key) for key in data['Product']]
#data.to_pickle('dataframe.pkl')
#data.head()


Создаю сеть

In [1]:
import pandas as pd
import random

from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from keras.preprocessing.sequence import pad_sequences

from keras.layers import Input, Dense, Embedding, Conv1D, Conv2D, MaxPool2D, MaxPool1D
from keras.layers import Reshape, Flatten, Dropout, Concatenate
from keras.callbacks import ModelCheckpoint
from keras.optimizers import Adam
from keras.models import Model
from keras.models import Sequential
from keras.layers import Conv1D, GlobalAveragePooling1D, MaxPooling1D

from sklearn.utils import shuffle

data = pd.read_pickle('dataframe.pkl')
data = data.sample(frac=1).reset_index(drop=True)

all_words = [word for tokens in data['Tokens'] for word in tokens]
sentence_lengths = [len(tokens) for tokens in data['Tokens']]
max_sentence_length = max(sentence_lengths)
vocabulary = sorted(list(set(all_words)))
words_total=len(all_words)
vocabulary_size=len(vocabulary)
print("%s всего слов, размер словаря %s" % (words_total, vocabulary_size))
print("Макс длина текстов %s" % max_sentence_length)

summaries = data['Tokens'].tolist()
products = data['ProductIndex'].tolist()

products_number=len(list(set(products)))+1 # так как индекса идут с 1, для top10 было с 0 и прибавлять ничего не надо
print("Всего категорий:", products_number)
tokenizer = Tokenizer(vocabulary_size)
tokenizer.fit_on_texts(summaries)
textSequences = tokenizer.texts_to_sequences(summaries)

textSequences = pad_sequences(textSequences, maxlen=max_sentence_length)
#[random.shuffle(sentence) for sentence in textSequences]

X_train, X_test, y_train, y_test = train_test_split(textSequences, products, random_state=0)
print(len(X_train))
print(len(X_test))
print(X_train[0:5])
print(X_test[0:5])

print('Размерность X_train:', X_train.shape)
print('Размерность X_test:', X_test.shape)

y_train = to_categorical(y_train, products_number)
y_test = to_categorical(y_test, products_number)
print('y_train форма:', y_train.shape)
print('y_test форма:', y_test.shape)

embedding_dim = 256
filter_sizes = [3,4,5]
num_filters = 512
drop = 0.5

epochs = 10
batch_size = 150

inputs = Input(shape=(max_sentence_length,), dtype='int32')
embedding = Embedding(input_dim=vocabulary_size+1, output_dim=embedding_dim, input_length=max_sentence_length)(inputs)
reshape = Reshape((max_sentence_length,embedding_dim,1))(embedding)

conv_0 = Conv2D(num_filters, kernel_size=(filter_sizes[0], embedding_dim), padding='valid', kernel_initializer='normal',
                activation='relu')(reshape)
conv_1 = Conv2D(num_filters, kernel_size=(filter_sizes[1], embedding_dim), padding='valid', kernel_initializer='normal',
                activation='relu')(reshape)
conv_2 = Conv2D(num_filters, kernel_size=(filter_sizes[2], embedding_dim), padding='valid', kernel_initializer='normal',
                activation='relu')(reshape)

maxpool_0 = MaxPool2D(pool_size=(max_sentence_length - filter_sizes[0] + 1, 1), strides=(1,1), padding='valid')(conv_0)
maxpool_1 = MaxPool2D(pool_size=(max_sentence_length - filter_sizes[1] + 1, 1), strides=(1,1), padding='valid')(conv_1)
maxpool_2 = MaxPool2D(pool_size=(max_sentence_length - filter_sizes[2] + 1, 1), strides=(1,1), padding='valid')(conv_2)

concatenated_tensor = Concatenate(axis=1)([maxpool_0, maxpool_1, maxpool_2])
flatten = Flatten()(concatenated_tensor)
dropout = Dropout(drop)(flatten)
output = Dense(products_number, activation='softmax')(dropout)

model = Model(inputs=inputs, outputs=output)

checkpoint = ModelCheckpoint('weights.{epoch:03d}-{val_acc:.4f}.hdf5', monitor='val_acc', verbose=1, save_best_only=True,
                             mode='auto')
#adam = Adam(lr=1e-4, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)

# А здесь особенность keras - можно использовать и binary  и categorical - но тогда надо правильно задавать metrics
# Вот тут про это подробно
# https://stackoverflow.com/questions/42081257/keras-binary-crossentropy-vs-categorical-crossentropy-performance
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])
print("Traning Model...")
model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, verbose=1, validation_data=(X_test, y_test))  # starts training



  from ._conv import register_converters as _register_converters


Using TensorFlow backend.


83058 всего слов, размер словаря 8622
Макс длина текстов 37
Всего категорий: 79


7500
2500
[[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0  321 1468 3177   88
  1723  278  327 1024    5  279    8 1840   65]
 [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    7 3581 3582    2  573 1393]
 [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0  469 6052 2213  759  999  102 1944 6053]
 [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0  225  272   98   57    1  270    3  789]
 [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0   77
  1330  134   10    7   31    1  553  500  602]]
[[   0    0    0    0    0   

Traning Model...


Train on 7500 samples, validate on 2500 samples
Epoch 1/10


 150/7500 [..............................] - ETA: 7:50 - loss: 4.3720 - acc: 0.0133

 300/7500 [>.............................] - ETA: 6:02 - loss: 4.1090 - acc: 0.1133

 450/7500 [>.............................] - ETA: 5:15 - loss: 3.8688 - acc: 0.1467

 600/7500 [=>............................] - ETA: 4:52 - loss: 3.7362 - acc: 0.1650

 750/7500 [==>...........................] - ETA: 4:20 - loss: 3.6392 - acc: 0.1707

 900/7500 [==>...........................] - ETA: 3:57 - loss: 3.5374 - acc: 0.1833

1050/7500 [===>..........................] - ETA: 3:39 - loss: 3.4624 - acc: 0.1924

1200/7500 [===>..........................] - ETA: 3:26 - loss: 3.3998 - acc: 0.1992

1350/7500 [====>.........................] - ETA: 3:13 - loss: 3.3687 - acc: 0.2030

1500/7500 [=====>........................] - ETA: 3:02 - loss: 3.3304 - acc: 0.2127

1650/7500 [=====>........................] - ETA: 2:58 - loss: 3.3153 - acc: 0.2091















































































Epoch 2/10


 150/7500 [..............................] - ETA: 2:21 - loss: 2.3143 - acc: 0.4533

 300/7500 [>.............................] - ETA: 2:21 - loss: 2.3683 - acc: 0.4267

 450/7500 [>.............................] - ETA: 2:28 - loss: 2.3452 - acc: 0.4378

 600/7500 [=>............................] - ETA: 2:33 - loss: 2.3440 - acc: 0.4217

 750/7500 [==>...........................] - ETA: 2:37 - loss: 2.3650 - acc: 0.4213

 900/7500 [==>...........................] - ETA: 2:29 - loss: 2.3956 - acc: 0.4133

1050/7500 [===>..........................] - ETA: 2:22 - loss: 2.3849 - acc: 0.4190

1200/7500 [===>..........................] - ETA: 2:16 - loss: 2.3909 - acc: 0.4175

1350/7500 [====>.........................] - ETA: 2:11 - loss: 2.3716 - acc: 0.4163

1500/7500 [=====>........................] - ETA: 2:07 - loss: 2.3754 - acc: 0.4227

1650/7500 [=====>........................] - ETA: 2:04 - loss: 2.3618 - acc: 0.4279















































































Epoch 3/10


 150/7500 [..............................] - ETA: 3:01 - loss: 1.7656 - acc: 0.5333

 300/7500 [>.............................] - ETA: 2:40 - loss: 1.7967 - acc: 0.5433

 450/7500 [>.............................] - ETA: 2:32 - loss: 1.9041 - acc: 0.5267

 600/7500 [=>............................] - ETA: 2:27 - loss: 1.8564 - acc: 0.5500

 750/7500 [==>...........................] - ETA: 2:20 - loss: 1.7894 - acc: 0.5747

 900/7500 [==>...........................] - ETA: 2:13 - loss: 1.7479 - acc: 0.5833

1050/7500 [===>..........................] - ETA: 2:08 - loss: 1.7407 - acc: 0.5838

1200/7500 [===>..........................] - ETA: 2:03 - loss: 1.7283 - acc: 0.5817

1350/7500 [====>.........................] - ETA: 2:00 - loss: 1.6975 - acc: 0.5933

1500/7500 [=====>........................] - ETA: 1:56 - loss: 1.6983 - acc: 0.5947

1650/7500 [=====>........................] - ETA: 1:52 - loss: 1.7149 - acc: 0.5873















































































Epoch 4/10


 150/7500 [..............................] - ETA: 2:21 - loss: 1.2625 - acc: 0.7400

 300/7500 [>.............................] - ETA: 2:12 - loss: 1.2106 - acc: 0.7200

 450/7500 [>.............................] - ETA: 2:09 - loss: 1.1787 - acc: 0.7333

 600/7500 [=>............................] - ETA: 2:06 - loss: 1.1740 - acc: 0.7300

 750/7500 [==>...........................] - ETA: 2:02 - loss: 1.1987 - acc: 0.7213

 900/7500 [==>...........................] - ETA: 1:59 - loss: 1.1983 - acc: 0.7256

1050/7500 [===>..........................] - ETA: 1:56 - loss: 1.1790 - acc: 0.7248

1200/7500 [===>..........................] - ETA: 1:53 - loss: 1.2180 - acc: 0.7175

1350/7500 [====>.........................] - ETA: 1:51 - loss: 1.2170 - acc: 0.7178

1500/7500 [=====>........................] - ETA: 1:48 - loss: 1.1978 - acc: 0.7233

1650/7500 [=====>........................] - ETA: 1:45 - loss: 1.1909 - acc: 0.7236















































































Epoch 5/10


 150/7500 [..............................] - ETA: 2:08 - loss: 0.9578 - acc: 0.7800

 300/7500 [>.............................] - ETA: 2:04 - loss: 0.9694 - acc: 0.7800

 450/7500 [>.............................] - ETA: 2:02 - loss: 0.9323 - acc: 0.7911

 600/7500 [=>............................] - ETA: 1:59 - loss: 0.9201 - acc: 0.8017

 750/7500 [==>...........................] - ETA: 1:56 - loss: 0.8936 - acc: 0.8067

 900/7500 [==>...........................] - ETA: 1:55 - loss: 0.9009 - acc: 0.8044

1050/7500 [===>..........................] - ETA: 1:52 - loss: 0.8824 - acc: 0.8086

1200/7500 [===>..........................] - ETA: 1:49 - loss: 0.8833 - acc: 0.8050

1350/7500 [====>.........................] - ETA: 1:47 - loss: 0.8716 - acc: 0.8089

1500/7500 [=====>........................] - ETA: 1:44 - loss: 0.8732 - acc: 0.8060

1650/7500 [=====>........................] - ETA: 1:41 - loss: 0.8777 - acc: 0.8036















































































Epoch 6/10


 150/7500 [..............................] - ETA: 2:26 - loss: 0.6570 - acc: 0.8467

 300/7500 [>.............................] - ETA: 2:19 - loss: 0.6700 - acc: 0.8333

 450/7500 [>.............................] - ETA: 2:19 - loss: 0.6318 - acc: 0.8556

 600/7500 [=>............................] - ETA: 2:24 - loss: 0.6156 - acc: 0.8583

 750/7500 [==>...........................] - ETA: 2:20 - loss: 0.6132 - acc: 0.8600

 900/7500 [==>...........................] - ETA: 2:16 - loss: 0.6333 - acc: 0.8533

1050/7500 [===>..........................] - ETA: 2:12 - loss: 0.6266 - acc: 0.8562

1200/7500 [===>..........................] - ETA: 2:07 - loss: 0.6285 - acc: 0.8600

1350/7500 [====>.........................] - ETA: 2:02 - loss: 0.6399 - acc: 0.8548

1500/7500 [=====>........................] - ETA: 1:58 - loss: 0.6372 - acc: 0.8587

1650/7500 [=====>........................] - ETA: 1:54 - loss: 0.6336 - acc: 0.8588















































































Epoch 7/10


 150/7500 [..............................] - ETA: 2:09 - loss: 0.3693 - acc: 0.9333

 300/7500 [>.............................] - ETA: 2:13 - loss: 0.4238 - acc: 0.9067

 450/7500 [>.............................] - ETA: 2:09 - loss: 0.4184 - acc: 0.9111

 600/7500 [=>............................] - ETA: 1:52 - loss: 0.4481 - acc: 0.9083

 750/7500 [==>...........................] - ETA: 1:40 - loss: 0.4377 - acc: 0.9160

 900/7500 [==>...........................] - ETA: 1:32 - loss: 0.4517 - acc: 0.9133

1050/7500 [===>..........................] - ETA: 1:26 - loss: 0.4382 - acc: 0.9105

1200/7500 [===>..........................] - ETA: 1:31 - loss: 0.4316 - acc: 0.9108

1350/7500 [====>.........................] - ETA: 1:31 - loss: 0.4201 - acc: 0.9119

1500/7500 [=====>........................] - ETA: 1:28 - loss: 0.4273 - acc: 0.9093

1650/7500 [=====>........................] - ETA: 1:32 - loss: 0.4285 - acc: 0.9073















































































Epoch 8/10


 150/7500 [..............................] - ETA: 2:01 - loss: 0.3203 - acc: 0.9400

 300/7500 [>.............................] - ETA: 1:46 - loss: 0.2895 - acc: 0.9433

 450/7500 [>.............................] - ETA: 1:36 - loss: 0.2707 - acc: 0.9467

 600/7500 [=>............................] - ETA: 1:38 - loss: 0.2872 - acc: 0.9467

 750/7500 [==>...........................] - ETA: 1:34 - loss: 0.2888 - acc: 0.9427

 900/7500 [==>...........................] - ETA: 1:28 - loss: 0.2880 - acc: 0.9467

1050/7500 [===>..........................] - ETA: 1:22 - loss: 0.2958 - acc: 0.9457

1200/7500 [===>..........................] - ETA: 1:18 - loss: 0.3039 - acc: 0.9442

1350/7500 [====>.........................] - ETA: 1:15 - loss: 0.3112 - acc: 0.9415

1500/7500 [=====>........................] - ETA: 1:13 - loss: 0.3038 - acc: 0.9433

1650/7500 [=====>........................] - ETA: 1:11 - loss: 0.2996 - acc: 0.9436















































































Epoch 9/10


 150/7500 [..............................] - ETA: 1:09 - loss: 0.1618 - acc: 0.9800

 300/7500 [>.............................] - ETA: 1:06 - loss: 0.1807 - acc: 0.9733

 450/7500 [>.............................] - ETA: 1:04 - loss: 0.1946 - acc: 0.9711

 600/7500 [=>............................] - ETA: 1:02 - loss: 0.2161 - acc: 0.9717

 750/7500 [==>...........................] - ETA: 1:01 - loss: 0.2351 - acc: 0.9667

 900/7500 [==>...........................] - ETA: 59s - loss: 0.2165 - acc: 0.9700 

1050/7500 [===>..........................] - ETA: 58s - loss: 0.2115 - acc: 0.9686

1200/7500 [===>..........................] - ETA: 57s - loss: 0.2061 - acc: 0.9683

1350/7500 [====>.........................] - ETA: 55s - loss: 0.2009 - acc: 0.9704

1500/7500 [=====>........................] - ETA: 54s - loss: 0.2001 - acc: 0.9707

1650/7500 [=====>........................] - ETA: 52s - loss: 0.2009 - acc: 0.9703















































































Epoch 10/10


 150/7500 [..............................] - ETA: 1:07 - loss: 0.0819 - acc: 1.0000

 300/7500 [>.............................] - ETA: 1:04 - loss: 0.0646 - acc: 0.9967

 450/7500 [>.............................] - ETA: 1:02 - loss: 0.0812 - acc: 0.9933

 600/7500 [=>............................] - ETA: 1:01 - loss: 0.1029 - acc: 0.9850

 750/7500 [==>...........................] - ETA: 1:00 - loss: 0.1047 - acc: 0.9853

 900/7500 [==>...........................] - ETA: 59s - loss: 0.1386 - acc: 0.9800 

1050/7500 [===>..........................] - ETA: 57s - loss: 0.1328 - acc: 0.9810

1200/7500 [===>..........................] - ETA: 56s - loss: 0.1297 - acc: 0.9817

1350/7500 [====>.........................] - ETA: 55s - loss: 0.1277 - acc: 0.9830

1500/7500 [=====>........................] - ETA: 53s - loss: 0.1236 - acc: 0.9833

1650/7500 [=====>........................] - ETA: 52s - loss: 0.1215 - acc: 0.9848















































































<keras.callbacks.History at 0x11a2307f0>

In [2]:
import pandas as pd
import random

from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from keras.preprocessing.sequence import pad_sequences

from keras.layers import Input, Dense, Embedding, Conv1D, Conv2D, MaxPool2D, MaxPool1D
from keras.layers import Reshape, Flatten, Dropout, Concatenate
from keras.callbacks import ModelCheckpoint
from keras.optimizers import Adam
from keras.models import Model
from keras.models import Sequential
from keras.layers import Conv1D, GlobalAveragePooling1D, MaxPooling1D

from sklearn.utils import shuffle

data = pd.read_pickle('dataframe.pkl')
data = data.sample(frac=1).reset_index(drop=True)

all_words = [word for tokens in data['Tokens'] for word in tokens]
sentence_lengths = [len(tokens) for tokens in data['Tokens']]
max_sentence_length = max(sentence_lengths)
vocabulary = sorted(list(set(all_words)))
words_total=len(all_words)
vocabulary_size=len(vocabulary)
print("%s всего слов, размер словаря %s" % (words_total, vocabulary_size))
print("Макс длина текстов %s" % max_sentence_length)

summaries = data['Tokens'].tolist()
products = data['ProductIndex'].tolist()

products_number=len(list(set(products)))+1 # так как индекса идут с 1, для top10 было с 0 и прибавлять ничего не надо
print("Всего категорий:", products_number)
tokenizer = Tokenizer(vocabulary_size)
tokenizer.fit_on_texts(summaries)
textSequences = tokenizer.texts_to_sequences(summaries)

textSequences = pad_sequences(textSequences, maxlen=max_sentence_length)
#[random.shuffle(sentence) for sentence in textSequences]

X_train, X_test, y_train, y_test = train_test_split(textSequences, products, random_state=0)
print(len(X_train))
print(len(X_test))
print(X_train[0:5])
print(X_test[0:5])

print('Размерность X_train:', X_train.shape)
print('Размерность X_test:', X_test.shape)

y_train = to_categorical(y_train, products_number)
y_test = to_categorical(y_test, products_number)
print('y_train форма:', y_train.shape)
print('y_test форма:', y_test.shape)

embedding_dim = 256
filter_sizes = [3,4,5]
num_filters = 512
drop = 0.5

epochs = 10
batch_size = 150

inputs = Input(shape=(max_sentence_length,), dtype='int32')
embedding = Embedding(input_dim=vocabulary_size+1, output_dim=embedding_dim, input_length=max_sentence_length)(inputs)

conv_0 = Conv1D(num_filters, kernel_size=filter_sizes[0], padding='valid', kernel_initializer='normal',
                activation='relu')(embedding)
conv_1 = Conv1D(num_filters, kernel_size=filter_sizes[1], padding='valid', kernel_initializer='normal',
                activation='relu')(embedding)
conv_2 = Conv1D(num_filters, kernel_size=filter_sizes[2], padding='valid', kernel_initializer='normal',
                activation='relu')(embedding)
#conv_3 = Conv1D(num_filters, 8, activation='relu')(embedding)
#conv_4 = Conv1D(num_filters, 5, activation='relu')(embedding)
#conv_5 = Conv1D(num_filters, 3, activation='relu')(embedding)

maxpool_0 = MaxPool1D(pool_size=(max_sentence_length - filter_sizes[0] + 1), strides=1, padding='valid')(conv_0)
maxpool_1 = MaxPool1D(pool_size=(max_sentence_length - filter_sizes[1] + 1), strides=1, padding='valid')(conv_1)
maxpool_2 = MaxPool1D(pool_size=(max_sentence_length - filter_sizes[2] + 1), strides=1, padding='valid')(conv_2)
#maxpool_3 = MaxPool1D(pool_size=8)(conv_3)
#maxpool_4 = MaxPool1D(pool_size=5)(conv_4)
#maxpool_5 = MaxPool1D(pool_size=3)(conv_5)

concatenated_tensor = Concatenate(axis=1)([maxpool_0, maxpool_1, maxpool_2])
flatten = Flatten()(concatenated_tensor)
dropout = Dropout(drop)(flatten)
output = Dense(products_number, activation='softmax')(dropout)

model = Model(inputs=inputs, outputs=output)


#model = Sequential()
#model.add(Embedding(vocabulary_size+1, embedding_dim, input_length=max_sentence_length))
#model.add(Conv1D(num_filters, 7, activation='relu'))
#model.add(MaxPooling1D(3))
#model.add(Conv1D(num_filters, 3, activation='relu'))
#model.add(MaxPooling1D(3))
#model.add(Conv1D(num_filters, 3, activation='relu'))
#model.add(Conv1D(num_filters, 3, activation='relu'))
#model.add(GlobalAveragePooling1D())
#model.add(Dropout(drop))
#model.add(Dense(products_number, activation='softmax'))



checkpoint = ModelCheckpoint('weights.{epoch:03d}-{val_acc:.4f}.hdf5', monitor='val_acc', verbose=1, save_best_only=True,
                             mode='auto')
#adam = Adam(lr=1e-4, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)

# А здесь особенность keras - можно использовать и binary  и categorical - но тогда надо правильно задавать metrics
# Вот тут про это подробно
# https://stackoverflow.com/questions/42081257/keras-binary-crossentropy-vs-categorical-crossentropy-performance
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])
print("Traning Model...")
model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, verbose=1, validation_data=(X_test, y_test))  # starts training



83058 всего слов, размер словаря 8622
Макс длина текстов 37
Всего категорий: 79


7500
2500
[[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0   49   60  247  437 2016 5871    1  274]
 [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
  4904 3572 1228    3 4905 1228  224 1156 1799]
 [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    7 1938 1785  571   84  733]
 [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0  541    8 2222  268  411   90]
 [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0  183  209  363    4  159  936    4   51
   936   51  936    4  909  189 1192  403  730]]
[[   0    0    0    0    0   

Instructions for updating:
`NHWC` for data_format is deprecated, use `NWC` instead


Traning Model...


Train on 7500 samples, validate on 2500 samples
Epoch 1/10


 150/7500 [..............................] - ETA: 1:33 - loss: 4.3950 - acc: 0.0000e+00

 300/7500 [>.............................] - ETA: 1:15 - loss: 4.1547 - acc: 0.0933    

 450/7500 [>.............................] - ETA: 1:08 - loss: 3.9164 - acc: 0.1356

 600/7500 [=>............................] - ETA: 1:04 - loss: 3.7488 - acc: 0.1483

 750/7500 [==>...........................] - ETA: 1:02 - loss: 3.6389 - acc: 0.1653

 900/7500 [==>...........................] - ETA: 59s - loss: 3.5267 - acc: 0.1800 

1050/7500 [===>..........................] - ETA: 58s - loss: 3.4531 - acc: 0.1886

1200/7500 [===>..........................] - ETA: 56s - loss: 3.4022 - acc: 0.1942

1350/7500 [====>.........................] - ETA: 55s - loss: 3.3706 - acc: 0.1956

1500/7500 [=====>........................] - ETA: 53s - loss: 3.3502 - acc: 0.1933

1650/7500 [=====>........................] - ETA: 51s - loss: 3.3355 - acc: 0.1933















































































Epoch 2/10


 150/7500 [..............................] - ETA: 1:08 - loss: 2.4735 - acc: 0.3667

 300/7500 [>.............................] - ETA: 1:03 - loss: 2.5263 - acc: 0.3833

 450/7500 [>.............................] - ETA: 1:01 - loss: 2.4671 - acc: 0.3978

 600/7500 [=>............................] - ETA: 1:00 - loss: 2.4866 - acc: 0.3783

 750/7500 [==>...........................] - ETA: 58s - loss: 2.4709 - acc: 0.3947 

 900/7500 [==>...........................] - ETA: 57s - loss: 2.4242 - acc: 0.4078

1050/7500 [===>..........................] - ETA: 57s - loss: 2.4118 - acc: 0.4181

1200/7500 [===>..........................] - ETA: 56s - loss: 2.3989 - acc: 0.4233

1350/7500 [====>.........................] - ETA: 54s - loss: 2.3841 - acc: 0.4319

1500/7500 [=====>........................] - ETA: 52s - loss: 2.3758 - acc: 0.4333

1650/7500 [=====>........................] - ETA: 50s - loss: 2.3609 - acc: 0.4388















































































Epoch 3/10


 150/7500 [..............................] - ETA: 1:07 - loss: 1.7294 - acc: 0.5933

 300/7500 [>.............................] - ETA: 1:02 - loss: 1.6246 - acc: 0.6167

 450/7500 [>.............................] - ETA: 1:01 - loss: 1.6142 - acc: 0.6222

 600/7500 [=>............................] - ETA: 1:04 - loss: 1.6344 - acc: 0.6283

 750/7500 [==>...........................] - ETA: 1:06 - loss: 1.5980 - acc: 0.6413

 900/7500 [==>...........................] - ETA: 1:04 - loss: 1.6134 - acc: 0.6311

1050/7500 [===>..........................] - ETA: 1:02 - loss: 1.6354 - acc: 0.6238

1200/7500 [===>..........................] - ETA: 1:01 - loss: 1.6173 - acc: 0.6233

1350/7500 [====>.........................] - ETA: 58s - loss: 1.6030 - acc: 0.6296 

1500/7500 [=====>........................] - ETA: 57s - loss: 1.5977 - acc: 0.6287

1650/7500 [=====>........................] - ETA: 55s - loss: 1.5901 - acc: 0.6309















































































Epoch 4/10


 150/7500 [..............................] - ETA: 1:05 - loss: 1.2209 - acc: 0.7467

 300/7500 [>.............................] - ETA: 1:02 - loss: 1.1321 - acc: 0.7567

 450/7500 [>.............................] - ETA: 1:00 - loss: 1.1656 - acc: 0.7578

 600/7500 [=>............................] - ETA: 1:02 - loss: 1.1714 - acc: 0.7517

 750/7500 [==>...........................] - ETA: 1:04 - loss: 1.1540 - acc: 0.7427

 900/7500 [==>...........................] - ETA: 1:03 - loss: 1.1600 - acc: 0.7433

1050/7500 [===>..........................] - ETA: 1:03 - loss: 1.1836 - acc: 0.7381

1200/7500 [===>..........................] - ETA: 1:01 - loss: 1.1560 - acc: 0.7442

1350/7500 [====>.........................] - ETA: 1:02 - loss: 1.1357 - acc: 0.7452

1500/7500 [=====>........................] - ETA: 1:01 - loss: 1.1259 - acc: 0.7467

1650/7500 [=====>........................] - ETA: 1:01 - loss: 1.1634 - acc: 0.7352















































































Epoch 5/10


 150/7500 [..............................] - ETA: 1:16 - loss: 0.7011 - acc: 0.8467

 300/7500 [>.............................] - ETA: 1:22 - loss: 0.7420 - acc: 0.8567

 450/7500 [>.............................] - ETA: 1:18 - loss: 0.8908 - acc: 0.8222

 600/7500 [=>............................] - ETA: 1:15 - loss: 0.8746 - acc: 0.8183

 750/7500 [==>...........................] - ETA: 1:10 - loss: 0.8619 - acc: 0.8147

 900/7500 [==>...........................] - ETA: 1:07 - loss: 0.8463 - acc: 0.8167

1050/7500 [===>..........................] - ETA: 1:04 - loss: 0.8354 - acc: 0.8238

1200/7500 [===>..........................] - ETA: 1:02 - loss: 0.8208 - acc: 0.8258

1350/7500 [====>.........................] - ETA: 1:00 - loss: 0.8354 - acc: 0.8193

1500/7500 [=====>........................] - ETA: 57s - loss: 0.8448 - acc: 0.8173 

1650/7500 [=====>........................] - ETA: 57s - loss: 0.8369 - acc: 0.8182















































































Epoch 6/10


 150/7500 [..............................] - ETA: 1:36 - loss: 0.6035 - acc: 0.8600

 300/7500 [>.............................] - ETA: 1:47 - loss: 0.6154 - acc: 0.8700

 450/7500 [>.............................] - ETA: 1:59 - loss: 0.6323 - acc: 0.8644

 600/7500 [=>............................] - ETA: 1:55 - loss: 0.6251 - acc: 0.8650

 750/7500 [==>...........................] - ETA: 1:53 - loss: 0.6133 - acc: 0.8720

 900/7500 [==>...........................] - ETA: 1:51 - loss: 0.6164 - acc: 0.8689

1050/7500 [===>..........................] - ETA: 1:44 - loss: 0.5911 - acc: 0.8695

1200/7500 [===>..........................] - ETA: 1:38 - loss: 0.5848 - acc: 0.8700

1350/7500 [====>.........................] - ETA: 1:32 - loss: 0.5844 - acc: 0.8674

1500/7500 [=====>........................] - ETA: 1:27 - loss: 0.5905 - acc: 0.8647

1650/7500 [=====>........................] - ETA: 1:22 - loss: 0.5920 - acc: 0.8655















































































Epoch 7/10


 150/7500 [..............................] - ETA: 1:02 - loss: 0.5309 - acc: 0.8933

 300/7500 [>.............................] - ETA: 1:00 - loss: 0.5042 - acc: 0.8967

 450/7500 [>.............................] - ETA: 59s - loss: 0.4687 - acc: 0.9022 

 600/7500 [=>............................] - ETA: 58s - loss: 0.4572 - acc: 0.9033

 750/7500 [==>...........................] - ETA: 57s - loss: 0.4450 - acc: 0.9053

 900/7500 [==>...........................] - ETA: 56s - loss: 0.4426 - acc: 0.9078

1050/7500 [===>..........................] - ETA: 54s - loss: 0.4422 - acc: 0.9105

1200/7500 [===>..........................] - ETA: 53s - loss: 0.4182 - acc: 0.9158

1350/7500 [====>.........................] - ETA: 52s - loss: 0.4266 - acc: 0.9133

1500/7500 [=====>........................] - ETA: 50s - loss: 0.4182 - acc: 0.9140

1650/7500 [=====>........................] - ETA: 49s - loss: 0.4266 - acc: 0.9103















































































Epoch 8/10


 150/7500 [..............................] - ETA: 1:01 - loss: 0.2566 - acc: 0.9600

 300/7500 [>.............................] - ETA: 59s - loss: 0.2531 - acc: 0.9567 

 450/7500 [>.............................] - ETA: 58s - loss: 0.2557 - acc: 0.9578

 600/7500 [=>............................] - ETA: 57s - loss: 0.2580 - acc: 0.9550

 750/7500 [==>...........................] - ETA: 56s - loss: 0.2687 - acc: 0.9547

 900/7500 [==>...........................] - ETA: 55s - loss: 0.2890 - acc: 0.9522

1050/7500 [===>..........................] - ETA: 54s - loss: 0.2847 - acc: 0.9533

1200/7500 [===>..........................] - ETA: 53s - loss: 0.2857 - acc: 0.9533

1350/7500 [====>.........................] - ETA: 51s - loss: 0.2767 - acc: 0.9541

1500/7500 [=====>........................] - ETA: 50s - loss: 0.2844 - acc: 0.9520

1650/7500 [=====>........................] - ETA: 49s - loss: 0.2810 - acc: 0.9527















































































Epoch 9/10


 150/7500 [..............................] - ETA: 1:03 - loss: 0.1832 - acc: 0.9667

 300/7500 [>.............................] - ETA: 1:02 - loss: 0.1885 - acc: 0.9667

 450/7500 [>.............................] - ETA: 1:01 - loss: 0.1899 - acc: 0.9711

 600/7500 [=>............................] - ETA: 59s - loss: 0.1904 - acc: 0.9700 

 750/7500 [==>...........................] - ETA: 57s - loss: 0.1786 - acc: 0.9760

 900/7500 [==>...........................] - ETA: 56s - loss: 0.1856 - acc: 0.9744

1050/7500 [===>..........................] - ETA: 55s - loss: 0.1844 - acc: 0.9743

1200/7500 [===>..........................] - ETA: 54s - loss: 0.1841 - acc: 0.9717

1350/7500 [====>.........................] - ETA: 53s - loss: 0.1856 - acc: 0.9696

1500/7500 [=====>........................] - ETA: 52s - loss: 0.1996 - acc: 0.9693

1650/7500 [=====>........................] - ETA: 50s - loss: 0.1983 - acc: 0.9697















































































Epoch 10/10


 150/7500 [..............................] - ETA: 1:12 - loss: 0.0946 - acc: 1.0000

 300/7500 [>.............................] - ETA: 1:06 - loss: 0.1072 - acc: 0.9933

 450/7500 [>.............................] - ETA: 1:03 - loss: 0.1075 - acc: 0.9911

 600/7500 [=>............................] - ETA: 1:01 - loss: 0.1094 - acc: 0.9917

 750/7500 [==>...........................] - ETA: 1:01 - loss: 0.1213 - acc: 0.9853

 900/7500 [==>...........................] - ETA: 1:00 - loss: 0.1319 - acc: 0.9833

1050/7500 [===>..........................] - ETA: 58s - loss: 0.1252 - acc: 0.9848 

1200/7500 [===>..........................] - ETA: 56s - loss: 0.1215 - acc: 0.9850

1350/7500 [====>.........................] - ETA: 55s - loss: 0.1179 - acc: 0.9859

1500/7500 [=====>........................] - ETA: 53s - loss: 0.1210 - acc: 0.9860

1650/7500 [=====>........................] - ETA: 52s - loss: 0.1176 - acc: 0.9867















































































<keras.callbacks.History at 0x11a2dea20>

In [None]:
from keras_text.data import Dataset