In [1]:
import pandas as pd

# Анализ данных

In [2]:
fields = ['Product', 'Summary']
data = pd.read_csv("bugs-2018-04-05.csv", usecols=fields)
data.head()

Unnamed: 0,Product,Summary
0,Thunderbird,e-mail tag is not visible when thread is colla...
1,Calendar,Make use of Assert.jsm in xpcshell tests
2,Core,Add touch-action regions to the layer EventReg...
3,Core,Transition more fields of FrameMetrics to use ...
4,Calendar,Getter fails in calender-migration-dialog on f...


In [3]:
data.describe()

Unnamed: 0,Product,Summary
count,10000,10000
unique,78,9952
top,Core,reboots-scl1
freq,2122,8


In [6]:
import re
RE_WORDS = re.compile(r'''
    # Find words in a string. Order matters!
    [A-Z]+(?=[A-Z][a-z]) |  # All upper case before a capitalized word
    [A-Z]?[a-z]+ |  # Capitalized words / all lower case
    [A-Z]+ |  # All upper case
    \d+  # Numbers
''', re.VERBOSE)

def clean_text(text):
    text = text.replace(r"[^A-Za-z0-9]", " ")
    text = re.sub("[\]\[_'\\\/.,;:\-()<>\"]", " ", text)
    text = ' '.join(word for word in RE_WORDS.findall(text) if len(word)>2 or word.isupper())
    text = text.lower()
    return text

data['CleanSummary'] = data.apply(lambda x: clean_text(x['Summary']), axis=1)

from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')

data['Tokens'] = data['CleanSummary'].apply(tokenizer.tokenize)
data.to_pickle('dataframe.pkl')

data.tail()


Unnamed: 0,Product,Summary,CleanSummary,Tokens
9995,Webmaker,[Meta] Solution to user's preference locale se...,meta solution user preference locale setting a...,"[meta, solution, user, preference, locale, set..."
9996,Webmaker,Figure out a way to instantiate variables in a...,figure out way instantiate variables angular,"[figure, out, way, instantiate, variables, ang..."
9997,Webmaker,Change SECRET_SESSION to match other apps,change secret session match other apps,"[change, secret, session, match, other, apps]"
9998,Webmaker,Missing selectize in define details-form,missing selectize define details form,"[missing, selectize, define, details, form]"
9999,Webmaker,Popcorn video from three months ago - Text fie...,popcorn video from three months ago text field...,"[popcorn, video, from, three, months, ago, tex..."


In [7]:
all_words = [word for tokens in data['Tokens'] for word in tokens]
sentence_lengths = [len(tokens) for tokens in data['Tokens']]
VOCAB = sorted(list(set(all_words)))
print("%s words total, with a vocabulary size of %s" % (len(all_words), len(VOCAB)))
print("Max sentence length is %s" % max(sentence_lengths))

83058 words total, with a vocabulary size of 8622
Max sentence length is 37


In [8]:
categories = {}
keys = enumerate(data['Product'].unique())
for key, value in keys:
    categories[value] = key + 1
data['ProductIndex'] = data['Product'].map(categories)
print(categories)

# Этот код - пробовал объединять схожие категории, не нужен сейчас
#keys = sorted(list(set(data['Product'])))
#print(keys)
#data['ProductIndex'] = [keys.index(key) for key in data['Product']]

#joinedKeys = {}
#index = 6
#for key in keys:
#    k = key.lower()
#    if "mozilla" in k or "bugzilla" in k or "web" in k:
#        joinedKeys.setdefault(key, 0)
#    elif "firefox" in k:
#        joinedKeys.setdefault(key, 1)
#    elif "penelope" in k or "documentation" in k:
#        joinedKeys.setdefault(key, 2)
#    elif "core" in k or "task" in k or "socorro" in k or "seamonkey" in k or "tool" in k or "develop" in k:
#        joinedKeys.setdefault(key, 3)
#    elif "testing" in k:
#        joinedKeys.setdefault(key, 4)
#    elif "infractructure" in k or "release" in k or "tracking" in k:
#        joinedKeys.setdefault(key, 5)
    # next is for other - too litle data for that
#    elif "community building" in k or "directory" in k or "skywriter" in k or "shield" in k or "nspr" in k \
#            or "nss" in k or "privacy" in k or "snippets" in k or "jss" in k:
#        joinedKeys.setdefault(key, 100)
#    else:
#        joinedKeys.setdefault(key, index)
#        index += 1

#joinedKeysList = list(joinedKeys.keys())
#print(len(joinedKeysList))
#print(len(set(joinedKeys.values())))
#print(joinedKeys)
#data['JoinedProductIndex'] = [joinedKeys[key] for key in data['Product']]
data.to_pickle('dataframe.pkl')
data.tail()

{'Thunderbird': 1, 'Calendar': 2, 'Core': 3, 'Firefox OS': 4, 'www.mozilla.org': 5, 'Toolkit': 6, 'Firefox': 7, 'SeaMonkey': 8, 'Chat Core': 9, 'Firefox for Android Graveyard': 10, 'Mozilla Labs': 11, 'Firefox Build System': 12, 'Tree Management Graveyard': 13, 'Toolkit Graveyard': 14, 'Participation Infrastructure': 15, 'Developer Documentation': 16, 'MailNews Core': 17, 'Mozilla Localizations': 18, 'Webtools': 19, 'Instantbird': 20, 'Cloud Services': 21, 'Firefox for Metro': 22, 'Firefox for Android': 23, 'Firefox Health Report Graveyard': 24, 'Other Applications': 25, 'addons.mozilla.org Graveyard': 26, 'Websites Graveyard': 27, 'support.mozilla.org': 28, 'Firefox Affiliates Graveyard': 29, 'Bugzilla': 30, 'Websites': 31, 'Infrastructure & Operations': 32, 'mozilla.org Graveyard': 33, 'Release Engineering': 34, 'Tracking': 35, 'Firefox for iOS': 36, 'Hello (Loop)': 37, 'Mozilla QA': 38, 'Mozilla QA Graveyard': 39, 'Core Graveyard': 40, 'Tech Evangelism': 41, 'Testing': 42, 'develope

Unnamed: 0,Product,Summary,CleanSummary,Tokens,ProductIndex
9995,Webmaker,[Meta] Solution to user's preference locale se...,meta solution user preference locale setting a...,"[meta, solution, user, preference, locale, set...",51
9996,Webmaker,Figure out a way to instantiate variables in a...,figure out way instantiate variables angular,"[figure, out, way, instantiate, variables, ang...",51
9997,Webmaker,Change SECRET_SESSION to match other apps,change secret session match other apps,"[change, secret, session, match, other, apps]",51
9998,Webmaker,Missing selectize in define details-form,missing selectize define details form,"[missing, selectize, define, details, form]",51
9999,Webmaker,Popcorn video from three months ago - Text fie...,popcorn video from three months ago text field...,"[popcorn, video, from, three, months, ago, tex...",51


In [9]:
# Код для выкидывания данных по редким категориям - тоже не нужен

#keys = sorted(list(set(data['Product'])))
#print("Number of products with more then 300 records")
#prodCount = 0
#keyCounts = {}
#for key in keys:
#    count = len(data[(data['Product'] == key)])
#    if count > 300:
#        keyCounts[key] = count
#        prodCount +=1
    
#keyCounts = sorted(keyCounts.items(), key=operator.itemgetter(1))
#print(keyCounts)
#print(keyCounts[0][0])
#print(prodCount)

In [10]:
#print(keyCounts.keys())
#for key in keys:
#    if key not in keyCounts.keys():
#        data = data[(data['Product'] != key)]
#        print(key)
        
#data.tail()
        

In [11]:
# Код создания индексов для оставшихся top 10 категорий - эже не нужен, так как ничего не выкидываю

#data = pd.read_pickle('dataframe.pkl')
#topKeys = list(keyCounts.keys())
#print(topKeys)
#data['Top10ProductIndex'] = [topKeys.index(key) for key in data['Product']]
#data.to_pickle('dataframe.pkl')
#data.head()


Создаю сеть

In [1]:
import pandas as pd
import random

from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from keras.preprocessing.sequence import pad_sequences

from keras.layers import Input, Dense, Embedding, Conv1D, Conv2D, MaxPool2D, MaxPool1D
from keras.layers import Reshape, Flatten, Dropout, Concatenate
from keras.callbacks import ModelCheckpoint
from keras.optimizers import Adam
from keras.models import Model
from keras.models import Sequential
from keras.layers import Conv1D, GlobalAveragePooling1D, MaxPooling1D

from sklearn.utils import shuffle

data = pd.read_pickle('dataframe.pkl')
data = data.sample(frac=1).reset_index(drop=True)

all_words = [word for tokens in data['Tokens'] for word in tokens]
sentence_lengths = [len(tokens) for tokens in data['Tokens']]
max_sentence_length = max(sentence_lengths)
vocabulary = sorted(list(set(all_words)))
words_total=len(all_words)
vocabulary_size=len(vocabulary)
print("%s всего слов, размер словаря %s" % (words_total, vocabulary_size))
print("Макс длина текстов %s" % max_sentence_length)

summaries = data['Tokens'].tolist()
products = data['ProductIndex'].tolist()

products_number=len(list(set(products)))+1 # так как индекса идут с 1, для top10 было с 0 и прибавлять ничего не надо
print("Всего категорий:", products_number)
tokenizer = Tokenizer(vocabulary_size)
tokenizer.fit_on_texts(summaries)
textSequences = tokenizer.texts_to_sequences(summaries)

textSequences = pad_sequences(textSequences, maxlen=max_sentence_length)
#[random.shuffle(sentence) for sentence in textSequences]

X_train, X_test, y_train, y_test = train_test_split(textSequences, products, random_state=0)
print(len(X_train))
print(len(X_test))
print(X_train[0:5])
print(X_test[0:5])

print('Размерность X_train:', X_train.shape)
print('Размерность X_test:', X_test.shape)

y_train = to_categorical(y_train, products_number)
y_test = to_categorical(y_test, products_number)
print('y_train форма:', y_train.shape)
print('y_test форма:', y_test.shape)

embedding_dim = 256
filter_sizes = [3,4,5]
num_filters = 512
drop = 0.5

epochs = 10
batch_size = 150

inputs = Input(shape=(max_sentence_length,), dtype='int32')
embedding = Embedding(input_dim=vocabulary_size+1, output_dim=embedding_dim, input_length=max_sentence_length)(inputs)
reshape = Reshape((max_sentence_length,embedding_dim,1))(embedding)

conv_0 = Conv2D(num_filters, kernel_size=(filter_sizes[0], embedding_dim), padding='valid', kernel_initializer='normal',
                activation='relu')(reshape)
conv_1 = Conv2D(num_filters, kernel_size=(filter_sizes[1], embedding_dim), padding='valid', kernel_initializer='normal',
                activation='relu')(reshape)
conv_2 = Conv2D(num_filters, kernel_size=(filter_sizes[2], embedding_dim), padding='valid', kernel_initializer='normal',
                activation='relu')(reshape)

maxpool_0 = MaxPool2D(pool_size=(max_sentence_length - filter_sizes[0] + 1, 1), strides=(1,1), padding='valid')(conv_0)
maxpool_1 = MaxPool2D(pool_size=(max_sentence_length - filter_sizes[1] + 1, 1), strides=(1,1), padding='valid')(conv_1)
maxpool_2 = MaxPool2D(pool_size=(max_sentence_length - filter_sizes[2] + 1, 1), strides=(1,1), padding='valid')(conv_2)

concatenated_tensor = Concatenate(axis=1)([maxpool_0, maxpool_1, maxpool_2])
flatten = Flatten()(maxpool_2)
dropout = Dropout(drop)(flatten)
output = Dense(products_number, activation='softmax')(dropout)

model = Model(inputs=inputs, outputs=output)

checkpoint = ModelCheckpoint('weights.{epoch:03d}-{val_acc:.4f}.hdf5', monitor='val_acc', verbose=1, save_best_only=True,
                             mode='auto')
#adam = Adam(lr=1e-4, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)

# А здесь особенность keras - можно использовать и binary  и categorical - но тогда надо правильно задавать metrics
# Вот тут про это подробно
# https://stackoverflow.com/questions/42081257/keras-binary-crossentropy-vs-categorical-crossentropy-performance
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])
print("Traning Model...")
model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, verbose=1, validation_data=(X_test, y_test))  # starts training



  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


83058 всего слов, размер словаря 8622
Макс длина текстов 37
Всего категорий: 79


7500
2500
[[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    7  832 2053    1  953  430]
 [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0 4881   10  125  590]
 [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0   16 1577  522    2  224    2   32]
 [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
   179   77 3041  410   19 2476   77  410    4]
 [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0  299  134  650  204  173  421   23 1626
   621  289  101 4930  345  306  213 3589  130]]
[[   0    0    0    0    0   

Train on 7500 samples, validate on 2500 samples
Epoch 1/10


 150/7500 [..............................] - ETA: 55s - loss: 4.3713 - acc: 0.0467

 300/7500 [>.............................] - ETA: 43s - loss: 4.2227 - acc: 0.0700

 450/7500 [>.............................] - ETA: 37s - loss: 4.0326 - acc: 0.1333

 600/7500 [=>............................] - ETA: 35s - loss: 3.8811 - acc: 0.1617

 750/7500 [==>...........................] - ETA: 32s - loss: 3.7924 - acc: 0.1627

 900/7500 [==>...........................] - ETA: 31s - loss: 3.7201 - acc: 0.1700

1050/7500 [===>..........................] - ETA: 30s - loss: 3.6594 - acc: 0.1800

1200/7500 [===>..........................] - ETA: 29s - loss: 3.5876 - acc: 0.1900

1350/7500 [====>.........................] - ETA: 28s - loss: 3.5395 - acc: 0.1933

1500/7500 [=====>........................] - ETA: 27s - loss: 3.5176 - acc: 0.1900

1650/7500 [=====>........................] - ETA: 26s - loss: 3.4848 - acc: 0.1927















































































Epoch 2/10


 150/7500 [..............................] - ETA: 31s - loss: 2.5967 - acc: 0.3467

 300/7500 [>.............................] - ETA: 30s - loss: 2.5814 - acc: 0.3567

 450/7500 [>.............................] - ETA: 29s - loss: 2.7005 - acc: 0.3067

 600/7500 [=>............................] - ETA: 29s - loss: 2.7183 - acc: 0.3083

 750/7500 [==>...........................] - ETA: 28s - loss: 2.7182 - acc: 0.3187

 900/7500 [==>...........................] - ETA: 27s - loss: 2.7407 - acc: 0.3111

1050/7500 [===>..........................] - ETA: 27s - loss: 2.7251 - acc: 0.3267

1200/7500 [===>..........................] - ETA: 26s - loss: 2.7334 - acc: 0.3283

1350/7500 [====>.........................] - ETA: 26s - loss: 2.7193 - acc: 0.3341

1500/7500 [=====>........................] - ETA: 25s - loss: 2.7124 - acc: 0.3360

1650/7500 [=====>........................] - ETA: 24s - loss: 2.7024 - acc: 0.3358















































































Epoch 3/10


 150/7500 [..............................] - ETA: 32s - loss: 2.0197 - acc: 0.5000

 300/7500 [>.............................] - ETA: 30s - loss: 1.9783 - acc: 0.5000

 450/7500 [>.............................] - ETA: 30s - loss: 2.0382 - acc: 0.4911

 600/7500 [=>............................] - ETA: 29s - loss: 2.0717 - acc: 0.4900

 750/7500 [==>...........................] - ETA: 28s - loss: 2.0578 - acc: 0.5040

 900/7500 [==>...........................] - ETA: 28s - loss: 2.0766 - acc: 0.5033

1050/7500 [===>..........................] - ETA: 27s - loss: 2.0375 - acc: 0.5124

1200/7500 [===>..........................] - ETA: 26s - loss: 2.0159 - acc: 0.5133

1350/7500 [====>.........................] - ETA: 26s - loss: 2.0203 - acc: 0.5148

1500/7500 [=====>........................] - ETA: 25s - loss: 1.9940 - acc: 0.5213

1650/7500 [=====>........................] - ETA: 24s - loss: 1.9968 - acc: 0.5236















































































Epoch 4/10


 150/7500 [..............................] - ETA: 37s - loss: 1.6506 - acc: 0.6133

 300/7500 [>.............................] - ETA: 36s - loss: 1.5498 - acc: 0.6400

 450/7500 [>.............................] - ETA: 33s - loss: 1.5262 - acc: 0.6578

 600/7500 [=>............................] - ETA: 31s - loss: 1.5620 - acc: 0.6517

 750/7500 [==>...........................] - ETA: 30s - loss: 1.5571 - acc: 0.6507

 900/7500 [==>...........................] - ETA: 29s - loss: 1.5460 - acc: 0.6467

1050/7500 [===>..........................] - ETA: 28s - loss: 1.5810 - acc: 0.6305

1200/7500 [===>..........................] - ETA: 27s - loss: 1.5906 - acc: 0.6300

1350/7500 [====>.........................] - ETA: 26s - loss: 1.5828 - acc: 0.6348

1500/7500 [=====>........................] - ETA: 26s - loss: 1.5847 - acc: 0.6353

1650/7500 [=====>........................] - ETA: 25s - loss: 1.5999 - acc: 0.6285















































































Epoch 5/10


 150/7500 [..............................] - ETA: 30s - loss: 1.2660 - acc: 0.7000

 300/7500 [>.............................] - ETA: 29s - loss: 1.1917 - acc: 0.7233

 450/7500 [>.............................] - ETA: 29s - loss: 1.1965 - acc: 0.7311

 600/7500 [=>............................] - ETA: 28s - loss: 1.1988 - acc: 0.7183

 750/7500 [==>...........................] - ETA: 27s - loss: 1.2078 - acc: 0.7173

 900/7500 [==>...........................] - ETA: 27s - loss: 1.2348 - acc: 0.7067

1050/7500 [===>..........................] - ETA: 26s - loss: 1.2683 - acc: 0.7038

1200/7500 [===>..........................] - ETA: 26s - loss: 1.2713 - acc: 0.7050

1350/7500 [====>.........................] - ETA: 25s - loss: 1.2434 - acc: 0.7126

1500/7500 [=====>........................] - ETA: 24s - loss: 1.2204 - acc: 0.7213

1650/7500 [=====>........................] - ETA: 24s - loss: 1.2175 - acc: 0.7273















































































Epoch 6/10


 150/7500 [..............................] - ETA: 32s - loss: 1.0303 - acc: 0.7933

 300/7500 [>.............................] - ETA: 31s - loss: 1.0122 - acc: 0.7867

 450/7500 [>.............................] - ETA: 30s - loss: 0.9634 - acc: 0.7889

 600/7500 [=>............................] - ETA: 29s - loss: 0.9575 - acc: 0.7883

 750/7500 [==>...........................] - ETA: 28s - loss: 0.9400 - acc: 0.7907

 900/7500 [==>...........................] - ETA: 28s - loss: 0.9544 - acc: 0.7856

1050/7500 [===>..........................] - ETA: 27s - loss: 0.9715 - acc: 0.7857

1200/7500 [===>..........................] - ETA: 26s - loss: 0.9749 - acc: 0.7850

1350/7500 [====>.........................] - ETA: 26s - loss: 0.9702 - acc: 0.7852

1500/7500 [=====>........................] - ETA: 25s - loss: 0.9673 - acc: 0.7847

1650/7500 [=====>........................] - ETA: 24s - loss: 0.9680 - acc: 0.7867















































































Epoch 7/10


 150/7500 [..............................] - ETA: 31s - loss: 0.5724 - acc: 0.8733

 300/7500 [>.............................] - ETA: 30s - loss: 0.6688 - acc: 0.8633

 450/7500 [>.............................] - ETA: 30s - loss: 0.7122 - acc: 0.8511

 600/7500 [=>............................] - ETA: 29s - loss: 0.7658 - acc: 0.8283

 750/7500 [==>...........................] - ETA: 29s - loss: 0.7527 - acc: 0.8333

 900/7500 [==>...........................] - ETA: 28s - loss: 0.7460 - acc: 0.8378

1050/7500 [===>..........................] - ETA: 27s - loss: 0.7616 - acc: 0.8324

1200/7500 [===>..........................] - ETA: 26s - loss: 0.7593 - acc: 0.8333

1350/7500 [====>.........................] - ETA: 26s - loss: 0.7468 - acc: 0.8356

1500/7500 [=====>........................] - ETA: 25s - loss: 0.7626 - acc: 0.8293

1650/7500 [=====>........................] - ETA: 24s - loss: 0.7633 - acc: 0.8309















































































Epoch 8/10


 150/7500 [..............................] - ETA: 31s - loss: 0.5462 - acc: 0.9000

 300/7500 [>.............................] - ETA: 30s - loss: 0.5844 - acc: 0.8800

 450/7500 [>.............................] - ETA: 29s - loss: 0.5664 - acc: 0.8867

 600/7500 [=>............................] - ETA: 29s - loss: 0.5885 - acc: 0.8733

 750/7500 [==>...........................] - ETA: 28s - loss: 0.5946 - acc: 0.8747

 900/7500 [==>...........................] - ETA: 28s - loss: 0.5903 - acc: 0.8722

1050/7500 [===>..........................] - ETA: 27s - loss: 0.5814 - acc: 0.8762

1200/7500 [===>..........................] - ETA: 26s - loss: 0.5799 - acc: 0.8783

1350/7500 [====>.........................] - ETA: 25s - loss: 0.5691 - acc: 0.8815

1500/7500 [=====>........................] - ETA: 25s - loss: 0.5703 - acc: 0.8800

1650/7500 [=====>........................] - ETA: 24s - loss: 0.5818 - acc: 0.8776















































































Epoch 9/10


 150/7500 [..............................] - ETA: 32s - loss: 0.4082 - acc: 0.9067

 300/7500 [>.............................] - ETA: 32s - loss: 0.4357 - acc: 0.9000

 450/7500 [>.............................] - ETA: 30s - loss: 0.4079 - acc: 0.9178

 600/7500 [=>............................] - ETA: 29s - loss: 0.4141 - acc: 0.9133

 750/7500 [==>...........................] - ETA: 29s - loss: 0.4158 - acc: 0.9133

 900/7500 [==>...........................] - ETA: 28s - loss: 0.4570 - acc: 0.9078

1050/7500 [===>..........................] - ETA: 27s - loss: 0.4553 - acc: 0.9095

1200/7500 [===>..........................] - ETA: 26s - loss: 0.4625 - acc: 0.9050

1350/7500 [====>.........................] - ETA: 25s - loss: 0.4538 - acc: 0.9089

1500/7500 [=====>........................] - ETA: 25s - loss: 0.4546 - acc: 0.9093

1650/7500 [=====>........................] - ETA: 24s - loss: 0.4641 - acc: 0.9067















































































Epoch 10/10


 150/7500 [..............................] - ETA: 35s - loss: 0.4008 - acc: 0.9267

 300/7500 [>.............................] - ETA: 32s - loss: 0.3512 - acc: 0.9200

 450/7500 [>.............................] - ETA: 31s - loss: 0.3357 - acc: 0.9244

 600/7500 [=>............................] - ETA: 30s - loss: 0.3485 - acc: 0.9217

 750/7500 [==>...........................] - ETA: 30s - loss: 0.3455 - acc: 0.9253

 900/7500 [==>...........................] - ETA: 29s - loss: 0.3354 - acc: 0.9311

1050/7500 [===>..........................] - ETA: 28s - loss: 0.3430 - acc: 0.9305

1200/7500 [===>..........................] - ETA: 27s - loss: 0.3528 - acc: 0.9258

1350/7500 [====>.........................] - ETA: 26s - loss: 0.3524 - acc: 0.9267

1500/7500 [=====>........................] - ETA: 25s - loss: 0.3440 - acc: 0.9273

1650/7500 [=====>........................] - ETA: 25s - loss: 0.3402 - acc: 0.9273















































































<keras.callbacks.History at 0x11a9df5c0>

In [2]:
import pandas as pd
import random

from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from keras.preprocessing.sequence import pad_sequences

from keras.layers import Input, Dense, Embedding, Conv1D, Conv2D, MaxPool2D, MaxPool1D
from keras.layers import Reshape, Flatten, Dropout, Concatenate
from keras.callbacks import ModelCheckpoint
from keras.optimizers import Adam
from keras.models import Model
from keras.models import Sequential
from keras.layers import Conv1D, GlobalAveragePooling1D, MaxPooling1D

from sklearn.utils import shuffle

data = pd.read_pickle('dataframe.pkl')
data = data.sample(frac=1).reset_index(drop=True)

all_words = [word for tokens in data['Tokens'] for word in tokens]
sentence_lengths = [len(tokens) for tokens in data['Tokens']]
max_sentence_length = max(sentence_lengths)
vocabulary = sorted(list(set(all_words)))
words_total=len(all_words)
vocabulary_size=len(vocabulary)
print("%s всего слов, размер словаря %s" % (words_total, vocabulary_size))
print("Макс длина текстов %s" % max_sentence_length)

summaries = data['Tokens'].tolist()
products = data['ProductIndex'].tolist()

products_number=len(list(set(products)))+1 # так как индекса идут с 1, для top10 было с 0 и прибавлять ничего не надо
print("Всего категорий:", products_number)
tokenizer = Tokenizer(vocabulary_size)
tokenizer.fit_on_texts(summaries)
textSequences = tokenizer.texts_to_sequences(summaries)

textSequences = pad_sequences(textSequences, maxlen=max_sentence_length)
#[random.shuffle(sentence) for sentence in textSequences]

X_train, X_test, y_train, y_test = train_test_split(textSequences, products, random_state=0)
print(len(X_train))
print(len(X_test))
print(X_train[0:5])
print(X_test[0:5])

print('Размерность X_train:', X_train.shape)
print('Размерность X_test:', X_test.shape)

y_train = to_categorical(y_train, products_number)
y_test = to_categorical(y_test, products_number)
print('y_train форма:', y_train.shape)
print('y_test форма:', y_test.shape)

embedding_dim = 256
filter_sizes = [3,4,5]
num_filters = 512
drop = 0.5

epochs = 10
batch_size = 150

inputs = Input(shape=(max_sentence_length,), dtype='int32')
embedding = Embedding(input_dim=vocabulary_size+1, output_dim=embedding_dim, input_length=max_sentence_length)(inputs)

conv_0 = Conv1D(num_filters, kernel_size=filter_sizes[0], padding='valid', kernel_initializer='normal',
                activation='relu')(embedding)
conv_1 = Conv1D(num_filters, kernel_size=filter_sizes[1], padding='valid', kernel_initializer='normal',
                activation='relu')(embedding)
conv_2 = Conv1D(num_filters, kernel_size=filter_sizes[2], padding='valid', kernel_initializer='normal',
                activation='relu')(embedding)
#conv_3 = Conv1D(num_filters, 8, activation='relu')(embedding)
#conv_4 = Conv1D(num_filters, 5, activation='relu')(embedding)
#conv_5 = Conv1D(num_filters, 3, activation='relu')(embedding)

maxpool_0 = MaxPool1D(pool_size=(max_sentence_length - filter_sizes[0] + 1), strides=1, padding='valid')(conv_0)
maxpool_1 = MaxPool1D(pool_size=(max_sentence_length - filter_sizes[1] + 1), strides=1, padding='valid')(conv_1)
maxpool_2 = MaxPool1D(pool_size=(max_sentence_length - filter_sizes[2] + 1), strides=1, padding='valid')(conv_2)
#maxpool_3 = MaxPool1D(pool_size=8)(conv_3)
#maxpool_4 = MaxPool1D(pool_size=5)(conv_4)
#maxpool_5 = MaxPool1D(pool_size=3)(conv_5)

concatenated_tensor = Concatenate(axis=1)([maxpool_0, maxpool_1, maxpool_2])
flatten = Flatten()(concatenated_tensor)
dropout = Dropout(drop)(flatten)
output = Dense(products_number, activation='softmax')(dropout)

model = Model(inputs=inputs, outputs=output)


#model = Sequential()
#model.add(Embedding(vocabulary_size+1, embedding_dim, input_length=max_sentence_length))
#model.add(Conv1D(num_filters, 7, activation='relu'))
#model.add(MaxPooling1D(3))
#model.add(Conv1D(num_filters, 3, activation='relu'))
#model.add(MaxPooling1D(3))
#model.add(Conv1D(num_filters, 3, activation='relu'))
#model.add(Conv1D(num_filters, 3, activation='relu'))
#model.add(GlobalAveragePooling1D())
#model.add(Dropout(drop))
#model.add(Dense(products_number, activation='softmax'))



checkpoint = ModelCheckpoint('weights.{epoch:03d}-{val_acc:.4f}.hdf5', monitor='val_acc', verbose=1, save_best_only=True,
                             mode='auto')
#adam = Adam(lr=1e-4, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)

# А здесь особенность keras - можно использовать и binary  и categorical - но тогда надо правильно задавать metrics
# Вот тут про это подробно
# https://stackoverflow.com/questions/42081257/keras-binary-crossentropy-vs-categorical-crossentropy-performance
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])
print("Traning Model...")
model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, verbose=1, validation_data=(X_test, y_test))  # starts training



83058 всего слов, размер словаря 8622
Макс длина текстов 37
Всего категорий: 79


7500
2500
[[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0   49   60  247  437 2016 5871    1  274]
 [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
  4904 3572 1228    3 4905 1228  224 1156 1799]
 [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    7 1938 1785  571   84  733]
 [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0  541    8 2222  268  411   90]
 [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0  183  209  363    4  159  936    4   51
   936   51  936    4  909  189 1192  403  730]]
[[   0    0    0    0    0   

Instructions for updating:
`NHWC` for data_format is deprecated, use `NWC` instead


Traning Model...


Train on 7500 samples, validate on 2500 samples
Epoch 1/10


 150/7500 [..............................] - ETA: 1:33 - loss: 4.3950 - acc: 0.0000e+00

 300/7500 [>.............................] - ETA: 1:15 - loss: 4.1547 - acc: 0.0933    

 450/7500 [>.............................] - ETA: 1:08 - loss: 3.9164 - acc: 0.1356

 600/7500 [=>............................] - ETA: 1:04 - loss: 3.7488 - acc: 0.1483

 750/7500 [==>...........................] - ETA: 1:02 - loss: 3.6389 - acc: 0.1653

 900/7500 [==>...........................] - ETA: 59s - loss: 3.5267 - acc: 0.1800 

1050/7500 [===>..........................] - ETA: 58s - loss: 3.4531 - acc: 0.1886

1200/7500 [===>..........................] - ETA: 56s - loss: 3.4022 - acc: 0.1942

1350/7500 [====>.........................] - ETA: 55s - loss: 3.3706 - acc: 0.1956

1500/7500 [=====>........................] - ETA: 53s - loss: 3.3502 - acc: 0.1933

1650/7500 [=====>........................] - ETA: 51s - loss: 3.3355 - acc: 0.1933















































































Epoch 2/10


 150/7500 [..............................] - ETA: 1:08 - loss: 2.4735 - acc: 0.3667

 300/7500 [>.............................] - ETA: 1:03 - loss: 2.5263 - acc: 0.3833

 450/7500 [>.............................] - ETA: 1:01 - loss: 2.4671 - acc: 0.3978

 600/7500 [=>............................] - ETA: 1:00 - loss: 2.4866 - acc: 0.3783

 750/7500 [==>...........................] - ETA: 58s - loss: 2.4709 - acc: 0.3947 

 900/7500 [==>...........................] - ETA: 57s - loss: 2.4242 - acc: 0.4078

1050/7500 [===>..........................] - ETA: 57s - loss: 2.4118 - acc: 0.4181

1200/7500 [===>..........................] - ETA: 56s - loss: 2.3989 - acc: 0.4233

1350/7500 [====>.........................] - ETA: 54s - loss: 2.3841 - acc: 0.4319

1500/7500 [=====>........................] - ETA: 52s - loss: 2.3758 - acc: 0.4333

1650/7500 [=====>........................] - ETA: 50s - loss: 2.3609 - acc: 0.4388















































































Epoch 3/10


 150/7500 [..............................] - ETA: 1:07 - loss: 1.7294 - acc: 0.5933

 300/7500 [>.............................] - ETA: 1:02 - loss: 1.6246 - acc: 0.6167

 450/7500 [>.............................] - ETA: 1:01 - loss: 1.6142 - acc: 0.6222

 600/7500 [=>............................] - ETA: 1:04 - loss: 1.6344 - acc: 0.6283

 750/7500 [==>...........................] - ETA: 1:06 - loss: 1.5980 - acc: 0.6413

 900/7500 [==>...........................] - ETA: 1:04 - loss: 1.6134 - acc: 0.6311

1050/7500 [===>..........................] - ETA: 1:02 - loss: 1.6354 - acc: 0.6238

1200/7500 [===>..........................] - ETA: 1:01 - loss: 1.6173 - acc: 0.6233

1350/7500 [====>.........................] - ETA: 58s - loss: 1.6030 - acc: 0.6296 

1500/7500 [=====>........................] - ETA: 57s - loss: 1.5977 - acc: 0.6287

1650/7500 [=====>........................] - ETA: 55s - loss: 1.5901 - acc: 0.6309















































































Epoch 4/10


 150/7500 [..............................] - ETA: 1:05 - loss: 1.2209 - acc: 0.7467

 300/7500 [>.............................] - ETA: 1:02 - loss: 1.1321 - acc: 0.7567

 450/7500 [>.............................] - ETA: 1:00 - loss: 1.1656 - acc: 0.7578

 600/7500 [=>............................] - ETA: 1:02 - loss: 1.1714 - acc: 0.7517

 750/7500 [==>...........................] - ETA: 1:04 - loss: 1.1540 - acc: 0.7427

 900/7500 [==>...........................] - ETA: 1:03 - loss: 1.1600 - acc: 0.7433

1050/7500 [===>..........................] - ETA: 1:03 - loss: 1.1836 - acc: 0.7381

1200/7500 [===>..........................] - ETA: 1:01 - loss: 1.1560 - acc: 0.7442

1350/7500 [====>.........................] - ETA: 1:02 - loss: 1.1357 - acc: 0.7452

1500/7500 [=====>........................] - ETA: 1:01 - loss: 1.1259 - acc: 0.7467

1650/7500 [=====>........................] - ETA: 1:01 - loss: 1.1634 - acc: 0.7352















































































Epoch 5/10


 150/7500 [..............................] - ETA: 1:16 - loss: 0.7011 - acc: 0.8467

 300/7500 [>.............................] - ETA: 1:22 - loss: 0.7420 - acc: 0.8567

 450/7500 [>.............................] - ETA: 1:18 - loss: 0.8908 - acc: 0.8222

 600/7500 [=>............................] - ETA: 1:15 - loss: 0.8746 - acc: 0.8183

 750/7500 [==>...........................] - ETA: 1:10 - loss: 0.8619 - acc: 0.8147

 900/7500 [==>...........................] - ETA: 1:07 - loss: 0.8463 - acc: 0.8167

1050/7500 [===>..........................] - ETA: 1:04 - loss: 0.8354 - acc: 0.8238

1200/7500 [===>..........................] - ETA: 1:02 - loss: 0.8208 - acc: 0.8258

1350/7500 [====>.........................] - ETA: 1:00 - loss: 0.8354 - acc: 0.8193

1500/7500 [=====>........................] - ETA: 57s - loss: 0.8448 - acc: 0.8173 

1650/7500 [=====>........................] - ETA: 57s - loss: 0.8369 - acc: 0.8182





































































