In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Bidirectional, Embedding, SpatialDropout1D, LSTM, Dense, Dropout
from tensorflow.keras import utils
from tensorflow.keras.preprocessing.text import tokenizer_from_json
from tensorflow.keras.callbacks import ReduceLROnPlateau
from tensorflow_addons.metrics import F1Score
from gensim.models import KeyedVectors

print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  1


In [None]:
train  = np.load('train.npy')
labels = np.load('target.npy')
test   = np.load('test.npy')

In [None]:
classes = [
    'CATEGORY:LVL1:IT',
    'CATEGORY:LVL1:covid19',
    'CATEGORY:LVL1:авто',
    'CATEGORY:LVL1:бизнесифинансы',
    'CATEGORY:LVL1:военнаятехника',
    'CATEGORY:LVL1:выгодныепокупки',
    'CATEGORY:LVL1:гаджеты',
    'CATEGORY:LVL1:громкиепроисшествия',
    'CATEGORY:LVL1:детииматеринство',
    'CATEGORY:LVL1:еда',
    'CATEGORY:LVL1:животные',
    'CATEGORY:LVL1:здоровье',
    'CATEGORY:LVL1:знаменитости',
    'CATEGORY:LVL1:игрыикиберспорт',
    'CATEGORY:LVL1:интернетмаркетинг',
    'CATEGORY:LVL1:история',
    'CATEGORY:LVL1:карьера',
    'CATEGORY:LVL1:киноисериалы',
    'CATEGORY:LVL1:лайфстайл',
    'CATEGORY:LVL1:модаистиль',
    'CATEGORY:LVL1:наука',
    'CATEGORY:LVL1:недвижимость',
    'CATEGORY:LVL1:общество',
    'CATEGORY:LVL1:оружие',
    'CATEGORY:LVL1:охотаирыбалка',
    'CATEGORY:LVL1:полезныесоветы',
    'CATEGORY:LVL1:политика',
    'CATEGORY:LVL1:праваизаконы',
    'CATEGORY:LVL1:правильноепитание',
    'CATEGORY:LVL1:психология',
    'CATEGORY:LVL1:путешествия',
    'CATEGORY:LVL1:ремонтистроительство',
    'CATEGORY:LVL1:рукоделиеихэндмэйд',
    'CATEGORY:LVL1:садоводство',
    'CATEGORY:LVL1:саморазвитие',
    'CATEGORY:LVL1:спорт',
    'CATEGORY:LVL1:тесты',
    'CATEGORY:LVL1:технологиииизобретения',
    'CATEGORY:LVL1:уходикосметика',
    'CATEGORY:LVL1:фитнес',
    'CATEGORY:LVL1:экология'
]
indices = { i: c for i, c in enumerate(classes) }

In [None]:
seed = 42
np.random.seed(seed)
np.random.shuffle(train)
np.random.seed(seed)
np.random.shuffle(labels)

In [None]:
labels = utils.to_categorical(labels)

In [None]:
tokenizer = tokenizer_from_json(open('tokenizer.json', 'r').read())
wv = KeyedVectors.load_word2vec_format('w2v.txt')
embedding_matrix = np.zeros((len(tokenizer.word_index) + 1, wv.vector_size))
unknown = []
for word, i in tokenizer.word_index.items():
    if word in wv:
        embedding_matrix[i] = wv[word]
    else:
        unknown.append(word)

len(unknown)

In [None]:
num_words, embed_dim = embedding_matrix.shape
seq_len = len(train[0])
print('num_words', num_words, 'embed_dim', embed_dim, 'seq_len', seq_len)

In [None]:
model = Sequential()
model.add(Embedding(num_words, embed_dim, weights=[embedding_matrix], input_length=seq_len, trainable=False))
model.add(SpatialDropout1D(0.25))
model.add(Bidirectional(LSTM(1000, dropout=0.2, recurrent_dropout=0.2, return_sequences=True)))
model.add(Bidirectional(LSTM(1000, dropout=0.2, recurrent_dropout=0.2)))
model.add(Dense(256, activation='relu'))
model.add(Dropout(0.33))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.33))
model.add(Dense(128, activation='relu'))
model.add(Dense(41, activation='sigmoid'))
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy', F1Score(41, 'weighted', name='f1')])

In [None]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 67, 100)           55814700  
_________________________________________________________________
spatial_dropout1d (SpatialDr (None, 67, 100)           0         
_________________________________________________________________
bidirectional (Bidirectional (None, 67, 2000)          8808000   
_________________________________________________________________
bidirectional_1 (Bidirection (None, 2000)              24008000  
_________________________________________________________________
dense (Dense)                (None, 256)               512256    
_________________________________________________________________
dropout (Dropout)            (None, 256)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 128)               3

In [None]:
epochs = 30
batch_size = 1000
validation_split = 0.1
history = model.fit(train, labels,
                    epochs=epochs,
                    batch_size=batch_size,
                    validation_split=validation_split,
                    shuffle=True,
                    callbacks=[ReduceLROnPlateau(monitor='val_loss',
                                                 factor=0.3,
                                                 patience=2,
                                                 min_lr=0.00001,
                                                 verbose=1)])

In [None]:
test = pd.read_csv('test.csv', usecols=['doc_id'])
test['target'] = [indices[np.argmax(x)] for x in model.predict(test, batch_size=batch_size, verbose=1)]
test.to_csv('res.csv', index=False)