In [1]:
import os
os.environ["KERAS_BACKEND"] = "torch"
import keras
import pandas as pd
import numpy as np
from string import punctuation
from sklearn.model_selection import train_test_split
from collections import Counter
import matplotlib.pyplot as plt

# Задание 1

In [2]:
# предобработка и сплит на выборки

data = pd.read_csv('lenta_40k.csv.zip')

def preprocess(text):
    tokens = text.lower().split()
    tokens = [token.strip(punctuation) for token in tokens]
    return tokens

# словарь
vocab = Counter()
for text in data.text:
    vocab.update(preprocess(text))

# отфильтрованный словарь
filtered_vocab = set()
for word in vocab:
    if vocab[word] > 30:
        filtered_vocab.add(word)

# индексируем слова
word2id = {'PAD':0}
for word in filtered_vocab:
    word2id[word] = len(word2id)
id2word = {i:word for word, i in word2id.items()}

# переводим тексты в последовательности индексов
X = []
for text in data.text:
    tokens = preprocess(text)
    ids = [word2id.get(token, 1) for token in tokens]
    X.append(ids)


MAX_LEN = max(len(x) for x in X)
MEAN_LEN = np.median([len(x) for x in X])
MAX_LEN = int(MEAN_LEN + 30)

# паддинг
X = keras.preprocessing.sequence.pad_sequences(X, maxlen=MAX_LEN)

id2label = {i:label for i, label in enumerate(set(data.topic.values))}
label2id = {l:i for i, l in id2label.items()}

y = keras.utils.to_categorical([label2id[label] for label in data.topic.values])
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.05)


In [3]:
# callback-параметры

checkpoint = keras.callbacks.ModelCheckpoint('model.weights.h5',
                                                monitor='recall@precision',
                                                verbose=1,
                                                save_weights_only=True,
                                                save_best_only=True,
                                                mode='max',
                                                save_freq='epoch'
                                               )

# для остановки
early_stop = keras.callbacks.EarlyStopping(monitor='val_rec@prec',
                                              min_delta=0.01,
                                              patience=5,
                                           # взяла побольше; пусть терпит,
                                           # потому что господь с ним еще не закончил
                                              verbose=1,
                                              mode='max',
                                              )

In [4]:
inputs = keras.layers.Input(shape=(MAX_LEN,))
embeddings = keras.layers.Embedding(input_dim=len(word2id), output_dim=100)(inputs)

# параллельные слои
convs = []
for ks in [3,4,5,7,10]:
    conv1 = keras.layers.Conv1D(kernel_size=ks, filters=32, padding='same',strides=1)(embeddings)
    conv2 = keras.layers.Conv1D(kernel_size=ks, filters=24, padding='same',strides=1,
                                  kernel_regularizer='l2', activation='relu')(conv1)

    convs.append(conv2)

# конкатенация параллельных слоев
concat = keras.layers.concatenate(convs, axis=2)

conv3 = keras.layers.Conv1D(kernel_size=3, filters=24, padding='same',strides=1,
                                   activation='relu')(concat)

# пулинг
pool = keras.layers.AveragePooling1D(pool_size=5)(conv3)

# делаем дропаут, чтобы избежать переобучения
drop2 = keras.layers.Dropout(0.5)(pool)

# делаем слои плоскими
flatten = keras.layers.Flatten()(drop2)

# делаем слои полносвязными
dense = keras.layers.Dense(50, activation='relu')(flatten)

# выходной слой
outputs = keras.layers.Dense(len(label2id), activation='softmax')(dense)

model = keras.Model(inputs=inputs, outputs=outputs)
optimizer = keras.optimizers.Adam(learning_rate=0.001)
model.compile(optimizer=optimizer,
              loss='binary_crossentropy',
              metrics=[keras.metrics.RecallAtPrecision(0.8, name='rec@prec')],
              )

model.summary()

In [5]:
# пробуем обучать

model.fit(X_train, y_train,
          validation_data=(X_valid, y_valid),
          batch_size=2000,
         epochs=100,
         callbacks=[checkpoint, early_stop])

  outputs = tnn.conv1d(


Epoch 1/100
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 231ms/step - loss: 1.4034 - rec@prec: 5.4348e-06 - val_loss: 0.9935 - val_rec@prec: 0.0000e+00
Epoch 2/100


  self._save_model(epoch=epoch, batch=None, logs=logs)


[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 229ms/step - loss: 0.8962 - rec@prec: 2.0764e-05 - val_loss: 0.6426 - val_rec@prec: 0.0032
Epoch 3/100
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 233ms/step - loss: 0.5822 - rec@prec: 2.3641e-04 - val_loss: 0.4229 - val_rec@prec: 0.0424
Epoch 4/100
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 232ms/step - loss: 0.3863 - rec@prec: 0.0512 - val_loss: 0.2912 - val_rec@prec: 0.1344
Epoch 5/100
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 262ms/step - loss: 0.2662 - rec@prec: 0.1823 - val_loss: 0.2081 - val_rec@prec: 0.3070
Epoch 6/100
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 235ms/step - loss: 0.1881 - rec@prec: 0.3825 - val_loss: 0.1614 - val_rec@prec: 0.4049
Epoch 7/100
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 236ms/step - loss: 0.1393 - rec@prec: 0.5147 - val_loss: 0.1277 - val_rec@prec: 0.4919
Epoch 8/100
[1m22/22[0m 

<keras.src.callbacks.history.History at 0x7cc57dfc53d0>

# Задание 2

In [None]:
# не шмог :(