## Домашка (10 баллов)


Обучите модель с минимум 15 слоями, где у каждого слоя разные параметры (Dropout, Conv1d и Pooling, Dense считаются слоями, остальное нет). Как мимнимум 4 слоя должны быть наложены друг на друга и как минимум 2 параллельных слоя (последовательности слоев). Должен быть хотя бы один слой каждого типа.

При обучении используйте колбек для отслеживания лучшей модели. Ориентируйтесь на ф1 меру. Качество модели не должно быть околонулевым. Если метрики не растут, то попробуйте пообучать подольше или перестроить саму сеть.

Советы: Начните с небольших сетей и постепенно добавляйте, не пытайтесь сразу собрать все слои. Иногда кернел может крашиться просто так или из-за слишком больших матриц.


In [None]:
import os
os.environ["KERAS_BACKEND"] = "torch"

import keras

import pandas as pd
import numpy as np
from string import punctuation
from sklearn.model_selection import train_test_split
from collections import Counter
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
print(keras.__version__)

In [None]:
#from google.colab import drive
#drive.mount('/content/drive')

In [None]:
data = pd.read_csv('lenta_40k.csv.zip')

In [None]:
data

In [None]:
def preprocess(text):
    tokens = text.lower().split()
    tokens = [token.strip(punctuation) for token in tokens]
    tokens = [token for token in tokens if token]
    return tokens

In [None]:
vocab = Counter()

for text in data.text:
    vocab.update(preprocess(text))

filtered_vocab = set()

for word in vocab:
    if vocab[word] > 30:
        filtered_vocab.add(word)

In [None]:
len(filtered_vocab)

In [None]:
word2id = {'PAD':0}

for word in filtered_vocab:
    word2id[word] = len(word2id)

id2word = {i:word for word, i in word2id.items()}

In [None]:
X = []

for text in data.text:
    tokens = preprocess(text)
    ids = [word2id.get(token, 1) for token in tokens]
    X.append(ids)

In [None]:
MAX_LEN = max(len(x) for x in X)
MEAN_LEN = np.median([len(x) for x in X])

In [None]:
MAX_LEN, MEAN_LEN

In [None]:
MAX_LEN = int(MEAN_LEN + 31)

In [None]:
X = keras.preprocessing.sequence.pad_sequences(X, maxlen=MAX_LEN)

In [None]:
X.shape

In [None]:
id2label = {i:label for i, label in enumerate(set(data.topic.values))}
label2id = {l:i for i, l in id2label.items()}

In [None]:
y = keras.utils.to_categorical([label2id[label] for label in data.topic.values])

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.05)

In [None]:
checkpoint = keras.callbacks.ModelCheckpoint('model.weights.h5', monitor='f1_score', verbose=1, save_weights_only=True, save_best_only=True, mode='max',save_freq='epoch')
early_stop = keras.callbacks.EarlyStopping(monitor='f1_score', min_delta=0.04, patience=3, verbose=1, mode='max',)

### Модель №1 - 3 слоя

In [None]:
inputs = keras.layers.Input(shape=(MAX_LEN,))
embeddings = keras.layers.Embedding(input_dim=len(word2id), output_dim=30)(inputs)

conv1 = keras.layers.Conv1D(kernel_size=10, filters=32, strides=1)(embeddings)

mean = keras.layers.Flatten()(conv1)

dense = keras.layers.Dense(30, activation='relu')(mean)


outputs = keras.layers.Dense(len(label2id), activation='softmax')(dense)


model = keras.Model(inputs=inputs, outputs=outputs)
optimizer = keras.optimizers.Adam(learning_rate=0.0005)
model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=[keras.metrics.RecallAtPrecision(0.8, name='rec@prec'), keras.metrics.F1Score(average="weighted")])

In [None]:
model.fit(X_train, y_train, validation_data=(X_valid, y_valid), batch_size=2000, epochs=10, callbacks=[checkpoint, early_stop])

In [None]:
print(model.history.history.keys())
plt.plot(model.history.history['rec@prec'])
plt.plot(model.history.history['val_rec@prec'])
plt.plot(model.history.history['f1_score'])
plt.plot(model.history.history['val_f1_score'])
plt.title('model metrics')
plt.ylabel('metric')
plt.xlabel('epoch')
plt.legend(['train', 'val',
            'f1', 'val_f1'], loc='upper left')
plt.show()

In [None]:
model.summary()

### Модель №2 - 6 слоёв

In [None]:
inputs = keras.layers.Input(shape=(MAX_LEN,))
embeddings = keras.layers.Embedding(input_dim=len(word2id), output_dim=30)(inputs)

conv1 = keras.layers.Conv1D(kernel_size=5, filters=32, strides=1)(embeddings)
conv2 = keras.layers.Conv1D(kernel_size=5, filters=32, strides=3)(conv1)
pool = keras.layers.AveragePooling1D(pool_size=10)(conv2)

mean = keras.layers.Flatten()(pool)

dense = keras.layers.Dense(60, activation='relu')(mean)
dropout = keras.layers.Dropout(0.4)(dense)


outputs = keras.layers.Dense(len(label2id), activation='softmax')(dropout)


model2 = keras.Model(inputs=inputs, outputs=outputs)
optimizer = keras.optimizers.Adam(learning_rate=0.002)
model2.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=[keras.metrics.RecallAtPrecision(0.8, name='rec@prec'), keras.metrics.F1Score(average="weighted")])

In [None]:
model2.fit(X_train, y_train, validation_data=(X_valid, y_valid), batch_size=3000, epochs=10, callbacks=[checkpoint, early_stop])

In [None]:
print(model2.history.history.keys())
plt.plot(model2.history.history['rec@prec'])
plt.plot(model2.history.history['val_rec@prec'])
plt.plot(model2.history.history['f1_score'])
plt.plot(model2.history.history['val_f1_score'])
plt.title('model metrics')
plt.ylabel('metric')
plt.xlabel('epoch')
plt.legend(['train', 'val',
            'f1', 'val_f1'], loc='upper left')
plt.show()

In [None]:
model2.summary()

### Модель №3 - 15 слоёв

In [None]:
inputs = keras.layers.Input(shape=(MAX_LEN,))
embeddings = keras.layers.Embedding(input_dim=len(word2id), output_dim=300)(inputs, )

convs = []

for ks in [3,4,5,7,10]:
  dropout1 = keras.layers.Dropout(0.2)(embeddings) #1
  conv1 = keras.layers.Conv1D(kernel_size=2, filters=32, padding='same', strides=1)(dropout1) #2
  pool1 = keras.layers.AveragePooling1D(pool_size=3, padding="same")(conv1) #3
  conv2 = keras.layers.Conv1D(kernel_size=3, filters=32, padding='same', strides=1)(pool1) #4
  pool2 = keras.layers.AveragePooling1D(pool_size=5, padding="same")(conv2) #5
  conv3 = keras.layers.Conv1D(kernel_size=4, filters=32, padding='same', strides=1, kernel_regularizer='l2', activation='relu')(pool2) #6
  pool3 = keras.layers.AveragePooling1D(pool_size=7, padding="same")(conv3) #7
  dropout2 = keras.layers.Dropout(0.6)(pool3) #8
  dense1 = keras.layers.Dense(40,  activation='relu', kernel_regularizer='l2')(dropout2) #9

  convs.append(dense1)

concat = keras.layers.concatenate(convs, axis=2)
conv4 = keras.layers.Conv1D(kernel_size=3, filters=32, strides=1, padding="same")(concat) #10
pool4 = keras.layers.AveragePooling1D(pool_size=4, padding="same")(conv4) #11

dropout3 = keras.layers.Dropout(0.1)(pool4) #12
mean = keras.layers.Flatten()(dropout3) #13

dense2 = keras.layers.Dense(60, activation='relu', kernel_regularizer='l2')(mean) #14
dropout4 = keras.layers.Dropout(0.3)(dense2) #15


outputs = keras.layers.Dense(len(label2id), activation='softmax')(dropout4)


model3 = keras.Model(inputs=inputs, outputs=outputs)
optimizer = keras.optimizers.Adam(learning_rate=0.005)
model3.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=[keras.metrics.RecallAtPrecision(0.8, name='rec@prec'), keras.metrics.F1Score(average="weighted")])

In [None]:
model3.fit(X_train, y_train, validation_data=(X_valid, y_valid), batch_size=2000, epochs=15, callbacks=[checkpoint, early_stop])

In [None]:
print(model3.history.history.keys())
plt.plot(model3.history.history['rec@prec'])
plt.plot(model3.history.history['val_rec@prec'])
plt.plot(model3.history.history['f1_score'])
plt.plot(model3.history.history['val_f1_score'])
plt.title('model metrics')
plt.ylabel('metric')
plt.xlabel('epoch')
plt.legend(['train', 'val',
            'f1', 'val_f1'], loc='upper left')
plt.show()

In [None]:
model3.summary()