# У меня очистились все аутпуты
Но весь код должен запускаться, если есть _keras_, _tensorflow_ и _gensim_. _Requirements_ не делал, потому что в итоге в прод это не пошло.

In [23]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from nltk import word_tokenize, sent_tokenize

import keras
from keras import backend as K
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential, load_model as keras_load_model
from keras.layers import Dense, Flatten, Embedding, LSTM, SpatialDropout1D, Input, Bidirectional, Dropout
from keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
from keras.metrics import AUC
import tensorflow as tf

from multiprocessing import cpu_count
from joblib import Parallel, delayed
from tqdm.notebook import tqdm

from typing import List, Tuple
from functools import partial
from numba import jit
from gensim.models import Word2Vec

from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import roc_auc_score
from scipy.sparse import hstack

In [7]:
train_clean = '../data/train_preprocessed.csv'
val_clean = '../data/val_preprocessed.csv'

train_dirty = '../data/train.csv'
val_dirty = '../data/val.csv'

In [8]:
from functools import wraps
from time import perf_counter

def timing(function):
    """Time measuring decorator."""
    @wraps(function)
    def wrap(*args, **kwargs):
        """Wrap function for the timing decorator."""
        time_start = perf_counter()
        function_result = function(*args, **kwargs)
        time_end = perf_counter()
        print(
            "func {a} took: {c:0.5f}s.".format(
                a=function.__name__,
                c=time_end - time_start,
            )
        )
        return function_result
    return wrap

In [24]:
class BiLSTM_maker:
    def __init__(self,
                 train_path: str,
                 val_path: str,
                 emb_dims: int,
                 min_words_count: int,
                 window_size: int,
                 max_len_tokenizer: int,
                 lstm_epochs: int,
                 lstm_batch_size: int,         
        ):
        self.train_path = train_path
        self.val_path = val_path
        self.emb_dims = emb_dims
        self.min_words_count = min_words_count
        self.window_size = window_size
        self.max_len_tokenizer = max_len_tokenizer
        self.lstm_epochs = lstm_epochs
        self.lstm_batch_size = lstm_batch_size

    @timing
    def get_dataset(self, path: str) -> Tuple[List, pd.DataFrame]:
        df = pd.read_csv(path)
        
        descriptions = df['title_and_description'].fillna("").str.split().values
        return df['is_bad'], descriptions

    @timing
    def train_word2vec(self, data, path: str):
        self.w2v = Word2Vec(data, vector_size=self.emb_dims, min_count=self.min_words_count, window=self.window_size)
        self.w2v.save(path)

    def load_word2vec(self, path: str):
        self.w2v = Word2Vec.load(path)

    def train_tokenizer(self, data):
        self.tokenizer = Tokenizer(num_words=self.max_len_tokenizer, lower=False)
        self.tokenizer.fit_on_texts(data)

    @timing
    def tokenize_dataset(self, dataset):
        sequences = self.tokenizer.texts_to_sequences(dataset)
        return pad_sequences(sequences, maxlen=self.max_len_tokenizer)

    def create_embedding_matrix(self):
        self.vocab_size = len(self.tokenizer.word_index) + 1
        self.embedding_matrix = np.zeros((self.vocab_size, self.emb_dims))

        for word, i in tqdm(self.tokenizer.word_index.items()):
            try:
                self.embedding_matrix[i] = self.w2v.wv[word]
            except KeyError:
                continue


    @timing
    def create_BiLSTM(self):
        self.BiLSTM = Sequential()
        self.BiLSTM.add(Embedding(input_dim=self.vocab_size,
                                  output_dim=self.emb_dims,
                                  input_length=self.max_len_tokenizer,
                                  weights = [self.embedding_matrix],
                                  ))
        self.BiLSTM.add(Bidirectional(LSTM(64, dropout=0.25, recurrent_dropout=0.1)))
        self.BiLSTM.add(Dense(10))
        self.BiLSTM.add(Dropout(0.3))
        self.BiLSTM.add(Dense(1, activation='sigmoid'))
        
        self.learning_rate_reduction = ReduceLROnPlateau(monitor="val_auc", 
                                                         patience=3, 
                                                         verbose=1, 
                                                         factor=0.5, 
                                                         min_lr=0.00001,
                                                         )
        self.BiLSTM.compile(optimizer='RMSprop', loss='binary_crossentropy', metrics=AUC())
        del self.embedding_matrix

    @timing
    def train_BiLSTM(self, path: str):
        checkpoint = ModelCheckpoint(path,
                                     monitor="val_auc",
                                     verbose=1,
                                     save_best_only=True,
                                     mode='auto',
                                     period=1,
        )
        self.BiLSTM.fit(self.data_train,
                        self.y_train,
                        epochs=self.lstm_epochs,
                        batch_size=self.lstm_batch_size,
                        validation_data=(self.data_val, self.y_val),
                        callbacks=[self.learning_rate_reduction, checkpoint],
        )
        
        self.BiLSTM.save(path)

    def load_BiLSTM(self, path: str):
        self.BiLSTM = keras_load_model(path)

    @timing
    def text_saver(self):
        train = pd.read_csv(self.train_path)
        train["LSTM_result"] = self.BiLSTM.predict_proba(self.data_train)
        train.to_csv(self.train_path)
        del train
        
        val = pd.read_csv(self.val_path)
        val["LSTM_result"] = self.BiLSTM.predict_proba(self.data_val)
        val.to_csv(self.val_path)
        del val

    @timing
    def process(self, preload_w2v: bool, w2v_path: str, preload_lstm: bool, lstm_path: str, save_text: bool):
        self.y_train, self.data_train = self.get_dataset(self.train_path)
        self.y_val, self.data_val = self.get_dataset(self.val_path)
        print("datasets loaded")

        if not preload_w2v:
            self.train_word2vec(self.data_train, w2v_path)
            print("w2v trained")
        else:
            self.load_word2vec(w2v_path)
            print("w2v loaded")

        self.train_tokenizer(self.data_train)
        self.data_train = self.tokenize_dataset(self.data_train)
        self.data_val = self.tokenize_dataset(self.data_val)
        print("texts tokenized")

        self.create_embedding_matrix()
        self.create_BiLSTM()
        if not preload_lstm:
            self.train_BiLSTM(lstm_path)
            print("BiLSTM trained")
        else:
            self.load_BiLSTM(lstm_path)
            print("BiLSTM loaded")

        if save_text:
            self.text_saver()
            print("text saved")
        print("end")

    def __del__(self):
        del self.train_path
        del self.val_path
        del self.emb_dims
        del self.min_words_count
        del self.window_size
        del self.max_len_tokenizer
        del self.lstm_epochs
        del self.lstm_batch_size
        del self.learning_rate_reduction
        del self.BiLSTM
        del self.y_train
        del self.data_train
        del self.y_val
        del self.data_val

In [25]:
import datetime
print(datetime.datetime.now())

clean_BiLSTM = BiLSTM_maker(train_clean, val_clean, 128, 200000, 5, 100, 4, 1000)
clean_BiLSTM.process(True, "../lib/word2vec/word2vec_clean.model", False, "../lib/lstm/lstm_clean.h5", False)

2021-07-27 13:10:25.622611


Exception ignored in: <function BiLSTM_maker.__del__ at 0x7fc0c4730700>
Traceback (most recent call last):
  File "<ipython-input-19-fb3e18496fd5>", line 154, in __del__
AttributeError: learning_rate_reduction


func get_dataset took: 13.87005s.
func get_dataset took: 0.24938s.
datasets loaded
w2v loaded
func tokenize_dataset took: 12.86924s.
func tokenize_dataset took: 0.51680s.
texts tokenized


  0%|          | 0/1610389 [00:00<?, ?it/s]

<class 'int'> <class 'int'> <class 'int'> <class 'list'>


NotImplementedError: Cannot convert a symbolic Tensor (bidirectional_2/forward_lstm_2/strided_slice:0) to a numpy array. This error may indicate that you're trying to pass a Tensor to a NumPy call, which is not supported

In [26]:
clean_BiLSTM.embedding_matrix

<tf.Tensor: shape=(1610390, 128), dtype=float32, numpy=
array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.4330686 ,  0.77231926,  0.1537738 , ...,  0.22119492,
        -0.08767278,  0.13130182],
       [ 0.11100317,  0.1846495 , -0.19686203, ..., -0.24070218,
         0.2207837 , -0.27435434],
       ...,
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ]], dtype=float32)>

In [None]:
%%time
y_val_bilstm = clean_BiLSTM.BiLSTM.predict_proba(clean_BiLSTM.data_val)

In [None]:
roc_auc = roc_auc_score(clean_BiLSTM.y_val, y_val_bilstm)
print('ROC_AUC score for val data using LogReg:', roc_auc)

In [None]:
def metrics_printer(X_train, X_val, y_train, y_val, model):
    headers = ['Model', 'Dataset', 'Metric', 'Value']
    table = []
    model_name = type(model).__name__
    for dataset, ds_type, labels in zip([X_train, X_val], ["train", "val"], [y_train, y_val]):
        accuracy = accuracy_score(labels, model.predict(dataset))
        labels_pred = model.predict_proba(dataset)[:, 1]
        rocauc = roc_auc_score(labels, labels_pred)
        table.append([model_name, ds_type, "accuracy", accuracy])
        table.append([model_name, ds_type, "AUC", rocauc])
    print(tabulate(table, headers=headers, tablefmt='orgtbl'))

In [None]:
dirty_BiLSTM = BiLSTM(train_dirty, val_dirty)
dirty_BiLSTM.process(False, "../lib/word2vec/word2vec_dirty.model", False, "../lib/lstm/lstm_dirty.h5")
del dirty_BiLSTM

In [None]:
!free -mh