In [1]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import LSTM
from tensorflow.keras.callbacks import ModelCheckpoint

In [2]:
import tensorflow as tf

from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
import numpy as np

In [3]:
import pandas as pd
from sqlalchemy import create_engine
from tqdm import tqdm

In [4]:
import matplotlib.pyplot as plt


def plot_graphs(history, string):
    plt.plot(history.history[string])
    plt.xlabel("Epochs")
    plt.ylabel(string)
    plt.show()

In [5]:
engine = create_engine('postgresql+psycopg2://postgres:mypassword@192.168.1.174/process_text_processing')

In [21]:
data = pd.read_sql("""
    SELECT "text" FROM textdocuments WHERE "text" ~ '^[а-яА-Я[:punct:]\s]+$' OFFSET 1000 LIMIT 1000
""", engine)

In [22]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    1000 non-null   object
dtypes: object(1)
memory usage: 7.9+ KB


In [23]:
corpus = [t.split("\n") for t in data.text]
raw_text = " ".join(data.text)

In [9]:
chars = sorted(list(set(raw_text)))

In [8]:
del data

In [67]:
tokenizer = Tokenizer(
    lower=True,
    num_words = 100000
)

tokenizer.fit_on_texts(corpus)
total_words = len(tokenizer.word_index) + 1

In [68]:
print(total_words)
total_words = 100000

177274


In [None]:
input_sequences = []
for line in corpus:
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

In [11]:
# pad sequences 
# max_sequence_len = max([len(x) for x in input_sequences])
max_sequence_len = 40
print(max_sequence_len)

40


In [12]:
input_sequences = np.array(pad_sequences(input_sequences[:300000], maxlen=max_sequence_len, padding='pre'))

In [None]:
# create predictors and label
xs, labels = input_sequences[:,:-1],input_sequences[:,-1]

ys = tf.keras.utils.to_categorical(labels, num_classes=total_words)

In [None]:
model = Sequential()
model.add(Embedding(total_words, 64, input_length=max_sequence_len-1))
model.add(Bidirectional(LSTM(20)))
model.add(Dense(total_words, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
history = model.fit(xs, ys, epochs=500, verbose=1)

In [None]:
plot_graphs(history, 'accuracy')

In [None]:
seed_text = "Laurence went to dublin"
next_words = 100
  
for _ in range(next_words):
    token_list = tokenizer.texts_to_sequences([seed_text])[0]
    token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
    predicted = model.predict_classes(token_list, verbose=0)
    output_word = ""
    for word, index in tokenizer.word_index.items():
        if index == predicted:
            output_word = word
            break
    seed_text += " " + output_word
print(seed_text)

----------

In [10]:
char_to_int = dict((c, i) for i, c in enumerate(chars))
int_to_char = dict((i, c) for i, c in enumerate(chars))

In [24]:
# summarize the loaded data
n_chars = len(raw_text)
n_vocab = len(chars)
print("Total Characters: ", n_chars)
print("Total Vocab: ", n_vocab)

Total Characters:  1120797
Total Vocab:  88


In [27]:
# prepare the dataset of input to output pairs encoded as integers
seq_length = 100
dataX = []
dataY = []
for i in tqdm(range(0, n_chars - seq_length, 1)):
    seq_in = raw_text[i:i + seq_length]
    seq_out = raw_text[i + seq_length]
    dataX.append([char_to_int.get(char, 0) for char in seq_in])
    dataY.append(char_to_int.get(seq_out, 0))
    
n_patterns = len(dataX)
print("Total Patterns: ", n_patterns)

100%|██████████| 1120697/1120697 [00:15<00:00, 70937.69it/s]

Total Patterns:  1120697





In [28]:
# reshape X to be [samples, time steps, features]
X = np.reshape(dataX, (n_patterns, seq_length, 1))
# normalize
X = X / float(n_vocab)
# one hot encode the output variable
y = tf.keras.utils.to_categorical(dataY)

In [13]:
del dataX

In [14]:
# define the LSTM model
model = Sequential()
model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2])))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))

In [None]:
# load the network weights
# filename = "weights-improvement-19-1.9435.hdf5"
# model.load_weights(filename)

In [15]:
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [17]:
history = model.fit(X, y, epochs=5, verbose=1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
plot_graphs(history, 'accuracy')

In [13]:
model = Sequential()
model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2]), return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(256))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [None]:
history = model.fit(X, y, epochs=5, verbose=1, batch_size=50)

Epoch 1/5
  671/22414 [..............................] - ETA: 1:28:43 - loss: 1.8995

In [15]:
# pick a random seed
start = np.random.randint(0, len(dataX)-1)
pattern = dataX[start]
print("Seed:")
print("\"", ''.join([int_to_char[value] for value in pattern]), "\"")

Seed:
" д Киева вынес решение о признании свидетельства на знак для товаров и услуг 'Сбербанк', владельцем к "


In [16]:
composition = ""

In [17]:
# generate characters
for i in range(1000):
    x = np.reshape(pattern, (1, len(pattern), 1))
    x = x / float(n_vocab)
    prediction = model.predict(x, verbose=0)
    index = np.argmax(prediction)
    result = int_to_char[index]
    seq_in = [int_to_char[value] for value in pattern]
    composition += result
    pattern.append(index)
    pattern = pattern[1:len(pattern)]
print( "\nDone.")


Done.


In [19]:
seq_in

['в',
 'к',
 'и',
 ' ',
 'п',
 'о',
 'с',
 'т',
 'а',
 'в',
 'к',
 'и',
 ' ',
 'п',
 'о',
 'с',
 'т',
 'а',
 'в',
 'к',
 'и',
 ' ',
 'п',
 'о',
 'с',
 'т',
 'а',
 'в',
 'к',
 'и',
 ' ',
 'п',
 'о',
 'с',
 'т',
 'а',
 'в',
 'к',
 'и',
 ' ',
 'п',
 'о',
 'с',
 'т',
 'а',
 'в',
 'к',
 'и',
 ' ',
 'п',
 'о',
 'с',
 'т',
 'а',
 'в',
 'к',
 'и',
 ' ',
 'п',
 'о',
 'с',
 'т',
 'а',
 'в',
 'к',
 'и',
 ' ',
 'п',
 'о',
 'с',
 'т',
 'а',
 'в',
 'к',
 'и',
 ' ',
 'п',
 'о',
 'с',
 'т',
 'а',
 'в',
 'к',
 'и',
 ' ',
 'п',
 'о',
 'с',
 'т',
 'а',
 'в',
 'к',
 'и',
 ' ',
 'п',
 'о',
 'с',
 'т',
 'а',
 'в']

In [18]:
composition

'ак поставки поставки поставки поставки поставки поставки поставки поставки поставки поставки поставки поставки поставки поставки поставки поставки поставки поставки поставки поставки поставки поставки поставки поставки поставки поставки поставки поставки поставки поставки поставки поставки поставки поставки поставки поставки поставки поставки поставки поставки поставки поставки поставки поставки поставки поставки поставки поставки поставки поставки поставки поставки поставки поставки поставки поставки поставки поставки поставки поставки поставки поставки поставки поставки поставки поставки поставки поставки поставки поставки поставки поставки поставки поставки поставки поставки поставки поставки поставки поставки поставки поставки поставки поставки поставки поставки поставки поставки поставки поставки поставки поставки поставки поставки поставки поставки поставки поставки поставки поставки поставки поставки поставки поставки поставки поставки поставки поставки поставки поставки постав