<a href="https://colab.research.google.com/github/valerih12/neural-zamay/blob/master/model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [220]:
import pandas as pd 

df = pd.read_csv('lines.csv')
df.head()
df.line

df.describe()

Unnamed: 0,id
count,20056.0
mean,10027.5
std,5789.812835
min,0.0
25%,5013.75
50%,10027.5
75%,15041.25
max,20055.0


In [221]:
import re

def clean_text(sentence):
    sentence = sentence.lower()
    
    sentence = re.sub("\n", "", sentence)

    sentence = re.sub("[-*/()\"’'#/@;:<>{}`+=~|.!?,]", "", sentence)
    
    return sentence

# New Section

In [222]:
df.line = df.line.astype(str).apply(lambda line: clean_text(line))

df.line = df.line.apply(lambda x: x.split())

sample_size=400
df = df.line.sample(n=sample_size)
df.describe()

count                 400
unique                390
top       [нахуй, мирона]
freq                    2
Name: line, dtype: object

In [223]:
x_train = [line[:-1] for line in df]
y_train = [line[1:] for line in df]

x_train[20], y_train[0]

(['любовь', 'и', 'боль', 'shiftdelete'],
 ['man', 'у', 'меня', 'есть', 'релиз', 'с', 'федуком'])

In [224]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer()
tokenizer.fit_on_texts(df)
x_train = tokenizer.texts_to_sequences(x_train)
y_train = tokenizer.texts_to_sequences(y_train)

In [225]:
lengths = []

for sequence in x_train:
    lengths.append(len(sequence))
    
lengths = pd.Series(lengths)
lengths.describe()


count    400.000000
mean       5.425000
std        2.732772
min        0.000000
25%        4.000000
50%        5.000000
75%        7.000000
max       25.000000
dtype: float64

In [226]:
word2idx = tokenizer.word_index
idx2word = {value: key for key, value in word2idx.items()}


word2idx["<pad>"] = 0
idx2word[0] = "<pad>"

In [227]:
maxlen = 1000
vocab_size = len(tokenizer.word_index) + 1
embedding_dim = 128
print(vocab_size)


x_train = pad_sequences(x_train, maxlen=maxlen, padding='post', truncating='post')
y_train = pad_sequences(y_train, maxlen=maxlen, padding='post', truncating='post')

1483


In [228]:
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import GRU, Dense, Input, LSTM, Embedding, Dropout, Flatten, Conv2D, MaxPooling1D, Activation, Bidirectional
from tensorflow.keras import layers
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2
from tensorflow.keras.losses import SparseCategoricalCrossentropy

In [229]:
GRU_size = 128
dropout=0.5

model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, mask_zero=True))
model.add(GRU(units=GRU_size, return_sequences=True))
model.add(Dropout(dropout)) 
model.add(Dense(vocab_size))

In [230]:
lr = 0.1
model.compile(optimizer=Adam(learning_rate=lr), loss=SparseCategoricalCrossentropy(from_logits=True))

In [None]:
import time

epochs = 15
time_start = time.time()

history = model.fit(x_train, y_train, epochs=epochs, verbose=1)

time_elapsed = time.time() - time_start
time_elapsed

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15

In [None]:
model.save("zamay.h5")
# model = load_model("zamay.h5")

In [None]:
# save to json:  
hist_df = pd.DataFrame(history.history) 

hist_json_file = 'history.json' 
with open(hist_json_file, mode='w') as f:
    hist_df.to_json(f)

In [None]:
import numpy as np


raize_to_power = 20

def NormalizeData(data):
    return (data - np.min(data)) / (np.max(data) - np.min(data))

def generate(word):
    inputs = np.zeros((1, 1))
    print(inputs)
    # inputs = [0, 0]
    print(word2idx[word])
    inputs[0] = word2idx[word]
    count = 1
    res = ''
    while count <= 100:
        pred = model.predict(inputs)
        # word = np.argmax(pred)
        p = NormalizeData(pred[0][0])
        p = np.power(p, raize_to_power)
        p /= p.sum()
        word = np.random.choice(len(pred[0][0]), p = p)
        if word >= vocab_size:
            word = vocab_size - 1

        inputs[0, 0] = word
        idx2word[word]
        res += idx2word[word] + ' '
        if count % 7 == 0:
            res = res + "\n"
        count += 1
    return res 

        
result_text = generate('я')
result_text

In [None]:
import matplotlib.pyplot as plt
plt.style.use('dark_background')

fig, ax = plt.subplots()

ax.plot(range(len(history.history['loss'])), history.history['loss'])

ax.set_title(f"""
dropout={dropout} GRU tensors={GRU_size} learning rate={lr} 

всего итераций={epochs} кол-во семплов={sample_size}
""",
            fontsize=10, loc='left')
ax.set_xlabel('кол-во итераций')
ax.set_ylabel('точность попаданий')

fig.text(0.1,-0.7,
f"""
Суммарное время тренировки = {round(time_elapsed / 60, 2)} минут
Семпл, грейд probablity distribution = {raize_to_power}

""" + result_text)
fig.tight_layout()

fig.savefig(f'GRUsize-{GRU_size}_epochs-{epochs}__sample-size-{sample_size}.png', dpi=300, bbox_inches='tight')
