In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
import os
import re
import nltk
from nltk.corpus import stopwords
from tensorflow.keras.layers import LSTM, Embedding, Dense
from tensorflow.keras.models import Sequential

from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Dropout

from tensorflow.keras.preprocessing.text import Tokenizer
import pickle

In [2]:
with open('text.txt', 'r') as file:
    text = file.read()

In [3]:
def text_preprocess(text):
    sent = nltk.sent_tokenize(text = text)
    corpus = ''
    for i in sent:
        review = re.sub('/s+', ' ', i)
        review = re.sub('/d', '', review)
        review = re.sub(',', '', review)
        tokens = nltk.word_tokenize(review)
        tokens = ' '.join(tokens)
        corpus += tokens
        
    return corpus

In [4]:
text_2 = text_preprocess(text)

In [5]:
def del_duplicate(text):
    tokens = text.split(' ')
    tokens = set(tokens)
    corpus = ' '.join(tokens)
    return corpus

In [6]:
text_3 = del_duplicate(text_2)

In [7]:
text_3

'immense extraordinary perfect signs do would drifted did room might story Holland some sneer attention dubious finally yet for high-power .It following never merely .â€śWedlock twice risen at wife He passions name scent Holmes fierce emotions there friend mess seven a in throw .Just strikes books .But had six mood out.â€ť chest felt those tell .As ( cocaine nightâ€ too rise centuries door habit â€śThen corner head told true Trepoff deeply careless nature new .Obviously out Jane thought intrusions daily home way left incidents employing is activity marriage on through clasped lived than .â€śI every part brilliantly imagine reigning .Grit things tragedy eye .All been nervous .He man given balanced getting lenses .From our canâ€™t itselfâ€ť notice glad more delicate but Mary abandoned you waved pacing Irene armchair sex as incorrigible your energy adjusted master interests between time pass manner excellent predominates own rubbed certainly you.â€ť .â€śIt singular menâ€™s .To .They journ

In [8]:
vocab_size = 1000
features_num = 10

In [9]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts([text_3])

tokenizer_rep = tokenizer.texts_to_sequences([text_3])

In [10]:
def data_split(data):
    data = np.squeeze(np.array(data))
    X_len = 3
    X = []
    y = []

    for i in range(len(data)):
        if i > len(data) - X_len - 1:
            continue
        else:
            X.append(data[i:i + X_len])
            y.append(data[i + X_len])
        
    X = np.array(X)
    y = np.array(y)
    return X,y
        

In [11]:
X_train, y_train = data_split(tokenizer_rep)

In [12]:
y_train = tf.keras.utils.to_categorical(y_train, vocab_size)

In [14]:
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.callbacks import ReduceLROnPlateau
from tensorflow.keras.callbacks import TensorBoard

checkpoint = ModelCheckpoint("best_weights.h5", monitor='loss', verbose=1,
    save_best_only=True, mode='auto')

reduce = ReduceLROnPlateau(monitor='loss', factor=0.2, patience=3, min_lr=0.0001, verbose = 1)

logdir='logsnextword1'
tensorboard_Visualization = TensorBoard(log_dir=logdir)

In [15]:
model = Sequential()
model.add(Embedding(vocab_size, features_num, input_length = 3))
model.add(LSTM(units = 1000, return_sequences = True))
model.add(LSTM(units = 1000))
model.add(Dense(units = 1000, activation = 'relu'))
model.add(Dense(units = vocab_size, activation = 'sigmoid'))

model.compile(optimizer = 'Adam', loss = 'categorical_crossentropy')

In [16]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 3, 10)             10000     
_________________________________________________________________
lstm (LSTM)                  (None, 3, 1000)           4044000   
_________________________________________________________________
lstm_1 (LSTM)                (None, 1000)              8004000   
_________________________________________________________________
dense (Dense)                (None, 1000)              1001000   
_________________________________________________________________
dense_1 (Dense)              (None, 1000)              1001000   
Total params: 14,060,000
Trainable params: 14,060,000
Non-trainable params: 0
_________________________________________________________________


In [17]:
model.fit(X_train, y_train, epochs = 20, batch_size = 64, 
         callbacks = [checkpoint, reduce, tensorboard_Visualization])

Epoch 1/20

Epoch 00001: loss improved from inf to 6.90932, saving model to best_weights.h5
Epoch 2/20

Epoch 00002: loss improved from 6.90932 to 6.87814, saving model to best_weights.h5
Epoch 3/20

Epoch 00003: loss improved from 6.87814 to 6.60811, saving model to best_weights.h5
Epoch 4/20

Epoch 00004: loss improved from 6.60811 to 6.30399, saving model to best_weights.h5
Epoch 5/20

Epoch 00005: loss improved from 6.30399 to 6.21289, saving model to best_weights.h5
Epoch 6/20

Epoch 00006: loss improved from 6.21289 to 6.15280, saving model to best_weights.h5
Epoch 7/20

Epoch 00007: loss improved from 6.15280 to 6.12699, saving model to best_weights.h5
Epoch 8/20

Epoch 00008: loss improved from 6.12699 to 6.11995, saving model to best_weights.h5
Epoch 9/20

Epoch 00009: loss improved from 6.11995 to 6.09792, saving model to best_weights.h5
Epoch 10/20

Epoch 00010: loss improved from 6.09792 to 6.08293, saving model to best_weights.h5
Epoch 11/20

Epoch 00011: loss improved fro

<tensorflow.python.keras.callbacks.History at 0x2595e603ee0>

In [212]:
word_index = {}
for key in tokenizer.word_index.keys():
    word_index.update({tokenizer.word_index[key]: key})

In [215]:
def predict(text):
    tokenized_text = tokenizer.texts_to_sequences([text])
    
    y_pred = model.predict(tokenized_text)
    y_pred = y_pred.argmax()
    
    word = word_index[y_pred]
    return word

In [222]:
text = ['Why','have', 'you']

In [223]:
for i in range(40):
    to_pred = ' '.join(text[len(text)-3:])
    word = predict(to_pred)
    text.append(word)

text = ' '.join(text)

In [224]:
text

'Why have you over loathed â€śi police part journey created itselfâ€ť dreadful excellent high six did away came answered rooms saw â€śi akin be true true harness from from home holmes holmes strikes official few admit â€śi true introspective sherlock home canâ€™t books'

In [226]:
for key,value in tokenizer.word_index.items():
    print(key, value)

â€ť 1
he 2
you 3
with 4
my 5
drug 6
in 7
i 8
it 9
they 10
to 11
â€śmy 12
one 13
from 14
home 15
as 16
all 17
but 18
just 19
out 20
and 21
how 22
study 23
â€śi 24
his 25
vague 26
bell 27
fashion 28
while 29
has 30
deeply 31
introduce 32
saw 33
suits 34
readers 35
did 36
mental 37
â€śwedlock 38
which 39
memory 40
akin 41
summons 42
twice 43
disturbing 44
observing 45
position 46
thursday 47
abhorrent 48
blind 49
â€śit 50
under 51
sensitive 52
daily 53
were 54
admirable 55
throw 56
mind 57
eagerly 58
there 59
take 60
attracted 61
inside 62
results 63
scent 64
sufficient 65
emotions 66
their 67
false 68
up 69
those 70
for 71
baker 72
said 73
trained 74
careless 75
beyond 76
old 77
told 78
holland 79
cold 80
threw 81
rubbed 82
abandoned 83
admit 84
more 85
admirably 86
well 87
remembered 88
precise 89
felt 90
observerâ€ 91
faculties 92
happiness 93
however 94
lately 95
drowsiness 96
save 97
love 98
armchair 99
passed 100
word 101
things 102
led 103
temperament 104
would 105
returned 106
har

In [227]:
tokenizer.word_index.items()

dict_items([('â€ť', 1), ('he', 2), ('you', 3), ('with', 4), ('my', 5), ('drug', 6), ('in', 7), ('i', 8), ('it', 9), ('they', 10), ('to', 11), ('â€śmy', 12), ('one', 13), ('from', 14), ('home', 15), ('as', 16), ('all', 17), ('but', 18), ('just', 19), ('out', 20), ('and', 21), ('how', 22), ('study', 23), ('â€śi', 24), ('his', 25), ('vague', 26), ('bell', 27), ('fashion', 28), ('while', 29), ('has', 30), ('deeply', 31), ('introduce', 32), ('saw', 33), ('suits', 34), ('readers', 35), ('did', 36), ('mental', 37), ('â€śwedlock', 38), ('which', 39), ('memory', 40), ('akin', 41), ('summons', 42), ('twice', 43), ('disturbing', 44), ('observing', 45), ('position', 46), ('thursday', 47), ('abhorrent', 48), ('blind', 49), ('â€śit', 50), ('under', 51), ('sensitive', 52), ('daily', 53), ('were', 54), ('admirable', 55), ('throw', 56), ('mind', 57), ('eagerly', 58), ('there', 59), ('take', 60), ('attracted', 61), ('inside', 62), ('results', 63), ('scent', 64), ('sufficient', 65), ('emotions', 66), ('t

In [228]:
slownik = {1:'A', 2:'B', 3:'C', 3:'D'}

In [232]:
slownik2 = {value:key for key, value in slownik.items()}

In [233]:
slownik2

{'A': 1, 'B': 2, 'D': 3}