In [32]:
import tensorflow as tf
import pandas as pd
import numpy as np
import os
import re
import nltk
from nltk.corpus import stopwords
from tensorflow.keras.layers import LSTM, Embedding, Dense
from tensorflow.keras.models import Sequential

from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Dropout

from tensorflow.keras.preprocessing.text import Tokenizer
import pickle

In [197]:
with open('text.txt', 'r') as file:
    text = file.read()

In [198]:
def text_preprocess(text):
    sent = nltk.sent_tokenize(text = text)
    corpus = ''
    for i in sent:
        review = re.sub('/s+', ' ', i)
        review = re.sub('/d', '', review)
        review = re.sub(',', '', review)
        tokens = nltk.word_tokenize(review)
        tokens = ' '.join(tokens)
        corpus += tokens
        
    return corpus

In [199]:
text_2 = text_preprocess(text)

In [201]:
def del_duplicate(text):
    tokens = text.split(' ')
    tokens = set(tokens)
    corpus = ' '.join(tokens)
    return corpus

In [202]:
text_3 = del_duplicate(text_2)

In [203]:
text_3

'.With .My vague bell fashion while has deeply introduce drug saw in suits readers did mental .â€śWedlock which memory akin summons twice disturbing observing position Thursday abhorrent blind I .â€śIt under sensitive daily .It were admirable .He throw mind eagerly there ; take attracted inside results scent sufficient emotions their false up those for Baker said trained they careless .Beyond old told Holland cold threw rubbed abandoned admit .I more admirably well-remembered â€ť precise felt observerâ€ faculties happiness to however lately drowsiness save love : armchair passed â€śmy word things led temperament one would returned hardly observation fail given gasogene crack lit .They pass on Irene never scored been first leather came put thought nervous shared Holmes rise yourself week Sherlock from dubious merely harness.â€ť almost home fancy questionable high-power 1888â€ trifle by have pounds deduce cigars as press successfully walk together .One nature with name sunk certainly men

In [204]:
vocab_size = 1000
features_num = 10

In [205]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts([text_3])

tokenizer_rep = tokenizer.texts_to_sequences([text_3])

In [206]:
def data_split(data):
    data = np.squeeze(np.array(data))
    X_len = 3
    X = []
    y = []

    for i in range(len(data)):
        if i > len(data) - X_len - 1:
            continue
        else:
            X.append(data[i:i + X_len])
            y.append(data[i + X_len])
        
    X = np.array(X)
    y = np.array(y)
    return X,y
        

In [207]:
X_train, y_train = data_split(tokenizer_rep)

In [208]:
y_train = tf.keras.utils.to_categorical(y_train, vocab_size)

In [209]:
model = Sequential()
model.add(Embedding(vocab_size, features_num, input_length = 3))
model.add(LSTM(units = 1000, return_sequences = True))
model.add(LSTM(units = 1000))
model.add(Dense(units = 1000, activation = 'relu'))
model.add(Dense(units = vocab_size, activation = 'sigmoid'))

model.compile(optimizer = 'Adam', loss = 'categorical_crossentropy')

In [210]:
model.summary()

Model: "sequential_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, 3, 10)             10000     
_________________________________________________________________
lstm_12 (LSTM)               (None, 3, 1000)           4044000   
_________________________________________________________________
lstm_13 (LSTM)               (None, 1000)              8004000   
_________________________________________________________________
dense_12 (Dense)             (None, 1000)              1001000   
_________________________________________________________________
dense_13 (Dense)             (None, 1000)              1001000   
Total params: 14,060,000
Trainable params: 14,060,000
Non-trainable params: 0
_________________________________________________________________


In [211]:
model.fit(X_train, y_train, epochs = 60, batch_size = 64)

Epoch 1/60
Epoch 2/60
Epoch 3/60
Epoch 4/60
Epoch 5/60
Epoch 6/60
Epoch 7/60
Epoch 8/60
Epoch 9/60
Epoch 10/60
Epoch 11/60
Epoch 12/60
Epoch 13/60
Epoch 14/60
Epoch 15/60
Epoch 16/60
Epoch 17/60
Epoch 18/60
Epoch 19/60
Epoch 20/60
Epoch 21/60
Epoch 22/60
Epoch 23/60
Epoch 24/60
Epoch 25/60
Epoch 26/60
Epoch 27/60
Epoch 28/60
Epoch 29/60
Epoch 30/60
Epoch 31/60
Epoch 32/60
Epoch 33/60
Epoch 34/60
Epoch 35/60
Epoch 36/60
Epoch 37/60
Epoch 38/60
Epoch 39/60
Epoch 40/60
Epoch 41/60
Epoch 42/60
Epoch 43/60
Epoch 44/60
Epoch 45/60
Epoch 46/60
Epoch 47/60
Epoch 48/60
Epoch 49/60
Epoch 50/60
Epoch 51/60
Epoch 52/60
Epoch 53/60
Epoch 54/60
Epoch 55/60
Epoch 56/60
Epoch 57/60
Epoch 58/60
Epoch 59/60
Epoch 60/60


<tensorflow.python.keras.callbacks.History at 0x162b3ec7ac0>

In [212]:
word_index = {}
for key in tokenizer.word_index.keys():
    word_index.update({tokenizer.word_index[key]: key})

In [215]:
def predict(text):
    tokenized_text = tokenizer.texts_to_sequences([text])
    
    y_pred = model.predict(tokenized_text)
    y_pred = y_pred.argmax()
    
    word = word_index[y_pred]
    return word

In [222]:
text = ['Why','have', 'you']

In [223]:
for i in range(40):
    to_pred = ' '.join(text[len(text)-3:])
    word = predict(to_pred)
    text.append(word)

text = ' '.join(text)

In [224]:
text

'Why have you over loathed â€śi police part journey created itselfâ€ť dreadful excellent high six did away came answered rooms saw â€śi akin be true true harness from from home holmes holmes strikes official few admit â€śi true introspective sherlock home canâ€™t books'

In [226]:
for key,value in tokenizer.word_index.items():
    print(key, value)

â€ť 1
he 2
you 3
with 4
my 5
drug 6
in 7
i 8
it 9
they 10
to 11
â€śmy 12
one 13
from 14
home 15
as 16
all 17
but 18
just 19
out 20
and 21
how 22
study 23
â€śi 24
his 25
vague 26
bell 27
fashion 28
while 29
has 30
deeply 31
introduce 32
saw 33
suits 34
readers 35
did 36
mental 37
â€śwedlock 38
which 39
memory 40
akin 41
summons 42
twice 43
disturbing 44
observing 45
position 46
thursday 47
abhorrent 48
blind 49
â€śit 50
under 51
sensitive 52
daily 53
were 54
admirable 55
throw 56
mind 57
eagerly 58
there 59
take 60
attracted 61
inside 62
results 63
scent 64
sufficient 65
emotions 66
their 67
false 68
up 69
those 70
for 71
baker 72
said 73
trained 74
careless 75
beyond 76
old 77
told 78
holland 79
cold 80
threw 81
rubbed 82
abandoned 83
admit 84
more 85
admirably 86
well 87
remembered 88
precise 89
felt 90
observerâ€ 91
faculties 92
happiness 93
however 94
lately 95
drowsiness 96
save 97
love 98
armchair 99
passed 100
word 101
things 102
led 103
temperament 104
would 105
returned 106
har

In [227]:
tokenizer.word_index.items()

dict_items([('â€ť', 1), ('he', 2), ('you', 3), ('with', 4), ('my', 5), ('drug', 6), ('in', 7), ('i', 8), ('it', 9), ('they', 10), ('to', 11), ('â€śmy', 12), ('one', 13), ('from', 14), ('home', 15), ('as', 16), ('all', 17), ('but', 18), ('just', 19), ('out', 20), ('and', 21), ('how', 22), ('study', 23), ('â€śi', 24), ('his', 25), ('vague', 26), ('bell', 27), ('fashion', 28), ('while', 29), ('has', 30), ('deeply', 31), ('introduce', 32), ('saw', 33), ('suits', 34), ('readers', 35), ('did', 36), ('mental', 37), ('â€śwedlock', 38), ('which', 39), ('memory', 40), ('akin', 41), ('summons', 42), ('twice', 43), ('disturbing', 44), ('observing', 45), ('position', 46), ('thursday', 47), ('abhorrent', 48), ('blind', 49), ('â€śit', 50), ('under', 51), ('sensitive', 52), ('daily', 53), ('were', 54), ('admirable', 55), ('throw', 56), ('mind', 57), ('eagerly', 58), ('there', 59), ('take', 60), ('attracted', 61), ('inside', 62), ('results', 63), ('scent', 64), ('sufficient', 65), ('emotions', 66), ('t

In [228]:
slownik = {1:'A', 2:'B', 3:'C', 3:'D'}

In [232]:
slownik2 = {value:key for key, value in slownik.items()}

In [233]:
slownik2

{'A': 1, 'B': 2, 'D': 3}