In [47]:
#################### Data Processing ######################
from numpy import array
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical

##################### Model building #####################
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Embedding

In [73]:
data="California is a state in the Western United States. California borders Oregon to the north, Nevada and Arizona to the east, the Mexican state of Baja California to the south; and has a coastline along the Pacific Ocean to the west."

data

'California is a state in the Western United States. California borders Oregon to the north, Nevada and Arizona to the east, the Mexican state of Baja California to the south; and has a coastline along the Pacific Ocean to the west.'

# Data Pre-Processing 

In [49]:
# cleaning the data
data= data.lower()           # Converting the string to lower case to get uniformity
data

'california is a state in the western united states. california borders oregon to the north, nevada and arizona to the east, the mexican state of baja california to the south; and has a coastline along the pacific ocean to the west.'

In [50]:
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [51]:
def remove_punctuation(text):
    text_nopunt="".join([c 
                         for c in text
                         if c not in string.punctuation])
    return text_nopunt

In [52]:
data=remove_punctuation(data)

data

'california is a state in the western united states california borders oregon to the north nevada and arizona to the east the mexican state of baja california to the south and has a coastline along the pacific ocean to the west'

In [53]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\L\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [54]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

stop_words = set(stopwords.words('english'))
  
text_tokens = word_tokenize(data)
print("Word Token:  \n",text_tokens)

Word Token:  
 ['california', 'is', 'a', 'state', 'in', 'the', 'western', 'united', 'states', 'california', 'borders', 'oregon', 'to', 'the', 'north', 'nevada', 'and', 'arizona', 'to', 'the', 'east', 'the', 'mexican', 'state', 'of', 'baja', 'california', 'to', 'the', 'south', 'and', 'has', 'a', 'coastline', 'along', 'the', 'pacific', 'ocean', 'to', 'the', 'west']


In [55]:
tokens_without_sw = [word 
                     for word in text_tokens 
                     if not word in stopwords.words()]
print("Word Without StopWords:  \n",tokens_without_sw)

Word Without StopWords:  
 ['california', 'state', 'western', 'united', 'states', 'california', 'borders', 'oregon', 'north', 'nevada', 'arizona', 'east', 'mexican', 'state', 'baja', 'california', 'south', 'coastline', 'pacific', 'ocean', 'west']


In [56]:
data = (" ").join(tokens_without_sw)
data

'california state western united states california borders oregon north nevada arizona east mexican state baja california south coastline pacific ocean west'

In [57]:
# Instantiating the Tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts([data])  ## convert sentance to word  
sequence_data = tokenizer.texts_to_sequences([data])[0]  # mode concept for sequence 
sequence_data   

[1, 2, 3, 4, 5, 1, 6, 7, 8, 9, 10, 11, 12, 2, 13, 1, 14, 15, 16, 17, 18]

In [58]:
# Getting the total number of words of the data.
word2idx = tokenizer.word_index  ####### index number to every token ro word 
print(len(word2idx))

18


In [59]:
print(word2idx)

{'california': 1, 'state': 2, 'western': 3, 'united': 4, 'states': 5, 'borders': 6, 'oregon': 7, 'north': 8, 'nevada': 9, 'arizona': 10, 'east': 11, 'mexican': 12, 'baja': 13, 'south': 14, 'coastline': 15, 'pacific': 16, 'ocean': 17, 'west': 18}


In [60]:
vocab_size = len(word2idx) + 1    ### Get unique words
print(vocab_size)         # california = 0+1   

19


In [61]:
sequence_data

[1, 2, 3, 4, 5, 1, 6, 7, 8, 9, 10, 11, 12, 2, 13, 1, 14, 15, 16, 17, 18]

In [62]:
sequences = []  # empty list 
for i in range(3,len(sequence_data)): # i = 3 to 20 # sequence_data= count of words
    abc=sequence_data[i-3:i+1] # 0,1
    sequences.append(abc)

print(len(sequences))

18


In [63]:
sequences

[[1, 2, 3, 4],
 [2, 3, 4, 5],
 [3, 4, 5, 1],
 [4, 5, 1, 6],
 [5, 1, 6, 7],
 [1, 6, 7, 8],
 [6, 7, 8, 9],
 [7, 8, 9, 10],
 [8, 9, 10, 11],
 [9, 10, 11, 12],
 [10, 11, 12, 2],
 [11, 12, 2, 13],
 [12, 2, 13, 1],
 [2, 13, 1, 14],
 [13, 1, 14, 15],
 [1, 14, 15, 16],
 [14, 15, 16, 17],
 [15, 16, 17, 18]]

In [64]:
import numpy as np
sequences=np.array(sequences)
sequences

array([[ 1,  2,  3,  4],
       [ 2,  3,  4,  5],
       [ 3,  4,  5,  1],
       [ 4,  5,  1,  6],
       [ 5,  1,  6,  7],
       [ 1,  6,  7,  8],
       [ 6,  7,  8,  9],
       [ 7,  8,  9, 10],
       [ 8,  9, 10, 11],
       [ 9, 10, 11, 12],
       [10, 11, 12,  2],
       [11, 12,  2, 13],
       [12,  2, 13,  1],
       [ 2, 13,  1, 14],
       [13,  1, 14, 15],
       [ 1, 14, 15, 16],
       [14, 15, 16, 17],
       [15, 16, 17, 18]])

In [65]:
X = []
Y = []
for i in sequences: 
    X.append(i[0:3])  # i = 0,1,2
    Y.append(i[3])

X = np.array(X)
Y = np.array(Y)

In [66]:
print("Data" , X[:5])
print("Response" , Y[:5])

Data [[1 2 3]
 [2 3 4]
 [3 4 5]
 [4 5 1]
 [5 1 6]]
Response [4 5 1 6 7]


In [67]:
Y

array([ 4,  5,  1,  6,  7,  8,  9, 10, 11, 12,  2, 13,  1, 14, 15, 16, 17,
       18])

In [68]:
Y=to_categorical(Y,num_classes=vocab_size)  # Matrix of Y  
Y[:5]

array([[0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0.]], dtype=float32)

# Model Building

In [69]:
model = Sequential()
model.add(Embedding(vocab_size,10, input_length=3)) # data import
model.add(LSTM(50,return_sequences=True)) # LSTM1
model.add(LSTM(50))  # LSTM2
model.add(Dense(50,activation='relu')) # HIDDEN LAYER
model.add(Dense(vocab_size, activation='softmax'))

In [70]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 3, 10)             190       
                                                                 
 lstm_2 (LSTM)               (None, 3, 50)             12200     
                                                                 
 lstm_3 (LSTM)               (None, 50)                20200     
                                                                 
 dense_2 (Dense)             (None, 50)                2550      
                                                                 
 dense_3 (Dense)             (None, 19)                969       
                                                                 
Total params: 36,109
Trainable params: 36,109
Non-trainable params: 0
_________________________________________________________________


In [71]:
model.compile(optimizer='adam',loss = 'categorical_crossentropy',metrics=['accuracy'])

In [72]:
r = model.fit(X,Y,epochs=200)

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

Epoch 84/200
Epoch 85/200
Epoch 86/200
Epoch 87/200
Epoch 88/200
Epoch 89/200
Epoch 90/200
Epoch 91/200
Epoch 92/200
Epoch 93/200
Epoch 94/200
Epoch 95/200
Epoch 96/200
Epoch 97/200
Epoch 98/200
Epoch 99/200
Epoch 100/200
Epoch 101/200
Epoch 102/200
Epoch 103/200
Epoch 104/200
Epoch 105/200
Epoch 106/200
Epoch 107/200
Epoch 108/200
Epoch 109/200
Epoch 110/200
Epoch 111/200
Epoch 112/200
Epoch 113/200
Epoch 114/200
Epoch 115/200
Epoch 116/200
Epoch 117/200
Epoch 118/200
Epoch 119/200
Epoch 120/200
Epoch 121/200
Epoch 122/200
Epoch 123/200
Epoch 124/200
Epoch 125/200
Epoch 126/200
Epoch 127/200
Epoch 128/200
Epoch 129/200
Epoch 130/200
Epoch 131/200
Epoch 132/200
Epoch 133/200
Epoch 134/200
Epoch 135/200
Epoch 136/200
Epoch 137/200
Epoch 138/200
Epoch 139/200
Epoch 140/200
Epoch 141/200
Epoch 142/200
Epoch 143/200
Epoch 144/200
Epoch 145/200
Epoch 146/200
Epoch 147/200
Epoch 148/200
Epoch 149/200
Epoch 150/200
Epoch 151/200
Epoch 152/200
Epoch 153/200
Epoch 154/200
Epoch 155/200
Epoch 15

Epoch 166/200
Epoch 167/200
Epoch 168/200
Epoch 169/200
Epoch 170/200
Epoch 171/200
Epoch 172/200
Epoch 173/200
Epoch 174/200
Epoch 175/200
Epoch 176/200
Epoch 177/200
Epoch 178/200
Epoch 179/200
Epoch 180/200
Epoch 181/200
Epoch 182/200
Epoch 183/200
Epoch 184/200
Epoch 185/200
Epoch 186/200
Epoch 187/200
Epoch 188/200
Epoch 189/200
Epoch 190/200
Epoch 191/200
Epoch 192/200
Epoch 193/200
Epoch 194/200
Epoch 195/200
Epoch 196/200
Epoch 197/200
Epoch 198/200
Epoch 199/200
Epoch 200/200


# Future Word Prediction

#### Describe
* Model = model we build
* tokenizer = Breaking into Word
* enter_text = input user give

In [39]:
for word, index in tokenizer.word_index.items():
    print(word,index)

california 1
state 2
western 3
united 4
states 5
borders 6
oregon 7
north 8
nevada 9
arizona 10
east 11
mexican 12
baja 13
south 14
coastline 15
pacific 16
ocean 17
west 18


In [40]:
enter_text='states california borders'
encoded = tokenizer.texts_to_sequences([enter_text]) 
encoded = np.array(encoded) # [3]
predicted= np.argmax(model.predict(encoded))  # input = 3 # prediction = 4  
predicted



7

In [41]:
def Predict_Next_Words(model,tokenizer,enter_text):
        encoded = tokenizer.texts_to_sequences([enter_text]) # in_text = Western = 3 
        encoded = np.array(encoded) # [3]
        predicted= np.argmax(model.predict(encoded))  # input = 3 # prediction = 4  
        predicted_word=''
        for word, index in tokenizer.word_index.items():
            if  index==predicted:    # Predicted  7 = Index 7
                predicted_word = word      # word = OREGON
                break
        result=enter_text + ' ' + predicted_word
        return result

In [42]:
data

'california state western united states california borders oregon north nevada arizona east mexican state baja california south coastline pacific ocean west'

In [43]:
print(Predict_Next_Words(model,tokenizer,'states california borders'))

states california borders oregon


In [44]:
print(Predict_Next_Words(model,tokenizer,'california borders oregon'))

california borders oregon north


In [45]:
print(Predict_Next_Words(model,tokenizer,'borders oregon north'))

borders oregon north nevada


In [46]:
print(Predict_Next_Words(model,tokenizer,'oregon north nevada '))

oregon north nevada  arizona


# Finished