In [1]:
#################### Data Processing ######################
from numpy import array
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical

##################### Model building #####################
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Embedding

In [2]:
data="California is a state in the Western United States. California borders Oregon to the north, Nevada and Arizona to the east, the Mexican state of Baja California to the south; and has a coastline along the Pacific Ocean to the west."

data

'California is a state in the Western United States. California borders Oregon to the north, Nevada and Arizona to the east, the Mexican state of Baja California to the south; and has a coastline along the Pacific Ocean to the west.'

# Data Pre-Processing 

In [3]:
# cleaning the data
data= data.lower()           # Converting the string to lower case to get uniformity
data

'california is a state in the western united states. california borders oregon to the north, nevada and arizona to the east, the mexican state of baja california to the south; and has a coastline along the pacific ocean to the west.'

In [4]:
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [5]:
def remove_punctuation(text):
    text_nopunt="".join([c 
                         for c in text
                         if c not in string.punctuation])
    return text_nopunt

In [6]:
data=remove_punctuation(data)

data

'california is a state in the western united states california borders oregon to the north nevada and arizona to the east the mexican state of baja california to the south and has a coastline along the pacific ocean to the west'

In [10]:
import nltk
nltk.download('punkt')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\suvar\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [11]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

stop_words = set(stopwords.words('english'))
  
text_tokens = word_tokenize(data)
print("Word Token:  \n",text_tokens)

Word Token:  
 ['california', 'is', 'a', 'state', 'in', 'the', 'western', 'united', 'states', 'california', 'borders', 'oregon', 'to', 'the', 'north', 'nevada', 'and', 'arizona', 'to', 'the', 'east', 'the', 'mexican', 'state', 'of', 'baja', 'california', 'to', 'the', 'south', 'and', 'has', 'a', 'coastline', 'along', 'the', 'pacific', 'ocean', 'to', 'the', 'west']


In [12]:
tokens_without_sw = [word 
                     for word in text_tokens 
                     if not word in stopwords.words()]
print("Word Without StopWords:  \n",tokens_without_sw)

Word Without StopWords:  
 ['california', 'state', 'western', 'united', 'states', 'california', 'borders', 'oregon', 'north', 'nevada', 'arizona', 'east', 'mexican', 'state', 'baja', 'california', 'south', 'coastline', 'pacific', 'ocean', 'west']


In [13]:
data = (" ").join(tokens_without_sw)
data

'california state western united states california borders oregon north nevada arizona east mexican state baja california south coastline pacific ocean west'

In [14]:
# Instantiating the Tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts([data])  ## convert sentance to word  
sequence_data = tokenizer.texts_to_sequences([data])[0]  # mode concept for sequence 
sequence_data   

[1, 2, 3, 4, 5, 1, 6, 7, 8, 9, 10, 11, 12, 2, 13, 1, 14, 15, 16, 17, 18]

In [15]:
# Getting the total number of words of the data.
word2idx = tokenizer.word_index  ####### index number to every token ro word 
print(len(word2idx))

18


In [16]:
print(word2idx)

{'california': 1, 'state': 2, 'western': 3, 'united': 4, 'states': 5, 'borders': 6, 'oregon': 7, 'north': 8, 'nevada': 9, 'arizona': 10, 'east': 11, 'mexican': 12, 'baja': 13, 'south': 14, 'coastline': 15, 'pacific': 16, 'ocean': 17, 'west': 18}


In [17]:
vocab_size = len(word2idx) + 1    ### Get unique words
print(vocab_size)         # california = 0+1   

19


In [18]:
sequence_data

[1, 2, 3, 4, 5, 1, 6, 7, 8, 9, 10, 11, 12, 2, 13, 1, 14, 15, 16, 17, 18]

In [19]:
sequences = []  # empty list 
for i in range(3,len(sequence_data)): # i = 3 to 20 # sequence_data= count of words
    abc=sequence_data[i-3:i+1] # 0,1
    sequences.append(abc)

print(len(sequences))

18


In [20]:
sequences

[[1, 2, 3, 4],
 [2, 3, 4, 5],
 [3, 4, 5, 1],
 [4, 5, 1, 6],
 [5, 1, 6, 7],
 [1, 6, 7, 8],
 [6, 7, 8, 9],
 [7, 8, 9, 10],
 [8, 9, 10, 11],
 [9, 10, 11, 12],
 [10, 11, 12, 2],
 [11, 12, 2, 13],
 [12, 2, 13, 1],
 [2, 13, 1, 14],
 [13, 1, 14, 15],
 [1, 14, 15, 16],
 [14, 15, 16, 17],
 [15, 16, 17, 18]]

In [21]:
import numpy as np
sequences=np.array(sequences)
sequences

array([[ 1,  2,  3,  4],
       [ 2,  3,  4,  5],
       [ 3,  4,  5,  1],
       [ 4,  5,  1,  6],
       [ 5,  1,  6,  7],
       [ 1,  6,  7,  8],
       [ 6,  7,  8,  9],
       [ 7,  8,  9, 10],
       [ 8,  9, 10, 11],
       [ 9, 10, 11, 12],
       [10, 11, 12,  2],
       [11, 12,  2, 13],
       [12,  2, 13,  1],
       [ 2, 13,  1, 14],
       [13,  1, 14, 15],
       [ 1, 14, 15, 16],
       [14, 15, 16, 17],
       [15, 16, 17, 18]])

In [22]:
X = []
Y = []
for i in sequences: 
    X.append(i[0:3])  # i = 0,1,2
    Y.append(i[3])

X = np.array(X)
Y = np.array(Y)

In [23]:
print("Data" , X[:5])
print("Response" , Y[:5])

Data [[1 2 3]
 [2 3 4]
 [3 4 5]
 [4 5 1]
 [5 1 6]]
Response [4 5 1 6 7]


In [24]:
Y

array([ 4,  5,  1,  6,  7,  8,  9, 10, 11, 12,  2, 13,  1, 14, 15, 16, 17,
       18])

In [25]:
Y=to_categorical(Y,num_classes=vocab_size)  # Matrix of Y  
Y[:5]

array([[0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0.]], dtype=float32)

# Model Building

In [26]:
model = Sequential()
model.add(Embedding(vocab_size,10, input_length=3)) # data import
model.add(LSTM(50,return_sequences=True)) # LSTM1
model.add(LSTM(50))  # LSTM2
model.add(Dense(50,activation='relu')) # HIDDEN LAYER
model.add(Dense(vocab_size, activation='softmax'))

In [27]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 3, 10)             190       
                                                                 
 lstm (LSTM)                 (None, 3, 50)             12200     
                                                                 
 lstm_1 (LSTM)               (None, 50)                20200     
                                                                 
 dense (Dense)               (None, 50)                2550      
                                                                 
 dense_1 (Dense)             (None, 19)                969       
                                                                 
Total params: 36,109
Trainable params: 36,109
Non-trainable params: 0
_________________________________________________________________


In [28]:
model.compile(optimizer='adam',loss = 'categorical_crossentropy',metrics=['accuracy'])

In [29]:
r = model.fit(X,Y,epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


# Future Word Prediction

#### Describe
* Model = model we build
* tokenizer = Breaking into Word
* enter_text = input user give

In [30]:
for word, index in tokenizer.word_index.items():
    print(word,index)

california 1
state 2
western 3
united 4
states 5
borders 6
oregon 7
north 8
nevada 9
arizona 10
east 11
mexican 12
baja 13
south 14
coastline 15
pacific 16
ocean 17
west 18


In [31]:
enter_text='states california borders'
encoded = tokenizer.texts_to_sequences([enter_text]) 
encoded = np.array(encoded) # [3]
predicted= np.argmax(model.predict(encoded))  # input = 3 # prediction = 4  
predicted



16

In [32]:
def Predict_Next_Words(model,tokenizer,enter_text):
        encoded = tokenizer.texts_to_sequences([enter_text]) # in_text = Western = 3 
        encoded = np.array(encoded) # [3]
        predicted= np.argmax(model.predict(encoded))  # input = 3 # prediction = 4  
        predicted_word=''
        for word, index in tokenizer.word_index.items():
            if  index==predicted:    # Predicted  7 = Index 7
                predicted_word = word      # word = OREGON
                break
        result=enter_text + ' ' + predicted_word
        return result

In [33]:
data

'california state western united states california borders oregon north nevada arizona east mexican state baja california south coastline pacific ocean west'

In [34]:
print(Predict_Next_Words(model,tokenizer,'states california borders'))

states california borders pacific


In [35]:
print(Predict_Next_Words(model,tokenizer,'california borders oregon'))

california borders oregon north


In [36]:
print(Predict_Next_Words(model,tokenizer,'borders oregon north'))

borders oregon north nevada


In [37]:
print(Predict_Next_Words(model,tokenizer,'oregon north nevada '))

oregon north nevada  north


# Finished