In [1]:
# source text
data = """Natural language processing (NLP) is a subfield of linguistics, computer science, and artificial intelligence concerned with the interactions between computers and human language, in particular how to program computers to process and analyze large amounts of natural language data.\n
             Challenges in natural language processing frequently involve speech recognition, natural language understanding, and natural-language generation.\n
             Natural language processing has its roots in the 1950s.\n
             Already in 1950, Alan Turing published an article titled Computing Machinery and Intelligence which proposed what is now called the Turing test as a criterion of intelligence, a task that involves the automated interpretation and generation of natural language, but at the time not articulated as a problem separate from artificial intelligence.\n
             The following is a list of some of the most commonly researched tasks in natural language processing.\n
             Some of these tasks have direct real-world applications, while others more commonly serve as subtasks that are used to aid in solving larger tasks.\n
             Though natural language processing tasks are closely intertwined, they can be subdivided into categories for convenience.\n"""


In [2]:
import nltk



In [3]:
nltk.download('punkt')
nltk.download('stopwords')
all_sentences=nltk.sent_tokenize(data)

all_words=[nltk.word_tokenize(sent) for sent in all_sentences]
from nltk.corpus import stopwords
for i in range(len(all_words)):
    all_words[i]=[w for w in all_words[i] if w not in stopwords.words('english')]
    print(all_words)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


[['Natural', 'language', 'processing', '(', 'NLP', ')', 'subfield', 'linguistics', ',', 'computer', 'science', ',', 'artificial', 'intelligence', 'concerned', 'interactions', 'computers', 'human', 'language', ',', 'particular', 'program', 'computers', 'process', 'analyze', 'large', 'amounts', 'natural', 'language', 'data', '.'], ['Challenges', 'in', 'natural', 'language', 'processing', 'frequently', 'involve', 'speech', 'recognition', ',', 'natural', 'language', 'understanding', ',', 'and', 'natural-language', 'generation', '.'], ['Natural', 'language', 'processing', 'has', 'its', 'roots', 'in', 'the', '1950s', '.'], ['Already', 'in', '1950', ',', 'Alan', 'Turing', 'published', 'an', 'article', 'titled', 'Computing', 'Machinery', 'and', 'Intelligence', 'which', 'proposed', 'what', 'is', 'now', 'called', 'the', 'Turing', 'test', 'as', 'a', 'criterion', 'of', 'intelligence', ',', 'a', 'task', 'that', 'involves', 'the', 'automated', 'interpretation', 'and', 'generation', 'of', 'natural', 

In [4]:
from gensim.models import Word2Vec
word2vec=Word2Vec(all_words,min_count=1)
word2vec.wv.most_similar('natural')

[('test', 0.20554330945014954),
 ('NLP', 0.1794852614402771),
 ('subfield', 0.1642543226480484),
 ('criterion', 0.16400839388370514),
 ('researched', 0.15037602186203003),
 ('task', 0.13987784087657928),
 ('larger', 0.1360655128955841),
 ('tasks', 0.13299749791622162),
 ('closely', 0.12206412851810455),
 ('applications', 0.09703784435987473)]

In [5]:
word2vec.wv.most_similar('language')

[('recognition', 0.19066877663135529),
 ('data', 0.18839259445667267),
 ('serve', 0.18468110263347626),
 ('NLP', 0.16145464777946472),
 ('roots', 0.16034404933452606),
 ('automated', 0.1600670963525772),
 ('Machinery', 0.1595340073108673),
 ('science', 0.13824574649333954),
 ('separate', 0.1350008249282837),
 ('speech', 0.12905026972293854)]

In [6]:

pip install tensorflow

Note: you may need to restart the kernel to use updated packages.


In [7]:
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense, LSTM , Embedding




In [8]:
tokenizer=Tokenizer()
tokenizer.fit_on_texts([data])
encoded=tokenizer.texts_to_sequences([data])[0]
vocab_size=len(tokenizer.word_index)+1
print('Vocabulary Size: %d'%vocab_size )

Vocabulary Size: 108


In [9]:
import numpy as np
sequences=list()
for i in range(1,len(encoded)):
    sequence=encoded[i-1:i+1]
    sequences.append(sequence)
print('Total Sequences: %d'%len(sequences))
sequences=np.array(sequences)
X,y=sequences[:,0],sequences[:,1]

Total Sequences: 173


In [10]:
# one hot encode outputs
y = to_categorical(y, num_classes=vocab_size)
# define model
model = Sequential()
model.add(Embedding(vocab_size, 10, input_length=1))
model.add(LSTM(50))
model.add(Dense(vocab_size, activation='softmax'))
print(model.summary())
# compile network
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# fit network
model.fit(X, y, epochs=200, verbose=2)



Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 1, 10)             1080      
                                                                 
 lstm (LSTM)                 (None, 50)                12200     
                                                                 
 dense (Dense)               (None, 108)               5508      
                                                                 
Total params: 18788 (73.39 KB)
Trainable params: 18788 (73.39 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None

Epoch 1/200


6/6 - 5s - loss: 4.6822 - accuracy: 0.0289 - 5s/epoch - 827ms/step
Epoch 2/200
6/6 - 0s - loss: 4.6789 - accuracy: 0.0809 - 27ms/epoch - 4ms/step
Epoch 3/200
6/6 - 0s - loss: 4.6761 - accuracy: 0.0925 - 28ms/epoch - 5ms/step
Epoch 4/200
6/6 - 0s - loss: 4.6734 - 

Epoch 83/200
6/6 - 0s - loss: 2.4745 - accuracy: 0.4277 - 29ms/epoch - 5ms/step
Epoch 84/200
6/6 - 0s - loss: 2.4466 - accuracy: 0.4335 - 29ms/epoch - 5ms/step
Epoch 85/200
6/6 - 0s - loss: 2.4204 - accuracy: 0.4335 - 30ms/epoch - 5ms/step
Epoch 86/200
6/6 - 0s - loss: 2.3931 - accuracy: 0.4566 - 27ms/epoch - 4ms/step
Epoch 87/200
6/6 - 0s - loss: 2.3668 - accuracy: 0.4509 - 27ms/epoch - 4ms/step
Epoch 88/200
6/6 - 0s - loss: 2.3411 - accuracy: 0.4624 - 26ms/epoch - 4ms/step
Epoch 89/200
6/6 - 0s - loss: 2.3157 - accuracy: 0.4740 - 36ms/epoch - 6ms/step
Epoch 90/200
6/6 - 0s - loss: 2.2897 - accuracy: 0.4855 - 26ms/epoch - 4ms/step
Epoch 91/200
6/6 - 0s - loss: 2.2652 - accuracy: 0.4913 - 28ms/epoch - 5ms/step
Epoch 92/200
6/6 - 0s - loss: 2.2398 - accuracy: 0.4971 - 24ms/epoch - 4ms/step
Epoch 93/200
6/6 - 0s - loss: 2.2149 - accuracy: 0.5029 - 25ms/epoch - 4ms/step
Epoch 94/200
6/6 - 0s - loss: 2.1905 - accuracy: 0.5029 - 25ms/epoch - 4ms/step
Epoch 95/200
6/6 - 0s - loss: 2.1667 - a

Epoch 185/200
6/6 - 0s - loss: 0.8926 - accuracy: 0.7110 - 24ms/epoch - 4ms/step
Epoch 186/200
6/6 - 0s - loss: 0.8867 - accuracy: 0.7110 - 23ms/epoch - 4ms/step
Epoch 187/200
6/6 - 0s - loss: 0.8814 - accuracy: 0.7110 - 35ms/epoch - 6ms/step
Epoch 188/200
6/6 - 0s - loss: 0.8762 - accuracy: 0.7168 - 34ms/epoch - 6ms/step
Epoch 189/200
6/6 - 0s - loss: 0.8709 - accuracy: 0.7168 - 39ms/epoch - 7ms/step
Epoch 190/200
6/6 - 0s - loss: 0.8658 - accuracy: 0.7110 - 34ms/epoch - 6ms/step
Epoch 191/200
6/6 - 0s - loss: 0.8613 - accuracy: 0.7168 - 41ms/epoch - 7ms/step
Epoch 192/200
6/6 - 0s - loss: 0.8569 - accuracy: 0.7110 - 37ms/epoch - 6ms/step
Epoch 193/200
6/6 - 0s - loss: 0.8512 - accuracy: 0.7110 - 40ms/epoch - 7ms/step
Epoch 194/200
6/6 - 0s - loss: 0.8471 - accuracy: 0.7052 - 40ms/epoch - 7ms/step
Epoch 195/200
6/6 - 0s - loss: 0.8422 - accuracy: 0.7168 - 41ms/epoch - 7ms/step
Epoch 196/200
6/6 - 0s - loss: 0.8379 - accuracy: 0.7168 - 29ms/epoch - 5ms/step
Epoch 197/200
6/6 - 0s - los

<keras.src.callbacks.History at 0x1fb7a6f2f10>

In [11]:
def generate_seq(model,tokenizer,seed_text,n_words):
    in_text,result=seed_text,seed_text
    for _ in range (n_words):
        encoded=tokenizer.texts_to_sequences([in_text])[0]
        encoded=np.array(encoded)
        yhat=np.argmax(model.predict(encoded,verbose=0))
        out_word=''
        for word, index in tokenizer.word_index.items():
            if index==yhat:
                out_word=word
                break
        in_text,result=out_word,result+' '+out_word
    return result

In [12]:
print(generate_seq(model,tokenizer,'natural',4))

natural language processing has its
