In [0]:
#https://machinelearningmastery.com/develop-word-based-neural-language-models-python-keras/

In [0]:
# source text
data = """सितारे दिन में नहीं दिखते\n
कुछ लोगों को चाय अच्छी लगती है और कुछ लोगों को कॉफ़ी\n
जब मैं अठारह साल का था मैंने गाड़ी चलानी सीखी और लाईसेंस बनवालिया\n
मुझे आपसे मिलकर बहुत खुशी हुई \n """

In [0]:
print(data)

सितारे दिन में नहीं दिखते

कुछ लोगों को चाय अच्छी लगती है और कुछ लोगों को कॉफ़ी

जब मैं अठारह साल का था मैंने गाड़ी चलानी सीखी और लाईसेंस बनवालिया

मुझे आपसे मिलकर बहुत खुशी हुई 
 


In [0]:
from numpy import array
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Embedding

In [0]:
# generate a sequence from a language model
def generate_seq(model, tokenizer, max_length, seed_text, n_words):
	in_text = seed_text
    # generate a fixed number of words
	for _ in range(n_words):
        # encode the text as integer
		encoded = tokenizer.texts_to_sequences([in_text])[0]
		# pre-pad sequences to a fixed length
		encoded = pad_sequences([encoded], maxlen=max_length, padding='pre')
		# predict probabilities for each word
		yhat = model.predict_classes(encoded, verbose=0)
		# map predicted word index to word
		out_word = ''
		for word, index in tokenizer.word_index.items():
			if index == yhat:
				out_word = word
				break
		# append to input
		in_text += ' ' + out_word
	return in_text
 

In [0]:
# integer encode sequences of words
tokenizer = Tokenizer()
tokenizer.fit_on_texts([data])
encoded = tokenizer.texts_to_sequences([data])[0]

In [0]:
# retrieve vocabulary size
vocab_size = len(tokenizer.word_index) + 1
print('Vocabulary Size: %d' % vocab_size)
# encode 2 words -> 1 word
sequences = list()
for i in range(2, len(encoded)):
	sequence = encoded[i-2:i+1]
	sequences.append(sequence)

Vocabulary Size: 33


In [0]:
print('Total Sequences: %d' % len(sequences))
# pad sequences
max_length = max([len(seq) for seq in sequences])
sequences = pad_sequences(sequences, maxlen=max_length, padding='pre')
print('Max Sequence Length: %d' % max_length)

Total Sequences: 34
Max Sequence Length: 3


In [0]:
# split into input and output elements
sequences = array(sequences)
X, y = sequences[:,:-1],sequences[:,-1]

#print(X)
y = to_categorical(y, num_classes=vocab_size)
#print(y)
# define model
model = Sequential()
model.add(Embedding(vocab_size, 10, input_length=max_length-1))
model.add(LSTM(50))
model.add(Dense(vocab_size, activation='softmax'))

In [0]:
print(model.summary())
# compile network
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# fit network
model.fit(X, y, epochs=500, verbose=2)
# evaluate model

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 2, 10)             330       
_________________________________________________________________
lstm_1 (LSTM)                (None, 50)                12200     
_________________________________________________________________
dense_1 (Dense)              (None, 33)                1683      
Total params: 14,213
Trainable params: 14,213
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/500
 - 0s - loss: 3.4962 - acc: 0.0882
Epoch 2/500
 - 0s - loss: 3.4948 - acc: 0.1176
Epoch 3/500
 - 0s - loss: 3.4938 - acc: 0.0882
Epoch 4/500
 - 0s - loss: 3.4929 - acc: 0.0588
Epoch 5/500
 - 0s - loss: 3.4920 - acc: 0.0588
Epoch 6/500
 - 0s - loss: 3.4910 - acc: 0.0882
Epoch 7/500
 - 0s - loss: 3.4901 - acc: 0.1765
Epoch 8/500
 - 0s - loss: 3.4892 - acc: 0.2059
Epoch 9/500
 - 0s - l

Epoch 156/500
 - 0s - loss: 1.7795 - acc: 0.5294
Epoch 157/500
 - 0s - loss: 1.7631 - acc: 0.5588
Epoch 158/500
 - 0s - loss: 1.7478 - acc: 0.5588
Epoch 159/500
 - 0s - loss: 1.7319 - acc: 0.5882
Epoch 160/500
 - 0s - loss: 1.7168 - acc: 0.5882
Epoch 161/500
 - 0s - loss: 1.7014 - acc: 0.5882
Epoch 162/500
 - 0s - loss: 1.6866 - acc: 0.5882
Epoch 163/500
 - 0s - loss: 1.6718 - acc: 0.5882
Epoch 164/500
 - 0s - loss: 1.6574 - acc: 0.5882
Epoch 165/500
 - 0s - loss: 1.6432 - acc: 0.5882
Epoch 166/500
 - 0s - loss: 1.6285 - acc: 0.5882
Epoch 167/500
 - 0s - loss: 1.6143 - acc: 0.5882
Epoch 168/500
 - 0s - loss: 1.5997 - acc: 0.6765
Epoch 169/500
 - 0s - loss: 1.5864 - acc: 0.6765
Epoch 170/500
 - 0s - loss: 1.5719 - acc: 0.6765
Epoch 171/500
 - 0s - loss: 1.5578 - acc: 0.6765
Epoch 172/500
 - 0s - loss: 1.5440 - acc: 0.6765
Epoch 173/500
 - 0s - loss: 1.5293 - acc: 0.7059
Epoch 174/500
 - 0s - loss: 1.5157 - acc: 0.7059
Epoch 175/500
 - 0s - loss: 1.5014 - acc: 0.7059
Epoch 176/500
 - 0s 

 - 0s - loss: 0.3850 - acc: 0.9412
Epoch 324/500
 - 0s - loss: 0.3823 - acc: 0.9412
Epoch 325/500
 - 0s - loss: 0.3794 - acc: 0.9412
Epoch 326/500
 - 0s - loss: 0.3767 - acc: 0.9412
Epoch 327/500
 - 0s - loss: 0.3740 - acc: 0.9412
Epoch 328/500
 - 0s - loss: 0.3714 - acc: 0.9412
Epoch 329/500
 - 0s - loss: 0.3681 - acc: 0.9412
Epoch 330/500
 - 0s - loss: 0.3653 - acc: 0.9412
Epoch 331/500
 - 0s - loss: 0.3621 - acc: 0.9412
Epoch 332/500
 - 0s - loss: 0.3591 - acc: 0.9412
Epoch 333/500
 - 0s - loss: 0.3565 - acc: 0.9412
Epoch 334/500
 - 0s - loss: 0.3536 - acc: 0.9412
Epoch 335/500
 - 0s - loss: 0.3509 - acc: 0.9412
Epoch 336/500
 - 0s - loss: 0.3475 - acc: 0.9412
Epoch 337/500
 - 0s - loss: 0.3446 - acc: 0.9412
Epoch 338/500
 - 0s - loss: 0.3422 - acc: 0.9412
Epoch 339/500
 - 0s - loss: 0.3400 - acc: 0.9412
Epoch 340/500
 - 0s - loss: 0.3372 - acc: 0.9412
Epoch 341/500
 - 0s - loss: 0.3337 - acc: 0.9412
Epoch 342/500
 - 0s - loss: 0.3314 - acc: 0.9412
Epoch 343/500
 - 0s - loss: 0.3292

Epoch 491/500
 - 0s - loss: 0.1355 - acc: 0.9706
Epoch 492/500
 - 0s - loss: 0.1346 - acc: 0.9706
Epoch 493/500
 - 0s - loss: 0.1338 - acc: 0.9706
Epoch 494/500
 - 0s - loss: 0.1326 - acc: 0.9706
Epoch 495/500
 - 0s - loss: 0.1318 - acc: 0.9706
Epoch 496/500
 - 0s - loss: 0.1306 - acc: 0.9706
Epoch 497/500
 - 0s - loss: 0.1315 - acc: 0.9706
Epoch 498/500
 - 0s - loss: 0.1299 - acc: 0.9706
Epoch 499/500
 - 0s - loss: 0.1296 - acc: 0.9706
Epoch 500/500
 - 0s - loss: 0.1292 - acc: 0.9706


<keras.callbacks.History at 0x114d7ce80>

In [0]:
print(generate_seq(model, tokenizer, max_length-1, 'सितारे दिन में', 2))
print(generate_seq(model, tokenizer, max_length-1, 'कुछ लोगों को', 4))
print(generate_seq(model, tokenizer, max_length-1, 'साल का था मैंने गाड़ी चलानी ', 2))
print(generate_seq(model, tokenizer, max_length-1, 'आपसे मिलकर', 2))
print(generate_seq(model, tokenizer, max_length-1, 'मेरा नाम ', 2))

सितारे दिन में नहीं दिखते
कुछ लोगों को कॉफ़ी जब मैं अठारह
साल का था मैंने गाड़ी चलानी  सीखी और
आपसे मिलकर बहुत खुशी
मेरा नाम  कुछ लोगों
