In [27]:
with open('faqs.txt', 'r', encoding='utf-8') as file:
    faqs = file.read()

In [28]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer

In [29]:
tokenizer = Tokenizer()

In [30]:
tokenizer.fit_on_texts([faqs])

In [31]:
tokenizer.word_index

{'a': 1,
 'the': 2,
 'to': 3,
 'is': 4,
 'of': 5,
 'in': 6,
 'int': 7,
 'and': 8,
 'c': 9,
 'i': 10,
 'function': 11,
 'can': 12,
 'be': 13,
 'for': 14,
 'program': 15,
 'are': 16,
 'this': 17,
 '3': 18,
 'we': 19,
 'if': 20,
 '0': 21,
 'an': 22,
 'loop': 23,
 'value': 24,
 'file': 25,
 'as': 26,
 'b': 27,
 'float': 28,
 'array': 29,
 '1': 30,
 'code': 31,
 'using': 32,
 '→': 33,
 'or': 34,
 'used': 35,
 'it': 36,
 'ptr': 37,
 'write': 38,
 'memory': 39,
 'n': 40,
 'by': 41,
 'pointer': 42,
 '2': 43,
 'instructions': 44,
 'like': 45,
 'd': 46,
 'x': 47,
 'string': 48,
 'from': 49,
 'not': 50,
 'printf': 51,
 'variable': 52,
 '5': 53,
 'with': 54,
 'which': 55,
 'char': 56,
 'that': 57,
 'use': 58,
 'will': 59,
 'type': 60,
 '4': 61,
 'operator': 62,
 'call': 63,
 'integer': 64,
 'else': 65,
 'while': 66,
 'on': 67,
 'operators': 68,
 '8': 69,
 'variables': 70,
 'example': 71,
 'character': 72,
 'types': 73,
 'return': 74,
 'condition': 75,
 'user': 76,
 'case': 77,
 'main': 78,
 'funct

In [32]:
len(tokenizer.word_index)

1163

In [33]:
input_sequences = []
for sentence in faqs.split('\n'):
    tokenize_sentence = tokenizer.texts_to_sequences([sentence])[0]

    for i in range(1,len(tokenize_sentence)):
        input_sequences.append(tokenize_sentence[:i+1])

In [34]:
input_sequences

[[190, 146],
 [190, 146, 4],
 [190, 146, 4, 1],
 [190, 146, 4, 1, 602],
 [190, 146, 4, 1, 602, 14],
 [190, 146, 4, 1, 602, 14, 191],
 [190, 146, 4, 1, 602, 14, 191, 3],
 [190, 146, 4, 1, 602, 14, 191, 3, 427],
 [190, 146, 4, 1, 602, 14, 191, 3, 427, 54],
 [190, 146, 4, 1, 602, 14, 191, 3, 427, 54, 603],
 [190, 146, 4, 1, 602, 14, 191, 3, 427, 54, 603, 147],
 [190, 146, 4, 1, 602, 14, 191, 3, 427, 54, 603, 147, 45],
 [190, 146, 4, 1, 602, 14, 191, 3, 427, 54, 603, 147, 45, 19],
 [190, 146, 4, 1, 602, 14, 191, 3, 427, 54, 603, 147, 45, 19, 58],
 [190, 146, 4, 1, 602, 14, 191, 3, 427, 54, 603, 147, 45, 19, 58, 604],
 [190, 146, 4, 1, 602, 14, 191, 3, 427, 54, 603, 147, 45, 19, 58, 604, 34],
 [190,
  146,
  4,
  1,
  602,
  14,
  191,
  3,
  427,
  54,
  603,
  147,
  45,
  19,
  58,
  604,
  34,
  605],
 [190,
  146,
  4,
  1,
  602,
  14,
  191,
  3,
  427,
  54,
  603,
  147,
  45,
  19,
  58,
  604,
  34,
  605,
  3],
 [190,
  146,
  4,
  1,
  602,
  14,
  191,
  3,
  427,
  54,
  603,

In [35]:
max_len = max([len(x) for x in input_sequences])

In [36]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
padded_input_sequences = pad_sequences(input_sequences, maxlen = max_len, padding='pre')

In [37]:
padded_input_sequences

array([[  0,   0,   0, ...,   0, 190, 146],
       [  0,   0,   0, ..., 190, 146,   4],
       [  0,   0,   0, ..., 146,   4,   1],
       ...,
       [  0,   0,   0, ..., 272,  37,  18],
       [  0,   0,   0, ...,  37,  18, 283],
       [  0,   0,   0, ...,  18, 283,   7]])

In [38]:
X = padded_input_sequences[:,:-1]

In [39]:
y = padded_input_sequences[:,-1]

In [40]:
print(X.shape,y.shape)

(6261, 104) (6261,)


In [41]:

from tensorflow.keras.utils import to_categorical
num_classes = np.max(y) + 1  # Automatically set the correct number of classes
y = to_categorical(y, num_classes=num_classes)

In [42]:
y.shape

(6261, 1164)

In [43]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense,Bidirectional

In [48]:
model = Sequential()
model.add(Embedding(num_classes, 150, input_shape=(max_len - 1,)))
model.add(Bidirectional(LSTM(150)))
model.add(Dense(num_classes, activation='softmax'))

In [49]:
model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])

In [50]:
model.summary()

In [51]:
history = model.fit(X,y,epochs=100,validation_data=(X,y))

Epoch 1/100
[1m196/196[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 85ms/step - accuracy: 0.0314 - loss: 6.3400 - val_accuracy: 0.0489 - val_loss: 5.7999
Epoch 2/100
[1m196/196[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 82ms/step - accuracy: 0.0514 - loss: 5.7899 - val_accuracy: 0.0811 - val_loss: 5.4696
Epoch 3/100
[1m196/196[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 82ms/step - accuracy: 0.0851 - loss: 5.4228 - val_accuracy: 0.1286 - val_loss: 4.9529
Epoch 4/100
[1m196/196[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 82ms/step - accuracy: 0.1379 - loss: 4.9194 - val_accuracy: 0.1789 - val_loss: 4.4395
Epoch 5/100
[1m196/196[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 82ms/step - accuracy: 0.1909 - loss: 4.4072 - val_accuracy: 0.2335 - val_loss: 3.9744
Epoch 6/100
[1m196/196[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 82ms/step - accuracy: 0.2303 - loss: 3.9899 - val_accuracy: 0.2702 - val_loss: 3.5808
Epoch 7/10

In [52]:
import time
text = "C++ programming"

for i in range(20):
    # Tokenize
    token_text = tokenizer.texts_to_sequences([text])[0]
    # Padding
    padded_token_text = pad_sequences([token_text],maxlen=105,padding='pre')
    # Predict
    pos = np.argmax(model.predict(padded_token_text)).item()
    
    for word,index in tokenizer.word_index.items():
        if index == pos:
            text = text + " "+word
            print(text)
            time.sleep(1)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 337ms/step
C++ programming is
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
C++ programming is a
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
C++ programming is a medium
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
C++ programming is a medium for
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
C++ programming is a medium for us
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
C++ programming is a medium for us to
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
C++ programming is a medium for us to communicate
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
C++ programming is a medium for us to communicate with
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
C++ programming is a medium for us to communicate with computers
[1m1/1