In [1]:
!pip install tokenizers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting tokenizers
  Downloading tokenizers-0.13.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 4.3 MB/s 
[?25hInstalling collected packages: tokenizers
Successfully installed tokenizers-0.13.1


In [2]:
#importing the libraries
from tensorflow import keras
from tensorflow.keras import layers
import numpy as np
import random
import io
from google.colab import files

In [3]:
#reading, loading the dataset
with open('cwe-train.txt', 'r') as f:    
   text_data=f.read()

In [4]:
text_data

'chikale vinogile fana viya wanhu wochikala habung\'huke imisi, si kwa umelo na ung\'waji na unyolodo na uwashelati na ndwagi hebu migongo. one muhapula chinhu chochose kwa zina jangu nizamtendelani. yesu kamulongela petili, "bweleza zele jako muna iyala. "ufalume wa kuulanga ulinga vino. niye mulala mulangulizi, nokwandikila weye gayo mulondwa wangu yonikulonda muna ikweli. nomulongelani ukweli, munhu yoyose yohauhokela ufalume wa mulungu fana mwana mdoododo, hezakwingila muna ufalume uwo ng\'o!" iyo ni mfalume wa siku zose na hadanganika na hawoneka na yeli mulungu yaidumwe. one munhu yoyose yahawa na nzala, kolondeka yaje ukaye yake, muladi vondamuiting\'hane sekeyatagusigwe. lelo kwa mbuli ziya zimwandike kuwa. maabaho msenga wa kuulanga wekaidi kaitowa mhalati yake na chinhu fana mulima mkulu woukwaka moto chasigwa muibahali selusi ya bahali igaluka damu. msede yahandile misavu niiyo mwihi kugozela uhelelo wa isi na waja wogobola ni wasenga wa kuulanga. wouutendile uwoneke kwa wan

In [5]:
vocab = sorted(set(text_data)) 
vocab_to_int = {c: i for i, c in enumerate(vocab)}    
int_to_vocab = dict(enumerate(vocab))    
encoded = np.array([vocab_to_int[c] for c in text_data], dtype=np.int32)

In [6]:
#reference: https://towardsdatascience.com/word-and-character-based-lstms-12eb65f779c2 
maxlen = 40
step = 3
sentences = []
next_chars = []
for i in range(0, len(text_data) - maxlen, step):
    sentences.append(text_data[i : i + maxlen])
    next_chars.append(text_data[i + maxlen])

In [7]:
print("Number of sequences:", len(sentences))

Number of sequences: 201131


In [8]:
no_of_characters = sorted(list(set(text_data)))
print("Total characters:", len(no_of_characters))

Total characters: 32


In [9]:
X = np.zeros((len(sentences), maxlen, len(no_of_characters)), dtype=np.bool)
Y = np.zeros((len(sentences), len(no_of_characters)), dtype=np.bool)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  """Entry point for launching an IPython kernel.
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  


In [51]:
X.shape

(201131, 40, 32)

In [52]:
Y.shape

(201131, 32)

In [10]:
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        X[i, t, vocab_to_int[char]] = 1
    Y[i, vocab_to_int[next_chars[i]]] = 1

In [29]:
model = keras.Sequential(
    [
        keras.Input(shape=(maxlen, len(no_of_characters))),
        layers.LSTM(4),
        layers.Dense(len(no_of_characters), activation="softmax"),
    ]
)
optimizer = keras.optimizers.RMSprop(learning_rate=0.01)
model.compile(loss="categorical_crossentropy", optimizer=optimizer,metrics=['accuracy'])

In [12]:
def sample(preds, temperature=1.0):
    preds = np.asarray(preds).astype("float64")
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [13]:
#reference: https://medium.com/deep-learning-with-keras/char-level-text-generation-with-an-lstm-model-e55ba7ff18c2
epochs = 10
batch_size = 128

for epoch in range(epochs):
    model.fit(X, Y, batch_size=batch_size, epochs=1)
    print()
    print("Generating text after epoch: %d" % epoch)

    start_index = random.randint(0, len(text_data) - maxlen - 1)
    for diversity in [0.5]:
        print("...Diversity:", diversity)

        generated = ""
        sentence = text_data[start_index : start_index + maxlen]
        print('...Generating with seed: "' + sentence + '"')

        for i in range(400):
            x_pred = np.zeros((1, maxlen, len(no_of_characters)))
            for t, char in enumerate(sentence):
                x_pred[0, t, vocab_to_int[char]] = 1.0
            preds = model.predict(x_pred, verbose=0)[0]
            next_index = sample(preds, diversity)
            next_char = int_to_vocab[next_index]
            sentence = sentence[1:] + next_char
            generated += next_char

        print("...Generated: ", generated)
        print()


Generating text after epoch: 0
...Diversity: 0.5
...Generating with seed: "la da pasaka na kalagiliza damu imizigwe"
...Generated:   usa musabuna mwe mula sangugola kuna si, wale mwa ga wa musa muya wa waka, wanga kumwa mula na hana lumula wa muluna yangu mbeha wala na imwa, mumwela nola muyongu kulalabela yele wana kulangwa kulumwa kusa wangu kwana mbuli mwa mumwa wana yama wangili na chike mani chini wa chila ni mula na waga wela muni muna yolamwa mwa mwa za kana walila wosa mulu wona mwengila kamwa wa yalingu musa kuna kula


Generating text after epoch: 1
...Diversity: 0.5
...Generating with seed: " yose na yeye mwenyewo kwa nzila ya chil"
...Generated:  a kusa kune muni kana wa wanhe mundu muvige ving'hesa yana muleng'hale nhu kasi wandu mana mwa ningi kuya mwa yoa wangu ilana muni vimwa mumula mwa mnha ita vina wa umwa wanhu mwaza ya iwa wa wani hanza yata dilanga yani mulu. hana kulaya wa ye. kukunina mwa mulunhu kakungu luna muno muluna wa wa kuhangu kulakeli yela chila yana k

In [35]:
#reading, loading the dataset
with open('cwe-test.txt', 'r') as f:    
   test=f.read()

In [36]:
test

'sang\'hanileni vinogile chila nzila imwili nayo, kwaviya siku zino ni za wihi. nhondo iyo itangigwa usungu, selusi ya mazi yawa masungu, wanhu wengi wang\'wile mazi yayo wadanganika kwaviya yagaluka masungu. kukala na minyong\'onyo muna ulung\'husesa lwa wanhu. mbali ingawa kuna wanhu wengi woigoda fana viya wanhu wa isi, na niye nizatenda ivo ivo. vino kumwenu kuna munhu mtamu? lung\'husesa lukulu lwa wanhu luiting\'hana mbaka wanhu wakala woibojoga. mbali deng\'heleleni kwingila muna yamagayo fana chilisito viyagaiye, muladi mudeng\'helele ng\'hani utunhizo wake vondaugubuligwe. ndugu zangu, sigesa kuwa nipata gweko. kwaviya ugima ni bola kufosa ndiya na lukuli ni bola kufosa viwalo. ndugu zangu muhamuwona munhu kabananga, mweye womulangulizigwa na muhe wa mulungu mzumeni munhu iyo kwa uhole, mbali mweye wenyewo muiteganye muladi sekemugezigwe. maabaho nikumbuka chiya chiyalongile mndewa, \'yohana kabatiza kwa mazi, mbali mweye mwizabatizigwa kwa muhe yelile.\' wambwiya zangu, simwa

In [37]:
vocab_test = sorted(set(test)) 
vocab_to_int_test = {c: i for i, c in enumerate(vocab_test)}    
int_to_vocab_test = dict(enumerate(vocab_test))    
encoded_test = np.array([vocab_to_int_test[c] for c in test], dtype=np.int32)

In [38]:
encoded_test

array([25,  9, 22, ...,  9,  7,  0], dtype=int32)

In [39]:
maxlen = 40
step = 3
sentences_test = []
next_chars_test = []
for i in range(0, len(test) - maxlen, step):
    sentences_test.append(test[i : i + maxlen])
    next_chars_test.append(test[i + maxlen])

In [41]:
print("Number of sequences:", len(sentences_test))

Number of sequences: 20559


In [42]:
test_char = sorted(list(set(test)))
print("Total characters:", len(test_char))

Total characters: 32


In [43]:
X_test = np.zeros((len(sentences_test), maxlen, len(test_char)), dtype=np.bool)
Y_test = np.zeros((len(sentences_test), len(test_char)), dtype=np.bool)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  """Entry point for launching an IPython kernel.
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  


In [45]:
for i, sentence in enumerate(sentences_test):
    for t, char in enumerate(sentence):
        X_test[i, t, vocab_to_int_test[char]] = 1
    Y_test[i, vocab_to_int_test[next_chars_test[i]]] = 1

In [47]:
model_test = keras.Sequential(
    [
        keras.Input(shape=(maxlen, len(test_char))),
        layers.LSTM(5),
        layers.Dense(len(test_char), activation="softmax"),
    ]
)
optimizer = keras.optimizers.RMSprop(learning_rate=0.01)
model_test.compile(loss="categorical_crossentropy", optimizer=optimizer,metrics=['accuracy'])

In [54]:
def sample(preds, temperature=1.0):
    preds = np.asarray(preds).astype("float64")
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [55]:
X_test.shape

(20559, 40, 32)

In [56]:
Y_test.shape

(20559, 32)

In [59]:
epochs = 10
batch_size = 128

for epoch in range(epochs):
    model_test.fit(X_test, Y_test, batch_size=batch_size, epochs=1)
    print()
    print("Generating text after epoch: %d" % epoch)

    start_index = random.randint(0, len(test) - maxlen - 1)
    for diversity in [0.5]:
        print("...Diversity:", diversity)

        generated = ""
        sentence = test[start_index : start_index + maxlen]
        print('...Generating with seed: "' + sentence + '"')

        for i in range(400):
            x_pred = np.zeros((1, maxlen, len(test_char)))
            for t, char in enumerate(sentence):
                x_pred[0, t, vocab_to_int_test[char]] = 1.0
            preds = model_test.predict(x_pred, verbose=0)[0]
            next_index = sample(preds, diversity)
            next_char = int_to_vocab_test[next_index]
            sentence = sentence[1:] + next_char
            generated += next_char

        print("...Generated: ", generated)
        print()


Generating text after epoch: 0
...Diversity: 0.5
...Generating with seed: "vo ndugu zangu funyeni umoyo mbaka mndew"
...Generated:  a mwa ko ma "wa na winini ha kiya ka gu ma ya imo nha mu ka ka ge mula ngu se mwa la na wa ga wani  wa wa ho wa ma ka  gitenga wa we wa ke huma kena ngwena wana ku wula mila ka wa ya sala wa wa iyila nga ha ka mwe ka mu wa va ya gunike ila we ha ya ka mungugi we mi wa nga. ha ka ya ka male wa cha ya munu wa la mule la wengu ka miniwena wa  wa wa wa kanha u za kena ma gu wa wa mukuwa va ya ma yala 


Generating text after epoch: 1
...Diversity: 0.5
...Generating with seed: "nanzi,' yezakuwa kweli. "na mtumwa ija y"
...Generated:  a we he ni kuna ku ku na mu mulana za na mu mulu ka we mwa mu ka wa munhali wa kanga ya na munge wi mi wa wabu ya ka wa mungu wa mumu gula mwa ka melimwa wa wa ka kala nga wa ha ha wa kungenga mwa ka kula ka mwa kuingulila wa ki. mwe mu wa ki na mwila no mulo kula halanga mila na ku ka ko ya ku wa wala muwa wa kuwa mu mumwe na mil

In [61]:
pred=model_test.evaluate(X_test, Y_test)

