# LSTM model training
Change the paths, `project` and `train_len` and run the notebook.


In [None]:
# %tensorflow_version 2.x
# import tensorflow as tf[
# device_name = tf.test.gpu_device_name()
# if device_name != '/device:GPU:0':
#     raise SystemError('GPU device not found')]
# print('Found GPU at: {}'.format(device_name))

In [None]:
import csv
import os
import numpy as np
import pandas as pd
import sklearn
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout, Embedding
from tensorflow.keras import optimizers
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from keras.callbacks import ModelCheckpoint
from keras.models import load_model
from pickle import dump, load
import tensorflow

In [None]:
class DataGenerator(tensorflow.keras.utils.Sequence):
    def __init__(self, data_path, data_size, batch_size, vocabulary_size,
                 to_fit=True, shuffle=True):
        self.data_path = data_path
        self.data_size = data_size
        self.batch_size = batch_size
        self.vocabulary_size = vocabulary_size
        self.to_fit = to_fit
        self.shuffle = shuffle
        # self.on_epoch_end()

    def __len__(self):
        return int(np.ceil(self.data_size / self.batch_size))

    def __getitem__(self, index):
        df = pd.read_csv(
            self.data_path, skiprows=range(1, index * self.batch_size),
            nrows=self.batch_size)
        if self.shuffle:
            df = sklearn.utils.shuffle(df)
        x = df.iloc[:, :-1]
        y = df.iloc[:, -1]
        return np.array(x), to_categorical(y, num_classes=self.vocabulary_size + 1)

In [None]:
def read_file(filepath):
    with open(filepath) as f:
        str_text = f.read()
    return str_text

project = 'eclipse'
fold = 9
train_len = 6
train_len_str = '6'
text_sequences = []
tokenizer = load(open(f'/content/drive/MyDrive/shared/LSTM-Kien/tokenizer/{project}/{project}.tk', 'rb'))

vocabulary_size = len(tokenizer.word_index)

In [None]:
def count_lines_csv(file_path):
    input_file = open(file_path, 'r')
    reader_file = csv.reader(input_file)
    return len(list(reader_file))

In [None]:
train_csv_path = f'/content/drive/MyDrive/shared/LSTM-Kien/train_data/{project}/{project}.csv'
batch_size = 2048
train_data_size = count_lines_csv(train_csv_path)
training_batch_generator = DataGenerator(train_csv_path, train_data_size, batch_size, vocabulary_size)

In [None]:
def create_model(vocabulary_size, seq_len):
    model = Sequential()
    model.add(Embedding(vocabulary_size, 20, input_length=seq_len))
    # model.add(LSTM(64, return_sequences=True))
    model.add(LSTM(64))
    # model.add(LSTM(64,recurrent_dropout=0.1))
    model.add(Dropout(0.15))
    # model.add(Dense(64,activation='relu'))
    # model.add(Dropout(0.2))
    model.add(Dense(vocabulary_size, activation='softmax'))
    opt_adam = optimizers.Adam()
    model.compile(loss='categorical_crossentropy', optimizer=opt_adam, metrics=['accuracy'])
    model.summary()
    return model

In [None]:
model_path = f"/content/drive/MyDrive/shared/LSTM-Kien/model/{project}/{project}.h5"
# if os.path.isfile(model_path):
model = load_model(model_path)
# else:
    # model = create_model(vocabulary_size + 1, train_len - 1)
checkpoint = ModelCheckpoint(model_path, monitor='loss', verbose=1, save_best_only=True, mode='min')
epoch = 60
if __name__ == '__main__':
    model.fit(x=training_batch_generator,
                epochs=epoch,
                verbose=1,
                use_multiprocessing=True,
                callbacks=[checkpoint],
                shuffle=True)

Epoch 1/60
Epoch 1: loss improved from inf to 2.47309, saving model to /content/drive/MyDrive/shared/LSTM-Kien/model/eclipse/eclipse.h5
Epoch 2/60
Epoch 2: loss improved from 2.47309 to 2.44305, saving model to /content/drive/MyDrive/shared/LSTM-Kien/model/eclipse/eclipse.h5
Epoch 3/60
1017/5011 [=====>........................] - ETA: 9:37:57 - loss: 2.3878 - accuracy: 0.5991