Make a folder in your "My Drive" main folder of Google Drive named "5LSL0_final".<br>
Put this colab notebook file inside that folder.

# Mount Google Drive
Run the next cell to start working on the project!

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/gdrive')

# Clone GitHub repo
Run the next two cells to clone the GitHub repo. This has now already been done, so should not be necessary to do it again.

In [None]:
# Go to the base folder in Google Drive
%cd /content/gdrive/My\ Drive/5LSL0_final

In [None]:
# Clone the GitHub repo
# Only have to do this ones!
!git clone https://github.com/tristan-deep/NLP_disaster_tweets

# Pull changes from GitHub repo
Run the next to cells before working on the project to sync the Google Drive folder with the GitHub repo.

In [None]:
# Go to the cloned repo folder
%cd /content/gdrive/My\ Drive/5LSL0_final/NLP_disaster_tweets

In [None]:
# git pull all changes that we made to the repo
!git pull

# Try different things

Go to working folder

In [None]:
import os
os.chdir('/content/gdrive/My Drive/5LSL0_final/NLP_disaster_tweets')
%pwd # print current location

This is the dataloader with some changes to make it work for Colab indicated with # COLAB CHANGE

In [None]:
"""## DataLoader"""

import numpy as np
import pandas as pd
import tensorflow.keras as keras
from pathlib import Path
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.preprocessing.text import Tokenizer

def TokenizeTweets(vocabulary_size=10000):
  data = pd.read_csv(Path('dataset', 'train' + '_set.csv'))
  list_IDs = list(range(len(data)))
  all_text = data.text.to_list()

  # Create Tokenizer Object
  tokenizer = Tokenizer(num_words=vocabulary_size, filters='#$%&()*+-<=>@[\\]^_`{|}~\t\n', lower=False, split=" ")

  # Train the tokenizer to the texts (training data)
  tokenizer.fit_on_texts(all_text)

  return tokenizer

class LoadTweets(keras.utils.Sequence):
    'Generates data for Keras'

    def __init__(self, tokenizer, split, batch_size=32, n_classes=2, shuffle=True, vocabulary_size = 10000, max_length=500):
        'Initialization'
        self.batch_size = batch_size
        self.split = split
        self.n_classes = n_classes
        self.shuffle = shuffle
        self.data = pd.read_csv(Path('dataset', self.split + '_set.csv'))
        self.all_text = self.data.text.to_list()
        self.max_length = max_length
        # Train the tokenizer to the texts
        self.tokenizer = tokenizer

        # not using ids but rather rows in csv file for convenience
        self.list_IDs = list(range(len(self.data)))

        self.vocabulary_size = vocabulary_size

        self.on_epoch_end()

    def __len__(self):
        'Denotes the number of batches per epoch'
        return int(np.floor(len(self.list_IDs) / self.batch_size))

    def __getitem__(self, index):
        'Generate one batch of data'
        # Generate indexes of the batch
        indexes = self.indexes[index * self.batch_size:(index + 1) * self.batch_size]

        # Find list of IDs
        list_IDs_temp = [self.list_IDs[k] for k in indexes]

        # Generate data
        X, y = self.__data_generation(list_IDs_temp)

        return X, y

    def on_epoch_end(self):
        'Updates indexes after each epoch'
        self.indexes = np.arange(len(self.list_IDs))
        if self.shuffle == True:
            np.random.shuffle(self.indexes)

    def __data_generation(self, list_IDs_temp):
        'Generates data containing batch_size samples'
        # Initialization

        # Generate data, we can change this later when we know exactly
        # in what kind of format we think data should be fed into network.
        data = self.data.loc[list_IDs_temp, :]

        X = {'keyword': data.keyword.to_list(),
             'location': data.location.to_list(),
             'text': data.text.to_list()}

        X_text = X['text']
        # Convert list of strings into list of lists of integers
        X_sequences = self.tokenizer.texts_to_sequences(X_text)
        # Truncate and pad input sequences
        X_pad = sequence.pad_sequences(X_sequences, maxlen=self.max_length)

        if self.split == 'train':
            y = data.target.to_list()
            y = np.expand_dims(y, axis=1) # COLAB CHANGE: removed the square brackets
            #y = keras.utils.to_categorical(y, num_classes=self.n_classes)
        elif self.split == 'val':
            y = data.target.to_list()
            y = np.expand_dims(y, axis=1) # COLAB CHANGE: removed the square brackets
            # y = keras.utils.to_categorical(y, num_classes=self.n_classes)
        elif self.split == 'test':
            # for test set there are no labels
            y = None
            
        return X_pad, y

if __name__ == '__main__':
    vocabulary_size =10000
    max_length = 500
    tokenizer = TokenizeTweets(vocabulary_size)

    gen = LoadTweets(tokenizer, split='train', batch_size=1, shuffle=False, vocabulary_size = vocabulary_size, max_length=max_length)

    # example that prints all the tweets of the first batch
    batch = gen[0]  # first of len(gen) batches

    X = batch[0]  # 0 -> keywords/location/text, 1 -> target
    text_first_batch = gen.tokenizer.sequences_to_texts(X)
    print(text_first_batch)

I copied the LSTM model here, to easily change the architecture.

In [16]:
import tensorflow.keras as keras
from tensorflow.keras.callbacks import Callback
from tensorflow.keras.layers import Input, Dense, LSTM, Conv2D, LeakyReLU, AvgPool2D, UpSampling2D, ReLU, MaxPooling2D, Reshape, Softmax, Activation, Flatten, Lambda, Conv2DTranspose, Dropout, Embedding
from tensorflow.keras.losses import MSE, categorical_crossentropy, binary_crossentropy
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.optimizers import Adam

def create_model(max_words=10000, embedding_vecor_length=32, max_length=500, lstm_out=100, print_summary=True):
    # create the model
    # model = Sequential()
    # model.add(Embedding(max_words, embedding_vecor_length, input_length=max_length))
    # model.add(LSTM(lstm_out))
    # model.add(Dense(1, activation='sigmoid'))
    # model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    # if print_summary:
    #     print(model.summary())
        
    model = Sequential()
    model.add(Embedding(max_words, embedding_vecor_length, input_length = max_length))
    model.add(LSTM(lstm_out))
    model.add(Dense(256))
    model.add(Activation('relu'))
    model.add(Dropout(0.5))
    model.add(Dense(1, name='out_layer'))
    model.add(Activation('sigmoid'))
    model.compile(loss = 'binary_crossentropy', optimizer='adam',
                   metrics = ['accuracy'])
    
    if print_summary:
        print(model.summary())
    return model

Start the tensorboard application

In [None]:
%load_ext tensorboard

Create model, train and save

In [None]:
# from DataLoader import LoadTweets, TokenizeTweets
# from models.LSTM_model import create_model

from datetime import datetime
from tensorflow.keras.callbacks import TensorBoard
import os
from pathlib import Path
import numpy as np

save_model = True
model_name = 'lstm_embedding-100_out-100-epochs-15.h5'

#Make a folder to save model weights, and
run = datetime.now().strftime("%Y%m%d-%H%M%S")
logdir = Path("logs/"+run)

#check if the directory to save the model in exists
os.makedirs(os.path.dirname(logdir), exist_ok=True)
tensorboard_callback = TensorBoard(log_dir=logdir)


"""Create model"""
vocabulary_size = 1000
max_length = 100
embedding_vector_length = 100
lstm_out = 100

tokenizer = TokenizeTweets(vocabulary_size=vocabulary_size)

train_gen = LoadTweets(tokenizer, split='train',
                        batch_size = 32, shuffle=True,
                        vocabulary_size=vocabulary_size, max_length=max_length)
val_gen = LoadTweets(tokenizer, split='val',
                      batch_size = 32, shuffle=False,
                      vocabulary_size=vocabulary_size, max_length=max_length)

model = create_model(vocabulary_size, embedding_vector_length, max_length, lstm_out)


"""## Train model"""
model.fit(train_gen, validation_data = val_gen, epochs=1, callbacks = [tensorboard_callback])

if save_model == True :
    PATH = os.path.join('weights', model_name)
    model.save(PATH)

Show the tensorboard with the training logs

In [None]:
%tensorboard --logdir logs