In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Dropout, Dense
from tensorflow.keras.layers import Flatten, LSTM
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Embedding
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Input
import tensorflow_addons as tfa

import pandas as pd
import numpy as np
import pickle
import re

import matplotlib.pyplot as plt

In [None]:
# !pip3 install pickle5
# import pickle5 as pickle

In [None]:
MAXLEN = 300

In [None]:
# Function to create train and test sets
def load_data():
    arxiv = pd.read_csv('./Data/SampledArxiv.csv')

    labels = arxiv[arxiv.columns[1:]]

    X = []
    sentences = list(arxiv["Text"])
    for sen in sentences:
        X.append(sen)
    y = labels.values
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

    return X_train, X_test, y_train, y_test

In [None]:
# Function for tokenizing text
def get_tokens(X_train, X_test, tokenizer_path=None, save_tokenizer=False):
    if not tokenizer_path:
      tokenizer = Tokenizer(num_words=15000)
      tokenizer.fit_on_texts(X_train)
      if save_tokenizer:
        file = open('./Utils/tokenizer.pkl', 'wb')
        pickle.dump(tokenizer, file)
        file.close()
    else:
      tokenizer = pickle.load(open(tokenizer_path, 'rb'))
    
    X_train = tokenizer.texts_to_sequences(X_train)
    X_test = tokenizer.texts_to_sequences(X_test)
    X_train = pad_sequences(X_train, padding='post', maxlen=MAXLEN)
    X_test = pad_sequences(X_test, padding='post', maxlen=MAXLEN)

    return X_train, X_test, tokenizer

In [None]:
#Function for generating emebedding matrix using GloVe embeddings
def get_embeddings(tokenizer, embedding_matrix_path=None, save_matrix=False):
    if not embedding_matrix_path:
      embeddings_dictionary = dict()
      vocab_size = len(tokenizer.word_index) + 1
      glove_file = open('./glove.840B.300d.txt', encoding="utf8")

      for line in glove_file:
        records = line.split(' ')
        word = records[0]
        vector_dimensions = np.asarray(records[1:])
        embeddings_dictionary[word] = vector_dimensions
      glove_file.close()

      embedding_matrix = np.zeros((vocab_size, 300))
      for word, index in tokenizer.word_index.items():
        embedding_vector = embeddings_dictionary.get(word)
        if embedding_vector is not None:
            embedding_matrix[index] = embedding_vector
      
      if save_matrix:
        file = open('./Utils/arxiv_embedding_matrix.pkl', 'wb')
        pickle.dump(embedding_matrix, file)
        file.close()
    else:
      embedding_matrix = pickle.load(open('./Utils/arxiv_embedding_matrix.pkl', 'rb'))
    
    return embedding_matrix

In [None]:
#Function for preprocessing text for inference
def pre_process(text, tokenizer):
    text = tokenizer.texts_to_sequences([text])
    text = pad_sequences(text, padding='post', maxlen=MAXLEN)

    return text

In [None]:
#Function for building model(either using GloVe embeddings or without)
def build_model(tokenizer, use_glove=True):
    vocab_size = len(tokenizer.word_index) + 1
    deep_inputs = Input(shape=(MAXLEN,))
    
    if use_glove:
      embedding_matrix = get_embeddings(tokenizer, embedding_matrix_path=None, save_matrix=True)  # If matrix is already saved, set embedding_matrix_path to file path
      embedding_layer = Embedding(vocab_size, 300, weights=[embedding_matrix], trainable=False)(deep_inputs)
    else:
      embedding_layer = Embedding(vocab_size, 300)(deep_inputs)
    
    lstm1 = LSTM(128)(embedding_layer)
    drop = Dropout(0.3)(lstm1)
    dense_layer_1 = Dense(11, activation='sigmoid')(drop)
    model = Model(inputs=deep_inputs, outputs=dense_layer_1)

    return model

In [None]:
print(tf.config.list_physical_devices('GPU'))
X_train, X_test, y_train, y_test = load_data()
X_train, X_test, tokenizer = get_tokens(X_train, X_test, tokenizer_path=None, save_tokenizer=True)  # If tokenizer is already saved, set tokenizer_path to file path
model = build_model(tokenizer=tokenizer, use_glove=True)

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc',
                                                                      tfa.metrics.F1Score(num_classes=11,average='micro', name='F1_micro'),
                                                                      tfa.metrics.F1Score(num_classes=11,average='macro', name='F1_macro')])

history = model.fit(X_train, y_train, batch_size=256, epochs=3, verbose=1, validation_split=0.2)
model.save('./Models/lstm_Glove')

score = model.evaluate(X_test, y_test, verbose=1)

print("Test Score:", score[0])
print("Test Accuracy:", score[1])
print("Test F1 micro:", score[2])
print("Test F1 macro:", score[3])