In [1]:
import sys
sys.path.append('/usr/local/lib/python3.9/site-packages')

In [2]:
import pandas as pd
import numpy as np
import pickle

import tensorflow as tf
import tensorflow_addons as tfa

import tensorflow.keras as keras
from keras import layers

from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Conv1D
from keras.layers import MaxPooling1D
from keras.layers import Embedding
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from sklearn.preprocessing import LabelEncoder
from keras.layers.experimental.preprocessing import TextVectorization
from keras import metrics
from keras.optimizers import Adam


# fix random seed for reproducibility
seed = 42
np.random.seed(seed)

 The versions of TensorFlow you are currently using is 2.4.0 and is not supported. 
Some things might work, some things might not.
If you were to encounter a bug, do not file an issue.
If you want to make sure you're using a tested and supported configuration, either change the TensorFlow version or the TensorFlow Addons's version. 
You can find the compatibility matrix in TensorFlow Addon's readme:
https://github.com/tensorflow/addons


In [3]:
train_dict = pickle.load(open('data/train.pkl', 'rb'))
valid_dict = pickle.load(open('data/valid.pkl', 'rb'))
test_dict = pickle.load(open('data/test.pkl', 'rb'))

In [4]:
class_names = ['Case Report','Diagnosis','Epidemic Forecasting','General Info',
               'Mechanism','Prevention','Transmission','Treatment','']

In [5]:
# Drop empty 

def drop_empty(d:dict):
    d_new = {i:d[i] for i in d if len(d[i]['embeddings']) != 0 and len(d[i]['lemmas']) != 0}
    return d_new

train = drop_empty(train_dict)
valid = drop_empty(valid_dict)
test = drop_empty(test_dict)

In [6]:
embedding_matrix = np.concatenate([train_dict[i]['embeddings'] for i in train])

In [7]:
num_tokens = embedding_matrix.shape[0]
embedding_dim = embedding_matrix.shape[1]
EPOCHS=10
BATCH_SIZE=32
LEARNING_RATE=0.001
DROPOUT=0.7
POOL_LENGTH=8

In [8]:
# create the model
embedding_layer = Embedding(
    input_dim = embedding_matrix.shape[0],
    output_dim = embedding_matrix.shape[1],
    embeddings_initializer=keras.initializers.Constant(embedding_matrix),
    trainable=False,
)

int_sequences_input = keras.Input(shape=(None,), dtype="int64")
embedded_sequences = embedding_layer(int_sequences_input)
x = layers.Conv1D(128, 5, activation="relu")(embedded_sequences)
x = layers.MaxPooling1D(5)(x)
x = layers.Conv1D(128, 5, activation="relu")(x)
x = layers.GlobalMaxPooling1D()(x)
x = layers.Dense(128, activation="relu")(x)
x = layers.Dropout(DROPOUT)(x)
preds = layers.Dense(len(class_names), activation="softmax")(x)
model = keras.Model(int_sequences_input, preds)
model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, None)]            0         
_________________________________________________________________
embedding (Embedding)        (None, None, 300)         1087340100
_________________________________________________________________
conv1d (Conv1D)              (None, None, 128)         192128    
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, None, 128)         0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, None, 128)         82048     
_________________________________________________________________
global_max_pooling1d (Global (None, 128)               0         
_________________________________________________________________
dense (Dense)                (None, 128)               16512 

In [9]:
train_samples = [' '.join(train[i]['lemmas']) for i in train]
val_samples = [' '.join(valid[i]['lemmas']) for i in valid]
test_samples = [' '.join(test[i]['lemmas']) for i in test]

# train_samples = [train[i]['input'] for i in train]
# val_samples = [valid[i]['input'] for i in valid]
# test_samples = [test[i]['input'] for i in test]

train_labels = [train[i]['label_vec'] for i in train]
val_labels = [valid[i]['label_vec'] for i in valid]
test_labels = [test[i]['label_vec'] for i in test]

vectorizer = TextVectorization(max_tokens=20000, output_sequence_length=300)
# text_ds = tf.data.Dataset.from_tensor_slices(train_samples).batch(32)
vectorizer.adapt(train_samples)


In [10]:
x_train = vectorizer(np.array([[s] for s in train_samples])).numpy()
x_val = vectorizer(np.array([[s] for s in val_samples])).numpy()
x_test = vectorizer(np.array([[s] for s in test_samples])).numpy()

y_train = np.array(train_labels)
y_val = np.array(val_labels)
y_test = np.array(test_labels)

In [11]:
%%time

# opt = SGD(lr=0.01)

opt = keras.optimizers.Adam(learning_rate=LEARNING_RATE)

model.compile(
    loss="categorical_crossentropy", optimizer=opt, 
    metrics=["acc", 
             metrics.Precision(), 
             tf.keras.metrics.Recall(), 
             tfa.metrics.F1Score(num_classes=len(class_names), average='micro', name='F1_score'),
             metrics.AUC(name='my_auc')]
)
model.fit(x_train, y_train, batch_size=BATCH_SIZE, epochs=EPOCHS, validation_data=(x_val, y_val))


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
CPU times: user 1h 6min 7s, sys: 14min 54s, total: 1h 21min 2s
Wall time: 17min 5s


<tensorflow.python.keras.callbacks.History at 0x7fe2d8dee8b0>

In [12]:
scores = model.evaluate(x_test, y_test, verbose=1)
print("Accuracy: %.2f%%" % (scores[1]*100))

Accuracy: 31.78%


In [13]:
predictions = model.predict_classes(X_test).ravel()

AttributeError: 'Functional' object has no attribute 'predict_classes'