Following https://keras.io/examples/nlp/multi_label_classification/

In [48]:
import sys
sys.path.append('/usr/local/lib/python3.9/site-packages')

from tensorflow.keras import layers
from tensorflow import keras
import tensorflow as tf
from keras.layers.experimental.preprocessing import StringLookup
from keras.layers.experimental.preprocessing import TextVectorization

from sklearn.model_selection import train_test_split
from ast import literal_eval

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import pickle


In [101]:
# train = pd.read_csv('data/train.csv')
# val = pd.read_csv('data/valid.csv')
# test = pd.read_csv('data/test.csv')

train_dict = pickle.load(open('data/train.pkl', 'rb'))
valid_dict = pickle.load(open('data/valid.pkl', 'rb'))
test_dict = pickle.load(open('data/test.pkl', 'rb'))

In [102]:
# Save certain keys only

def load_dict(d:dict):
    
    d_new = {i:{'input': d[i]['input'],
                 'label': d[i]['label'].split(';'),
                 'label_vec': d[i]['label_vec'], 
                 'lemmas': d[i]['lemmas']}
              for i in d
              if len(d[i]['lemmas']) != 0}
    
    return d_new


train_dict = load_dict(train_dict)
valid_dict = load_dict(valid_dict)
test_dict = load_dict(test_dict)

In [116]:
train_data = pd.DataFrame.from_dict(train_dict, orient='index')
valid_data = pd.DataFrame.from_dict(valid_dict, orient='index')
test_data = pd.DataFrame.from_dict(test_dict, orient='index')

In [125]:
terms = tf.ragged.constant(train_data["label"].values)
lookup = StringLookup()
lookup.adapt(terms)
vocab = lookup.get_vocabulary()


def invert_multi_hot(encoded_labels):
    """Reverse a single multi-hot encoded label to a tuple of vocab terms."""
    hot_indices = np.argwhere(encoded_labels == 1.0)[..., 0]
    return np.take(vocab, hot_indices)


print("Vocabulary:\n")
print(vocab)


Vocabulary:

['', '[UNK]', 'Prevention', 'Treatment', 'Diagnosis', 'Mechanism', 'Case Report', 'Transmission', 'General Info', 'Epidemic Forecasting']


In [126]:
train_data["input"].apply(lambda x: len(x.split(" "))).describe()


count    52416.000000
mean       149.346764
std        120.188635
min          2.000000
25%         16.000000
50%        155.000000
75%        243.000000
max       1921.000000
Name: input, dtype: float64

In [144]:
max_seqlen = 155
batch_size = 128
padding_token = "<pad>"
auto = tf.data.AUTOTUNE


def make_dataset(dataframe, is_train=True):
    labels = tf.ragged.constant(dataframe["label"].values)
    label_binarized = lookup(labels).numpy()
    dataset = tf.data.Dataset.from_tensor_slices(
        (list(dataframe["input"]), label_binarized)
    )
    dataset = dataset.shuffle(batch_size * 10) if is_train else dataset
    return dataset.batch(batch_size)


In [145]:
train_dataset = make_dataset(train_data, is_train=True)
validation_dataset = make_dataset(valid_data, is_train=False)
test_dataset = make_dataset(test_data, is_train=False)


ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type numpy.ndarray).

In [5]:
train_dict = pickle.load(open('data/train.pkl', 'rb'))
valid_dict = pickle.load(open('data/valid.pkl', 'rb'))
test_dict = pickle.load(open('data/test.pkl', 'rb'))

In [6]:
class_names = ['Case Report','Diagnosis','Epidemic Forecasting','General Info',
               'Mechanism','Prevention','Transmission','Treatment','']

In [7]:
# Drop empty 

def drop_empty(d:dict):
    d_new = {i:d[i] for i in d if len(d[i]['embeddings']) != 0 and len(d[i]['lemmas']) != 0}
    return d_new

train = drop_empty(train_dict)
valid = drop_empty(valid_dict)
test = drop_empty(test_dict)

In [51]:
terms = tf.ragged.constant([train[i]['input'] for i in train])
lookup = StringLookup()
lookup.adapt(terms)
vocab = lookup.get_vocabulary()


def invert_multi_hot(encoded_labels):
    """Reverse a single multi-hot encoded label to a tuple of vocab terms."""
    hot_indices = np.argwhere(encoded_labels == 1.0)[..., 0]
    return np.take(vocab, hot_indices)


In [59]:
dir(lookup)

['_TF_MODULE_IGNORED_PROPERTIES',
 '__abstractmethods__',
 '__call__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_abc_impl',
 '_activity_regularizer',
 '_add_state_variable',
 '_add_trackable',
 '_add_variable_with_custom_getter',
 '_auto_track_sub_layers',
 '_autocast',
 '_autographed_call',
 '_batch_input_shape',
 '_build_input_shape',
 '_call_accepts_kwargs',
 '_call_arg_was_passed',
 '_call_fn_arg_defaults',
 '_call_fn_arg_positions',
 '_call_fn_args',
 '_call_full_argspec',
 '_callable_losses',
 '_cast_single_input',
 '_checkpoint_dependencies',
 '_clear_losses',
 '_combiner',
 '_compute_dtype',
 '_compute_dtype_object',
 '

In [19]:
np.median([len(train[i]['input']) for i in train]) # average length of input string

1082.0

In [33]:
max_seqlength = 1082
batch_size = 128
padding_token = "<pad>"
auto = tf.data.AUTOTUNE

vocabulary = set(np.concatenate([train[i]['lemmas'] for i in train]))
vocabulary_size = len(vocabulary)
print(vocabulary_size)


106516


In [72]:
# text_vectorizer = layers.TextVectorization(
#     max_tokens=vocabulary_size, ngrams=2, output_mode="tf_idf"
# )

train_samples = [' '.join(train[i]['lemmas']) for i in train]
val_samples = [' '.join(valid[i]['lemmas']) for i in valid]
test_samples = [' '.join(test[i]['lemmas']) for i in test]

train_labels = [train[i]['label_vec'] for i in train]
val_labels = [valid[i]['label_vec'] for i in valid]
test_labels = [test[i]['label_vec'] for i in test]

vectorizer = TextVectorization(max_tokens=vocabulary_size, output_sequence_length=300)
# text_ds = tf.data.Dataset.from_tensor_slices(train_samples).batch(32)
vectorizer.adapt(train_samples)
vectorizer.adapt(val_samples)
vectorizer.adapt(test_samples)


NameError: name 'text_vectorizer' is not defined

In [75]:
text_vectorizer = TextVectorization(max_tokens=vocabulary_size, ngrams=2, output_mode="tf-idf")


with tf.device("/CPU:0"):
    text_vectorizer.adapt(train_samples.map(lambda text, label: text))

train_dataset = train_samples.map(
    lambda text, label: (text_vectorizer(text), label), num_parallel_calls=auto
).prefetch(auto)

validation_dataset = val_samples.map(
    lambda text, label: (text_vectorizer(text), label), num_parallel_calls=auto
).prefetch(auto)

test_dataset = test_samples.map(
    lambda text, label: (text_vectorizer(text), label), num_parallel_calls=auto
).prefetch(auto)


AttributeError: 'list' object has no attribute 'map'

In [68]:
def make_model():
    shallow_mlp_model = keras.Sequential(
        [
            layers.Dense(512, activation="relu"),
            layers.Dense(256, activation="relu"),
            layers.Dense(len(class_names), activation="sigmoid"),
        ]  # More on why "sigmoid" has been used here in a moment.
    )
    return shallow_mlp_model


In [70]:
epochs = 20

shallow_mlp_model = make_model()
shallow_mlp_model.compile(
    loss="binary_crossentropy", optimizer="adam", metrics=["categorical_accuracy"]
)

history = shallow_mlp_model.fit(
    train_samples, train_labels, epochs=epochs
)


def plot_result(item):
    plt.plot(history.history[item], label=item)
    plt.plot(history.history["val_" + item], label="val_" + item)
    plt.xlabel("Epochs")
    plt.ylabel(item)
    plt.title("Train and Validation {} Over Epochs".format(item), fontsize=14)
    plt.legend()
    plt.grid()
    plt.show()


plot_result("loss")
plot_result("categorical_accuracy")


TypeError: float() argument must be a string or a number, not 'list'

Following https://chriskhanhtran.github.io/posts/cnn-sentence-classification/

In [153]:
import sys
sys.path.append('/usr/local/lib/python3.9/site-packages')

import os
import re
from tqdm import tqdm
import numpy as np
import pandas as pd
import nltk
nltk.download("all")
import matplotlib.pyplot as plt
import torch

%matplotlib inline


[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to
[nltk_data]    |     /Users/yohyoh.wang/nltk_data...
[nltk_data]    |   Unzipping corpora/abc.zip.
[nltk_data]    | Downloading package alpino to
[nltk_data]    |     /Users/yohyoh.wang/nltk_data...
[nltk_data]    |   Unzipping corpora/alpino.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /Users/yohyoh.wang/nltk_data...
[nltk_data]    |   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /Users/yohyoh.wang/nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers/averaged_perceptron_tagger_ru.zip.
[nltk_data]    | Downloading package basque_grammars to
[nltk_data]    |     /Users/yohyoh.wang/nltk_data...
[nltk_data]    |   Unzipping grammars/basque_grammars.zip.
[nltk_data]    | Downloading package biocreative_ppi to
[nltk_data]    |  

[nltk_data]    |   Unzipping corpora/omw-1.4.zip.
[nltk_data]    | Downloading package opinion_lexicon to
[nltk_data]    |     /Users/yohyoh.wang/nltk_data...
[nltk_data]    |   Unzipping corpora/opinion_lexicon.zip.
[nltk_data]    | Downloading package panlex_swadesh to
[nltk_data]    |     /Users/yohyoh.wang/nltk_data...
[nltk_data]    | Downloading package paradigms to
[nltk_data]    |     /Users/yohyoh.wang/nltk_data...
[nltk_data]    |   Unzipping corpora/paradigms.zip.
[nltk_data]    | Downloading package pe08 to
[nltk_data]    |     /Users/yohyoh.wang/nltk_data...
[nltk_data]    |   Unzipping corpora/pe08.zip.
[nltk_data]    | Downloading package perluniprops to
[nltk_data]    |     /Users/yohyoh.wang/nltk_data...
[nltk_data]    |   Unzipping misc/perluniprops.zip.
[nltk_data]    | Downloading package pil to
[nltk_data]    |     /Users/yohyoh.wang/nltk_data...
[nltk_data]    |   Unzipping corpora/pil.zip.
[nltk_data]    | Downloading package pl196x to
[nltk_data]    |     /Users

[nltk_data]    |   Unzipping corpora/wordnet_ic.zip.
[nltk_data]    | Downloading package words to
[nltk_data]    |     /Users/yohyoh.wang/nltk_data...
[nltk_data]    |   Unzipping corpora/words.zip.
[nltk_data]    | Downloading package ycoe to
[nltk_data]    |     /Users/yohyoh.wang/nltk_data...
[nltk_data]    |   Unzipping corpora/ycoe.zip.
[nltk_data]    | 
[nltk_data]  Done downloading collection all


In [155]:
$URL = 'https://www.cs.cornell.edu/people/pabo/movie-review-data/rt-polaritydata.tar.gz'
# Download Datasets
!wget -P 'data/' $URL
# Unzip
!tar xvzf 'data/rt-polaritydata.tar.gz' -C 'data/'


/bin/bash: wget: command not found
tar: Error opening archive: Failed to open 'data/rt-polaritydata.tar.gz'
