In [6]:
import torch
import tensorflow as tf
import os
import pandas as pd
import random
import torch.nn as nn
from tensorflow import keras
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from keras.preprocessing.text import text_to_word_sequence, Tokenizer
from tensorflow.keras import layers

In [2]:
class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerBlock, self).__init__()
        self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = keras.Sequential(
            [layers.Dense(ff_dim, activation="relu"), layers.Dense(embed_dim),]
        )
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

In [3]:
class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super(TokenAndPositionEmbedding, self).__init__()
        self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions

In [7]:
def get_lines(filename):
    with open(filename,"r") as f:
        return f.readlines()
    
# Read the Lines in the Training Set
data_dir = "./data/"
pos_data_lines=get_lines(data_dir+"pos.txt")
neg_data_lines=get_lines(data_dir+"neg.txt")
sentences = []
for txt in pos_data_lines:
    line_data = {}
    line_data["target"]="POSITIVE"
    line_data["text"] = txt
    sentences.append(line_data)
for txt in neg_data_lines:
    line_data = {}
    line_data["target"]="NEGATIVE"
    line_data["text"] = txt
    sentences.append(line_data)
random.shuffle(sentences)
train_samples = sentences[:round(len(sentences)*0.8)]
test_samples = sentences[:round(len(sentences)*0.2)]
random.shuffle(sentences)
val_samples = sentences[:round(len(sentences)*0.2)]

In [11]:
# Reading in and preprocessing data
data_dir = "./results/PubMed_20k_RCT_numbers_replaced_with_at_sign/"
filenames=[data_dir + filename for filename in os.listdir(data_dir)]
print(filenames)

train_samples=preprocess_data(data_dir+"train.txt")
val_samples=preprocess_data(data_dir+"dev.txt")
test_samples=preprocess_data(data_dir+"test.txt")


['./results/PubMed_20k_RCT_numbers_replaced_with_at_sign/dev.txt', './results/PubMed_20k_RCT_numbers_replaced_with_at_sign/train.txt', './results/PubMed_20k_RCT_numbers_replaced_with_at_sign/test.txt']


In [10]:
vocab_size = 20000  # Only consider the top 20k words
maxlen = 200  # Only consider the first 200 words of each movie review
x_train = []
x_val = []
x_test = []
y_test = []
y_train = []
y_val = []
for sample in train_samples:
    x_train.append(sample["text"])
    y_train.append(sample["target"])
for sample in val_samples:
    x_val.append(sample["text"])
    y_val.append(sample["target"])
for sample in test_samples:
    x_test.append(sample["text"])
    y_test.append(sample["target"])


In [11]:
num_words = 68000
tokenizer = Tokenizer(num_words)
tokenizer.fit_on_texts(x_train+x_val+x_test)
txt_2_sq = tokenizer.texts_to_sequences(x_train)
x_train = keras.preprocessing.sequence.pad_sequences(txt_2_sq, maxlen=maxlen)
txt_2_sq = tokenizer.texts_to_sequences(x_test)
x_test = keras.preprocessing.sequence.pad_sequences(txt_2_sq, maxlen=maxlen)
txt_2_sq = tokenizer.texts_to_sequences(x_val)
x_val = keras.preprocessing.sequence.pad_sequences(txt_2_sq, maxlen=maxlen)

In [12]:
train_df=pd.DataFrame(train_samples)
val_df=pd.DataFrame(val_samples)
test_df=pd.DataFrame(test_samples)
# Turning the target Labels into Numeric Data
# We have 5 main labels -> Background, Objective,Methods, Results, Conclusion
# We'll encode them both 1HEC and Simple Numerical

# Tensorflow is incompatible with sparse matrices
one_hot_encoder=OneHotEncoder(sparse=False)
# You should reshape your X to be a 2D array not 1D array. Fitting a model requires requires a 2D array. i.e (n_samples, n_features)
y_train=one_hot_encoder.fit_transform(train_df["target"].to_numpy().reshape(-1,1))
y_val=one_hot_encoder.fit_transform(val_df["target"].to_numpy().reshape(-1,1))
y_test=one_hot_encoder.fit_transform(test_df["target"].to_numpy().reshape(-1,1))
y_train, y_val, y_test


(array([[1., 0.],
        [0., 1.],
        [0., 1.],
        ...,
        [0., 1.],
        [1., 0.],
        [1., 0.]]),
 array([[1., 0.],
        [0., 1.],
        [0., 1.],
        ...,
        [0., 1.],
        [0., 1.],
        [0., 1.]]),
 array([[1., 0.],
        [0., 1.],
        [0., 1.],
        ...,
        [1., 0.],
        [0., 1.],
        [0., 1.]]))

In [13]:
embed_dim = 128  # Embedding size for each token
num_heads = 2  # Number of attention heads
ff_dim = 32  # Hidden layer size in feed forward network inside transformer

inputs = layers.Input(shape=(maxlen,))
embedding_layer = TokenAndPositionEmbedding(maxlen, num_words, embed_dim)
x = embedding_layer(inputs)
transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)
x = transformer_block(x)
x = layers.GlobalAveragePooling1D()(x)
x = layers.Dropout(0.1)(x)
x = layers.Dense(20, activation="relu")(x)
x = layers.Dropout(0.1)(x)
outputs = layers.Dense(2, activation="softmax")(x)

model = keras.Model(inputs=inputs, outputs=outputs)

model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])
history = model.fit(
    x_train, y_train, batch_size=32, epochs=2, validation_data=(x_val, y_val)
)

2022-04-02 23:03:35.087642: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Epoch 1/2
   60/20000 [..............................] - ETA: 1:59:14 - loss: 0.7349 - accuracy: 0.5094

KeyboardInterrupt: 

In [None]:
# Evaluate the model on the test data using `evaluate`
print("Evaluate on test data")
results = model.evaluate(x_test, y_test, batch_size=32)
print("test loss, test acc:", results)

# Generate predictions (probabilities -- the output of the last layer)
# on new data using `predict`
print("Generate predictions for 3 samples")
predictions = model.predict(x_test[:3])
print("predictions shape:", predictions.shape)