In [20]:
import numpy as np
import pandas as pd
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

import string
from nltk.corpus import stopwords
PUNCT_TO_REMOVE = string.punctuation
STOPWORDS = set(stopwords.words('english'))
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag, word_tokenize
import re

import tensorflow as tf
from transformer import positional_encoding, EncoderLayer
# from tensorflow.keras.callbacks import EarlyStopping
# from transformers import BertTokenizer , TFBertModel
AUTO = tf.data.experimental.AUTOTUNE

In [3]:
data_dir = "kaggle/input/"
bert_dir = "kaggle/input/huggingface-bert-variants/bert-base-uncased/"
train_df = pd.read_csv(data_dir + 'feedback-prize-english-language-learning/train.csv')
test_df = pd.read_csv(data_dir + 'feedback-prize-english-language-learning/test.csv')
sample_df = pd.read_csv(data_dir + 'feedback-prize-english-language-learning/sample_submission.csv')
bert_path = bert_dir + 'bert-base-uncased'
print(train_df.shape, test_df.shape, sample_df.shape)

(3911, 8) (3, 2) (3, 7)


In [4]:
size = train_df.shape[0]
train, validate = int(0.8*size), int(0.2*size)
validate_df = train_df.tail(validate).copy()
train_df = train_df.head(train).copy()
print(train_df.shape, validate_df.shape)

(3128, 8) (782, 8)


In [5]:
# Merging Train and Test Data
train_size = train_df.shape[0]
test_size = test_df.shape[0]
all_data = pd.concat((train_df, test_df)).reset_index(drop=True)
all_data.drop(['text_id'], axis=1, inplace=True)
validate_df.drop(['text_id'], axis=1, inplace=True)
print(all_data.shape, validate_df.shape)

(3131, 7) (782, 7)


In [6]:
lemmatizer = WordNetLemmatizer()

def preprocess(text) :
    text = text.lower()
    text = re.sub(r'https?://\S+|www\.\S+','', text)
    text = re.sub(r'@[0-9a-zA-Z]*\W+',' ' , text)

    text = re.sub(r'\d+', ' ', text)
    text = re.sub(r'\#', ' ', text)
    text = re.sub(r'\'', ' ', text)

    list_text = text.split()
    text = ' '.join(list_text[:512])
    return text

In [18]:
all_data['full_text'] = all_data['full_text'].apply(lambda text : preprocess(text))
validate_df['full_text'] = validate_df['full_text'].apply(lambda text : preprocess(text))
all_data['pos_tag'] = all_data['full_text'].apply(lambda text: pos_tag(word_tokenize(text)))
validate_df['pos_tag'] = validate_df['full_text'].apply(lambda text: pos_tag(word_tokenize(text)))
# there are 36 possible pos_tags, we will later encode these in one-hot representation
all_data['pos_tag'] = all_data['pos_tag'].apply(lambda elem: elem[1])
validate_df['pos_tag'] = validate_df['pos_tag'].apply(lambda elem: elem[1])

In [8]:
train_data = all_data[:train_size].copy()
test_data = all_data[train_size:].copy()

print(train_data.shape, test_data.shape)

(3128, 7) (3, 7)


In [16]:
# tag parts of speech, add as feature
train_data['pos_tag']

0       [(i, NN), (think, VBP), (that, IN), (students,...
1       [(when, WRB), (a, DT), (problem, NN), (is, VBZ...
2       [(dear, NN), (,, ,), (principal, JJ), (if, IN)...
3       [(the, DT), (best, JJS), (time, NN), (in, IN),...
4       [(small, JJ), (act, NN), (of, IN), (kindness, ...
                              ...                        
3123    [(author, NN), (ralph, NN), (waldo, NN), (emer...
3124    [(we, PRP), (humans, NNS), (aren, VBP), (t, JJ...
3125    [(``, ``), (your, PRP$), (character, NN), (wil...
3126    [(the, DT), (school, NN), (board, NN), (plans,...
3127    [(is, VBZ), (good, JJ), (idea, NN), (for, IN),...
Name: pos_tag, Length: 3128, dtype: object

In [19]:
# "(E)lement-(Wi)se (Dense)" Layer for combining two embeddings (or other multi-feature time sequence data) with a Dense layer applied element-wise (so not exactly Dense, as in the output embedding the first position is only determined by a linear combination of the two embedding values in corresponding positions in the two input embeddings
class EWiDense(tf.keras.layers.Layer):
    def __init__(self, embedding_size, activation=None):
        super(EWiDense, self).__init__()
        w_init = tf.random_normal_initializer()
        self.w1 = tf.Variable(
            initial_value=w_init(shape=(1, 1, embedding_size), dtype="float32"),
            trainable=True,
        )
        self.w2 = tf.Variable(
            initial_value=w_init(shape=(1, 1, embedding_size), dtype="float32"),
            trainable=True,
        )
        b_init = tf.zeros_initializer()
        self.b1 = tf.Variable(
            initial_value=b_init(shape=(1, 1, embedding_size), dtype="float32"), trainable=True
        )
        self.b2 = tf.Variable(
            initial_value=b_init(shape=(1, 1, embedding_size), dtype="float32"), trainable=True
        )
        self.activation = activation

    def call(self, data):  # expected x of two embeddings of shape batch_size, seq_len, embedding_size
        emb1, emb2 = data
        emb1 = tf.multiply(emb1, self.w1) + self.b1
        emb2 = tf.multiply(emb2, self.w2) + self.b2
        out = emb1 + emb2
        if self.activation:
            self.activation(out)
        return out



In [None]:
# compare standard positional encoding with grammar + positional encodinng
# use encoder networks, but not the decoders because we don't have an output sequence really

class GrammarModel(tf.keras.Model):
    def __init__(self, embedding_size, num_layers, d_model, num_heads, dff,
               input_vocab_size, dropout_rate=0.1):
        super().__init__()
        self.d_model = d_model
        self.word_embedding = tf.keras.layers.Embedding(input_vocab_size, d_model) # replace with bert in the future for testing
        self.pos_embedding = tf.keras.layers.Embedding(36, d_model)
        self.EWiDenseLayer = EWiDense(embedding_size, activation=tf.keras.layers.LeakyReLU())
        self.pos_encoding = positional_encoding(length=2048, depth=d_model)
        self.enc_layers = [
            EncoderLayer(d_model=d_model,
                         num_heads=num_heads,
                         dff=dff,
                         dropout_rate=dropout_rate)
            for _ in range(num_layers)]
        self.dropout = tf.keras.layers.Dropout(dropout_rate)
        self.pooling = tf.keras.layers.GlobalAveragePooling1D()
        self.layernorm = tf.keras.layers.LayerNormalization()
        self.output = tf.keras.layers.Dense()


    def call(self, inputs):
        words, pos = inputs
        length = tf.shape(words)[1] # seq_len
        # combine embeddings
        x = self.EwiDenseLayer((self.word_embedding(words), self.pos_embedding(pos)))
        # add positional encoding
        x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        x = x + self.pos_encoding[tf.newaxis, :length, :]
        # dropout
        x = self.dropout(x)
        # add encoding layers
        for i in range(self.num_layers):
            x = self.enc_layers[i](x)
        x = self.pooling(x)
        x = self.layernorm(x)
        return self.output(x)