In [1]:
import numpy as np
import pandas as pd
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

import string
from nltk.corpus import stopwords
PUNCT_TO_REMOVE = string.punctuation
STOPWORDS = set(stopwords.words('english'))
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag, word_tokenize
from nltk.data import load
import re

import tensorflow as tf
from transformer import positional_encoding, EncoderLayer
# from tensorflow.keras.callbacks import EarlyStopping
# from transformers import BertTokenizer , TFBertModel
AUTO = tf.data.experimental.AUTOTUNE

In [2]:
data_dir = "kaggle/input/"
bert_dir = "kaggle/input/huggingface-bert-variants/bert-base-uncased/"
train_df = pd.read_csv(data_dir + 'feedback-prize-english-language-learning/train.csv')
test_df = pd.read_csv(data_dir + 'feedback-prize-english-language-learning/test.csv')
sample_df = pd.read_csv(data_dir + 'feedback-prize-english-language-learning/sample_submission.csv')
bert_path = bert_dir + 'bert-base-uncased'
print(train_df.shape, test_df.shape, sample_df.shape)

(3911, 8) (3, 2) (3, 7)


In [3]:
size = train_df.shape[0]
train, validate = int(0.8*size), int(0.2*size)
valid_df = train_df.tail(validate).copy()
train_df = train_df.head(train).copy()
print(train_df.shape, valid_df.shape)

(3128, 8) (782, 8)


In [4]:
# Merging Train and Test Data
train_size = train_df.shape[0]
test_size = test_df.shape[0]
print(train_df.shape, test_df.shape, valid_df.shape)

(3128, 8) (3, 2) (782, 8)


In [5]:
lemmatizer = WordNetLemmatizer()

def preprocess(text) :
    text = text.lower()
    text = re.sub(r'https?://\S+|www\.\S+','', text)
    text = re.sub(r'@[0-9a-zA-Z]*\W+',' ' , text)

    text = re.sub(r'\d+', ' ', text)
    text = re.sub(r'\#', ' ', text)
    text = re.sub(r'\'', ' ', text)

    list_text = text.split()
    text = ' '.join(list_text)
    return text

In [6]:
text_vocab = set()
pos_vocab = list(load('help/tagsets/upenn_tagset.pickle').keys())
for dataset in [train_df, valid_df, test_df]:
    dataset.drop(['text_id'], axis=1, inplace=True)
    dataset['full_text'] = dataset['full_text'].apply(lambda text : preprocess(text))
    dataset['pos_tag'] = dataset['full_text'].apply(lambda text: pos_tag(word_tokenize(text)))
    # there are 36 possible pos_tags
    dataset['pos'] = dataset['pos_tag'].apply(lambda text: ' '.join([elem[1] for elem in text]))
    dataset['tokens'] = dataset['pos_tag'].apply(lambda text: [elem[0] for elem in text])
    for tokens in dataset['tokens']:
        text_vocab.update(tokens)
    dataset['tokens'] = dataset['tokens'].apply(lambda text: ' '.join(text))
    dataset.drop(['full_text'], axis=1, inplace=True)
    dataset.drop(['pos_tag'], axis=1, inplace=True)
all_data = pd.concat((train_df, valid_df, test_df)).reset_index(drop=True)
max_text_len = max(len(elem) for elem in all_data['tokens'])

In [7]:
# tag parts of speech, add as feature
train_df.head()

Unnamed: 0,cohesion,syntax,vocabulary,phraseology,grammar,conventions,pos,tokens
0,3.5,3.5,3.0,3.0,4.0,3.0,"NN VBP IN NNS MD VB IN VBG IN NN , IN PRP VBP ...",i think that students would benefit from learn...
1,2.5,2.5,3.0,2.0,2.0,2.5,WRB DT NN VBZ DT NN PRP VBP TO VB PRP VB DT JJ...,when a problem is a change you have to let it ...
2,3.0,3.5,3.0,3.0,3.0,2.5,"NN , JJ IN JJ VBP DT NN NN IN VBG DT NN NN NN ...","dear , principal if u change the school policy..."
3,4.5,4.5,4.5,4.5,4.0,5.0,DT JJS NN IN NN VBZ WRB PRP VBP PRP . VB VBP I...,the best time in life is when you become yours...
4,2.5,3.0,3.0,3.0,2.5,2.5,JJ NN IN NN MD VB IN JJ NNS MD VB NNS TO VB JJ...,small act of kindness can impact in other peop...


In [207]:
# "(E)lement-(Wi)se (Dense)" Layer for combining two embeddings (or other multi-feature time sequence data) with a Dense layer applied element-wise (so not exactly Dense, as in the output embedding the first position is only determined by a linear combination of the two embedding values in corresponding positions in the two input embeddings)
# (We picked this name because it was funny)
class EWiDense(tf.keras.layers.Layer):
    def __init__(self, activation=None, **kwargs):
        super(EWiDense, self).__init__(**kwargs)
        self.activation = activation

    def build(self, input_shape):
        self.embedding_size = input_shape[0][-1]
        self.tile_shape = input_shape[0][1:-1]
        self.tile_shape = tf.concat([self.tile_shape, tf.convert_to_tensor([1])], 0)
        # print(self.tile_shape)
        self.w1 = self.add_weight(
            shape=(1, self.embedding_size),
            initializer="random_normal",
            trainable=True
        )
        # print(tf.shape(tf.tile(self.w1, self.tile_shape)))
        self.w2 = self.add_weight(
            shape=(1, self.embedding_size),
            initializer="random_normal",
            trainable=True
        )
        self.b1 = self.add_weight(
            shape=(1, self.embedding_size),
            initializer="zeros",
            trainable=True
        )

    def call(self, data):  # expected x of two embeddings of shape batch_size, seq_len, embedding_size
        if self.activation:
            return self.activation(tf.multiply(data[0], tf.tile(self.w1, self.tile_shape)) + tf.multiply(data[1], tf.tile(self.w2, self.tile_shape)) + tf.tile(self.b1, self.tile_shape))
        return tf.multiply(data[0], tf.tile(self.w1, self.tile_shape)) + tf.multiply(data[1], tf.tile(self.w2, self.tile_shape)) + tf.tile(self.b1, self.tile_shape)

In [208]:
# compare standard positional encoding with grammar + positional encodinng
# use encoder networks, but not the decoders because we don't have an output sequence really

class GrammarModel(tf.keras.Model):
    def __init__(self, num_layers, d_model, num_heads, dff,
               max_text_len, text_vocab, pos_vocab, dropout_rate=0.1):
        super().__init__()
        self.num_layers = num_layers
        self.d_model = d_model

        self.text_vectorization = tf.keras.layers.TextVectorization(output_mode='int', output_sequence_length=max_text_len)
        self.text_vectorization.adapt(text_vocab)
        self.pos_vectorization = tf.keras.layers.TextVectorization(output_mode='int', output_sequence_length=max_text_len)
        self.pos_vectorization.adapt(pos_vocab)
        self.word_embedding = tf.keras.layers.Embedding(self.text_vectorization.vocabulary_size(), d_model) # replace
        self.pos_embedding = tf.keras.layers.Embedding(self.pos_vectorization.vocabulary_size(), d_model)
        self.EWiDenseLayer = EWiDense(activation=tf.keras.layers.LeakyReLU())
        self.pos_encoding = positional_encoding(length=2048, depth=d_model)
        # self.enc_layers = [
        #     EncoderLayer(d_model=d_model,
        #                  num_heads=num_heads,
        #                  dff=dff,
        #                  dropout_rate=dropout_rate)
        #     for _ in range(num_layers)]
        self.dropout = tf.keras.layers.Dropout(dropout_rate)
        self.pooling = tf.keras.layers.GlobalAveragePooling1D()
        self.layernorm = tf.keras.layers.LayerNormalization()
        self.dense = tf.keras.layers.Dense(1)

    def call(self, inputs):
        words = inputs[:, 0]
        pos = inputs[:, 1]
        length = tf.shape(words)[0] # seq_len
        # combine embeddings
        words = self.text_vectorization(words)
        pos = self.pos_vectorization(pos)
        x = self.EWiDenseLayer((self.word_embedding(words), self.pos_embedding(pos)))
        # add positional encoding
        x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        x = x + self.pos_encoding[tf.newaxis, :length, :]
        # dropout
        x = self.dropout(x)
        # add encoding layers
        # for i in range(self.num_layers):
        #     x = self.enc_layers[i](x)
        x = self.pooling(x)
        x = self.layernorm(x)
        return self.dense(x)

In [209]:
## Column-wise RMSE
def MCRMSE(y_true, y_pred):
    mcrmse = tf.reduce_mean(tf.square(y_true - y_pred), axis=1)
    return tf.reduce_mean(tf.sqrt(mcrmse), axis=-1, keepdims=True)

In [210]:
def create_model():
    num_layers = 4
    d_model = 128
    dff = 512
    num_heads = 8
    dropout_rate = 0.1
    model = GrammarModel(num_layers, d_model, num_heads, dff, max_text_len, np.array(list(text_vocab)), np.array(list(pos_vocab)), dropout_rate)
    model.compile(optimizer=tf.keras.optimizers.Adam(1e-5), loss=MCRMSE, metrics=MCRMSE)
    return model

In [211]:
model = create_model()
# model.summary()
batch_size = 5

In [198]:
model(tf.expand_dims(tf.convert_to_tensor(train_df.iloc[0][['tokens', 'pos']]), 0))
model.summary()

Model: "grammar_model_29"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 text_vectorization_58 (Text  multiple                 0         
 Vectorization)                                                  
                                                                 
 text_vectorization_59 (Text  multiple                 0         
 Vectorization)                                                  
                                                                 
 embedding_55 (Embedding)    multiple                  2758144   
                                                                 
 embedding_56 (Embedding)    multiple                  4608      
                                                                 
 e_wi_dense_28 (EWiDense)    multiple                  384       
                                                                 
 dropout_50 (Dropout)        multiple             

In [199]:
tf.debugging.disable_traceback_filtering()

In [212]:
history = model.fit(
                    train_df[['tokens', 'pos']],
                    train_df['grammar'],
                    validation_data = (valid_df[['tokens', 'pos']], valid_df['grammar']),
                    steps_per_epoch= train_df.shape[0]//batch_size,
                    batch_size = batch_size,
                    epochs= 100,
                    verbose = 1,
                    shuffle= True)

Epoch 1/100
tf.Tensor([6192    1], shape=(2,), dtype=int32)
tf.Tensor([6192  128], shape=(2,), dtype=int32)


InvalidArgumentError: Graph execution error:

Detected at node 'gradient_tape/grammar_model_31/add/BroadcastGradientArgs' defined at (most recent call last):
    File "C:\Python310\lib\runpy.py", line 196, in _run_module_as_main
      return _run_code(code, main_globals, None,
    File "C:\Python310\lib\runpy.py", line 86, in _run_code
      exec(code, run_globals)
    File "C:\Users\patri\OneDrive\Documents\Study\CSCI 1470\N-ELL-P\venv\lib\site-packages\ipykernel_launcher.py", line 16, in <module>
      app.launch_new_instance()
    File "C:\Users\patri\OneDrive\Documents\Study\CSCI 1470\N-ELL-P\venv\lib\site-packages\traitlets\config\application.py", line 846, in launch_instance
      app.start()
    File "C:\Users\patri\OneDrive\Documents\Study\CSCI 1470\N-ELL-P\venv\lib\site-packages\ipykernel\kernelapp.py", line 677, in start
      self.io_loop.start()
    File "C:\Users\patri\OneDrive\Documents\Study\CSCI 1470\N-ELL-P\venv\lib\site-packages\tornado\platform\asyncio.py", line 199, in start
      self.asyncio_loop.run_forever()
    File "C:\Python310\lib\asyncio\base_events.py", line 600, in run_forever
      self._run_once()
    File "C:\Python310\lib\asyncio\base_events.py", line 1896, in _run_once
      handle._run()
    File "C:\Python310\lib\asyncio\events.py", line 80, in _run
      self._context.run(self._callback, *self._args)
    File "C:\Users\patri\OneDrive\Documents\Study\CSCI 1470\N-ELL-P\venv\lib\site-packages\ipykernel\kernelbase.py", line 457, in dispatch_queue
      await self.process_one()
    File "C:\Users\patri\OneDrive\Documents\Study\CSCI 1470\N-ELL-P\venv\lib\site-packages\ipykernel\kernelbase.py", line 446, in process_one
      await dispatch(*args)
    File "C:\Users\patri\OneDrive\Documents\Study\CSCI 1470\N-ELL-P\venv\lib\site-packages\ipykernel\kernelbase.py", line 353, in dispatch_shell
      await result
    File "C:\Users\patri\OneDrive\Documents\Study\CSCI 1470\N-ELL-P\venv\lib\site-packages\ipykernel\kernelbase.py", line 648, in execute_request
      reply_content = await reply_content
    File "C:\Users\patri\OneDrive\Documents\Study\CSCI 1470\N-ELL-P\venv\lib\site-packages\ipykernel\ipkernel.py", line 353, in do_execute
      res = shell.run_cell(code, store_history=store_history, silent=silent)
    File "C:\Users\patri\OneDrive\Documents\Study\CSCI 1470\N-ELL-P\venv\lib\site-packages\ipykernel\zmqshell.py", line 533, in run_cell
      return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
    File "C:\Users\patri\OneDrive\Documents\Study\CSCI 1470\N-ELL-P\venv\lib\site-packages\IPython\core\interactiveshell.py", line 2914, in run_cell
      result = self._run_cell(
    File "C:\Users\patri\OneDrive\Documents\Study\CSCI 1470\N-ELL-P\venv\lib\site-packages\IPython\core\interactiveshell.py", line 2960, in _run_cell
      return runner(coro)
    File "C:\Users\patri\OneDrive\Documents\Study\CSCI 1470\N-ELL-P\venv\lib\site-packages\IPython\core\async_helpers.py", line 68, in _pseudo_sync_runner
      coro.send(None)
    File "C:\Users\patri\OneDrive\Documents\Study\CSCI 1470\N-ELL-P\venv\lib\site-packages\IPython\core\interactiveshell.py", line 3185, in run_cell_async
      has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
    File "C:\Users\patri\OneDrive\Documents\Study\CSCI 1470\N-ELL-P\venv\lib\site-packages\IPython\core\interactiveshell.py", line 3377, in run_ast_nodes
      if (await self.run_code(code, result,  async_=asy)):
    File "C:\Users\patri\OneDrive\Documents\Study\CSCI 1470\N-ELL-P\venv\lib\site-packages\IPython\core\interactiveshell.py", line 3457, in run_code
      exec(code_obj, self.user_global_ns, self.user_ns)
    File "C:\Users\patri\AppData\Local\Temp/ipykernel_21604/1202085082.py", line 1, in <module>
      history = model.fit(
    File "C:\Users\patri\OneDrive\Documents\Study\CSCI 1470\N-ELL-P\venv\lib\site-packages\keras\utils\traceback_utils.py", line 61, in error_handler
      return fn(*args, **kwargs)
    File "C:\Users\patri\OneDrive\Documents\Study\CSCI 1470\N-ELL-P\venv\lib\site-packages\keras\engine\training.py", line 1650, in fit
      tmp_logs = self.train_function(iterator)
    File "C:\Users\patri\OneDrive\Documents\Study\CSCI 1470\N-ELL-P\venv\lib\site-packages\keras\engine\training.py", line 1249, in train_function
      return step_function(self, iterator)
    File "C:\Users\patri\OneDrive\Documents\Study\CSCI 1470\N-ELL-P\venv\lib\site-packages\keras\engine\training.py", line 1233, in step_function
      outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "C:\Users\patri\OneDrive\Documents\Study\CSCI 1470\N-ELL-P\venv\lib\site-packages\keras\engine\training.py", line 1222, in run_step
      outputs = model.train_step(data)
    File "C:\Users\patri\OneDrive\Documents\Study\CSCI 1470\N-ELL-P\venv\lib\site-packages\keras\engine\training.py", line 1027, in train_step
      self.optimizer.minimize(loss, self.trainable_variables, tape=tape)
    File "C:\Users\patri\OneDrive\Documents\Study\CSCI 1470\N-ELL-P\venv\lib\site-packages\keras\optimizers\optimizer_experimental\optimizer.py", line 526, in minimize
      grads_and_vars = self.compute_gradients(loss, var_list, tape)
    File "C:\Users\patri\OneDrive\Documents\Study\CSCI 1470\N-ELL-P\venv\lib\site-packages\keras\optimizers\optimizer_experimental\optimizer.py", line 259, in compute_gradients
      grads = tape.gradient(loss, var_list)
Node: 'gradient_tape/grammar_model_31/add/BroadcastGradientArgs'
Incompatible shapes: [5,6192,128] vs. [1,5,128]
	 [[{{node gradient_tape/grammar_model_31/add/BroadcastGradientArgs}}]] [Op:__inference_train_function_93617]

In [None]:
train_df[['tokens', 'pos']]