In [1]:
import numpy as np
import pandas as pd
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

import string
from nltk.corpus import stopwords
PUNCT_TO_REMOVE = string.punctuation
STOPWORDS = set(stopwords.words('english'))
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag, word_tokenize
from nltk.data import load
import re

import tensorflow as tf
from transformer import positional_encoding, EncoderLayer
# from tensorflow.keras.callbacks import EarlyStopping
# from transformers import BertTokenizer , TFBertModel
AUTO = tf.data.experimental.AUTOTUNE

In [2]:
data_dir = "kaggle/input/"
bert_dir = "kaggle/input/huggingface-bert-variants/bert-base-uncased/"
train_df = pd.read_csv(data_dir + 'feedback-prize-english-language-learning/train.csv')
test_df = pd.read_csv(data_dir + 'feedback-prize-english-language-learning/test.csv')
sample_df = pd.read_csv(data_dir + 'feedback-prize-english-language-learning/sample_submission.csv')
bert_path = bert_dir + 'bert-base-uncased'
print(train_df.shape, test_df.shape, sample_df.shape)

(3911, 8) (3, 2) (3, 7)


In [3]:
size = train_df.shape[0]
train, validate = int(0.8*size), int(0.2*size)
valid_df = train_df.tail(validate).copy()
train_df = train_df.head(train).copy()
print(train_df.shape, valid_df.shape)

(3128, 8) (782, 8)


In [4]:
# Merging Train and Test Data
train_size = train_df.shape[0]
test_size = test_df.shape[0]
print(train_df.shape, test_df.shape, valid_df.shape)

(3128, 8) (3, 2) (782, 8)


In [5]:
lemmatizer = WordNetLemmatizer()

def preprocess(text) :
    text = text.lower()
    text = re.sub(r'https?://\S+|www\.\S+','', text)
    text = re.sub(r'@[0-9a-zA-Z]*\W+',' ' , text)

    text = re.sub(r'\d+', ' ', text)
    text = re.sub(r'\#', ' ', text)
    text = re.sub(r'\'', ' ', text)

    list_text = text.split()
    text = ' '.join(list_text)
    return text

In [6]:
text_vocab = set()
pos_vocab = list(load('help/tagsets/upenn_tagset.pickle').keys())
max_text_len = 0
truncate_to = 512
for dataset in [train_df, valid_df, test_df]:
    dataset.drop(['text_id'], axis=1, inplace=True)
    dataset['full_text'] = dataset['full_text'].apply(lambda text : preprocess(text))
    dataset['pos_tag'] = dataset['full_text'].apply(lambda text: pos_tag(word_tokenize(text)))
    # there are 36 possible pos_tags
    dataset['pos'] = dataset['pos_tag'].apply(lambda text: ' '.join([elem[1] for elem in text[:truncate_to]]))
    dataset['tokens'] = dataset['pos_tag'].apply(lambda text: [elem[0] for elem in text[:truncate_to]])
    for tokens in dataset['tokens']:
        text_vocab.update(tokens)
        max_text_len = max(max_text_len, len(tokens))
    dataset['tokens'] = dataset['tokens'].apply(lambda text: ' '.join(text))
    dataset.drop(['full_text'], axis=1, inplace=True)
    dataset.drop(['pos_tag'], axis=1, inplace=True)
all_data = pd.concat((train_df, valid_df, test_df)).reset_index(drop=True)

In [7]:
max_text_len

512

In [8]:
# tag parts of speech, add as feature
train_df.head()

Unnamed: 0,cohesion,syntax,vocabulary,phraseology,grammar,conventions,pos,tokens
0,3.5,3.5,3.0,3.0,4.0,3.0,"NN VBP IN NNS MD VB IN VBG IN NN , IN PRP VBP ...",i think that students would benefit from learn...
1,2.5,2.5,3.0,2.0,2.0,2.5,WRB DT NN VBZ DT NN PRP VBP TO VB PRP VB DT JJ...,when a problem is a change you have to let it ...
2,3.0,3.5,3.0,3.0,3.0,2.5,"NN , JJ IN JJ VBP DT NN NN IN VBG DT NN NN NN ...","dear , principal if u change the school policy..."
3,4.5,4.5,4.5,4.5,4.0,5.0,DT JJS NN IN NN VBZ WRB PRP VBP PRP . VB VBP I...,the best time in life is when you become yours...
4,2.5,3.0,3.0,3.0,2.5,2.5,JJ NN IN NN MD VB IN JJ NNS MD VB NNS TO VB JJ...,small act of kindness can impact in other peop...


In [9]:
# "(E)lement-(Wi)se (Dense)" Layer for combining two embeddings (or other multi-feature time sequence data) with a Dense layer applied element-wise (so not exactly Dense, as in the output embedding the first position is only determined by a linear combination of the two embedding values in corresponding positions in the two input embeddings)
# (We picked this name because it was funny)
class EWiDense(tf.keras.layers.Layer):
    def __init__(self, activation=None, **kwargs):
        super(EWiDense, self).__init__(**kwargs)
        self.activation = activation

    def build(self, input_shape):
        self.embedding_size = input_shape[0][-1]
        # print(self.tile_shape)
        self.w1 = self.add_weight(
            shape=[self.embedding_size],
            initializer="ones",
            trainable=True
        )
        # print(tf.shape(self.w1))
        self.w2 = self.add_weight(
            shape=[self.embedding_size],
            initializer="ones",
            trainable=True
        )
        self.b1 = self.add_weight(
            shape=[self.embedding_size],
            initializer="zeros",
            trainable=True
        )

    def call(self, data):  # expected x of two embeddings of shape batch_size, seq_len, embedding_size
        if self.activation:
            return self.activation(tf.multiply(data[0], self.w1) + tf.multiply(data[1], self.w2) + self.b1)
        return tf.multiply(data[0], self.w1) + tf.multiply(data[1], self.w2) + self.b1

In [10]:
# compare standard positional encoding with grammar + positional encodinng
# use encoder networks, but not the decoders because we don't have an output sequence really

class GrammarModel(tf.keras.Model):
    def __init__(self, num_layers, d_model, num_heads, dff,
               max_text_len, text_vocab, pos_vocab, dropout_rate=0.1):
        super().__init__()
        self.num_layers = num_layers
        self.d_model = d_model
        with tf.device("/cpu:0"):
            self.text_vectorization = tf.keras.layers.TextVectorization(output_mode='int', output_sequence_length=max_text_len)
            self.text_vectorization.adapt(text_vocab)
            self.pos_vectorization = tf.keras.layers.TextVectorization(output_mode='int', output_sequence_length=max_text_len)
            self.pos_vectorization.adapt(pos_vocab)
            self.word_embedding = tf.keras.layers.Embedding(self.text_vectorization.vocabulary_size(), d_model) # replace
            self.pos_embedding = tf.keras.layers.Embedding(self.pos_vectorization.vocabulary_size(), d_model)
        self.EWiDenseLayer = EWiDense(activation=tf.keras.layers.LeakyReLU())
        self.pos_encoding = tf.Variable(positional_encoding(length=max_text_len, depth=d_model), trainable=False)
        self.pos_scalar = tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        self.enc_layers = [
            EncoderLayer(d_model=d_model,
                         num_heads=num_heads,
                         dff=dff,
                         dropout_rate=dropout_rate)
            for _ in range(num_layers)]
        self.dropout = tf.keras.layers.Dropout(dropout_rate)
        self.pooling = tf.keras.layers.GlobalAveragePooling1D()
        self.layernorm = tf.keras.layers.LayerNormalization()
        self.dense = tf.keras.layers.Dense(1)

    def call(self, inputs):
        words = inputs[:, 0]
        pos = inputs[:, 1]
        # combine embeddings
        words = self.text_vectorization(words)
        pos = self.pos_vectorization(pos)
        x = self.EWiDenseLayer((self.word_embedding(words), self.pos_embedding(pos)))
        # add positional encoding
        x = x * self.pos_scalar
        x = x + self.pos_encoding
        # dropout
        x = self.dropout(x)
        # add encoding layers
        for i in range(self.num_layers):
            x = self.enc_layers[i](x)
        x = self.pooling(x)
        x = self.layernorm(x)
        return self.dense(x)

In [11]:
## Column-wise RMSE
def MCRMSE(y_true, y_pred):
    mcrmse = tf.reduce_mean(tf.square(y_true - y_pred), axis=1)
    return tf.reduce_mean(tf.sqrt(mcrmse), axis=-1, keepdims=True)

In [12]:
def create_model():
    num_layers = 6
    d_model = 64
    dff = 256
    num_heads = 8
    dropout_rate = 0.2
    model = GrammarModel(num_layers, d_model, num_heads, dff, max_text_len, np.array(list(text_vocab)), np.array(list(pos_vocab)), dropout_rate)
    model.compile(optimizer=tf.keras.optimizers.Adam(1e-5, clipnorm=1), loss=MCRMSE, metrics=MCRMSE, run_eagerly=True)
    return model

In [13]:
model = create_model()
# model.summary()

In [15]:
len(train_df.iloc[0][['tokens', 'pos']][0].split())

283

In [16]:
model.text_vectorization.vocabulary_size()

20133

In [17]:
max_text_len

512

In [94]:
model(tf.expand_dims(tf.convert_to_tensor(train_df.iloc[0][['tokens', 'pos']]), 0))
model.summary()

Model: "grammar_model_8"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 text_vectorization_16 (Text  multiple                 0         
 Vectorization)                                                  
                                                                 
 text_vectorization_17 (Text  multiple                 0         
 Vectorization)                                                  
                                                                 
 embedding_16 (Embedding)    multiple                  1307392   
                                                                 
 embedding_17 (Embedding)    multiple                  2304      
                                                                 
 e_wi_dense_8 (EWiDense)     multiple                  192       
                                                                 
 encoder_layer_28 (EncoderLa  multiple             

In [86]:
# tf.debugging.disable_traceback_filtering()

In [87]:
# import tensorflow as tf
# print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

In [None]:
checkpoint_filepath = 'tmp/checkpoint'
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    monitor="val_loss",
    verbose=1,
    save_weights_only=False,
    mode='min',
    save_best_only=True)
history = model.fit(
                    train_df[['tokens', 'pos']],
                    train_df['grammar'],
                    validation_data = (valid_df[['tokens', 'pos']], valid_df['grammar']),
                    steps_per_epoch= train_df.shape[0]//4,
                    batch_size = 4,
                    epochs= 100,
                    verbose = 1,
                    shuffle= True,
                    callbacks=[model_checkpoint_callback])

Epoch 1/100
 73/782 [=>............................] - ETA: 2:13 - loss: 1.2985 - MCRMSE: 1.2985

In [96]:
history

<keras.callbacks.History at 0x1d5a544d870>

In [97]:
history.history

{'loss': [0.6319288015365601,
  0.5693314671516418,
  0.5644047260284424,
  0.563819169998169,
  0.5583122372627258,
  0.5539228916168213,
  0.5575403571128845,
  0.5539509654045105,
  0.5454583168029785,
  0.5210098028182983,
  0.5034996271133423,
  0.4937760829925537,
  0.4846763014793396,
  0.48725461959838867,
  0.47956550121307373,
  0.47537827491760254,
  0.4688221216201782,
  0.4692607522010803,
  0.46288731694221497,
  0.45620232820510864,
  0.457209974527359,
  0.4546971917152405,
  0.45028936862945557,
  0.4479743540287018,
  0.44166314601898193,
  0.4422048330307007,
  0.4352353811264038,
  0.4331895411014557,
  0.42566388845443726,
  0.42861512303352356,
  0.4187238812446594,
  0.42049524188041687,
  0.41448888182640076,
  0.4155365526676178,
  0.4093209505081177,
  0.40732190012931824,
  0.4102404713630676,
  0.3964894413948059,
  0.396979957818985,
  0.3952256143093109,
  0.3929370641708374,
  0.3880009055137634,
  0.38459107279777527,
  0.37893083691596985,
  0.382968604

In [99]:
model(tf.expand_dims(tf.convert_to_tensor(train_df.iloc[0][['tokens', 'pos']]), 0))

<tf.Tensor: shape=(1, 1), dtype=float32, numpy=array([[nan]], dtype=float32)>