In [1]:
import numpy as np
import functools
import pandas as pd
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

import string
from nltk.corpus import stopwords
PUNCT_TO_REMOVE = string.punctuation
STOPWORDS = set(stopwords.words('english'))
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag, word_tokenize
from nltk.data import load
import re

import tensorflow as tf
from transformer import positional_encoding, EncoderLayer
AUTO = tf.data.experimental.AUTOTUNE

In [2]:
data_dir = "kaggle/input/"
train_df = pd.read_csv(data_dir + 'feedback-prize-english-language-learning/train.csv')
test_df = pd.read_csv(data_dir + 'feedback-prize-english-language-learning/test.csv')
sample_df = pd.read_csv(data_dir + 'feedback-prize-english-language-learning/sample_submission.csv')
print(train_df.shape, test_df.shape, sample_df.shape)

(3911, 8) (3, 2) (3, 7)


In [3]:
train_df.head()

Unnamed: 0,text_id,full_text,cohesion,syntax,vocabulary,phraseology,grammar,conventions
0,0016926B079C,I think that students would benefit from learn...,3.5,3.5,3.0,3.0,4.0,3.0
1,0022683E9EA5,When a problem is a change you have to let it ...,2.5,2.5,3.0,2.0,2.0,2.5
2,00299B378633,"Dear, Principal\n\nIf u change the school poli...",3.0,3.5,3.0,3.0,3.0,2.5
3,003885A45F42,The best time in life is when you become yours...,4.5,4.5,4.5,4.5,4.0,5.0
4,0049B1DF5CCC,Small act of kindness can impact in other peop...,2.5,3.0,3.0,3.0,2.5,2.5


In [4]:
size = train_df.shape[0]
train, validate = int(0.8*size), int(0.2*size)
valid_df = train_df.tail(validate).copy()
train_df = train_df.head(train).copy()
print(train_df.shape, valid_df.shape)

(3128, 8) (782, 8)


In [5]:
class Preprocessor():
    def __init__(self, init_vocab = set(), truncate=512):
        self.text_vocab = init_vocab
        self.truncate = truncate
        pass

    def preprocess(self, text) :
        text = text.lower()
        text = re.sub(r'https?://\S+|www\.\S+','', text)
        text = re.sub(r'@[0-9a-zA-Z]*\W+',' ' , text)

        text = re.sub(r'\d+', ' ', text)
        text = re.sub(r'\#', ' ', text)
        text = re.sub(r'\'', ' ', text)
        text = re.sub(r'[^\w\s]', '', text)

        list_text = text.split()
        self.text_vocab.update(list_text)
        text = ' '.join(list_text[:self.truncate])
        return text
    
    def reset_vocab(self):
        self.text_vocab = set()

In [6]:
from vocab.word_freqs import WordFreqDataLoader
wfdl = WordFreqDataLoader('./datasets/unigram_freq.csv')
wfdl.df.columns.to_list()

['word', 'count', 'word_freq', 'log_freq']

In [7]:
wfdl.df.head()

Unnamed: 0,word,count,word_freq,log_freq
0,the,23135851162,0.039338,0.93297
1,of,13151942776,0.022363,1.497784
2,and,12997637966,0.0221,1.509585
3,to,12136980858,0.020637,1.578096
4,a,9081174698,0.015441,1.86815


In [8]:
preprocessor = Preprocessor()

def compute_and_pad(text):
    not_padded = wfdl.get_list_logs(text.split())
    padded = np.pad(not_padded, (0, preprocessor.truncate - len(not_padded)), 'constant', constant_values=0)
    return padded


for dataset in [train_df, valid_df, test_df]:
    dataset['full_text'] = dataset['full_text'].apply(lambda text: preprocessor.preprocess(text))
    dataset['log_freqs'] = dataset['full_text'].apply(lambda text:  compute_and_pad(text)) # np array in each index

text_vocab = preprocessor.text_vocab
all_data = pd.concat((train_df, valid_df, test_df)).reset_index(drop=True)

In [9]:
all_data.head()

Unnamed: 0,text_id,full_text,cohesion,syntax,vocabulary,phraseology,grammar,conventions,log_freqs
0,0016926B079C,i think that students would benefit from learn...,3.5,3.5,3.0,3.0,4.0,3.0,"[2.9474044, 5.5982714, 2.8505685, 5.6600685, 4..."
1,0022683E9EA5,when a problem is a change you have to let it ...,2.5,2.5,3.0,2.0,2.0,2.5,"[4.504181, 1.8681495, 6.033263, 2.5255692, 1.8..."
2,00299B378633,dear principal if u change the school policy o...,3.0,3.5,3.0,3.0,3.0,2.5,"[7.956408, 7.651186, 3.947731, 5.7912893, 5.63..."
3,003885A45F42,the best time in life is when you become yours...,4.5,4.5,4.5,4.5,4.0,5.0,"[0.9329697, 5.0636106, 4.1700873, 1.9378928, 5..."
4,0049B1DF5CCC,small act of kindness can impact in other peop...,2.5,3.0,3.0,3.0,2.5,2.5,"[5.642784, 6.0328083, 1.4977837, 10.056067, 3...."


In [10]:

class VocabModel(tf.keras.Model):
    def __init__(self, num_layers, d_model, num_heads, dff,
               max_text_len, text_vocab, dropout_rate=0.1):
        super().__init__()
        self.num_layers = num_layers
        self.d_model = d_model
        self.max_text_len = max_text_len
        with tf.device("/cpu:0"):
            self.text_vectorization = tf.keras.layers.TextVectorization(output_mode='int', output_sequence_length=max_text_len)
            self.text_vectorization.adapt(text_vocab)
            self.word_embedding = tf.keras.layers.Embedding(self.text_vectorization.vocabulary_size(), d_model) # replace
        
        self.enc_layers = [
            EncoderLayer(d_model=d_model,
                         num_heads=num_heads,
                         dff=dff,
                         dropout_rate=dropout_rate)
            for _ in range(num_layers)]
        self.dropout = tf.keras.layers.Dropout(dropout_rate)
        self.pooling = tf.keras.layers.GlobalAveragePooling1D()
        self.pos_encoding = tf.Variable(positional_encoding(length=max_text_len, depth=d_model), trainable=False)
        self.pos_scalar = tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        self.layernorm = tf.keras.layers.LayerNormalization()
        self.inter_dense = tf.keras.layers.Dense(d_model, activation='relu')
        self.dense = tf.keras.layers.Dense(1)
    
    def call(self, inputs):
        """
        - inputs is a tuple of length 2.
        first index is the words (batch_size,)
        second index is the log freqs (batch_size, num_words)
        """
        words = inputs[0] # (batch_size,)
        log_freqs = tf.expand_dims(tf.convert_to_tensor(inputs[1], dtype=tf.float32), -1)
        x = self.word_embedding(self.text_vectorization(words))

        # positional encoding
        x = x * self.pos_scalar
        x = x + self.pos_encoding
        # dropout
        x = self.dropout(x)
        # add encoding layers
        for i in range(self.num_layers):
            x = self.enc_layers[i](x)
        # should be of shape (batch_size, num_words, embed_size)
        # we want to add on a (batch_size, num_words, 1)
        x = tf.concat((x, log_freqs), axis=2)
        # now have (batch_size, num_words, embed_size + 1)
        x = self.inter_dense(x)
        x = self.pooling(x)
        x = self.layernorm(x)
        return self.dense(x)

In [11]:
## Column-wise RMSE
def MCRMSE(y_true, y_pred):
    mcrmse = tf.reduce_mean(tf.square(y_true - y_pred), axis=1)
    return tf.reduce_mean(tf.sqrt(mcrmse), axis=-1, keepdims=True)

In [12]:
def create_model():
    num_layers = 2
    d_model = 64
    dff = 256
    num_heads = 8
    dropout_rate = 0.1
    max_text_len = preprocessor.truncate
    model = VocabModel(num_layers, d_model, num_heads, dff, max_text_len, np.array(list(text_vocab)), dropout_rate)
    model.compile(optimizer=tf.keras.optimizers.Adam(1e-5, clipnorm=1), loss=MCRMSE, metrics=MCRMSE, run_eagerly=True)
    return model

In [13]:
model = create_model()

2022-12-09 11:43:52.202930: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:925] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2022-12-09 11:43:52.226449: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:925] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2022-12-09 11:43:52.226871: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:925] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2022-12-09 11:43:52.227544: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate

In [14]:
model((tf.expand_dims(tf.convert_to_tensor(train_df.iloc[0]['full_text']), 0), tf.expand_dims(tf.convert_to_tensor(train_df.iloc[0]['log_freqs']), 0)))

2022-12-09 11:43:55.647714: I tensorflow/stream_executor/cuda/cuda_dnn.cc:366] Loaded cuDNN version 8500


<tf.Tensor: shape=(1, 1), dtype=float32, numpy=array([[2.1230109]], dtype=float32)>

In [15]:
checkpoint_filepath = 'checkpoints/vocab/run_2/'
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    monitor="val_loss",
    verbose=1,
    save_weights_only=True,
    mode='min',
    save_best_only=True)
history = model.fit(
                    (train_df['full_text'], np.array(train_df['log_freqs'].values.tolist())),
                    train_df['vocabulary'],
                    validation_data = ((valid_df['full_text'], np.array(valid_df['log_freqs'].values.tolist())), valid_df['vocabulary']),
                    steps_per_epoch= train_df.shape[0]//8,
                    batch_size = 8,
                    epochs= 25,
                    verbose = 1,
                    shuffle= True, callbacks=[model_checkpoint_callback])

Epoch 1/25
 44/391 [==>...........................] - ETA: 4:02 - loss: 0.8540 - MCRMSE: 0.8540

KeyboardInterrupt: 

In [17]:
history.history

{'loss': [0.605384349822998,
  0.43926531076431274,
  0.43117430806159973,
  0.4231126010417938,
  0.4112090766429901,
  0.39481255412101746,
  0.37830406427383423,
  0.3676895201206207,
  0.36019647121429443,
  0.3509618937969208,
  0.3448069095611572,
  0.3376022279262543,
  0.33415818214416504],
 'MCRMSE': [0.605384349822998,
  0.43926531076431274,
  0.43117430806159973,
  0.4231126010417938,
  0.4112090766429901,
  0.39481255412101746,
  0.37830406427383423,
  0.3676895201206207,
  0.36019647121429443,
  0.3509618937969208,
  0.3448069095611572,
  0.3376022279262543,
  0.33415818214416504],
 'val_loss': [0.42399463057518005,
  0.41263145208358765,
  0.4066115915775299,
  0.39771413803100586,
  0.38454118371009827,
  0.3781641125679016,
  0.36460965871810913,
  0.3625567555427551,
  0.3951273262500763,
  0.3822939991950989,
  0.3597392141819,
  0.38560110330581665,
  0.3674958348274231],
 'val_MCRMSE': [0.4237264394760132,
  0.4124107360839844,
  0.4062783718109131,
  0.397385448217

In [18]:
pd.DataFrame(history.history).to_csv(os.path.join(checkpoint_filepath, 'loss_history.csv'))