### Learning Agency Lab - Automated Essay Scoring 2.0

In this notebook, I demonstrated a baseline modelling for auto essay scoring using Tensorflow.
I used my previous work for this task which was for detecting if an essay written by AI or Human. [Notebook](https://www.kaggle.com/code/umar47/notebooked8b6a149c)

1- Importing

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import tensorflow as tf
import re
import string
import tensorflow_text as tf_text
from collections import Counter
from sklearn.metrics import cohen_kappa_score
import numpy as np
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

RANDOM_SEED = 3
tf.random.set_seed(RANDOM_SEED)

In [None]:
class Config:
    train = "/kaggle/input/learning-agency-lab-automated-essay-scoring-2/train.csv"
    test = "/kaggle/input/learning-agency-lab-automated-essay-scoring-2/test.csv"
    # Model constants
    max_features = 75000
    embedding_dim = 128
    sequence_length = 512
    batch_size = 128
    num_classes = 6
    epochs = 15
    ngram = (1,4)#(2,4)#-0.71

* Reading train and test data

In [None]:
train = pd.read_csv(Config.train)
test = pd.read_csv(Config.test)

* first look at the data

In [None]:
train

In [None]:
train['score']=train['score'] - 1

In [None]:
train.info()

In [None]:
train

#### Simple EDA

* distribution of scores, seems like very few example for score 5 and 6

In [None]:
train['score'].value_counts().plot(kind = 'bar', color = ['steelblue', 'orange'])
plt.ylabel('Percentage');

* Distribution of length of essays

In [None]:
import matplotlib.pyplot as plt
sequence_lengths = train['full_text'].apply(lambda x: len(x.split()))

plt.hist(sequence_lengths, bins=30, alpha=0.75, color='blue')
plt.title('Distribution of Text Sequence Lengths for Essays')
plt.xlabel('Sequence Length')
plt.ylabel('Frequency')
plt.show()

* Distribution of texts for each score individually.
It changes a lot for some scores as seems which tells that length of essay is an important metric here

In [None]:
scores = train['score'].unique()
# Create subplots
fig, axes = plt.subplots(2, 3, figsize=(18, 10))

# Flatten axes for easy iteration
axes = axes.flatten()

for i, score in enumerate(scores):
    # Get sequence lengths for the current score
    sequence_lengths = train.loc[train['score'] == score]['full_text'].apply(lambda x: len(x.split()))
    
    # Plot histogram
    axes[i].hist(sequence_lengths, bins=30, alpha=0.75, color='blue')
    axes[i].set_title(f'Distribution of Text Sequence Lengths for score {score}')
    axes[i].set_xlabel('Sequence Length')
    axes[i].set_ylabel('Frequency')

# Hide any remaining empty subplots
for j in range(len(scores), len(axes)):
    axes[j].axis('off')

plt.tight_layout()
plt.show()

In [None]:
essays = train['full_text'].values 
scores = train['score'].values  # List of corresponding scores
def preprocess_text(text):
    text = re.sub("[^a-zA-Z]", ' ', text)

    text = text.lower().split()
    swords = set(stopwords.words("english"))
    text = [w for w in text if w not in swords]
    text = " ".join(text)
    return text

# Function to plot top 5 words for each score category
def plot_top_words_per_score(essays, scores):
    # Create subplots
    fig, axs = plt.subplots(2, 3, figsize=(15, 10))
    fig.tight_layout(pad=5.0)
    for i, score in enumerate(range(0, 6)):
        essays_for_score = [essay for essay, s in zip(essays, scores) if s == score]
        
        tokenized_essays = [preprocess_text(essay).split() for essay in essays_for_score]

        all_words = [word for essay_words in tokenized_essays for word in essay_words]

        word_counts = Counter(all_words)

        # Get the top 5 most common words
        top_words = word_counts.most_common(5)

        # Plot the top 5 words
        ax = axs[i // 3, i % 3]  # Select subplot
        ax.bar([word[0] for word in top_words], [word[1] for word in top_words])
        ax.set_title(f"Score {score}")
        ax.set_xlabel("Word")
        ax.set_ylabel("Frequency")

    plt.show()
plot_top_words_per_score(essays, scores)

#### Dataset Prep.

* Lets start prepraring dataset for training. 

In [None]:
# Split the data into training, validation, and test sets
train_df, test_df = train_test_split(train, test_size=0.3, random_state=42)
train_df, val_df = train_test_split(train_df, test_size=0.3, random_state=42)

batch_size = Config.batch_size

raw_train_ds = tf.data.Dataset.from_tensor_slices(
    (train_df['full_text'].values, train_df['score'].values)
).batch(batch_size)

raw_val_ds = tf.data.Dataset.from_tensor_slices(
    (val_df['full_text'].values, val_df['score'].values)
).batch(batch_size)

raw_test_ds = tf.data.Dataset.from_tensor_slices(
    (test_df['full_text'].values, test_df['score'].values)
).batch(batch_size)

In [None]:
def tf_lower_and_split_punct(text):
    # Convert words to lowercase
    text = tf.strings.lower(text)

    # Remove HTML tags
    text = tf.strings.regex_replace(text, '<.*?>', '')

    # Delete strings starting with @
    text = tf.strings.regex_replace(text, '@\w+', '')

    # Delete Numbers
    text = tf.strings.regex_replace(text, "'\d+", '')
    text = tf.strings.regex_replace(text, '\d+', '')

    # Delete URL
    text = tf.strings.regex_replace(text, 'http\w+', '')

    # Replace consecutive empty spaces with a single space character
    text = tf.strings.regex_replace(text, r'\s+', ' ')

    # Replace consecutive commas and periods with one comma and period character
    text = tf.strings.regex_replace(text, r'\.+', '.')
    text = tf.strings.regex_replace(text, r'\,+', ',')

    # Strip whitespace.
    text = tf.strings.strip(text)

    # Join with '[START]' and '[END]'
    text = tf.strings.join(['[START]', text, '[END]'], separator=' ')
    
    return text


# Text vectorization layer
vectorize_layer = tf.keras.layers.TextVectorization(
    standardize=tf_lower_and_split_punct,
    max_tokens=Config.max_features,
    ngrams = Config.ngram,
    output_mode="int",
    output_sequence_length=Config.sequence_length,
    pad_to_max_tokens=True
)

text_ds = raw_train_ds.map(lambda x, y: x)
vectorize_layer.adapt(text_ds)

# Vectorize the data
def vectorize_text(text, label):
    text = tf.expand_dims(text, -1)
    return vectorize_layer(text), label

train_ds = raw_train_ds.map(vectorize_text)
val_ds = raw_val_ds.map(vectorize_text)
test_ds = raw_test_ds.map(vectorize_text)

In [None]:
for data, labels in train_ds.take(1):
    print("Data shape:", data.shape)
    print("Labels:", labels)

In [None]:
text_batch, label_batch = next(iter(raw_train_ds))
first_text, first_label = text_batch[5], label_batch[5]
print("text: ", first_text)
print("Vectorized text: ", vectorize_text(first_text, first_label))

In [None]:
vectorize_layer.get_vocabulary()[3000:3010]

In [None]:
for example_context_strings, example_target_strings in raw_train_ds.take(2024):
    #print(example_context_strings[10:12])
    print()
    #print(example_target_strings[10:12])
    break

In [None]:
example_tokens = vectorize_layer(example_context_strings)
example_tokens[:3, :]

In [None]:
#padded sequences viz
plt.subplot(1, 2, 1)
plt.pcolormesh(example_tokens)
plt.title('Token IDs')

plt.subplot(1, 2, 2)
plt.pcolormesh(example_tokens != 0)
plt.title('Mask')

#### Modelling

In [None]:
from tensorflow.keras.layers import TextVectorization, Embedding, Bidirectional, LSTM, Conv1D, GlobalMaxPooling1D, Dense, Dropout
from tensorflow.keras import Model, Input

* A simple transformer block

In [None]:
# great work: https://www.kaggle.com/code/ichigoe/acc0-921-bidirectionlstm-transformer-cnn-approach
class TransformerBlock(tf.keras.layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerBlock, self).__init__()
        self.att = tf.keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = tf.keras.Sequential([
            tf.keras.layers.Dense(ff_dim, activation="relu"),
            tf.keras.layers.Dense(embed_dim)  # Adjust output dimension to embed_dim
        ])
        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = tf.keras.layers.Dropout(rate)
        self.dropout2 = tf.keras.layers.Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        out2 = self.layernorm2(out1 + ffn_output)  # Use out1 instead of inputs
        return out2

* model will need many tuning 

In [None]:
inputs = tf.keras.Input(shape=(Config.sequence_length,), dtype="int64")
x = tf.keras.layers.Embedding(Config.max_features, Config.embedding_dim)(inputs)
x = tf.keras.layers.SpatialDropout1D(0.5)(x)
x = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True))(x)
transformer_block = TransformerBlock(Config.embedding_dim, 6, 32)
x = transformer_block(x, training=True)
x = tf.keras.layers.Conv1D(128, 4,  activation="relu", strides=3)(x)
x = tf.keras.layers.Conv1D(128, 4,  activation="relu", strides=3)(x)
x = tf.keras.layers.Conv1D(128, 4,  activation="relu", strides=3)(x)
x = tf.keras.layers.Conv1D(128, 4,  activation="relu", strides=3)(x)
x = tf.keras.layers.Dense(128, activation="relu")(x)
transformer_block = TransformerBlock(Config.embedding_dim, 6, 32)
x = transformer_block(x, training=True)
x = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128, return_sequences=True))(x)
x = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128, return_sequences=True))(x)
#x = tf.keras.layers.SpatialDropout1D(0.5)(x)
x = tf.keras.layers.Dropout(0.5)(x)
x = tf.keras.layers.Dense(128, activation="relu")(x)
x = tf.keras.layers.GlobalMaxPooling1D()(x)
x = tf.keras.layers.Dense(128, activation="relu")(x)
x = tf.keras.layers.Dropout(0.5)(x)

predictions = tf.keras.layers.Dense(Config.num_classes, activation="softmax", name="predictions")(x)
model = tf.keras.Model(inputs, predictions)
model.summary() 

* testing cohen kappa implementation

In [None]:
class CohenKappaWeightedMetric(tf.keras.metrics.Metric):
    def __init__(self, num_classes, name='cohen_kappa_weighted', **kwargs):
        super(CohenKappaWeightedMetric, self).__init__(name=name, **kwargs)
        self.num_classes = num_classes
        self.conf_mtx = self.add_weight(name='conf_mtx', shape=(num_classes, num_classes), initializer='zeros')

    def update_state(self, y_true, y_pred, sample_weight=None):
        y_true = tf.cast(y_true, tf.int32)
        #tf.print("y_p", y_pred)
        #y_pred = tf.cast(y_pred, tf.int32)
        y_pred = tf.argmax(y_pred, axis=1)
        
        #tf.print("y_t", y_true)
        #tf.print("y_p after", y_pred)
        conf_mtx = tf.math.confusion_matrix(labels=y_true, predictions=y_pred, num_classes=self.num_classes)
        self.conf_mtx.assign_add(conf_mtx)

    def result(self):
        weights = np.arange(1, self.num_classes + 1)
        true_counts = tf.reduce_sum(self.conf_mtx, axis=1)
        pred_counts = tf.reduce_sum(self.conf_mtx, axis=0)
        true_sum = tf.reduce_sum(true_counts * weights)
        pred_sum = tf.reduce_sum(pred_counts * weights)
        observed = tf.reduce_sum(tf.linalg.diag_part(self.conf_mtx) * weights)
        expected = true_sum * pred_sum / tf.reduce_sum(self.conf_mtx)
        kappa = 1 - (1 - observed / expected) / (1 - pred_sum / expected)
        return kappa

    def reset_states(self):
        tf.keras.backend.set_value(self.conf_mtx, np.zeros((self.num_classes, self.num_classes)))

#model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=[CohenKappaWeightedMetric(num_classes=6)])
#history = model.fit(train_ds, validation_data=val_ds, epochs=Config.epochs, class_weight=dict(enumerate(class_weights)))

* train, evaluation and submission.(model for some reason is overfitting and i am investigating it but if you have any idea why feel free to comment!)

In [None]:
class_counts = np.bincount(train['score'])
total_samples = np.sum(class_counts)
num_classes = len(class_counts)
class_weights = total_samples / (num_classes * class_counts)
class_weights

In [None]:
initial_learning_rate = 0.0001
lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
    initial_learning_rate,
    decay_steps=1000, 
    decay_rate=0.96,
    staircase=True)

optimizer = tf.keras.optimizers.Adam(learning_rate=lr_schedule)

In [None]:
model.compile(loss="sparse_categorical_crossentropy", optimizer=optimizer, metrics=["accuracy"])# Change loss function
history = model.fit(train_ds, validation_data=val_ds, epochs=Config.epochs, class_weight=dict(enumerate(class_weights)))

#model.compile(loss="sparse_categorical_crossentropy", optimizer="adam", metrics=[CohenKappaWeightedMetric(num_classes=6)])
#history = model.fit(train_ds, validation_data=val_ds, epochs=Config.epochs, class_weight=dict(enumerate(class_weights)))

In [None]:
# Evaluate the model on the test set
test_loss, test_acc = model.evaluate(test_ds)
print('Test Loss:', test_loss)
print('Test Accuracy:', test_acc)

In [None]:
test_text = test['full_text'].values
vectorized_test_text = vectorize_layer(test_text)
predictions = model.predict(vectorized_test_text)
test['score'] = np.argmax(predictions, axis=1) + 1
test

In [None]:
test[['essay_id', 'score']].to_csv("submission.csv", index = False)

* To dos


1 - Quadratic Weighted Kappa (QWK)  metric
