# Exploring the Essays

In this notebook, we perform some initial EDA on the essays. We will examine how the essays look like for various scores, then do some feature engineering to extract some meta-properties from the essays.

In [None]:
!pip install -q autocorrect==1.1.0 pyspellchecker sentence-transformers py_readability_metrics

In [None]:
import numpy as np
import pandas as pd
from pandas.api.types import is_numeric_dtype
import re
import random
import string
import shutil
import warnings
import logging
import gc
from tqdm.autonotebook import tqdm
from collections import Counter

from IPython.display import display, Markdown

import matplotlib.pyplot as plt
import seaborn as sns

import nltk
from nltk.corpus import stopwords
from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize
from autocorrect import Speller
from spellchecker import SpellChecker
from readability import Readability
# import spacy
from textblob import TextBlob

from transformers import AutoTokenizer

In [None]:
np.random.seed(42)
tqdm.pandas()

warnings.simplefilter("ignore")

sns.set_palette('crest')

In [None]:
df_train = pd.read_csv("/kaggle/input/learning-agency-lab-automated-essay-scoring-2/train.csv")
df_train.head()

In [None]:
df_train.info()

In [None]:
SAMPLES_TESTING = None

if SAMPLES_TESTING != None:
    df_train = df_train.sample(SAMPLES_TESTING)

In [None]:
df_train.shape

In [None]:
ax = sns.countplot(data=df_train, x='score')
ax.set_title("Distribution of score")
plt.show()

In [None]:
scores = range(1,7)

def _display(t):
    display(Markdown(t))

def display_essay(row):
    _display(f"**Essay {row['essay_id']}** (score = {row['score']})")
    _display(row['full_text'])

num_to_sample = 10
for score in scores:
    _display(f"# Sample Essays for Score = {score}")
    sample = df_train[df_train['score'] == score].sample(num_to_sample)
    for i, row in sample.iterrows():
        display_essay(row)
        display()

# Feature Engineering

Code is modified from my CommonLit student summaries utility script: https://www.kaggle.com/code/mcpenguin/utility-commonlit-student-summaries#Feature-Engineering

In [None]:
# word difficulty
word_difficulty = pd.read_csv("/kaggle/input/word-difficulty/word-difficulty.csv")
difficult_words_list = word_difficulty.loc[word_difficulty['I_Zscore'] > 0]
difficult_words_list = difficult_words_list.Word.unique().tolist()
difficult_words_list[:20]

In [None]:
# nltk pos tags
tags_df = pd.read_csv("/kaggle/input/ntlk-pos-tags/tags.csv")
tags_df.head()

In [None]:
# set of stop words
STOP_WORDS = set(stopwords.words('english'))

In [None]:
USE_AUTOCORRECT = False

In [None]:
class FeatureEngineer:
    
    def __init__(self):
        self.speller = Speller(lang='en')
        self.spell_checker = SpellChecker()
        
        self.tokenizer = AutoTokenizer.from_pretrained("/kaggle/input/debertav3base")
        
        self.text_column = 'full_text'
        
        
    def count_punctuation(self, row, specific_punctuation=None):
        text = row[self.text_column]
        if specific_punctuation is None:
            punctuation_set = set(string.punctuation)
        else:
            punctuation_set = specific_punctuation

        punctuation_count = sum(1 for char in text if char in punctuation_set)
        return punctuation_count

    # Count the digits in the text.
    def count_numbers(self, row):
        text = row[self.text_column]
        numbers = re.findall(r'\d+', text)
        numbers_count = len(numbers)
        return numbers_count

    def count_text_length(self, df, col, tokenizer):
        return df[col].progress_apply(lambda x: len(tokenizer.encode(x)))

    def check_is_stop_word(self, word):
        return word not in STOP_WORDS

    def get_misspell_count(self, row):
        text = row[self.text_column]
        tokens = nltk.word_tokenize(text)
        mis_tokens = [token for token in self.spell_checker.unknown(tokens) if token.isalpha()]
        return len(mis_tokens)

    def get_ngrams(self, token, n):
        # Use the zip function to help us generate n-grams
        # Concatentate the tokens into ngrams and return
        ngrams = zip(*[token[i:] for i in range(n)])
        return [" ".join(ngram) for ngram in ngrams]

    def filter_difficult_words(self, words):
        return [word for word in words if word in difficult_words_list]

    # advanced vocabulary
    def get_difficult_words(self, text):
        words = [word for word in word_tokenize(text) if word not in STOP_WORDS]
        return set(self.filter_difficult_words(words)), words

    def get_difficult_words_stats(self, row):
        summary = row[self.text_column]
        summary_difficult_words, all_words = self.get_difficult_words(summary)
        return {
            "summary_difficult_words_count": len(summary_difficult_words),
        }

    def get_readability_metrics(self, row):
        text = row[self.text_column]
        r = Readability(text)

        flesch_kincaid_score = None
        flesch_kincaid_grade_level = None    
        flesch_score = None
        flesch_ease = None
        flesch_grade_levels = None
        dale_chall_score = None
        dale_chall_grade_levels = None
        ari_score = None
        ari_grade_levels = None
        ari_ages = None
        coleman_liau_score = None
        coleman_liau_grade_level = None
        gunning_fog_score = None
        gunning_fog_grade_level = None
        smog_score = None
        smog_grade_level = None
        spache_score = None
        spache_grade_level = None
        linsear_write_score = None
        linsear_write_grade_level = None

        if len(text.split()) > 120:
            fk = r.flesch_kincaid()
            flesch_kincaid_score = fk.score
            flesch_kincaid_grade_level = fk.grade_level

            f = r.flesch()
            flesch_score = f.score
            flesch_ease = f.ease
            flesch_grade_levels = f.grade_levels

            dc = r.dale_chall()
            dale_chall_score = dc.score
            dale_chall_grade_levels = dc.grade_levels

            ari = r.ari()
            ari_score = ari.score
            ari_grade_levels = ari.grade_levels
            ari_ages = ari.ages

            cl = r.coleman_liau()
            coleman_liau_score = cl.score
            coleman_liau_grade_level = cl.grade_level

            gf = r.gunning_fog()
            gunning_fog_score = gf.score
            gunning_fog_grade_level = gf.grade_level

            sp = r.spache()
            spache_score = sp.score
            spache_grade_level = sp.grade_level

            lw = r.linsear_write()
            linsear_write_score = lw.score
            linsear_write_grade_level = lw.grade_level

        result = (
            flesch_kincaid_score, 
            flesch_score,
            dale_chall_score,
            ari_score,
            coleman_liau_score,
            gunning_fog_score,
            spache_score,
            linsear_write_score,
        )

        return result
    
    def count_num_pos_tags(self, row, text_col_name):
        result = {}
        for tag in tags_df["abbrev"]:
            result[f"count_{text_col_name}_{tag}"] = 0
            
        result[f"count_{text_col_name}_#"] = 0

        text = row[text_col_name]
        word_tok_text = word_tokenize(text)
        len_text = len(word_tok_text)
        pair_pos_tags = pos_tag(word_tok_text)
        for pair in pair_pos_tags:
            tag = pair[1]
            result[f"count_{text_col_name}_{tag}"] += 1

        return result

    def get_polarity_and_subjectivity(self, row):
        text = row[self.text_column]
        blob = TextBlob(str(text))
        polarity = blob.sentiment.polarity
        subjectivity = blob.sentiment.subjectivity
        return {
            'text_polarity': polarity,
            'text_subjectivity': subjectivity
        }
    
    def create_features(self, df):
        df = df.applymap(lambda s: s.lower() if type(s) == str else s)
    
        # create corrected text summary 
        if USE_AUTOCORRECT:
            print("Adding corrected summary text")
            df[self.text_column] = df[self.text_column].progress_apply(self.speller)

        # number of punctuation
        # punctuation_set -> !"#$%&'()*+, -./:;<=>?@[\]^_`{|}~
        print("Adding number of punctuation")
        df['punctuation_count'] = df.progress_apply(lambda row: self.count_punctuation(row), axis=1)
        df['full_stop_count'] = df.progress_apply(lambda row: self.count_punctuation(row, ['.']), axis=1)
        df['comma_count'] = df.progress_apply(lambda row: self.count_punctuation(row, [',']), axis=1)
        df['question_mark_count'] = df.progress_apply(lambda row: self.count_punctuation(row, ['?']), axis=1)
        df['exclamation_mark_count'] = df.progress_apply(lambda row: self.count_punctuation(row, ['!']), axis=1)
        df['colon_count'] = df.progress_apply(lambda row: self.count_punctuation(row, [':']), axis=1)
        df['semicolon_count'] = df.progress_apply(lambda row: self.count_punctuation(row, [';']), axis=1)
        df['brackets_count'] = df.progress_apply(lambda row: self.count_punctuation(row, ['(', ')', '[', ']', '{', '}']), axis=1)

        # number of numbers
        print("Adding number of numbers")
        df['number_count'] = df.progress_apply(lambda row: self.count_numbers(row), axis=1)

        # number of misspells
        print("Adding number of misspells")
        df['misspell_count'] = df.progress_apply(lambda row: self.get_misspell_count(row), axis=1)

        # summary length + length ratio
        df['text_length'] = self.count_text_length(df, self.text_column, self.tokenizer)

        # get readability metrics
        print("Adding readabililty metrics data")
        readability_metrics = [
            "flesch_kincaid_score",   
            "flesch_score",
            "dale_chall_score",
            "ari_score",
            "coleman_liau_score",
            "gunning_fog_score",
            "spache_score",
            "linsear_write_score",
        ]
        df[readability_metrics] = pd.DataFrame(
            df.progress_apply(self.get_readability_metrics, axis=1).tolist(),
            index=df.index).astype('float64')

        # advanced vocabulary count
        print("Adding advanced vocabulary data")
        summaries_difficult_words_stats_df = df.progress_apply(lambda x: pd.Series(self.get_difficult_words_stats(x)), axis=1)
        summaries_difficult_words_stats_df.columns = [f"ADV_{col}" for col in summaries_difficult_words_stats_df.columns.values]
        df = pd.concat([df, summaries_difficult_words_stats_df], axis=1)
        
        print(df.shape)

        # get number of occurences of pos tags
#         print("Adding number of pos tags data")
#         pos_tag_results = df.progress_apply(lambda row: self.count_num_pos_tags(row, self.text_column), axis=1).values.tolist()
#         pos_tag_results = pd.DataFrame.from_records(pos_tag_results)
#         df = pd.concat([df, pos_tag_results], axis=1)

        # get polarity and subjectivity data
#         print("Adding polarity and subjectivity data")
#         pol_subj_results = df.progress_apply(lambda row: self.get_polarity_and_subjectivity(row), axis=1).values.tolist()
#         df = pd.concat([df, pd.DataFrame.from_records(pol_subj_results)], axis=1)

        print("Done!")
        return df

In [None]:
fe = FeatureEngineer()

feat_eng = fe.create_features(df_train)

del fe
gc.collect()

In [None]:
feat_eng.head()

In [None]:
def make_describe(df: pd.DataFrame):
    describe =  pd.DataFrame(feat_eng.columns, columns=['param_name'])
    
    describe['count'] = describe['param_name'].apply(lambda col: df[col].count())
    describe['missing_count'] = describe['param_name'].apply(lambda col: df[col].isna().sum())
    describe['missing_%'] = describe['param_name'].apply(lambda col: df[col].isna().sum() / df[col].isna().count())
    describe['unique_count'] = describe['param_name'].apply(lambda col: len(df[col].unique()))
    describe['unique_values'] = describe['param_name'].apply(lambda col: df[col].unique() if len(df[col].unique()) < 30 else "truncated")
    
    describe['mean'] = describe['param_name'].apply(lambda col: df[col].mean() if is_numeric_dtype(df[col]) else np.nan)
    describe['median'] = describe['param_name'].apply(lambda col: df[col].median() if is_numeric_dtype(df[col]) else np.nan)
    describe['std'] = describe['param_name'].apply(lambda col: df[col].std() if is_numeric_dtype(df[col]) else np.nan)
    describe['min'] = describe['param_name'].apply(lambda col: df[col].min() if is_numeric_dtype(df[col]) else np.nan)
    for p in [5, 25, 50, 75, 95]: 
        describe[f'{p}%'] = describe['param_name'].apply(lambda col: df[col].quantile(p/100) if is_numeric_dtype(df[col]) else np.nan)
    describe['max'] = describe['param_name'].apply(lambda col: df[col].max() if is_numeric_dtype(df[col]) else np.nan)
    
    return describe.style.background_gradient(
        axis=1,
    )

In [None]:
make_describe(feat_eng)

In [None]:
feats = list(set(feat_eng.columns).difference(set(['essay_id', 'full_text', 'score'])))

fig, axes = plt.subplots(nrows = len(feats) // 3 + 1, ncols = 3, figsize = (16, len(feats) * 1.6))
plt.subplots_adjust(hspace = 0.5)

for feat, ax in zip(feats, axes.flat):
    sns.boxenplot(data = feat_eng, x = 'score', y = feat, ax = ax)
    ax.set_title(feat)
    
plt.show()