# Summery: Featuring engineering + LGBM
This notebook is a modified from notebook provided by YE_AI [link](https://www.kaggle.com/code/ye11725/tfidf-lgbm-baseline-cv-0-799-lb-0-799?scriptVersionId=171264491). If you like my notebook remember to like YE_AI notebook also. 

***

### Modification Done:
1. Add sum, kurtosis, Quartile 1 and Quartile 3 to paragraph and sentence features.
2. Add spelling mistake counting and Extra text processing.
3. Add TFIDF vector.
4. Add TFIDF word. 

***


**Extra text processing**:
> * Contraction Expension e.g. I'll --> i will, this is added as a text processing step.
> * Punctuation removal is applied:
    - When extra features are generation [**[Link](#extra-feature)**]
> 
> **Note**: The <u>Extra text processing</u> is taken from the notebook [here](https://www.kaggle.com/code/xianhellg/more-feature-engineering-feature-selection-0-817)

***

### References:
* Paragraph, sentence & word based features [source](https://www.kaggle.com/code/ye11725/tfidf-lgbm-baseline-cv-0-799-lb-0-799).
* Spelling mistake counting [here](https://www.kaggle.com/code/tsunotsuno/updated-debertav3-lgbm-with-spell-autocorrect).
* Extra text processing [here](https://www.kaggle.com/code/xianhellg/more-feature-engineering-feature-selection-0-817?scriptVersionId=173223907&cellId=11).
* TFIDF vector [here](https://www.kaggle.com/code/ye11725/tfidf-lgbm-baseline-cv-0-799-lb-0-799?scriptVersionId=172203959&cellId=16).
* TFIDF word [here](https://www.kaggle.com/code/guillaums/error-in-tfidf-vectorizer-in-baseline-nbs?scriptVersionId=175110986&cellId=17).

# 1. Import modules

In [None]:
!pip install "/kaggle/input/pyspellchecker/pyspellchecker-0.7.2-py3-none-any.whl"

In [None]:
import os
import numpy as np 
import pandas as pd
import polars as pl
import matplotlib.pyplot as plt
import seaborn as sn

import torch
from glob import glob
from tqdm import tqdm
from typing import List
import logging
import json, string

from scipy.special import softmax
import gc

from datasets import Dataset,load_dataset, load_from_disk
from datasets import load_metric, disable_progress_bar

import transformers
from transformers import AutoModel, AutoTokenizer, AutoConfig, AutoModelForSequenceClassification
from transformers import DataCollatorWithPadding
from transformers import TrainingArguments, Trainer


from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold, GroupKFold, StratifiedKFold
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.metrics import cohen_kappa_score, accuracy_score
from sklearn.feature_selection import mutual_info_regression
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, f1_score

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tokenize.treebank import TreebankWordDetokenizer
from nltk.stem import WordNetLemmatizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer


from collections import Counter
import spacy
import re
from spellchecker import SpellChecker

import lightgbm as lgb
from lightgbm import LGBMClassifier
from lightgbm import log_evaluation, early_stopping

import joblib

# logging setting 
import warnings
warnings.simplefilter("ignore")
logging.disable(logging.ERROR)
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 
disable_progress_bar()
tqdm.pandas()

nlp = spacy.load("en_core_web_sm")

import pandas as pd
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_distances

with open('/kaggle/input/english-word-hx/words.txt', 'r') as file:
    english_vocab = set(word.strip().lower() for word in file)

# See stopwords at: http://mlg.ucd.ie/files/datasets/stopwords.txt?fbclid=IwAR2b3y3IJZ4DWdlbX5xhxlLOW5OW3UeBX8vkRTbBRSFiykQefzZttnkOrEA
with open('/kaggle/input/stop-words/stopwords.txt', 'r') as file:
    stopwords_list = [word.strip().lower() for word in file]


In [None]:
print(len(stopwords.words('english')))
print(len(stopwords_list))
final_stopwords_list = list(set(stopwords.words('english')) | set(stopwords_list))
print(len(final_stopwords_list))

In [None]:
!unzip /usr/share/nltk_data/corpora/wordnet.zip -d /usr/share/nltk_data/corpora/

# **Initial configuration:**

In [None]:
train_path = '/kaggle/input/learning-agency-lab-automated-essay-scoring-2/train.csv'
test_path = '/kaggle/input/learning-agency-lab-automated-essay-scoring-2/test.csv'
sub_path = '/kaggle/input/learning-agency-lab-automated-essay-scoring-2/sample_submission.csv'

In [None]:
n_splits = 5
seed = 42

# **2. Load dataset:**

In [None]:
train = pd.read_csv(train_path)
train.head(5)

In [None]:
test = pd.read_csv(test_path)
test.head()

# 3. Feature Engineering

## 3.1 Data preprocessing functions definations

In [None]:

cList = {
    "ain't": "am not", "aren't": "are not", "can't": "cannot", "can't've": "cannot have", "'cause": "because", "could've": "could have",
    "couldn't": "could not", "couldn't've": "could not have", "didn't": "did not", "doesn't": "does not", "don't": "do not", "hadn't": "had not",
    "hadn't've": "had not have", "hasn't": "has not", "haven't": "have not", 
    "he'd": "he would",  ## --> he had or he would
    "he'd've": "he would have","he'll": "he will", "he'll've": "he will have", "he's": "he is", 
    "how'd": "how did","how'd'y": "how do you","how'll": "how will","how's": "how is",
    "I'd": "I would",   ## --> I had or I would
    "I'd've": "I would have","I'll": "I will","I'll've": "I will have","I'm": "I am","I've": "I have","isn't": "is not",
    "it'd": "it had",   ## --> It had or It would
    "it'd've": "it would have","it'll": "it will","it'll've": "it will have","it's": "it is",
    "let's": "let us","ma'am": "madam","mayn't": "may not","might've": "might have","mightn't": "might not","mightn't've": "might not have",
    "must've": "must have","mustn't": "must not","mustn't've": "must not have",
    "needn't": "need not","needn't've": "need not have",
    "o'clock": "of the clock",
    "oughtn't": "ought not", "oughtn't've": "ought not have",
    "shan't": "shall not","sha'n't": "shall not","shan't've": "shall not have",
    "she'd": "she would",   ## --> It had or It would
    "she'd've": "she would have","she'll": "she will","she'll've": "she will have","she's": "she is",
    "should've": "should have","shouldn't": "should not","shouldn't've": "should not have",
    "so've": "so have","so's": "so is",
    "that'd": "that would",
    "that'd've": "that would have","that's": "that is",
    "there'd": "there had",
    "there'd've": "there would have","there's": "there is",
    "they'd": "they would",
    "they'd've": "they would have","they'll": "they will","they'll've": "they will have","they're": "they are","they've": "they have",
    "to've": "to have","wasn't": "was not","weren't": "were not",
    "we'd": "we had",
    "we'd've": "we would have","we'll": "we will","we'll've": "we will have","we're": "we are","we've": "we have",
    "what'll": "what will","what'll've": "what will have","what're": "what are","what's": "what is","what've": "what have",
    "when's": "when is","when've": "when have",
    "where'd": "where did","where's": "where is","where've": "where have",
    "who'll": "who will","who'll've": "who will have","who's": "who is","who've": "who have","why's": "why is","why've": "why have",
    "will've": "will have","won't": "will not","won't've": "will not have",
    "would've": "would have","wouldn't": "would not","wouldn't've": "would not have",
    "y'all": "you all","y'alls": "you alls","y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are",
    "y'all've": "you all have","you'd": "you had","you'd've": "you would have","you'll": "you you will","you'll've": "you you will have",
    "you're": "you are",  "you've": "you have"
}

c_re = re.compile('(%s)' % '|'.join(cList.keys()))

def expandContractions(text):
    def replace(match):
        return cList[match.group(0)]
    return c_re.sub(replace, text)


def removeHTML(x):
    html=re.compile(r'<.*?>')
    return html.sub(r'',x)

def dataPreprocessing(x):
    
    x = x.lower() # Convert words to lowercase
    x = removeHTML(x) # Remove HTML
    x = re.sub("@\w+", '',x)     # Delete strings starting with @
    x = re.sub("\d+", '',x)
    x = re.sub("'\d+", '',x) # Delete Numbers
    x = re.sub("http\w+", '',x) # Delete URL
    x = x.replace(u'\xa0',' ') # Remove \xa0
    x = re.sub(r'_+', ' ', x)
    x = re.sub(r"\s+", " ", x)
    x = expandContractions(x)
    
    # Replace consecutive commas and periods with one comma and period character
    x = re.sub(r"\.+", ".", x)
    x = re.sub(r"\,+", ",", x)
    x = re.sub(r"\s+", " ", x) # Replace consecutive empty spaces with a single space character
    x = x.strip()
    return x

def remove_punctuation(text):
    translator = str.maketrans('', '', string.punctuation)
    text =  text.translate(translator)
    text = re.sub(r"\s+", ' ', text)
    return text

def remove_stop_words (text):
    tokens = nltk.word_tokenize(text)
    stop_words = set(final_stopwords_list)
    filtered_tokens = [word for word in tokens if word not in stop_words]
    filtered_text = ' '.join(filtered_tokens)
    return filtered_text

def lemmatization(text):
    
    words = nltk.word_tokenize(text)
    lemmatizer = WordNetLemmatizer()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words if len(word) > 1]
    
    return ' '.join(lemmatized_words)

def dataPreprocessing_w_contract_punct_remove(x):
    
    x = x.lower() # Convert words to lowercase
   
    x = removeHTML(x)  # Remove HTML
    
    x = expandContractions(x)
    
    x = remove_stop_words(x) # Remove stopwords
    
    x = re.sub("@\w+", '',x) # Delete strings starting with @
    x = x.replace(u'\xa0',' ') # Remove \xa0
    x = re.sub("'\d+", '',x) # Delete Numbers
    x = re.sub("\d+", '',x)
    x = re.sub(r'_+', ' ', x)
    x = re.sub("http\w+", '',x)     # Delete URL
    
    x = remove_punctuation(x) # Remove punctuation
    
    x = re.sub(r"\s+", " ", x) # Replace consecutive empty spaces with a single space character
    x = lemmatization(x) # Lemmatizing
    x = x.strip()
    return x


## 3.2 Paragraph based feature

<a id='paragraph-feature'></a>

In [None]:
def remove_duplicates(text):
    sentences = text.split('. ')
    
    # Use an OrderedDict to remove duplicates while preserving order
    from collections import OrderedDict
    unique_sentences = list(OrderedDict.fromkeys(sentences))
    
    # Join the unique sentences back into a single string
    result = '. '.join(unique_sentences)
    
    # Ensure the final sentence ends with a period if it originally did
    if text.endswith('.'):
        result += '.'
    
    return result


def extract_sentences(text):
    # Use a regular expression to split the text into sentences
    # This will handle periods, exclamation marks, and question marks as sentence terminators
    sentences = re.split(r'[.!?]+\s*', text)
    # Remove any empty sentences
    sentences = [s.strip() for s in sentences if s.strip()]
    return sentences

def extract_words(text):
    words = []
    for word in re.findall(r"\w+(?:[']\w+)*", text):
        if word.count("'") > 2:
            split_words = word.split("'")
            words.extend(split_words)
        else:
            words.append(word)
    return words


def preprocessing_for_paragraphs(text):
    # If before /n/n is not a mark, this is not the end of a paragraph    
    text = re.sub(r'(?<![\.\!\?])\n\n', ' ', text)

    #If after \n\n is an normal case, replace with space
    text = re.sub(r'\n\n([a-z])', ' ', text)
    
    return text.strip()

def extract_paragraphs(text):
    processed_text = preprocessing_for_paragraphs(text)
    paragraphs = processed_text.split('\n\n')
    
    return paragraphs


In [None]:
# test = pl.from_pandas(test).with_columns([pl.col("full_text").apply(remove_duplicates)])
# test = test.with_columns(columns)

In [None]:
columns = [(pl.col("full_text").apply(extract_paragraphs).alias("paragraph"))]
train = pl.from_pandas(train).with_columns([pl.col("full_text").apply(remove_duplicates)])
test = pl.from_pandas(test).with_columns([pl.col("full_text").apply(remove_duplicates)])

train = train.with_columns(columns)
test = test.with_columns(columns)

In [None]:
# paragraph features
def Paragraph_Preprocess(tmp):
    # Expand the paragraph list into several lines of data
    tmp = tmp.explode('paragraph')
    
    # Paragraph preprocessing
    tmp = tmp.with_columns(pl.col('paragraph').map_elements(dataPreprocessing))
    
    # Calculate the length of each paragraph
    tmp = tmp.with_columns(pl.col('paragraph').map_elements(lambda x: len(x)).alias("paragraph_len"))
    
    tmp = tmp.with_columns(
        pl.col('paragraph').map_elements(lambda x: len(extract_sentences(x))).alias('paragraph_sentence_cnt'),
        pl.col('paragraph').map_elements(lambda x: len(extract_words(x))).alias('paragraph_word_cnt'),
        pl.col('paragraph').map_elements(lambda x: len(set(extract_words(x)))).alias('paragraph_unique_word_cnt')
    )
    return tmp


length_ranges = [(1, 100), (101, 200), (201, 300), (301, 400), (401, 500), (501, 600), (601, 800)]

# feature_engineering
paragraph_fea = ['paragraph_len', 'paragraph_sentence_cnt', 'paragraph_word_cnt', 'paragraph_unique_word_cnt']

def Paragraph_Eng(train_tmp):
    
    count_aggs = [
        pl.col('paragraph').filter((pl.col('paragraph_len') >= start) & (pl.col('paragraph_len') <= end)).count().alias(f"paragraph_len_between_{start}_{end}_cnt")
        for start, end in length_ranges
    ]

    aggs = [
        # Count the number of paragraph lengths greater than and less than the i-value
        *[pl.col('paragraph').filter(pl.col('paragraph_len') >= i).count().alias(f"paragraph_len_geq_{i}_cnt") for i in [100,150,200,300,350,400,500,600,700] ], 

        *[pl.col(fea).max().alias(f"{fea}_max") for fea in paragraph_fea],
        *[pl.col(fea).mean().alias(f"{fea}_mean") for fea in paragraph_fea],
        *[pl.col(fea).sum().alias(f"{fea}_sum") for fea in paragraph_fea],
        *[pl.col(fea).min().alias(f"{fea}_min") for fea in paragraph_fea],
        ]

    df = train_tmp.group_by(['essay_id'], maintain_order=True).agg(count_aggs + aggs).sort("essay_id")
    df = df.to_pandas()
    return df

In [None]:
tmp = Paragraph_Preprocess(train)
train_feats = Paragraph_Eng(tmp)

# Obtain feature names
feature_names = list(filter(lambda x: x not in ['essay_id','score'], train_feats.columns))

print('Features Number: ',len(feature_names))
print('Features:', train_feats.columns)
train_feats.head(5)

## 3.3 Sentence based features

source: https://www.kaggle.com/code/ye11725/tfidf-lgbm-baseline-with-code-comments/notebook#Features-engineering

In [None]:
# sentence feature
def Sentence_Preprocess(tmp):
    # Preprocess full_text and use periods to segment sentences in the text
    tmp = tmp.with_columns( pl.col('full_text').map_elements(dataPreprocessing).map_elements(extract_sentences).alias("sentences"))
    tmp = tmp.explode('sentences')
    
    # Calculate the length of a sentence
    tmp = tmp.with_columns(pl.col('sentences').map_elements(lambda x: len(x)).alias("sentence_len"))
    
    # Count the number of words in each sentence
    tmp = tmp.with_columns(pl.col('sentences').map_elements(lambda x: len(extract_words(x))).alias("sentence_word_cnt"))
    tmp = tmp.with_columns(pl.col('sentences').map_elements(lambda x: len(set(extract_words(x)))).alias("sentence_unique_word_cnt"))
    
    return tmp
sentence_length_ranges = [(1, 50), (51, 100), (101, 150), (151, 300)]

# feature_eng
sentence_fea = ['sentence_len','sentence_word_cnt', 'sentence_unique_word_cnt']
def Sentence_Eng(train_tmp):
    
    count_aggs = [
        pl.col('sentences').filter((pl.col('sentence_len') >= start) & (pl.col('sentence_len') <= end)).count().alias(f"sentence_len_between_{start}_{end}_cnt")
        for start, end in sentence_length_ranges
    ]
    
    aggs = [
        # Count the number of sentences with a length greater than i
        *[pl.col('sentences').filter(pl.col('sentence_len') >= i).count().alias(f"sentence_len_geq_{i}_cnt") for i in [50,100,150,300] ], 
        *[pl.col(fea).max().alias(f"{fea}_max") for fea in sentence_fea],
        *[pl.col(fea).mean().alias(f"{fea}_mean") for fea in sentence_fea],
        *[pl.col(fea).min().alias(f"{fea}_min") for fea in sentence_fea],
    ]

    df = train_tmp.group_by(['essay_id'], maintain_order=True).agg(count_aggs + aggs).sort("essay_id")
    
    df = df.to_pandas()
    
    return df


In [None]:
tmp = Sentence_Preprocess(train)

# Merge the newly generated feature data with the previously generated feature data
train_feats = train_feats.merge(Sentence_Eng(tmp), on='essay_id', how='left')

feature_names = list(filter(lambda x: x not in ['essay_id','score'], train_feats.columns))
print('Features Number: ',len(feature_names))
train_feats.head(5)

## 3.4 Word based feature

source: https://www.kaggle.com/code/ye11725/tfidf-lgbm-baseline-with-code-comments/notebook#Features-engineering

In [None]:
# word feature
def Word_Preprocess(tmp):
    # Preprocess full_text and use spaces to separate words from the text
    tmp = tmp.with_columns(pl.col('full_text').map_elements(dataPreprocessing).map_elements(extract_words).alias('word'))
    tmp = tmp.explode('word')
    
    # Calculate the length of each word
    tmp = tmp.with_columns(pl.col('word').map_elements(lambda x: len(x)).alias("word_len"))
    
    # Delete data with a word length of 0
    tmp = tmp.filter(pl.col('word_len')!=0)
    
    return tmp

word_length_ranges = [(1, 5), (6, 10), (11, 15)]
# feature_eng
def Word_Eng(train_tmp):
    
    count_aggs = [
        pl.col('word').filter((pl.col('word_len') >= start) & (pl.col('word_len') <= end)).count().alias(f"word_len_between_{start}_{end}_cnt")
        for start, end in word_length_ranges
    ]
    aggs = [
        # Count the number of words with a length greater than i+1
        *[pl.col('word').filter(pl.col('word_len') >= i+1).count().alias(f"word_len_geq_{i+1}_cnt") for i in range(15) ], 
        # other
        
        pl.col('word_len').max().alias(f"word_len_max"),
        pl.col('word_len').mean().alias(f"word_len_mean"),
        pl.col('word_len').std().alias(f"word_len_std"),
        pl.col('word_len').sum().alias(f"word_len_sum"), 
        ]

    df = train_tmp.group_by(['essay_id'], maintain_order=True).agg(count_aggs + aggs).sort("essay_id")
    
    df = df.to_pandas()
    
    return df

In [None]:
tmp = Word_Preprocess(train)

# Merge the newly generated feature data with the previously generated feature data
train_feats = train_feats.merge(Word_Eng(tmp), on='essay_id', how='left')

feature_names = list(filter(lambda x: x not in ['essay_id','score'], train_feats.columns))
print('Features Number: ',len(feature_names))
train_feats.head(5)

## 3.5 Character TFIDF feature:

##### Note:
* **tokenizer=lambda x: x**: "`words are not tokenized from full-text? Tokenizer should only be overided by identity if text is already tokenized before. Perhaps vectorizer is receiving string (char sequence) instead of word sequence, so it behaves like a char ngram vectorizer`" qouted from notebook [here](https://www.kaggle.com/code/guillaums/error-in-tfidf-vectorizer-in-baseline-nbs?scriptVersionId=175110986&cellId=11)

In [None]:
# character_tfidf_vectorizer = TfidfVectorizer(
#             tokenizer = lambda x: x,
#             preprocessor = lambda x: x,
#             token_pattern = None,
#             strip_accents = 'unicode',
#             analyzer = 'word',
#             ngram_range = (1,3),
#             min_df = 0.1,
#             max_df = 0.95,
#             sublinear_tf = True,
# )
# # Processed text
# processed_text = train.to_pandas()["full_text"].progress_apply(lambda x: dataPreprocessing_w_contract_punct_remove(x))

# # Fit all datasets into TfidfVector,this may cause leakage and overly optimistic CV scores
# train_tfidf = character_tfidf_vectorizer.fit_transform([i for i in processed_text])

# character_tfidf_feature_names = character_tfidf_vectorizer.get_feature_names_out()
# print(character_tfidf_feature_names[0:110])

# dense_matrix = train_tfidf.toarray()
# df = pd.DataFrame(dense_matrix, columns=[f"tfidf_{name}" for name in character_tfidf_feature_names ])
# df['essay_id'] = train_feats['essay_id']


# # Merge the newly generated feature data with the previously generated feature data
# train_feats = train_feats.merge(df, on='essay_id', how='left')

# feature_names = list(filter(lambda x: x not in ['essay_id','score'], train_feats.columns))
# print('Features Number: ',len(feature_names))
# train_feats.head(5)


<a id='tfidf-word-feature'></a>
## 3.6 Word TFIDF feature:

Source notebook [here](https://www.kaggle.com/code/guillaums/error-in-tfidf-vectorizer-in-baseline-nbs?scriptVersionId=175110986&cellId=17)

https://www.kaggle.com/code/batprem/aes2-added-don-t-waste-your-run-time-feature/notebook

In [None]:
# TfidfVectorizer parameter
tfidf_vectorizer = TfidfVectorizer(
    preprocessor = lambda x: x,
    strip_accents = 'unicode',
    analyzer = 'word',
    ngram_range = (1, 3),
    min_df = 0.05,
    max_df = 0.85,
    sublinear_tf = True,
    stop_words = final_stopwords_list,
)

# Processed text
processed_text = train.to_pandas()["full_text"].progress_apply(lambda x: dataPreprocessing_w_contract_punct_remove(x))

# Fit all datasets into TfidfVectorizer
train_tfidf = tfidf_vectorizer.fit_transform([i for i in processed_text])
word_tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()
print(word_tfidf_feature_names[0:100])

dense_matrix = train_tfidf.toarray()
df = pd.DataFrame(dense_matrix, columns=[f"tfidf_{name}" for name in word_tfidf_feature_names])
df['essay_id'] = train_feats['essay_id']

# Merge the newly generated feature data with the previously generated feature data
train_feats = train_feats.merge(df, on='essay_id', how='left')

feature_names = list(filter(lambda x: x not in ['essay_id','score'], train_feats.columns))
print('Features Number: ',len(feature_names))

train_feats.head(3)

## **3.7 CountVectorizer Features:**

In [None]:
count_vectorizer = CountVectorizer(
            strip_accents='unicode',
            analyzer = 'word',
            ngram_range=(2,3),
            min_df=0.05, 
            max_df=0.85,
)

train_count = count_vectorizer.fit_transform([i for i in processed_text])

dense_matrix = train_count.toarray()
word_count_feature_names = count_vectorizer.get_feature_names_out()
print(word_count_feature_names[0:100])

df = pd.DataFrame(dense_matrix,  columns=[f"count_{name}" for name in word_count_feature_names])
df['essay_id'] = train_feats['essay_id']
train_feats = train_feats.merge(df, on='essay_id', how='left')

feature_names = list(filter(lambda x: x not in ['essay_id','score'], train_feats.columns))
print('Features Number: ',len(feature_names))
train_feats.head()

## **Add centroid**

In [None]:

tfidf_w_columns = [ f'tfidf_{i}' for i in word_tfidf_feature_names]
kmean_test = train_feats[tfidf_w_columns]

In [None]:
# Initialize KMeans with the number of clusters you want
kmeans = KMeans(n_clusters=7, random_state=42)

# Fit the model to the data
kmeans.fit(train_feats[tfidf_w_columns])

# Predict the clusters for the data points
labels = kmeans.labels_

# Get the centroids
centroids = kmeans.cluster_centers_

# Calculate the distance to the centroid
distances = np.sqrt(((kmean_test - centroids[labels]) ** 2).sum(axis=1))

cosine_distances_to_centroid = [
    cosine_distances([kmean_test.iloc[i]], [centroids[label]])[0][0]
    for i, label in enumerate(labels)
]

# Add the distances to the DataFrame
kmean_test['DistanceToCentroid'] = distances
kmean_test['CosineDistanceToCentroid'] = cosine_distances_to_centroid

train_feats['DistanceToCentroid'] = kmean_test['DistanceToCentroid']
train_feats['CosineDistanceToCentroid'] = kmean_test['CosineDistanceToCentroid']


In [None]:
train_feats.head()

In [None]:
train_feats['CosineDistanceToCentroid'].min()

<a id='extra-feature'></a>
## 3.7 Extra features:
Reference: https://www.kaggle.com/code/tsunotsuno/updated-debertav3-lgbm-with-spell-autocorrect

In [None]:
!pip install /kaggle/input/textstat-pypi/Pyphen-0.9.3-py2.py3-none-any.whl >> none
!pip install /kaggle/input/textstat-pypi/textstat-0.7.0-py3-none-any.whl >> none

In [None]:
import textstat

from textblob import TextBlob

from collections import Counter
from collections import defaultdict
from nltk.metrics import BigramAssocMeasures
from nltk.collocations import BigramCollocationFinder
import spacy
nlp = spacy.load("en_core_web_sm")

In [None]:
class Preprocessor:
    def __init__(self) -> None:
        self.STOP_WORDS = set(final_stopwords_list)
        self.spellchecker = SpellChecker()

    def spelling(self, text):
        text_2 = re.sub(r'[^\w\s]', ' ', text)
        amount_miss = len(list(self.spellchecker.unknown(text_2.split())))
        return amount_miss
    
    def find_wrong_punctuation(self, text):
        punctuations = ['.', ',', ';', '?', '!', ':']
        lowercase_list = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
        uppercase_list = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z']

        # find punctuation in the text
        wrong_punctuations = 0
        length = len(text)

        for i in range(length):
            if text[i] in punctuations and i < length - 1:
                if text[i + 1] in lowercase_list or text[i + 1] in uppercase_list:
                    wrong_punctuations += 1

        return wrong_punctuations
    
    def noun_verb_adj_adv_adp_others(self, text):
        doc = nlp(text)
        pos_counts = doc.count_by(spacy.attrs.POS)
        nouns = 0
        verbs = 0
        adj = 0
        adv = 0
        adp_conj = 0
        others = 0

        for pos_id, count in pos_counts.items():
            pos_tag = doc.vocab.strings[pos_id]
            if pos_tag in ['NOUN', 'PROPN', 'PRON']:
                nouns += count
            elif pos_tag in ['VERB', 'AUX']:
                verbs += count
            elif pos_tag == 'ADJ':
                adj += count
            elif pos_tag == 'ADV':
                adv += count
            elif pos_tag in ['ADP', 'CONJ']:
                adp_conj += count
            else:
                others += count

        return nouns, verbs, adj, adv, adp_conj, others
    def count_sym(self, text, sym):
        sym_count = 0
        for l in text:
            if l == sym:
                sym_count += 1
        return sym_count
    
    def lexical_diversity(self,text):
        # Tokenize the text into words
        words = nltk.word_tokenize(text)

        # Calculate the number of unique words (types) and total number of words (tokens)
        num_types = len(set(words))
        num_tokens = len(words)

        # Calculate the Type-Token Ratio (TTR)
        ttr = num_types / num_tokens

        return ttr
    
    def calculate_collocation_diversity(self,text):
        tokens = nltk.word_tokenize(text)
        finder = BigramCollocationFinder.from_words(tokens)
        return len(finder.score_ngrams(BigramAssocMeasures.mi_like)) / float(len(tokens))

    def calculate_collocation_strength(self,text):
        tokens = nltk.word_tokenize(text)
        finder = BigramCollocationFinder.from_words(tokens)
        collocations = finder.nbest(BigramAssocMeasures.mi_like, 10)  # Get top 10 collocations
        return sum(score for bigram, score in finder.score_ngrams(BigramAssocMeasures.mi_like)) / float(len(collocations))

    def run(self, data: pd.DataFrame, mode:str) -> pd.DataFrame:
        
        # preprocessing the text
        data["processed_text"] = data["full_text"].apply(lambda x: dataPreprocessing_w_contract_punct_remove(x))

            
        data[['nouns', 'verbs', 'adj', 'adv', 'adp_conj', 'others']] = data['processed_text'].apply(lambda x: pd.Series(self.noun_verb_adj_adv_adp_others(x)))
        
         # distinct word count
        data['distinct_word_count'] = data['processed_text'].apply(lambda x: len(set(word_tokenize(x))))
        
        # coleman
        data['coleman_liau'] = data['processed_text'].apply(lambda x: textstat.coleman_liau_index(x))
        
        # smog
        data['smog'] = data['processed_text'].apply(lambda x: textstat.smog_index(x))

        # sentiment
        data['sentiment'] = data['processed_text'].apply(lambda x: TextBlob(x).sentiment.polarity)
        
        # Text tokenization
        data["text_tokens"] = data["processed_text"].apply(lambda x: word_tokenize(x))
        
         # lexical diversity
        data['lexical_diversity'] = data['full_text'].apply(lambda x: self.lexical_diversity(x))
        
        # collocation diversity
        data['collocation_diversity'] = data['processed_text'].apply(lambda x: self.calculate_collocation_diversity(x))
                
        # collocation strength
        data['collocation_strength'] = data['processed_text'].apply(lambda x: self.calculate_collocation_strength(x))
        
        # essay length
        data["text_length"] = data["processed_text"].apply(lambda x: len(x))
        
        # essay word count
        data["word_count"] = data["text_tokens"].apply(lambda x: len(x))
        
        # essay unique word count
        data["unique_word_count"] = data["text_tokens"].apply(lambda x: len(set(x)))
        
        # essay sentence count
        data["sentence_count"] = data["full_text"].apply(lambda x: len(x.split('.')))
        
        # essay paragraph count
        data["paragraph_count"] = data["full_text"].apply(lambda x: len(x.split('\n\n')))
        
        # count misspelling
        data["splling_err_num"] = data["processed_text"].progress_apply(self.spelling)
        data["splling_err_ratio"] = data["splling_err_num"] / data["text_length"]

        print("Spelling mistake count done")
        
        # ratio fullstop / text_length ** new
        data["fullstop_ratio"] = data["full_text"].apply(lambda x: x.count(".")/len(x))
        
        # ratio comma / text_length ** new
        data["comma_ratio"] = data["full_text"].apply(lambda x: x.count(",")/len(x))
        
        return data

In [None]:
preprocessor = Preprocessor()
tmp = preprocessor.run(train.to_pandas(), mode="train")
train_feats = train_feats.merge(tmp, on='essay_id', how='left')
feature_names = list(filter(lambda x: x not in ['essay_id','score'], train_feats.columns))

print('Features Number: ',len(feature_names))
train_feats.head(5)

In [None]:
# print(list(train_feats.columns))

## 3.8 Test dataset featurization

In [None]:
# Paragraph
tmp = Paragraph_Preprocess(test)
test_feats = Paragraph_Eng(tmp)

# Sentence
tmp = Sentence_Preprocess(test)
test_feats = test_feats.merge(Sentence_Eng(tmp), on='essay_id', how='left')

# Word
tmp = Word_Preprocess(test)
test_feats = test_feats.merge(Word_Eng(tmp), on='essay_id', how='left')

# # Character Tfidf
# processed_text = test.to_pandas()["full_text"].progress_apply(lambda x: dataPreprocessing_w_contract_punct_remove(x))
# test_tfidf = character_tfidf_vectorizer.transform([i for i in processed_text])
# feature_names = character_tfidf_vectorizer.get_feature_names_out()
# test_tfidf_df = pd.DataFrame(test_tfidf.toarray(), columns=[f'tfidf_{name}' for name in feature_names])
# test_tfidf_df['essay_id'] = test_feats['essay_id']
# test_feats = test_feats.merge(test_tfidf_df, on='essay_id', how='left')


# Word Tfidf
processed_text = test.to_pandas()["full_text"].progress_apply(lambda x: dataPreprocessing_w_contract_punct_remove(x))
test_tfidf = tfidf_vectorizer.transform([i for i in processed_text])
feature_names = tfidf_vectorizer.get_feature_names_out()
test_tfidf_df = pd.DataFrame(test_tfidf.toarray(), columns=[f'tfidf_{name}' for name in feature_names])
test_tfidf_df['essay_id'] = test_feats['essay_id']
test_feats = test_feats.merge(test_tfidf_df, on='essay_id', how='left')

# Word vectorize
processed_text = test.to_pandas()["full_text"].progress_apply(lambda x: dataPreprocessing_w_contract_punct_remove(x))
test_count = count_vectorizer.transform([i for i in processed_text])
feature_names = count_vectorizer.get_feature_names_out()
test_count_df = pd.DataFrame(test_count.toarray(), columns=[f'count_{name}' for name in feature_names])
test_count_df['essay_id'] = test_feats['essay_id']
test_feats = test_feats.merge(test_count_df, on='essay_id', how='left')

# Fit the model to the data
labels = kmeans.predict(test_feats[tfidf_w_columns])
kmean_test = test_feats[tfidf_w_columns]
centroids = kmeans.cluster_centers_
print(len(centroids))
distances = np.sqrt(((kmean_test - centroids[labels]) ** 2).sum(axis=1))
cosine_distances_to_centroid = [
    cosine_distances([kmean_test.iloc[i]], [centroids[label]])[0][0]
    for i, label in enumerate(labels)
]
# Add the distances to the DataFrame
kmean_test['DistanceToCentroid'] = distances
kmean_test['CosineDistanceToCentroid'] = cosine_distances_to_centroid

test_feats['DistanceToCentroid'] = kmean_test['DistanceToCentroid']
test_feats['CosineDistanceToCentroid'] = kmean_test['CosineDistanceToCentroid']

    
# Extra feature
tmp = preprocessor.run(test.to_pandas(), mode="train")
test_feats = test_feats.merge(tmp, on='essay_id', how='left')

# Features number
feature_names = list(filter(lambda x: x not in ['essay_id','score'], test_feats.columns))
print('Features number: ',len(feature_names))
test_feats.head()

In [None]:
tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()
tfidf_feature_names

In [None]:
count_feature_names = count_vectorizer.get_feature_names_out()
count_feature_names

# 4. Data preparation

## 4.1 Add k-fold details

In [None]:
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

for i, (_, val_index) in enumerate(skf.split(train_feats, train_feats["score"])):
    train_feats.loc[val_index, "fold"] = i
    
print(train_feats.shape)

## 4.2 Feature selection

In [None]:
target = "score"
train_drop_columns = ["essay_id", "fold", "full_text", "paragraph", "text_tokens", "processed_text"] + [target]

In [None]:
test_drop_columns = ["essay_id", "full_text", "paragraph", "text_tokens", "processed_text"]

In [None]:
train_feats.shape

In [None]:
test_feats.shape

In [None]:
def sanitize_feature_names(df):
    sanitized_columns = {col: re.sub(r'[^\w]', '_', col) for col in df.columns}
    df.rename(columns=sanitized_columns, inplace=True)
    return df

In [None]:
train_feats = sanitize_feature_names(train_feats)

In [None]:
test_feats = sanitize_feature_names(test_feats)

# 5. Training

## 5.1 Evaluation function and loss function defination 

In [None]:
# idea from https://www.kaggle.com/code/rsakata/optimize-qwk-by-lgb/notebook#QWK-objective
def quadratic_weighted_kappa(y_true, y_pred):
    y_true = (y_true + a).round()
    y_pred = (y_pred + a).clip(1, 6).round()
    qwk = cohen_kappa_score(y_true, y_pred, weights="quadratic")
    
    return 'QWK', qwk, True

def quadratic_weighted_kappa_2(y_true, y_pred):
    y_true = (y_true + a).round()
    y_pred = (y_pred.get_label() + a).clip(1, 6).round()
    qwk = cohen_kappa_score(y_true, y_pred, weights="quadratic")
    return 'QWK', qwk, True

def qwk_obj(y_true, y_pred):
    labels = y_true + a
    preds = y_pred + a
    preds = preds.clip(1, 6)
    f = 1/2 * np.sum((preds-labels)**2)
    g = 1/2 * np.sum((preds-a)**2 + b)
    df = preds - labels
    dg = preds - a
    grad = (df/g - f*dg/g**2) * len(labels)
    hess = np.ones(len(labels))
    
    return grad, hess

def qwk_param_calc(y):
    a = y.mean()
    b = (y ** 2).mean() - a**2
    
    return np.round(a, 4), np.round(b, 4)

## 5.2 Training LGBMRegressor model

### **K-fold**

In [None]:
from sklearn.utils.class_weight import compute_sample_weight

from sklearn.linear_model import LinearRegression
from sklearn.metrics import ConfusionMatrixDisplay
import lightgbm as lgb
import xgboost as xgb

In [None]:
actual_score = train_feats['score'].astype(np.float32).values
actual_score

In [None]:
models = []

callbacks = [
    lgb.log_evaluation(period=25), 
    lgb.early_stopping(stopping_rounds=75, first_metric_only=True)
]


for fold in range(n_splits):

    model_lgb = lgb.LGBMRegressor(
                                    objective = qwk_obj,
                                    metrics = 'None',
                                    learning_rate = 0.05, 
                                    max_depth = 5,
                                    num_leaves = 10, 
                                    colsample_bytree = 0.3,  
                                    reg_alpha = 0.1, 
                                    reg_lambda = 0.8,     #0.8,
                                    n_estimators = 1024, #1024, 
                                    random_state = 42, 
                                    extra_trees=True,
                                    verbosity = - 1
    )

   
    a, b = qwk_param_calc(train_feats[train_feats["fold"] != fold]["score"])
    
    # Take out the training and validation sets for 5 kfold segmentation separately
    X_train = train_feats[train_feats["fold"] != fold].drop(columns=train_drop_columns)
    y_train = train_feats[train_feats["fold"] != fold]["score"] - a

    X_eval = train_feats[train_feats["fold"] == fold].drop(columns=train_drop_columns)
    y_eval = train_feats[train_feats["fold"] == fold]["score"] - a

    print('\nFold_{} Training ================================\n'.format(fold+1))
    print(f"Fold {fold} a: {a}  ;;  b: {b}")
    
    sample_weights = compute_sample_weight(class_weight='balanced', y=y_train)

    # Training model
    lgb_model = model_lgb.fit( 
                    X_train, y_train,
                    sample_weight = sample_weights,
                    eval_names = ['train', 'valid'],
                    eval_set = [(X_train, y_train), (X_eval, y_eval)],
                    eval_metric = quadratic_weighted_kappa,
                    callbacks = callbacks
                )

    models.append(lgb_model)


## 5.3 Validating LGBMRegressor model

In [None]:

preds, trues = [], []
    
for fold, model in enumerate(models):
    X_eval_cv = train_feats[train_feats["fold"] == fold].drop(columns=train_drop_columns)
    y_eval_cv = train_feats[train_feats["fold"] == fold]["score"]    

    pred = model.predict(X_eval_cv) + a
    
    pred[pred < 1] = 1
    pred[pred > 6] = 6
    
    trues.extend(y_eval_cv)
    preds.extend(np.round(pred, 0))
#     preds.extend(pred)
    
    v_score = cohen_kappa_score(trues, preds, weights="quadratic")
    print(f"Validation score {fold} : {v_score}")

v_score = cohen_kappa_score(trues, preds, weights="quadratic")
print(f"Validation score : {v_score}")

In [None]:
cm = confusion_matrix(trues, preds, labels=[x for x in range(1,7)])

# Displaying the confusion matrix
disp = ConfusionMatrixDisplay(confusion_matrix=cm,
                              display_labels=[x for x in range(1,7)])
disp.plot()
plt.show()


In [None]:
model_prec = pd.DataFrame([trues, preds]).T
    
model_prec.rename(columns = {0: 'trues', 1: 'preds'}, inplace=True)
model_prec

In [None]:
def analyze_preds(trues, preds):
    # Create dataframe
    model_prec = pd.DataFrame([trues, preds]).T
    
    model_prec.rename(columns = {0: 'trues', 1: 'preds'}, inplace=True)
    model_prec['correct'] = model_prec['trues'] == model_prec['preds']
    model_prec['count'] = model_prec.groupby('trues')['trues'].transform('count')
    model_prec['correct_count'] = model_prec.groupby('trues')['correct'].transform('sum')
    model_prec['correct_rate'] = model_prec['correct_count'] / model_prec['count']
    
    # Print binary correction rate
    print(model_prec[['trues', 'correct_rate', 'correct_count', 'count']].drop_duplicates().sort_values(by='trues'))
    
    # Plot predictions by score    
    def plot_model(ax, counts, true):
        bars = ax.bar(counts.index, counts.values, color='skyblue')

        # Find the index of the column with the specified label
        highlight_index = counts.index.get_loc(true)

        # Highlight the specified column
        bars[highlight_index].set_color('orange')

        ax.set_xlabel('Predicted Values')
        ax.set_ylabel('Count')
        ax.set_title("Score " + str(true))
    
    score_list = [1,2,3,4,5,6]
    test_pred_by_score = [model_prec[model_prec['trues'] ==  score]['preds'].value_counts() for score in score_list]

    # Create a figure and six subplots arranged in a 2x3 grid
    fig, axs = plt.subplots(2, 3, figsize=(15, 10))    
    plot_model(axs[0, 0], test_pred_by_score[0], 1)
    plot_model(axs[0, 1], test_pred_by_score[1], 2)
    plot_model(axs[0, 2], test_pred_by_score[2], 3)
    plot_model(axs[1, 0], test_pred_by_score[3], 4)
    plot_model(axs[1, 1], test_pred_by_score[4], 5)
    plot_model(axs[1, 2], test_pred_by_score[5], 6)
    
analyze_preds(trues, preds)


### **Classification with thresholds**

In [None]:
# import numpy as np


# optimal_thresholds = [1.5, 2.5, 3.5, 4.5, 5.5]

# def classify_with_thresholds(predicted_value, optimal_thresholds):


#     new_predicted_value = np.empty_like(predicted_value)
    
#     for i, threshold in enumerate(optimal_thresholds):
#         if i == 0:
#             new_predicted_value[predicted_value < threshold] = 1

#         else:
#             new_predicted_value[(predicted_value >= optimal_thresholds[i-1]) & (predicted_value < threshold)] = i + 1

#         if i == 4:
#             new_predicted_value[predicted_value >= threshold] = 6
            
#     return new_predicted_value.astype(int)
            
# new_predicted_value = classify_with_thresholds(preds, optimal_thresholds)
    
# print("New value")
# print(new_predicted_value)

In [None]:
# optimal_thresholds = [1.5, 2.5, 3.5, 4.5, 5.5]
# best_kappa_score = v_score

# for i in range(5):
#     i_threshold = optimal_thresholds[i] - 0.5
    
#     for j in range(1000):
#         thresholds = [x for x in optimal_thresholds]
#         thresholds[i] = i_threshold + j / 1000
        
#         classified_oof = classify_with_thresholds(preds, thresholds)
#         threshold_kappa_score = cohen_kappa_score(trues, classified_oof, weights='quadratic')

#         if threshold_kappa_score >= best_kappa_score:
#             best_kappa_score = threshold_kappa_score
#             optimal_thresholds[i] = thresholds[i]
# optimal_thresholds

In [None]:
# new_predicted_value = classify_with_thresholds(preds, optimal_thresholds)
    
# print("New value")
# print(new_predicted_value)

In [None]:
# analyze_preds(trues, new_predicted_value)

## 5.4 Testing and collecting prediction

In [None]:
test_drop_columns

In [None]:
test_feats.head()

In [None]:
# predecting for 5 models
preds = []
for fold, model in enumerate(models):
    X_eval_cv = test_feats.drop(columns=test_drop_columns)
    print(X_eval_cv.shape)
    pred = model.predict(X_eval_cv) + a
    
    
    pred[pred < 1] = 1
    pred[pred > 6] = 6
    
    preds.append(pred)

In [None]:
for i, pred in enumerate(preds):
    test_feats[f"score_pred_{i}"] = pred
test_feats["score"] = np.round(test_feats[[f"score_pred_{fold}" for fold in range(n_splits)]].mean(axis=1),0).astype('int32')

# new_predicts = classify_with_thresholds(test_predicts, optimal_thresholds)

In [None]:
# new_predicts

In [None]:
# test_predicts

In [None]:

# test_feats["score"] = test_predicts

In [None]:
test_feats.head()

# 6. Submission

#### 

In [None]:
test_feats[["essay_id", "score"]].to_csv("submission.csv", index=False)


In [None]:

# Plot the tree with the feature names
ax = lgb.plot_tree(models[0], tree_index=0, figsize=(20, 8), dpi=300)
plt.show()

In [None]:

for model in models:  
    ax = lgb.plot_tree(model, tree_index=0, figsize=(20, 8), dpi=300)
    plt.show()


In [None]:
ax = lgb.plot_importance(models[0], figsize=(20, 300), importance_type="split")
plt.show()

In [None]:
# ax = lgb.plot_split_value_histogram(models[0], figsize=(20, 8), feature='splling_err_num')
# plt.show()

In [None]:
ax = lgb.plot_importance(models[0], figsize=(20, 100), importance_type="gain")
plt.show()