Hi everyone!  
I am very happy that if my notebook is helpful to someone.  
If you like please read my EDA and text clustering notebook.

# 1. Import library and Load data  
First, I import library to load and analysis dataset.

In [None]:
from collections import Counter
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import nltk
from nltk.corpus import stopwords
import warnings

warnings.filterwarnings('ignore')

Create a class to collect paths.

In [None]:
class PATHS:
    train_path = '/kaggle/input/learning-agency-lab-automated-essay-scoring-2/train.csv'
    test_path = '/kaggle/input/learning-agency-lab-automated-essay-scoring-2/test.csv'
    sample_sub_path = '/kaggle/input/learning-agency-lab-automated-essay-scoring-2/sample_submission.csv'

Load training dataset and check it simply.

In [None]:
train = pd.read_csv(PATHS.train_path)
display(train.head())
print('')
display(train.describe().T)
print('')
print(f'Shape of training dataset: {train.shape}')

Load sample submission file and check it.

In [None]:
sample_submission = pd.read_csv(PATHS.sample_sub_path)
sample_submission

Load test data and check it.  
The test data is replaced during inference, so it has the same content as the first three rows of the training data.

In [None]:
test = pd.read_csv(PATHS.test_path)
test

# 2. Create basic features  
Second, I create basic features from text.

In [None]:
def count_stopwords(text, stopwords):
    '''Function that count a number of words which is not stopwords of nltk. '''
    text = text.split()
    stopwords_length = len([t for t in text if t in stopwords])
    return stopwords_length

def create_features(df, stopwords):
    '''Funcition that create features.'''
    # Count letters
    df['letters'] = df['full_text'].apply(lambda x: len(x))
    # Count words
    df['words'] = df['full_text'].apply(lambda x: len(x.split()))
    # Count words that is unique in text
    df['unique_words'] = df['full_text'].apply(lambda x: len(set(x.split())))
    # Count sentence
    df['sentences'] = df['full_text'].apply(lambda x: len(x.split('.')))
    # Count paragraph
    df['paragraph'] = df['full_text'].apply(lambda x: len(x.split('\n\n')))
    # Count stopwords 
    df['stopwords'] = df['full_text'].apply(count_stopwords, args=(stopwords,))
    # Count not stopwords
    df['not_stopwords'] = df['words'] - df['stopwords']
    return df

Download stopwords and apply function that defined above to training dataset.

In [None]:
nltk.download('stopwords')
stop_words = stopwords.words('english')

data = create_features(train, stop_words)
data

# 3. Check distribution and Plot data 
First, I check the distribution of score.

In [None]:
data['score'].value_counts().sort_index()

In [None]:
plt.figure(figsize=(5, 3))
plt.title('Destribution of score')
sns.histplot(data=data, x='score')

Second, check the distribusion of each features.

In [None]:
num_rows = 4
num_cols = 2
plt.figure(figsize=(20, 20))
for i, col in enumerate(data.columns[3:], start=1):
    plt.subplot(num_rows, num_cols, i)
    sns.kdeplot(data=data, x=col, hue='score', palette='bright', fill=False)
    plt.title(col, fontsize=20)
    
plt.tight_layout()
plt.show()

Next is that plot scatter each features.

In [None]:
plt.figure(figsize=(15, 15))
sns.pairplot(data=data.drop(columns=['essay_id', 'full_text']), hue='score', palette='bright')

Lastly, check the corelation of each features.

In [None]:
data.drop(columns=['essay_id', 'full_text']).corr()

In [None]:
plt.figure(figsize=(5, 4))
sns.heatmap(data=data.drop(columns=['essay_id', 'full_text']).corr(), annot=True, fmt=".2f", linewidth=.5)

# 4. Text Clustering to get prompt_name
Training data does not have `prompt_name`, so I try to text clustering using KMeans.  
Text preprocessing and creating features part is thanks to great notebook.  
[https://www.kaggle.com/code/ye11725/tfidf-lgbm-baseline-with-code-comments](https://www.kaggle.com/code/ye11725/tfidf-lgbm-baseline-with-code-comments)

In [None]:
def removeHTML(x):
    html=re.compile(r'<.*?>')
    return html.sub(r'',x)
def dataPreprocessing(x):
    # Convert words to lowercase
    x = x.lower()
    # Remove HTML
    x = removeHTML(x)
    # Delete strings starting with @
    x = re.sub("@\w+", '',x)
    # Delete Numbers
    x = re.sub("'\d+", '',x)
    x = re.sub("\d+", '',x)
    # Delete URL
    x = re.sub("http\w+", '',x)
    # Replace consecutive empty spaces with a single space character
    x = re.sub(r"\s+", " ", x)
    # Replace consecutive commas and periods with one comma and period character
    x = re.sub(r"\.+", ".", x)
    x = re.sub(r"\,+", ",", x)
    # Remove empty characters at the beginning and end
    x = x.strip()
    return x

def remove_stopwords(df, stopwords):
    all_text = []
    for text in df['cleaned_text']:
        text = text.split()
        each_text = []
        for t in text:
            if t not in stopwords:
                each_text.append(t)
        all_text.append(' '.join(each_text))
    df['cleaned_text'] = all_text
    return df               

In [None]:
data['cleaned_text'] = data['full_text'].apply(dataPreprocessing)
data = remove_stopwords(data, stop_words)

In [None]:
vectorizer = TfidfVectorizer(
            tokenizer=lambda x: x,
            preprocessor=lambda x: x,
            token_pattern=None,
            strip_accents='unicode',
            analyzer = 'word',
            ngram_range=(1,3),
            min_df=0.05,
            max_df=0.95,
            sublinear_tf=True,
)

train_tfid = vectorizer.fit_transform([i for i in data['cleaned_text']])

In [None]:
num_k = range(1, 11)
inertia_values = []
for k in num_k:
    kmeans = KMeans(n_clusters=k).fit(train_tfid.toarray())
    inertia_values.append(kmeans.inertia_)

plt.figure(figsize=(6, 4))
x = [i for i in range(1, 11)]
plt.plot(x, inertia_values)
plt.title('Distance from cluster: Elbow curve')
plt.show()

In [None]:
n_clusters = 7
kmeans = KMeans(n_clusters=n_clusters, random_state=1278).fit(train_tfid.toarray())
labels = kmeans.predict(train_tfid.toarray())
data['group'] = labels

In [None]:
plot_data = data['group'].value_counts().sort_index().reset_index()
plt.figure(figsize=(6, 4))
sns.barplot(data=plot_data, x='group', y='count')
plt.title('A number of data of each groups')

I think the title of each groups is like below.  
  
Group0: Car-free cities  
Group1: Exploring Venus  
Group2: Does the electoral college work?  
Group3: The Face on Mars  
Group4: "A Cowboy Who Rode the Waves"    
Group5: Driverless cars    
Group6: Facial action coding system  

In [None]:
for i in range(7):
    print(f'================================== group_{i} ==================================')
    print('')
    for j in range(2):
        print(data[data['group'] == i]['full_text'].iloc[j])
        print('')
        print('----------------------------------------------------------------------------------')
    print('')

In [None]:
prompt_mapping = {
    0: 'Exploring Venus',
    1: 'Exploring Venus',
    2: 'Does the electoral college work?',
    3: 'The Face on Mars',
    4: '"A Cowboy Who Rode the Waves"',
    5: 'Driverless cars',
    6: 'Facial action coding system'
}

data['prompt_name'] = data['group'].map(prompt_mapping)

In [None]:
data = data[['essay_id', 'full_text', 'score', 'prompt_name', 'group']]
data.to_csv('train_containing_groups.csv', index=False)
data.head()

I really appreciate that you read my notebook till the end.    
Let's enjoy Kaggle!!