In [None]:
import polars as pl

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from wordcloud import WordCloud

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [None]:
train = pl.read_csv("/kaggle/input/learning-agency-lab-automated-essay-scoring-2/train.csv")

# Score vs Text or Word Length

In [None]:
train = train.with_columns(
    pl.col("full_text").str.len_chars().alias("text_length"),
    pl.col("full_text")
    .map_elements(lambda x: len(x.split()), return_dtype=pl.Int64)
    .alias("word_length"),
)

In [None]:
sns.scatterplot(train.to_pandas(), x="text_length", y="score", alpha=0.5);

In [None]:
sns.scatterplot(train.to_pandas(), x="word_length", y="score", alpha=0.5);

In [None]:
sns.boxplot(train.to_pandas(), x="score", y="text_length");

In [None]:
sns.boxplot(train.to_pandas(), x="score", y="word_length");

# Word Cloud(All Text)

In [None]:
full_text_combined = " ".join(text for text in train['full_text'])

wordcloud = WordCloud(width = 800, height = 400, background_color ='white').generate(full_text_combined)

plt.figure(figsize=(10, 5))
plt.imshow(wordcloud)
plt.axis("off");

# Word Cloud(Text Per Score)

In [None]:
fig, axes = plt.subplots(nrows=3, ncols=2, figsize=(10, 6))

for score in [1, 2 ,3, 4, 5, 6]:
    subset_text = " ".join(
        [
            text for text
            in train.filter(pl.col("score") == score)["full_text"].to_list()
        ]
    )
    wordcloud = WordCloud(width = 800, height = 400, background_color ='white').generate(subset_text)
    
    ax_idx = (score - 1) // 2
    ax_col = (score - 1) % 2
    axes[ax_idx][ax_col].imshow(wordcloud)
    axes[ax_idx][ax_col].axis("off")
    axes[ax_idx][ax_col].set_title(f"Word Cloud for Score {score}")

plt.tight_layout()
plt.show()

# Topic Modeling

In [None]:
# Create a document-term matrix using CountVectorizer
cv = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
dtm = cv.fit_transform(train['full_text'])

# LDA model
LDA = LatentDirichletAllocation(n_components=5, random_state=42)
LDA.fit(dtm)

# Getting the words for each topic
topics = {}
for index, topic in enumerate(LDA.components_):
    words = [cv.get_feature_names_out()[i] for i in topic.argsort()[-10:]]
    topics[index] = words

topics

named by GPT4

- 0: Planetary Geography and Exploration
- 1: Electoral Politics
- 2: Environmental Concerns
- 3: Educational Technology in the Classroom
- 4: Automotive Technology and Innovation

# Score Per Topic Modeling

In [None]:
# Assign the dominant topic to each document
topic_results = LDA.transform(dtm)
train = train.with_columns(
    pl.Series(topic_results.argmax(axis=1)).alias("dominant_topic")
)

In [None]:
plt.figure(figsize=(12, 6))
sns.boxplot(data=train.to_pandas(), x='dominant_topic', y='score')
plt.title('Score Distribution by Dominant Topic')
plt.xlabel('Topic')
plt.ylabel('Score')
plt.xticks(np.arange(5),
    [
        'Planetary Geography and Exploration',
        'Electoral Politics',
        'Environmental Concerns',
        'Educational Technology in the Classroom',
        'Automotive Technology and Innovation'
    ],
    rotation=45
);

In [None]:
plt.figure(figsize=(12, 6))
sns.violinplot(data=train.to_pandas(), x='dominant_topic', y='score')
plt.title('Score Distribution by Dominant Topic')
plt.xlabel('Topic')
plt.ylabel('Score')
plt.xticks(np.arange(5),
    [
        'Planetary Geography and Exploration',
        'Electoral Politics',
        'Environmental Concerns',
        'Educational Technology in the Classroom',
        'Automotive Technology and Innovation'
    ],
    rotation=45
);