In [None]:
%%html
<style>
  table {margin-left: 0 !important;}
</style>

# AES2: What are the essays about?

We know that **12871** over 17307 are from Persuade 2.0 (total=25992). We've 100% text match for these 12871 essays. For some other essays (minor) we've similarity match not taken into account here.

Hereafter a quick EDA with topics modeling to learn what are the essays about for the ones from Kaggle, Persuade and both overlapping.


| Dataset      | Dataset             | Dataset               |
| -----------  | ------------------- | --------------------- |
|  Kaggle-only (4436) | Kaggle + Persuade (12871)   | Persuade-only (13125)         |

Note: _Persuade comes with prompt_name and assignment details that gives information about the topic._

Kaggle training dataset is about:
- Solar planets exploration (Venus conditions, orgin of Mars).
- Driverless cars safety cost and legal issues.
- Voting/Election.
- Emotions/facial expressions recognition.
- ...


Some topics in the full Persuade 2.0 are not available in the Kaggle training data. 

Persuade-only training dataset is about:
- Cell phone use while driving.
- Student extracurricular activities.
- Asking advice/opinion.
- Cell phone use in school.
- Student summer project.
- ...

Finally some are available in both datasets (Kaggle + Persuade):
- Online course/classes.
- Limiting car usage for cleaner air.
- Community services benefits.


In [None]:
!pip install sentence-transformers==2.2.2
!pip install InstructorEmbedding
!pip install accelerate
!pip install -i https://pypi.org/simple/ bitsandbytes
!pip install bertopic
!pip install openai
!pip install ctransformers[cuda]

In [None]:
import os, sys, gc, re, json
os.environ["TOKENIZERS_PARALLELISM"] = "false"
import random, math
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import transformers
import torch
from sentence_transformers import SentenceTransformer
from bertopic.representation import KeyBERTInspired
from bertopic.representation import TextGeneration
from sklearn.feature_extraction.text import CountVectorizer
from ctransformers import AutoModelForCausalLM
from transformers import AutoTokenizer, pipeline
from umap import UMAP
from hdbscan import HDBSCAN
from InstructorEmbedding import INSTRUCTOR
from bertopic import BERTopic
from wordcloud import WordCloud
import openai
import hashlib
from bertopic.representation import OpenAI
print("Pytorch:", torch.__version__)
print("Transformers:", transformers.__version__)

In [None]:
def seed_everything(seed):
    import random, os
    import numpy as np
    import torch
    
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True
    
def generate_id(txt):
    result = hashlib.md5()
    result.update(txt.encode())
    result_md5 = result.hexdigest()
    return result_md5

def cleanup(txt):
    if txt is not None:
        txt = txt.strip()
        # Replace line feed
        txt = txt.replace('\n', ' ')
        # Remove duplicated spaces
        txt = re.sub(r"\s+", " ", txt)        
    return txt

In [None]:
SEED = 42
DATA_HOME = "/kaggle/input/learning-agency-lab-automated-essay-scoring-2"
TRAIN_FILE = os.path.join(DATA_HOME, "train.csv")
PERSUADE2_FILE = '/kaggle/input/persuade-2-0/persuade_2.0_human_scores_demo_id_github.csv'
seed_everything(SEED)

# Load data and extract source

In [None]:
train_pd = pd.read_csv(TRAIN_FILE)
# Clean up Kaggle data
train_pd["uid"] = train_pd["full_text"].apply(lambda x: generate_id(x))
train_pd["full_text"] = train_pd["full_text"].apply(lambda x: cleanup(x))
train_pd["src"] = "kaggle-only"
# Read Persuade 2.0 data
extra = pd.read_csv(PERSUADE2_FILE)
extra = extra[["essay_id_comp", "full_text", "holistic_essay_score", "prompt_name", "assignment"]].rename(columns={'essay_id_comp': 'essay_id', 'holistic_essay_score': 'score'})        
extra["uid"] = extra["full_text"].apply(lambda x: generate_id(x))
extra["src"] = "persuade-only"
# Clean up extra
extra["full_text"] = extra["full_text"].apply(lambda x: cleanup(x))    
common = pd.merge(extra, train_pd, on="uid", how="left")
new_items = common[common["essay_id_y"].isna()]["uid"].unique()
common_items = common[~common["essay_id_y"].isna()]["uid"].unique()
print("Extra:", extra.shape, "New items:", len(new_items), "Common items:", len(common_items))
train_pd = pd.concat([train_pd, extra[extra["uid"].isin(new_items)]], ignore_index=True)
train_pd.loc[train_pd["uid"].isin(common_items), "src"] = "kaggle-persuade"

# Some stats
train_pd["words"] = train_pd["full_text"].apply(lambda x: x.replace('\n', ' ').split(" "))
train_pd["total_words"] = train_pd["full_text"].apply(lambda x: len(x.split(" ")))
train_pd["length"] = train_pd["full_text"].apply(lambda x: len(x))

print("Data:", train_pd.shape)
train_pd

In [None]:
fig, ax = plt.subplots(1,2,figsize=(32,4))
d = sns.countplot(data=train_pd, x="score", hue="src", ax=ax[0])
for i in ax[0].containers:
    ax[0].bar_label(i,fmt='%d', fontsize=9)
d = ax[0].set_title("Label breakdown per source, total=%d. Kaggle=%d"%(len(train_pd), len(train_pd[train_pd["src"].str.contains("kaggle")])))
d = sns.histplot(data=train_pd, x="total_words", hue="src", ax=ax[1])

In [None]:
kaggle_only = train_pd[train_pd["src"].str.contains("kaggle-only")].reset_index()
kaggle_persuade = train_pd[train_pd["src"].str.contains("kaggle-persuade")].reset_index()
persuade_only = train_pd[train_pd["src"].str.contains("persuade-only")].reset_index()
kaggle_only.shape, kaggle_persuade.shape, persuade_only.shape

# Word cloud

In [None]:
def display_wordcloud(df):
    c = []
    frequency = {}

    for tokens in df["words"].values:
        c.extend(tokens)

        for word in tokens:
            if len(word) > 5:
                if word.lower() not in frequency:
                    frequency[word.lower()]=1
                else:
                    frequency[word.lower()]+=1

    fig, ax = plt.subplots(1,1, figsize=(16, 7))

    # Generate wordclouds
    cloud_freq = WordCloud(
        background_color="white", 
        contour_width=3, 
        contour_color='steelblue',
        width=800, height=400,
        max_words=1000,
        random_state=SEED,
    ).generate_from_frequencies(frequency)

    d = ax.imshow(cloud_freq, interpolation="bilinear")
    d = ax.axis("off")
    d = ax.set_title("Most frequent words")

    plt.tight_layout(pad=4)
    plt.show()

## All data: Kaggle + Persuade 2.0

In [None]:
display_wordcloud(train_pd)

## Kaggle only word cloud

In [None]:
display_wordcloud(kaggle_only)

## Kaggle-persuade word cloud

In [None]:
display_wordcloud(kaggle_persuade)

## Persuade only word cloud

In [None]:
display_wordcloud(persuade_only)

In [None]:
# Cleanup essay
def prepare_document(m, limit=-1):
    txt = ""
    if m is not None:
        txt = m[:limit+1].strip() if limit != -1 else m
    return txt

train_pd["cleaned_text"] = train_pd.apply(lambda x: prepare_document(x["full_text"], limit=-1), axis=1)
train_pd[["essay_id", "cleaned_text", "score"]].head()

# Generate embeddings to prepare topics modeling.

In [None]:
# Data
# train_pd = train_pd.head(1000)
essays = train_pd["cleaned_text"]

# Embedding model
EMB_MODEL = 'hkunlp/instructor-xl' # 'BAAI/bge-large-en'

if 'instructor' in EMB_MODEL:
    embedding_model = INSTRUCTOR(EMB_MODEL)

    # Compute essay embeddings (max_seq_length = 512, embeddings size = 784)
    instruction = "Represent the essay statement: "

    documents = []
    for essay in essays:
        documents.append([instruction ,essay])

    embeddings = embedding_model.encode(documents, show_progress_bar=True, batch_size=32)

else:
    embedding_model = SentenceTransformer(EMB_MODEL)
    embeddings = embedding_model.encode(essays, show_progress_bar=True)

print(embeddings.shape)

# Topics modeling: Reduction, clustering, ngrams extraction

In [None]:
# UMAP
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=SEED)

# Clustering
min_cluster_size = 30 # We don't want small clusters
hdbscan_model = HDBSCAN(min_cluster_size=min_cluster_size, metric='euclidean', cluster_selection_method='eom', prediction_data=True)

# Extract ngram
top_n_words = 25
vectorizer_model = CountVectorizer(ngram_range=(1,3), min_df=2, stop_words="english", strip_accents="unicode")
representation_model = KeyBERTInspired(top_n_words=top_n_words, nr_repr_docs=5, random_state=SEED)

topic_model = BERTopic(language="english", top_n_words=top_n_words, embedding_model=embedding_model, umap_model=umap_model, hdbscan_model=hdbscan_model,
                       vectorizer_model=vectorizer_model, representation_model=representation_model, calculate_probabilities=True, verbose=True)
topics, probs = topic_model.fit_transform(documents=essays, embeddings=embeddings)

In [None]:
chart = topic_model.visualize_barchart(top_n_topics=24, n_words=9, width=512)
chart.show()

In [None]:
train_pd["topic"] = topics
train_pd["topic_keyword"] = train_pd["topic"].apply(lambda x: [t[0] for t in topic_model.get_topic(int(x))])
train_pd.to_parquet("aes2_topics.parquet")
train_pd

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(25,7))
d = sns.countplot(x="topic", data=train_pd, ax=ax, palette = "Set1")
for i in ax.containers:
    ax.bar_label(i,fmt='%d', fontsize=10)
d = ax.set_title("Total topics: %d" % (len(np.unique(topics))))

In [None]:
umap_2d = UMAP(n_neighbors=15, n_components=2, min_dist=0.0, metric='cosine', random_state=SEED).fit_transform(embeddings)
train_pd["x"] = umap_2d[:,0]
train_pd["y"] = umap_2d[:,1]

In [None]:
kaggle_only = train_pd[train_pd["src"].str.contains("kaggle-only")].reset_index()
kaggle_persuade = train_pd[train_pd["src"].str.contains("kaggle-persuade")].reset_index()
persuade_only = train_pd[train_pd["src"].str.contains("persuade-only")].reset_index()
kaggle_only.shape, kaggle_persuade.shape, persuade_only.shape

## Topics related to source

We can notice that some Persuade topics not are in Kaggle (train) topics.

In [None]:
fig, ax = plt.subplots(1,2, figsize=(32,12), sharex=True, sharey=True)
d = sns.scatterplot(data=kaggle_only, x="x", y="y", hue="topic", s=10, ax=ax[0], palette='Set1')
d = ax[0].set_title("kaggle_only topics: %d" % (kaggle_only["topic"].nunique()))
d = ax[0].grid()
d = sns.scatterplot(data=kaggle_persuade, x="x", y="y", hue="topic", s=10, ax=ax[1], palette='Set1')
d = ax[1].set_title("kaggle_persuade topics: %d" % (kaggle_persuade["topic"].nunique()))
d = ax[1].grid()
plt.show()

In [None]:
fig, ax = plt.subplots(1,2, figsize=(32,12), sharex=True, sharey=True)
d = sns.scatterplot(data=persuade_only, x="x", y="y", hue="topic", s=10, ax=ax[0], palette='Set1')
d = ax[0].set_title("persuade_only topics: %d" % (persuade_only["topic"].nunique()))
d = ax[0].grid()
d = sns.scatterplot(data=train_pd, x="x", y="y", hue="src", s=10, ax=ax[1], palette='Set1')
d = ax[1].set_title("By source topics")
d = ax[1].grid()
plt.show()

In [None]:
kaggle_topics = pd.concat([kaggle_only, kaggle_persuade], ignore_index=True)["topic"].astype(int).unique()
sorted(kaggle_topics)

In [None]:
persuade_only_topics = persuade_only["topic"].astype(int).unique()
sorted(persuade_only_topics)

In [None]:
# Common topics
np.intersect1d(kaggle_topics, persuade_only_topics)

In [None]:
train_pd["kaggle_topic"] = np.where(train_pd["topic"].astype(int).isin(kaggle_topics), True, False)

In [None]:
fig, ax = plt.subplots(1,1, figsize=(26,13))
d = sns.scatterplot(data=train_pd, x="x", y="y", hue="topic", s=10, ax=ax, palette='Set1')
d = ax.set_title("All topics: %d" % (train_pd["topic"].nunique()))

In [None]:
fig, ax = plt.subplots(1,1, figsize=(26,13))
d = sns.scatterplot(data=train_pd, x="x", y="y", hue="kaggle_topic", s=10, ax=ax)
d = ax.set_title("Missing topics in Kaggle training: %d, Topics: %s" % (train_pd[train_pd["kaggle_topic"] == False]["topic"].nunique(), sorted(train_pd[train_pd["kaggle_topic"] == False]["topic"].unique())))

In [None]:
missing_topics = sorted(train_pd[train_pd["kaggle_topic"] == False]["topic"].unique())
missing_topics

In [None]:
for t in missing_topics:
    kw = train_pd[train_pd["topic"] == t]["topic_keyword"].values[0]
    print("Topic:", t, ":", kw)
    print()

In [None]:
gp = train_pd.groupby(["src", "topic"])["essay_id"].count().reset_index()
gp.pivot(columns=["topic"], index='src').fillna("")