# What this notebook covers?
Persuade 2.0 has essays overlapping with the train dataset of this comp. @mpware published the gread EDA notebook of essays in the persuade 2.0 dataset ([link](https://www.kaggle.com/code/mpware/aes2-what-are-the-essays-about)). @mpware, Thanks! 


So, in the notebook, I foucs on EDA of columns exclusding "full_text".

In [None]:
import pandas as pd 
import plotly.express as px

# Load & Process Data

## Load

In [None]:
df_train = pd.read_csv("/kaggle/input/learning-agency-lab-automated-essay-scoring-2/train.csv")
df_persuade_2 = pd.read_csv("/kaggle/input/persuade-2-0/persuade_2.0_human_scores_demo_id_github.csv")
df_persuade_2 = df_persuade_2.rename(columns={"holistic_essay_score":"score", "essay_id_comp": "essay_id"})

## Merge the provided training data with persuade 2.0 data

In [None]:
train_essays = df_train.full_text.tolist()
df_persuade_2["exist_in_train"] = df_persuade_2["full_text"].apply(lambda x: x in train_essays)

# How many samples in persuade 2.0 dataset is overlaped with the given train dataset?
df_persuade_2["exist_in_train"].sum() / len(df_persuade_2)

In [None]:
non_exist_in_train_persuade_2 = df_persuade_2.loc[~df_persuade_2["exist_in_train"], ["essay_id", "full_text", "score"]]
non_exist_in_train_persuade_2["is_original"] = False
df_train["is_original"] = True
df_merged = pd.concat([df_train, non_exist_in_train_persuade_2], axis=0).reset_index(drop=True)

print(df_merged.shape)

## Fix non-uniqueness of essay ids
There are two essays with the same essay_id yet different texts.

In [None]:
pd.set_option('display.max_colwidth', 500)
# There are two essays with the same essay_id yet different texts
df_merged.loc[df_merged["essay_id"] == "3.25E+11", ["full_text"]]

In [None]:
import uuid
df_merged.loc[20303, "essay_id"] = str(uuid.uuid4())
# check
df_merged.loc[df_merged["essay_id"] == "3.25E+11", ["full_text"]]

### plot histogram

In [None]:
# plot histogram
fig = px.histogram(df_merged, x="score", color="is_original",nbins=6)
fig.show()

# Explore persuade 2.0

In [None]:
# check columns in persudade 2.0
df_persuade_2.columns

## Missing Values

In [None]:
# missing values of all samples
df_persuade_2.isna().sum()

In [None]:
# missing values of samples in the train data
df_persuade_2.loc[df_persuade_2["exist_in_train"]].isna().sum()

In [None]:
df_persuade_2 = df_persuade_2.sort_values("score")

## Grade Level                    

In [None]:
df_persuade_2["grade_level"] = df_persuade_2["grade_level"].astype(str)
df_persuade_2["grade_level"] = df_persuade_2["grade_level"].fillna("Unknown")

In [None]:
# grade X score
fig = px.histogram(df_persuade_2, x="grade_level", color="score")
fig.show()

## Race Ethnicity & Gender & Economically Disadvantaged & Student Disability Status
Understainding relationships between demographic features and scores may give hints to generate synthetic essays.

In [None]:
# count data: race_ethnicity X prompt_name
fig = px.histogram(df_persuade_2, x="race_ethnicity", color="prompt_name")
fig.show()

In [None]:
# count data: race_ethnicity X score
fig = px.histogram(df_persuade_2, x="race_ethnicity", color="score")
fig.show()

In [None]:
fig = px.histogram(df_persuade_2, x="gender", color="score")
fig.show()

In [None]:
fig = px.histogram(df_persuade_2, x="economically_disadvantaged", color="score")
fig.show()

In [None]:
fig = px.histogram(df_persuade_2, x="student_disability_status", color="score")
fig.show()

# Task

In [None]:
# Essays in both Persuade2.0 and the train dataset are only "text-dependent."
df_persuade_2.loc[df_persuade_2["exist_in_train"], "task"].value_counts()

In [None]:
#Analysis: the counts for 'Independent' and 'Text Dependent' are roughly equal.
fig = px.histogram(df_persuade_2, x="task", color="exist_in_train")
fig.show()

In [None]:
fig = px.histogram(df_persuade_2, x="task", color="score")
fig.show()

## Prompt Name & Assignment & Source Text
The three features are related to the contents of essays. Analyzing these features may reveal insights into essays.

In [None]:
# replace missing source texts with "No Source Text"
df_persuade_2["source_text"] = df_persuade_2["source_text"].fillna("No Source Text")

In [None]:
# the unique count of prompt_name and source_text and assignment
# all
print(df_persuade_2["prompt_name"].nunique())
print(df_persuade_2["source_text"].nunique())
print(df_persuade_2["assignment"].nunique())

# rows inclueded in the train dataset
print(df_persuade_2.loc[df_persuade_2["exist_in_train"], "prompt_name"].nunique())
print(df_persuade_2.loc[df_persuade_2["exist_in_train"], "source_text"].nunique())
print(df_persuade_2.loc[df_persuade_2["exist_in_train"], "assignment"].nunique())

In [None]:
prompts = []
assignments = []
source_texts = []
exist_in_train_flags = []
for prompt in df_persuade_2.prompt_name.unique():
    tmp = df_persuade_2.loc[df_persuade_2.prompt_name == prompt]
    assignment = tmp.assignment.tolist()[0]
    source_text = tmp.source_text.tolist()[0]
    exist_in_train_flag = tmp.exist_in_train.tolist()[0]
    prompts.append(prompt)
    assignments.append(assignment)
    source_texts.append(source_text)
    exist_in_train_flags.append(exist_in_train_flag)
    
    assert len(tmp.assignment.unique()) == 1 and len(tmp.source_text.unique()) == 1

In [None]:
from IPython.display import HTML

def display_with_html(prompt_names, assignments, source_texts, exist_in_train_flags):
    html_content = """
    <style>
        .custom-container {
            padding: 10px;
            margin: 10px 0;
            display: flex;
            justify-content: space-between;
        }
        .custom-box {
            width: 30%;
            padding: 10px;
            margin: 5px;
            display: flex;
            flex-direction: column;
            align-items: center;
        }
    </style>
    """
    for i, (prompt_name, assignment, source_text, exist_in_train) in enumerate(zip(prompt_names, assignments, source_texts, exist_in_train_flags)):
        border_color = "red" if exist_in_train else "green"
        html_content += f"""
        <div class="custom-container" style='border: 2px solid {border_color};'>
            <p><strong>Topic{i+1}</strong><br>exist_in_train={exist_in_train}</p>
            <div class="custom-box">
                <p><strong>Prompt Name:</strong></p>
                <p>{prompt_name}</p>
            </div>
            <div class="custom-box">
                <p><strong>Assignment:</strong></p>
                <p>{assignment}</p>
            </div>
            <div class="custom-box">
                <p><strong>Source Texts:</strong></p>
                <p>{source_text}</p>
            </div>
        </div>
        """
    return html_content

In [None]:
# display all combinations of prompt names and assignmets and source texts
# red: exist_in_train is True
# green: exist_in_train is False

HTML(
    display_with_html(
        prompts,
        assignments,
        source_texts,
        exist_in_train_flags
    )
)

In [None]:
# How many counts does each prompt_name have? In addition, what is the score ratio in each prompt_name?
fig = px.histogram(df_persuade_2, x="prompt_name", color="score")
fig.show()

## Exist In Train

### Score Distribution 

In [None]:
fig = px.histogram(df_persuade_2, x="score", color="exist_in_train",nbins=6)
fig.show()

### Violin Plot

In [None]:
fig = px.violin(df_persuade_2, y="score", x="exist_in_train", color="exist_in_train", box=True)
fig.show()