# PERSUADE -> train essays analysis

PERSUADE corpus is available on kaggle [here](https://www.kaggle.com/datasets/nbroad/persaude-corpus-2/)

In [None]:
!pip install polyleven -q

In [None]:
import torch
import pandas as pd

persuade_df = pd.read_csv("/kaggle/input/persaude-corpus-2/persuade_2.0_human_scores_demo_id_github.csv")
train_df = pd.read_csv("/kaggle/input/learning-agency-lab-automated-essay-scoring-2/train.csv").rename(columns={'full_text': 'text', 'essay_id': 'id'})

# ignore other persuade essays with other prompts
# persuade_filtered = persuade_df
# train_filtered = train_df


train_text2id = {t: i for t, i in train_df[["text", "id"]].values}
train_id2text = {v: k for k, v in train_text2id.items()}
persuade_text2id = {
    t: i for t, i in persuade_df[["full_text", "essay_id_comp"]].values
}
persuade_id2text = {v: k for k, v in persuade_text2id.items()}

persuade_df.shape, train_df.shape

In [None]:
# checking for exact matches

train_df.text.str.strip().isin(persuade_df.full_text.str.strip()).sum()

In [None]:
# and the scores

common_essays = train_df.merge(persuade_df, left_on='text', right_on='full_text', how='inner')
all(common_essays.score == common_essays.holistic_essay_score)

In [None]:
# drop rows with exact matches
persuade_filtered = persuade_df[~persuade_df.full_text.str.strip().isin(train_df.text.str.strip())]
train_filtered = train_df[~train_df.text.str.strip().isin(persuade_df.full_text.str.strip())]

persuade_filtered.to_csv("persuade.csv", index=False)
train_filtered.to_csv("train.csv", index=False)

len(persuade_filtered), len(train_filtered)

# Create embeddings

In [None]:
%%writefile get_embeddings.py

from argparse import ArgumentParser
from transformers import AutoTokenizer, AutoModel, Trainer, TrainingArguments
from datasets import Dataset
import pandas as pd
import torch


def parse_args():
    parser = ArgumentParser()

    parser.add_argument("--model_name", type=str, default="BAAI/bge-base-en-v1.5")
    parser.add_argument("--csv_path", type=str)
    parser.add_argument("--text_col", type=str)
    parser.add_argument("--max_length", type=int, default=512)
    parser.add_argument("--batch_size", type=int, default=128)
    parser.add_argument("--num_proc", type=int, default=2)
    parser.add_argument("--output_path", type=str)

    return parser.parse_args()


def main():
    args = parse_args()

    tokenizer = AutoTokenizer.from_pretrained(args.model_name)
    model = AutoModel.from_pretrained(args.model_name)
    model.eval()

    targs = TrainingArguments(
        ".",
        report_to="none",
        per_device_eval_batch_size=args.batch_size,
        dataloader_num_workers=1,
    )

    ds = Dataset.from_pandas(pd.read_csv(args.csv_path))

    # strip whitespace from end
    ds = ds.map(
        lambda x: {args.text_col: x[args.text_col].strip()}, num_proc=args.num_proc
    )

    def tokenize(batch):
        return tokenizer(
            batch[args.text_col],
            padding=False,
            truncation=True,
            max_length=args.max_length,
        )

    with targs.main_process_first(desc="dataset map pre-processing"):
        ds = ds.map(tokenize, batched=True, num_proc=args.num_proc)

    trainer = Trainer(model=model, args=targs, tokenizer=tokenizer)

    embeddings = trainer.predict(ds).predictions[0][:, 0]

    embeddings = torch.nn.functional.normalize(
        torch.tensor(embeddings), p=2, dim=1
    ).cpu()

    torch.save(embeddings, args.output_path)


if __name__ == "__main__":
    main()


In [None]:
# run locally due to kaggle memory constraints


# # Use both GPUs for faster inference

# !torchrun --nproc_per_node 2 get_embeddings.py \
#   --model_name "BAAI/bge-small-en-v1.5" \
#   --csv_path "./persuade.csv" \
#   --text_col "full_text" \
#   --output_path "persuade_embeddings.pt" 

# !torchrun --nproc_per_node 2 get_embeddings.py \
#   --model_name "BAAI/bge-small-en-v1.5" \
#   --csv_path "./train.csv" \
#   --text_col "text" \
#   --output_path "train_embeddings.pt" 

# Take cosine similarity between train and PERSUADE embeddings

In [None]:
import torch

device = 0 if torch.cuda.is_available() else torch.device("cpu")

# move to gpu to make dot product faster
train_embeds = torch.load("/kaggle/input/aes-x-persuade/train_embeddings.pt").to(device)
persuade_embeds = torch.load("/kaggle/input/aes-x-persuade/persuade_embeddings.pt").to(device)

# dot product of normalized vectors is the same as cosine similarity
cos_sim_matrix = torch.mm(train_embeds, persuade_embeds.transpose(0, 1))

topk_results = torch.topk(cos_sim_matrix, k=300)

topk_results

# Levenshtein distance for top cosine similarity scores

In [None]:
from polyleven import levenshtein
from tqdm.auto import tqdm

train_texts = train_filtered.text.values
persuade_texts = persuade_filtered.full_text.values
topk_idxs = topk_results.indices.cpu().numpy()

all_lev_scores = []
for topk, t_txt in zip(topk_idxs, train_texts):
    
    lev_scores = []
    
    for idx in topk:
        p_txt = persuade_texts[idx]
        lev = levenshtein(t_txt.strip(), p_txt.strip())
        lev_scores.append(lev/max(len(t_txt.strip()), len(p_txt.strip())))
    
    all_lev_scores.append(lev_scores)

### It is clear to see that when the levenshtein distance is <0.3, the texts are the same

In [None]:
import difflib
import html


# Created using gpt4
def compare_texts(text1, text2):

    # Create a Differ object
    differ = difflib.Differ()

    # Compare the texts character by character
    diff = differ.compare(text1, text2)

    # Process the differences to generate HTML
    html_output = []
    for char in diff:
        if char.startswith("+"):
            # Characters in text2 but not in text1, highlighted in blue
            html_output.append(
                f'<mark style="background-color: #AFEEEE;">{html.escape(char[2:])}</mark>'
            )
        elif char.startswith("-"):
            # Characters in text1 but not in text2, highlighted in red
            html_output.append(
                f'<mark style="background-color: #DDA0DD;">{html.escape(char[2:])}</mark>'
            )
        else:
            # Characters that are the same in both texts
            html_output.append(html.escape(char[2:]))

    html_content = "".join(html_output)

    # Join the processed characters to form the complete HTML
    return f"<pre style='font-size: 14px !important;'>{html_content}</pre>"

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from IPython.core.display import HTML

ts, ps = [], []

for t, idxs, scores in zip(train_texts, topk_idxs, all_lev_scores):

    if min(scores) > 0.3: continue

    ts.append(t)
    ps.append(persuade_texts[idxs[np.argmin(scores)]])
    
    # print(" SCORES ".center(100, "*"), "\n")
    # print(scores[:5], "\n")
    # print(" TRAIN TEXT BELOW ".center(100, "*"), "\n")
    # print(t, "\n")
    # print(" PERSUADE CORPUS TEXT BELOW ".center(100, "*"), "\n")
    # print(persuade_texts[idxs[np.argmin(scores)]])
    
    # print("\n")
    # print("-"*100)
    # print("\n")

len(ts), len(ps)

# Visualizing differences

#### <mark style="background-color: #AFEEEE;">LIGHT BLUE</mark>: PERSUADE text has it, train text does NOT  
#### <mark style="background-color: #DDA0DD;">LIGHT PINK</mark>: train text has it, PERSUADE text does NOT

No highlight means both have it

In [None]:
i = 0
HTML(compare_texts(ts[i], ps[i]))

In [None]:
i = 1
HTML(compare_texts(ts[i], ps[i]))

In [None]:
t_matches = []
p_matches = [] 
matching_scores = []
no_matches_t = []

for t_txt, idxs, l_scores in zip(train_texts, topk_idxs, all_lev_scores):
    min_score = min(l_scores)
    min_idx = l_scores.index(min_score)
    if min_score < 0.3:
        
        t_matches.append(t_txt)
        p_matches.append(persuade_texts[idxs[min_idx]])
        matching_scores.append(min_score)
    else:
        no_matches_t.append(t_txt)

In [None]:
len(t_matches), len(train_texts), len(no_matches_t)

In [None]:
l_scores = pd.Series([min(s) for s in all_lev_scores if min(s) > 0.3]).sort_values().reset_index(drop=True)

In [None]:
l_scores

In [None]:
ax = l_scores.plot.line()
ax.set_ylabel('levenshtein distance')
ax.set_xlabel('essays sorted by levenshtein distance')
plt.show()