# Imports

In [None]:
!pip install --upgrade -qq pandas

In [None]:
import pandas as pd
import numpy as np
import warnings
import random
from typing import Dict, Any, List, Optional
import os
import gc
import re

import seaborn as sns
import matplotlib.pyplot as plt
from IPython.display import display


gc.enable()
warnings.simplefilter("ignore")

# Utilities

In [None]:
DEFAULT_REFERENCES_TITLES = ["Reference", "References", "Work Cited", "Works Cited", "Work cited", "Works cited"]
DEFAULT_SPECIAL_SYMBOLS = [r"\n", r"\xa0", r"\t", r"\r"]


def read_file(path: str, mode: str = "r", encoding: str = "utf-8", **kwargs) -> Dict[str, Any]:
    with open(path, mode=mode, encoding=encoding, **kwargs) as file:
        data = file.read()
        
    return data

def remove_urls(text: str) -> str:
    pattern = r'(http\S+)|(https\S+)|(www.\S+)'
    text = re.sub(pattern, '', text)

    
    return text

def remove_special_symbols(
    text: str, 
    special_symbols: List[str] = DEFAULT_SPECIAL_SYMBOLS, 
    replace_value: str = " ",
) -> str:
    for special_symbol in special_symbols:
        text = re.sub(special_symbol, replace_value, text)
        
    return text


def remove_brackets(text: str) -> str:
    return re.sub("[\(\[].*?[\)\]]", "", text)

def remove_extra_spaces(text: str, remove_spaces_before_punctionations: bool = True) -> str:
    text = re.sub(' +', ' ', text)
    
    if remove_spaces_before_punctionations:
        text = re.sub(r'\s([?.!"](?:\s|$))', "", text) # removes spaces before punctuations
    
    return text

def remove_reference_content(text: str, titles: List[str] = DEFAULT_REFERENCES_TITLES) -> str:
    for title in titles:
        try:
            title_index = text.find(title)
            if title_index > 0:
                text = text[:title_index]
        except:
            pass
        
    return text

def get_num_words(text: str) -> int:    
    return len(text.split())

# Config

In [None]:
output_directory = "./"

# Competition data

In [None]:
feedback_prize_ell_directory = "../input/feedback-prize-english-language-learning/"
feedback_prize_ell_train_path = os.path.join(feedback_prize_ell_directory, "train.csv")
feedback_prize_ell_test_path = os.path.join(feedback_prize_ell_directory, "test.csv")

feedback_prize_ell_train = pd.read_csv(feedback_prize_ell_train_path)
feedback_prize_ell_train["source"] = "Feedback Prize ELL"

feedback_prize_ell_test = pd.read_csv(feedback_prize_ell_test_path)
feedback_prize_ell_test["source"] = "Feedback Prize ELL"
gc.collect()

display(feedback_prize_ell_test.head())

# Feedback Prize 2021

In [None]:
feedback_prize_2021_directory = "../input/feedback-prize-2021/"
feedback_prize_2021_train_directory = os.path.join(feedback_prize_2021_directory, "train")
feedback_prize_2021_train_path = os.path.join(feedback_prize_2021_directory, "train.csv")

feedback_prize_2021_train = pd.read_csv(feedback_prize_2021_train_path)
feedback_prize_2021_train = feedback_prize_2021_train.drop_duplicates(subset=["id"], keep="first")
feedback_prize_2021_train = feedback_prize_2021_train.reset_index(drop=True)

# remove similar texts to competition data
feedback_prize_ell_ids = feedback_prize_ell_train["text_id"].values
feedback_prize_intersect_mask = feedback_prize_2021_train["id"].isin(feedback_prize_ell_ids)
feedback_prize_2021_train = feedback_prize_2021_train[~feedback_prize_intersect_mask].reset_index(drop=True)

columns = ["id"]
feedback_prize_2021_train = feedback_prize_2021_train[columns]
get_essay_path = lambda id, directory: os.path.join(directory, f"{id}.txt")
feedback_prize_2021_train["path"] = feedback_prize_2021_train["id"].apply(lambda id: get_essay_path(id, feedback_prize_2021_train_directory))
feedback_prize_2021_train["full_text"] = feedback_prize_2021_train["path"].apply(read_file)

columns = ["text_id", "full_text", "source"]
feedback_prize_2021_train = feedback_prize_2021_train.rename(columns={"id": "text_id"})
feedback_prize_2021_train["source"] = "Feedback Prize 2021"
feedback_prize_2021_train = feedback_prize_2021_train[columns]
gc.collect()

display(feedback_prize_2021_train)

# Essayscsv

In [None]:
essayscsv_path = "../input/essayscsv/essays.csv"
essayscsv = pd.read_csv(essayscsv_path, encoding="cp1252")
essayscsv = essayscsv[["#AUTHID", "TEXT"]]

# pre-processing
essayscsv = essayscsv.rename(columns={"#AUTHID": "text_id", "TEXT": "full_text"})
essayscsv["text_id"] = essayscsv["text_id"].apply(lambda x: x.split(".")[0])
essayscsv["source"] = "essayscsv"
gc.collect()

display(essayscsv)

# IvyPanda

In [None]:
def preprocess_ivypanda_text(text: str) -> str:
    text = remove_urls(text)
    text = remove_reference_content(text)
    text = remove_brackets(text)
    text = remove_extra_spaces(text)
    
    return text

In [None]:
ivypanda_directory = "../input/ivypanda/"

ivypanda_1pages_essays_path = os.path.join(ivypanda_directory, "ivypanda_1pages_essays.csv")
ivypanda_1pages_essays = pd.read_csv(ivypanda_1pages_essays_path)

ivypanda_2pages_essays_path = os.path.join(ivypanda_directory, "ivypanda_2pages_essays.csv")
ivypanda_2pages_essays = pd.read_csv(ivypanda_2pages_essays_path)

ivypanda_3pages_essays_path = os.path.join(ivypanda_directory, "ivypanda_3pages_essays.csv")
ivypanda_3pages_essays = pd.read_csv(ivypanda_3pages_essays_path)

ivypanda_4pages_essays_path = os.path.join(ivypanda_directory, "ivypanda_4pages_essays.csv")
ivypanda_4pages_essays = pd.read_csv(ivypanda_4pages_essays_path)

ivypanda_5pages_essays_path = os.path.join(ivypanda_directory, "ivypanda_5pages_essays.csv")
ivypanda_5pages_essays = pd.read_csv(ivypanda_5pages_essays_path)

ivypanda_6pages_essays_path = os.path.join(ivypanda_directory, "ivypanda_6pages_essays.csv")
ivypanda_6pages_essays = pd.read_csv(ivypanda_6pages_essays_path)

ivypanda_datasets = [
    ivypanda_1pages_essays, ivypanda_2pages_essays, ivypanda_3pages_essays, 
    ivypanda_4pages_essays, ivypanda_5pages_essays, ivypanda_6pages_essays,
]

ivypanda = pd.concat(ivypanda_datasets, axis=0)

# removing NaN values and duplicates
ivypanda = ivypanda.dropna(subset=["text", "type"], axis=0)
ivypanda = ivypanda.drop_duplicates(subset=["text"]).reset_index(drop=True)

# type selection
essays_mask = ivypanda["type"].str.lower().str.contains("essay")
ivypanda = ivypanda[essays_mask]

ivypanda = ivypanda.rename(columns={"index": "text_id", "text": "full_text"})
ivypanda["source"] = "IvyPanda"
ivypanda["text_id"] = range(len(ivypanda))

# pre-processing
ivypanda["full_text"] = ivypanda["full_text"].apply(preprocess_ivypanda_text)

columns = ["text_id", "full_text", "source"]
ivypanda = ivypanda[columns]
gc.collect()

display(ivypanda)

# StudentShare

In [None]:
def preprocess_studentshare_text(text: str) -> str:
    text = remove_special_symbols(text, special_symbols=[r"\t", r"\xa0"])
    text = remove_brackets(text)
    text = remove_reference_content(text)
    text = remove_extra_spaces(text)
    text = remove_urls(text)
    
    return text

In [None]:
studentshare_path = "../input/studentshare/studentshare_essays.csv"
studentshare = pd.read_csv(studentshare_path)
studentshare["source"] = "StudentShare"

studentshare = studentshare.rename(columns={"text": "full_text"})

# type selection
studentshare = studentshare.dropna(axis=0, subset=["full_text", "type"])
essays_mask = studentshare["type"].str.lower().str.contains("essay")
studentshare = studentshare[essays_mask]

# removing NaN and duplicates
studentshare = studentshare.dropna(axis=0, subset=["full_text", "type"])
studentshare = studentshare.drop_duplicates(subset=["full_text"]).reset_index(drop=True)

# pre-processing
studentshare["full_text"] = studentshare["full_text"].apply(preprocess_studentshare_text)

columns = ["text_id", "full_text", "source"]
studentshare["text_id"] = range(0, len(studentshare))
studentshare = studentshare[columns]
gc.collect()

display(studentshare)

# EssayForum

In [None]:
essayforum_path = "../input/essayforum-essays/EssayForum_Essays_cleaned.csv"
essayforum = pd.read_csv(essayforum_path)
essayforum = essayforum.drop(["Correct Grammar"], axis=1)
essayforum["text_id"] = range(len(essayforum))
essayforum["source"] = "EssayForum Essays"
essayforum = essayforum.rename(columns={"Cleaned Essay": "full_text"})

columns = ["text_id", "full_text", "source"]
essayforum = essayforum[columns]
gc.collect()

display(essayforum)

# Back translation

In [None]:
columns = ["text_id", "back_translated_text"]
back_translation_directory = "../input/fp3-back-translation/"

# English -> French -> English
english_french_path = os.path.join(back_translation_directory, "english_french.csv")
english_french = pd.read_csv(english_french_path)[columns].rename(columns={"back_translated_text": "full_text"})
english_french["source"] = "Back translation (English-French-English)"

# English -> German -> English
english_german_path = os.path.join(back_translation_directory, "english_german.csv")
english_german = pd.read_csv(english_german_path)[columns].rename(columns={"back_translated_text": "full_text"})
english_german["source"] = "Back translation (English-German-English)"

# English -> Russian -> English
english_russian_path = os.path.join(back_translation_directory, "english_russian.csv")
english_russian = pd.read_csv(english_russian_path)[columns].rename(columns={"back_translated_text": "full_text"})
english_russian["source"] = "Back translation (English-Russian-English)"

# # English -> Italian -> English
# english_italian_path = os.path.join(back_translation_directory, "english_italian.csv")
# english_italian = pd.read_csv(english_italian_path)[columns]
# english_italian["source"] = "Back translation (English-Italian-English)"

# # English -> Chinese -> English
# english_chinese_path = os.path.join(back_translation_directory, "english_chinese.csv")
# english_chinese = pd.read_csv(english_chinese_path)[columns]
# english_chinese["source"] = "Back translation (English-Chinese-English)"

# https://www.kaggle.com/datasets/mujrush/multiple-language-backtranslation-dataset
back_translated_path = "../input/fp3-back-translation/backtranslate_df.csv"
back_translated = pd.read_csv(back_translated_path)

# English -> Japanese -> English
english_japanese = back_translated[["text_id", "ja_backtrans_text"]]
english_japanese = english_japanese.rename(columns={"ja_backtrans_text": "full_text"})
english_japanese["source"] = "Back translation (English-Japanese-English)"

# English -> Korean -> English
english_korean = back_translated[["text_id", "ko_backtrans_text"]]
english_korean = english_korean.rename(columns={"ko_backtrans_text": "full_text"})
english_korean["source"] = "Back translation (English-Korean-English)"

# English -> Thai -> English
english_thai = back_translated[["text_id", "th_backtrans_text"]]
english_thai = english_thai.rename(columns={"th_backtrans_text": "full_text"})
english_thai["source"] = "Back translation (English-Thai-English)"

# English -> Greek -> English
english_greek = back_translated[["text_id", "el_backtrans_text"]]
english_greek = english_greek.rename(columns={"el_backtrans_text": "full_text"})
english_greek["source"] = "Back translation (English-Greek-English)"

# English -> Portuguese -> English
english_portuguese = back_translated[["text_id", "pt_backtrans_text"]]
english_portuguese = english_portuguese.rename(columns={"pt_backtrans_text": "full_text"})
english_portuguese["source"] = "Back translation (English-Portuguese-English)"

# English -> Spanish -> English
english_spanish = back_translated[["text_id", "es_backtrans_text"]]
english_spanish = english_spanish.rename(columns={"es_backtrans_text": "full_text"})
english_spanish["source"] = "Back translation (English-Spanish-English)"

In [None]:
back_translation_datasets = [
    english_french, english_german, english_russian, english_japanese, 
    english_korean, english_thai, english_greek, english_portuguese, 
    english_spanish, # english_italian, english_chinese,
] 

back_translation = pd.concat(back_translation_datasets, axis=0)
gc.collect()

display(back_translation)

# Synthetic data

In [None]:
def remove_t5_transform_special_tokens(text: str) -> str:
    text = re.sub(r"<pad>", "", text)
    text = re.sub(r"<s>", "", text)
    text = re.sub(r"</s>", "", text)
    text = text.strip()
    
    return text

def preprocess_synthetic_text(text: str) -> str:
    text = remove_t5_transform_special_tokens(text)
    
    return text

In [None]:
synthetic_data_path = "../input/fp3-synthetic-data/synthetic_data.csv"
synthetic_data = pd.read_csv(synthetic_data_path)
synthetic_data["source"] = "Synthetic data"

# pre-processing
synthetic_data["full_text"] = synthetic_data["full_text"].apply(preprocess_synthetic_text)

columns = ["text_id", "full_text", "source"]
synthetic_data = synthetic_data[columns]
gc.collect()

display(synthetic_data)

# Summary

In [None]:
all_datasets = [
    feedback_prize_2021_train, essayscsv, ivypanda, studentshare, 
    back_translation, essayforum, synthetic_data,
]

external_dataset = pd.DataFrame()
external_dataset = pd.concat([external_dataset, *all_datasets], axis=0, ignore_index=True)
external_dataset = external_dataset.reset_index(drop=True)

# removing NaN and duplicates
external_dataset = external_dataset.dropna(subset=["text_id", "full_text"], axis=0)
external_dataset = external_dataset.drop_duplicates(subset=["full_text"]).reset_index(drop=True)

# length selection
min_num_words, max_num_words = 10, 1300
external_dataset["num_words"] = external_dataset["full_text"].apply(get_num_words)
num_words_mask = (min_num_words < external_dataset["num_words"]) & (external_dataset["num_words"] < max_num_words)
external_dataset = external_dataset[num_words_mask].reset_index(drop=True)
external_dataset = external_dataset.drop(["num_words"], axis=1)
external_dataset["text_id"] = range(0, len(external_dataset))

# saving
external_dataset_output_path = os.path.join(output_directory, "external_data.csv")
external_dataset.to_csv(external_dataset_output_path, index=False, encoding="utf-8")
gc.collect()

display(external_dataset)

In [None]:
# loading
external_dataset = pd.read_csv(external_dataset_output_path)
external_dataset = external_dataset.dropna(subset=["text_id", "full_text"], axis=0).reset_index(drop=True)
external_dataset

# Exploratory Data Analysis

In [None]:
palette = sns.color_palette("mako")

In [None]:
external_dataset["num_words"] = external_dataset["full_text"].apply(get_num_words)

In [None]:
fig = plt.figure(figsize=(10, 7))
ax = fig.add_subplot()
ax.grid(color="lightgrey", axis="x", zorder=0)
sns.kdeplot(x="num_words", fill=True, alpha=0.9, color=palette[0], data=external_dataset, zorder=2, ax=ax)
fig.show()

In [None]:
fig = plt.figure(figsize=(10, 7))
ax = fig.add_subplot()
ax.grid(color="lightgrey", axis="x", zorder=0)
sns.kdeplot(x="num_words", hue="source", fill=True, alpha=0.9, palette="mako", data=external_dataset, zorder=2, ax=ax)
fig.show()

In [None]:
fig = plt.figure(figsize=(10, 7))
ax = fig.add_subplot()
ax.grid(color="lightgrey", axis="x", zorder=0)
sns.countplot(y="source", palette="mako", data=external_dataset, zorder=2, ax=ax)
fig.show()