### Prepare folds

In [None]:
import pandas as pd
from glob import glob
from tqdm import tqdm

df = pd.read_csv("../data/train.csv")

In [None]:
from sklearn.model_selection import StratifiedGroupKFold, GroupKFold
import numpy as np

for fold, (_, val_idx) in enumerate(
    list(
        StratifiedGroupKFold(n_splits=5).split(
            np.arange(len(df)), df.discourse_effectiveness, groups=df["essay_id"]
        )
    )
):
    df.loc[val_idx, "fold"] = fold
df["fold"] = df["fold"].astype(int)

In [None]:
pd.crosstab(df.fold, df.discourse_effectiveness)

In [None]:
# df.to_csv("../data/train_folded.csv", index=False)

### Prepare previous competition data

In [None]:
df = pd.read_csv("../data/feedback_2021/train.csv")

df.columns = ['essay_id', 'discourse_id', 'discourse_start', 'discourse_end',
       'discourse_text', 'discourse_type', 'discourse_type_num',
       'predictionstring']
new = pd.read_csv("../data/train_folded.csv")

df = df[~df.essay_id.isin(new.essay_id)].reset_index(drop=True)

essay_texts = {}
for fname in tqdm(glob("../data/feedback_2021/train/*.txt")):
    with open(fname) as f:
        lines = f.read()
        
    essay_texts[fname.split("/")[-1][:-4]] = lines
    
df["essay_text"] = df.essay_id.map(essay_texts)

# df.to_csv("../data/old_competition_data.csv", index=False)

In [None]:
all_obs = []

for name, gr in tqdm(df.groupby("essay_id", sort=False)):
    essay_text_start_end = gr.essay_text.values[0]
    token_labels = []
    token_obs = []
    
    end_pos = 0
    for idx, row in gr.reset_index(drop=True).iterrows():
        target_text = row["discourse_text"].strip()

        essay_text_start_end = essay_text_start_end[:end_pos] + essay_text_start_end[end_pos:].replace(row["discourse_text"].strip(), target_text, 1)
        
        start_pos = essay_text_start_end[end_pos:].find(target_text)
        if start_pos == -1:
            raise ValueError()
        start_pos += end_pos
        
        if idx == 0 and start_pos > 0:
            token_labels.append("O")
            token_obs.append(essay_text_start_end[:start_pos])
        
        if start_pos > end_pos and end_pos > 0:
            token_labels.append("O")
            token_obs.append(essay_text_start_end[end_pos:start_pos])


        end_pos = start_pos + len(target_text)
        token_labels.append("A" + row["discourse_type"])
        token_obs.append(essay_text_start_end[start_pos: end_pos])
            
        if idx == len(gr) - 1 and end_pos < len(essay_text_start_end):
            token_labels.append("O")
            token_obs.append(essay_text_start_end[end_pos:])
        
    all_obs.append((name, token_labels, token_obs))

tt = pd.DataFrame(all_obs, columns=["essay_id", "tokens", "essay_text"])
# tt.to_parquet("../data/feedback_2021_pretrain.pq", index=False)

### Prepare train data

In [None]:
import pandas as pd

df = pd.read_csv("../data/train_folded.csv")
df.loc[df.discourse_id == "56744a66949a", "discourse_text"] = "This whole thing is point less how they have us in here for two days im missing my education. We could have finished this in one day and had the rest of the week to get back on the track of learning. I've missed both days of weight lifting, algebra, and my world history that i do not want to fail again! If their are any people actually gonna sit down and take the time to read this then\n\nDO NOT DO THIS NEXT YEAR\n\n.\n\nThey are giving us cold lunches. ham and cheese and an apple, I am 16 years old and my body needs proper food. I wouldnt be complaining if they served actual breakfast. but because of Michelle Obama and her healthy diet rule they surve us 1 poptart in the moring. How does the school board expect us to last from 7:05-12:15 on a pop tart? then expect us to get A's, we are more focused on lunch than anything else. I am about done so if you have the time to read this even though this does not count. Bring PROPER_NAME a big Mac from mc donalds, SCHOOL_NAME, (idk area code but its in LOCATION_NAME)       \xa0    "

In [None]:
all_obs = []

for name, gr in tqdm(df.groupby("essay_id", sort=False)):
    essay_text_start_end = gr.essay_text.values[0]
    token_labels = []
    token_obs = []
    
    end_pos = 0
    for idx, row in gr.reset_index(drop=True).iterrows():
        target_text = row["discourse_type"] + " " + row["discourse_text"].strip()
        essay_text_start_end = essay_text_start_end[:end_pos] + essay_text_start_end[end_pos:].replace(row["discourse_text"].strip(), target_text, 1)
        
        start_pos = essay_text_start_end[end_pos:].find(target_text)
        if start_pos == -1:
            raise ValueError()
        start_pos += end_pos
        
        if idx == 0 and start_pos > 0:
            token_labels.append("O")
            token_obs.append(essay_text_start_end[:start_pos])
        
        if start_pos > end_pos and end_pos > 0:
            token_labels.append("O")
            token_obs.append(essay_text_start_end[end_pos:start_pos])

        end_pos = start_pos + len(target_text)
        token_labels.append(row["discourse_effectiveness"])
        token_obs.append(essay_text_start_end[start_pos: end_pos])
            
        if idx == len(gr) - 1 and end_pos < len(essay_text_start_end):
            token_labels.append("O")
            token_obs.append(essay_text_start_end[end_pos:])
    all_obs.append((name, token_labels, token_obs, row["fold"]))
    
tt = pd.DataFrame(all_obs, columns=["essay_id", "tokens", "essay_text", "fold"])
# tt.to_parquet("../data/feedback_text_token_classification_v5.pq", index=False)

In [None]:
all_obs = []

for name, gr in tqdm(df.groupby("essay_id", sort=False)):
    essay_text_start_end = gr.essay_text.values[0]
    token_labels = []
    token_obs = []
    
    end_pos = 0
    token_obs.append(" ".join(gr.discourse_type.to_list()))
    token_labels.append("O")
    for idx, row in gr.reset_index(drop=True).iterrows():
        target_text = row["discourse_type"] + " " + row["discourse_text"].strip()
        essay_text_start_end = essay_text_start_end[:end_pos] + essay_text_start_end[end_pos:].replace(row["discourse_text"].strip(), target_text, 1)
        
        start_pos = essay_text_start_end[end_pos:].find(target_text)
        if start_pos == -1:
            raise ValueError()
        start_pos += end_pos
        
        if idx == 0 and start_pos > 0:
            token_labels.append("O")
            token_obs.append(essay_text_start_end[:start_pos])
        
        if start_pos > end_pos and end_pos > 0:
            token_labels.append("O")
            token_obs.append(essay_text_start_end[end_pos:start_pos])

        end_pos = start_pos + len(target_text)
        token_labels.append(row["discourse_effectiveness"])
        token_obs.append(essay_text_start_end[start_pos: end_pos])
            
        if idx == len(gr) - 1 and end_pos < len(essay_text_start_end):
            token_labels.append("O")
            token_obs.append(essay_text_start_end[end_pos:])
    all_obs.append((name, token_labels, token_obs, row["fold"]))
    
tt = pd.DataFrame(all_obs, columns=["essay_id", "tokens", "essay_text", "fold"])
# tt.to_parquet("../data/feedback_text_token_classification_types.pq", index=False)

### Prepare pseudo data

In [None]:
import pandas as pd
from tqdm import tqdm

labels = pd.read_csv("../data/pseudo_75_ff_raw.csv")
# labels = pd.read_csv("../data/pseudo_104_ff_raw.csv")
# labels = pd.read_csv("../data/pseudo_140_ff_raw.csv")
df = pd.read_csv("../data/old_competition_data.csv")
df = df.merge(labels)

In [None]:
all_obs = []

for name, gr in tqdm(df.groupby("essay_id", sort=False)):
    essay_text_start_end = gr.essay_text.values[0]
    token_labels = []
    token_obs = []
    end_pos = 0
    
    # Pseudo with types in the beginning (pseudo 104 and 140)
    # token_obs.append(" ".join(gr.discourse_type.to_list()))
    # token_labels.append([-1, -1, -1])
    
    for idx, row in gr.reset_index(drop=True).iterrows():
        target_text = row["discourse_type"] + " " + row["discourse_text"].strip()

        essay_text_start_end = essay_text_start_end[:end_pos] + essay_text_start_end[end_pos:].replace(row["discourse_text"].strip(), target_text, 1)
        
        start_pos = essay_text_start_end[end_pos:].find(target_text)
        if start_pos == -1:
            raise ValueError()
        start_pos += end_pos
        
        if idx == 0 and start_pos > 0:
            # Soft_Labels
            token_labels.append([-1, -1, -1])
            token_obs.append(essay_text_start_end[:start_pos])
        
        if start_pos > end_pos and end_pos > 0:
            # Soft_Labels
            token_labels.append([-1, -1, -1])
            token_obs.append(essay_text_start_end[end_pos:start_pos])


        end_pos = start_pos + len(target_text)
        # Soft_Labels
        token_labels.append([row["Adequate"], row["Effective"], row["Ineffective"]])
        token_obs.append(essay_text_start_end[start_pos: end_pos])
            
        if idx == len(gr) - 1 and end_pos < len(essay_text_start_end):
            # Soft_Labels
            token_labels.append([-1, -1, -1])
            token_obs.append(essay_text_start_end[end_pos:])
            
    all_obs.append((name, token_labels, token_obs))

tt = pd.DataFrame(all_obs, columns=["essay_id", "tokens", "essay_text"])

# tt.to_parquet("../data/pseudo_75_ff.pq", index=False)
# tt.to_parquet("../data/pseudo_104_ff.pq", index=False)
# tt.to_parquet("../data/pseudo_140_ff.pq", index=False)