# Imports and Setup

In [1]:
import argparse
import glob
import os
import random
import re
import pandas as pd
import numpy as np
import csv
import json
from IPython.core.display import HTML

%load_ext autoreload
%autoreload 2

In [None]:
# Spacy setup. Need to be done once.
!pip -q install -U spacy
!pip install -U spacy-lookups-data
!python -m spacy download en_core_web_sm
!python -m spacy download es_core_news_sm
## !pip3 install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.0/en_core_web_sm-2.2.0.tar.gz

In [3]:
import spacy
from spacy.lang.es.examples import sentences 
from spacy import displacy

print(spacy.__version__)
print(pd.__version__)

from nltk.parse import stanford, corenlp
NLP_ES = spacy.load("es_core_news_sm")
NLP_EN = spacy.load("en_core_web_sm")

3.7.2
1.4.2


In [4]:
from config import Config
args = Config()

In [6]:
# Helper methods
def get_neighbors(idx, id_to_head, head_to_ids, verbosity):
    as_list = []
    as_map = dict()

    if idx in id_to_head:
        upstream = id_to_head[idx]
        downstream = []
        if idx in head_to_ids:
            downstream = head_to_ids[idx]

        # Add to neighbors list
        as_list = [upstream, *downstream]
        # Add to the neighbors map
        as_map[f'{idx}'] = upstream
        for i in downstream:
            as_map[f'{i}'] = idx

    if verbosity > 1:
        print(f'idx:{idx}, as_list: {as_list}, as_map: {as_map}')

    return (as_list, as_map)

def dep_parsing_tree(nlp, text, correction_index, verbosity=0):
    doc = nlp(text)
    js_str = doc.to_json()
    id_to_head = dict() #dict([(token["id"], token["head"]) for token in js_str["tokens"]])
    head_to_ids = dict()
    for token in js_str["tokens"]:
        id = token["id"]
        head = token["head"]
        if token['pos'] != 'PUNCT':
            id_to_head[id] = head
            if head in head_to_ids:
                head_to_ids[head].append(id)
            else:
                head_to_ids[head] = [id]
    if verbosity > 0:
        print(f'id -> head: {id_to_head}')
        print(f'head -> ids: {head_to_ids}')            
        
    pred_parse_order = dict()
    for cor_idx in correction_index:
        # Get the head for the current correction index
        (as_list, as_map) = get_neighbors(cor_idx, id_to_head, head_to_ids, verbosity)
        for i in as_list:
            (_, sub_map) = get_neighbors(i, id_to_head, head_to_ids, verbosity)
            as_map.update(sub_map)          
        pred_parse_order.update(as_map)

#         if cor_idx in id_to_head:
#             first_order = id_to_head[cor_idx] # Do we need the downstream dependencies as well here?
#             tt = head_to_ids[cor_idx]
#             for t in tt:
#                 # get second order
#             second_order = head_to_ids[first_order]
#             pred_parse_order[f'{cor_idx}'] = first_order
#             for i in second_order:
#                 pred_parse_order[f'{i}'] = first_order

    if verbosity > 0:
        df_dep_tree = pd.DataFrame()
        df_dep_tree['text'] = [token.text for token in doc]
        df_dep_tree['lemma'] = [token.lemma_ for token in doc]
        df_dep_tree['is_punctuation'] = [token.is_punct for token in doc]
        df_dep_tree['is_space'] = [token.is_space for token in doc]
        df_dep_tree['shape'] = [token.shape_ for token in doc]
        df_dep_tree['part_of_speech'] = [token.pos_ for token in doc]
        df_dep_tree['pos_tag'] = [token.tag_ for token in doc]
        df_dep_tree['head'] = [token.head.text for token in doc] 
        df_dep_tree['dep'] = [token.dep_ for token in doc]
        html = displacy.render(doc, style="dep")
        HTML(html)
    return pred_parse_order

# Load Data and do Sanity Tests

## Sanity on Full Data

In [7]:
def load():
    converters = {'correction_index': pd.eval, 'source': pd.eval, 'target': pd.eval}
    # Converters not working
    df = pd.read_csv(args.PROCESSED_DATA_FOLDER + "/final_dataset.csv", index_col=None, header=0)
    return df

df = load()

  df = pd.read_csv(args.PROCESSED_DATA_FOLDER + "/final_dataset.csv", index_col=None, header=0)


In [8]:
# Check the columns
df.columns

Index(['Unnamed: 0', 'paragraph_id', 'sentence_id', 'course', 'essay_new',
       'annotated_sentence', 'full_error_tag', 'order_tag', 'error_tag',
       'correction', 'error', 'anno_error_cnt', 'error_type',
       'correction_index', 'source', 'target', 'check_target', 'check_source'],
      dtype='object')

In [9]:
# All rows should have an error_type
df[df.error_type.isnull()]

Unnamed: 0.1,Unnamed: 0,paragraph_id,sentence_id,course,essay_new,annotated_sentence,full_error_tag,order_tag,error_tag,correction,error,anno_error_cnt,error_type,correction_index,source,target,check_target,check_source


In [10]:
# Check for null values
df[df.notnull()].count()

Unnamed: 0            47083
paragraph_id          47083
sentence_id           47083
course                47083
essay_new             47083
annotated_sentence    47083
full_error_tag         5622
order_tag              5622
error_tag              5622
correction             5573
error                  5208
anno_error_cnt        47083
error_type            47083
correction_index      47083
source                47083
target                47083
check_target          47083
check_source          47083
dtype: int64

In [11]:
df.isna().sum()

Unnamed: 0                0
paragraph_id              0
sentence_id               0
course                    0
essay_new                 0
annotated_sentence        0
full_error_tag        41461
order_tag             41461
error_tag             41461
correction            41510
error                 41875
anno_error_cnt            0
error_type                0
correction_index          0
source                    0
target                    0
check_target              0
check_source              0
dtype: int64

In [13]:
# The srouce and target as loaded as string version of lists. Convert them to actual lists.
def convert_strings(row):
    row.source = eval(row.source)
    row.target = eval(row.target)
    
print(type(df.iloc[1].source))
df.apply(convert_strings, axis=1)
print(type(df.iloc[1].source))

<class 'str'>
<class 'str'>


In [14]:
# Add additional fields
df['evidence_index'] = np.empty((len(df), 0)).tolist()
df['predicted_parsing_order'] = ""
df['origin'] = 'A'

In [15]:
df.sample(n=5)

Unnamed: 0.1,Unnamed: 0,paragraph_id,sentence_id,course,essay_new,annotated_sentence,full_error_tag,order_tag,error_tag,correction,...,anno_error_cnt,error_type,correction_index,source,target,check_target,check_source,evidence_index,predicted_parsing_order,origin
32010,510,892,3647b755b894305c3a2615dc73f6b59378ce6373524572...,SPA 1,No se gusta la gente de los paises otros.,No se gusta la gente de los paises otros.,,,,,...,0,,[],"['No', 'se', 'gusta', 'la', 'gente', 'de', 'lo...","['No', 'se', 'gusta', 'la', 'gente', 'de', 'lo...",0,0,[],,A
16898,794,1313,eb30407b6726daafa085ee2080cd8116736545cac7b138...,SPA 2,"Cuando llego al calor, me voy al océano y nada...","Cuando llego al calor, me voy al océano y nada...",,,,,...,0,,[],"['Cuando', 'llego', 'al', 'calor', ',', 'me', ...","['Cuando', 'llego', 'al', 'calor', ',', 'me', ...",0,0,[],,A
31556,488,856,c867082507f7171f105c452ae746b8b73f99f27d925713...,SPA 2,Joan Rivers muere en 2014.,Joan Rivers muere en 2014.,,,,,...,0,,[],"['Joan', 'Rivers', 'muere', 'en', '2014', '.']","['Joan', 'Rivers', 'muere', 'en', '2014', '.']",0,0,[],,A
15253,717,1196,64066d56fe484c254637e58ffb318ada4211062e886da8...,SPA 3,Habrá mucha gente.,Habrá mucha gente.,,,,,...,0,,[],"['Habrá', 'mucha', 'gente', '.']","['Habrá', 'mucha', 'gente', '.']",0,0,[],,A
43565,574,995,9f300b60229662a786240c6b92829bc748b3b89bbf842e...,SPA 3,Comen para mucho tiempo para mucho cursos de c...,Comen para mucho tiempo para [mucho]{muchos}<n...,[mucho]{muchos}<na:ps:det:inan>,0.0,na:ps:det:inan,muchos,...,1,na:ps,"[5, 16]","['Comen', 'para', 'mucho', 'tiempo', 'para', '...","['Comen', 'para', 'mucho', 'tiempo', 'para', '...",0,0,[],,A


## Split Dataset into With Errors and Without Errors

In [16]:
# Split dataset into two dataframes one with errors and one without
df_no_errors = df.query('error_type == "None"').copy()
df_no_errors.reset_index(inplace=True)
df_no_errors.rename(columns={'index': "orig_index"}, inplace=True)
df_errors = df.query('error_type != "None"').copy()
df_errors.reset_index(inplace=True)
df_errors.rename(columns={'index': "orig_index"}, inplace=True)
df_errors


Unnamed: 0.1,orig_index,Unnamed: 0,paragraph_id,sentence_id,course,essay_new,annotated_sentence,full_error_tag,order_tag,error_tag,...,anno_error_cnt,error_type,correction_index,source,target,check_target,check_source,evidence_index,predicted_parsing_order,origin
0,41461,0,1,5643cbe8e41322b9157848dae95bfc1d33e531b4d5e98d...,SPA 2,Las niñas de todo el país miran a Lauren y enc...,Las niñas de todo el país miran [a]{a}<aa:do:a...,[a]{a}<aa:do:an>,0.0,aa:do:an,...,1,aa:do,"[7, 33]","['Las', 'niñas', 'de', 'todo', 'el', 'país', '...","['Las', 'niñas', 'de', 'todo', 'el', 'país', '...",0,0,[],,A
1,41462,2,5,888d8bf63a6f44b4b925c61492c0240c21c55e8503c2d8...,SPA 3,Me gusta mucho Taylor Swift porque soy similar...,Me gusta mucho Taylor Swift porque soy similar...,[a]{a}<aa:do:an>,0.0,aa:do:an,...,1,aa:do,"[9, 22]","['Me', 'gusta', 'mucho', 'Taylor', 'Swift', 'p...","['Me', 'gusta', 'mucho', 'Taylor', 'Swift', 'p...",0,0,[],,A
2,41463,4,8,96d3244e9824d903da11192ce6b046b84c44b2da910561...,SPA 1,Ella enseña a muchas personas con sus conocimi...,Ella enseña [a]{a}<aa:do:an> muchas personas c...,[a]{a}<aa:do:an>,0.0,aa:do:an,...,1,aa:do,"[2, 12]","['Ella', 'enseña', 'a', 'muchas', 'personas', ...","['Ella', 'enseña', 'a', 'muchas', 'personas', ...",0,0,[],,A
3,41464,4,8,cd60f77dc5a4d52091b7ec9c8e47bd0c6f0c71a084908a...,SPA 1,Ella está feliz de ayudar a otras personas.,Ella está feliz de ayudar [a]{a}<aa:do:an> otr...,[a]{a}<aa:do:an>,0.0,aa:do:an,...,1,aa:do,"[5, 15]","['Ella', 'está', 'feliz', 'de', 'ayudar', 'a',...","['Ella', 'está', 'feliz', 'de', 'ayudar', 'a',...",0,0,[],,A
4,41465,4,8,e928d9f779186cc01fe17161ca830c8b6cd64d47e8b355...,SPA 1,Maya angelou escribe muchas historias para ayu...,Maya angelou escribe muchas historias para ayu...,[a]{a}<aa:do:an>,0.0,aa:do:an,...,1,aa:do,"[7, 19]","['Maya', 'angelou', 'escribe', 'muchas', 'hist...","['Maya', 'angelou', 'escribe', 'muchas', 'hist...",0,0,[],,A
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5617,47078,44,1492,f6f91450d7fd28e3df5266da689a9ee900195ee3d9a435...,SPA 3,Llevaria mi familia al Museo Nacional para mir...,Llevaria mi familia al Museo Nacional para mir...,[los]{las}<ga:fm:det:inan>,1.0,ga:fm:det:inan,...,4,ga:fm,"[9, 77]","['Llevaria', 'mi', 'familia', 'al', 'Museo', '...","['Llevaria', 'mi', 'familia', 'al', 'Museo', '...",0,0,[],,A
5618,47079,44,1492,f6f91450d7fd28e3df5266da689a9ee900195ee3d9a435...,SPA 3,Llevaria mi familia al Museo Nacional para mir...,Llevaria mi familia al Museo Nacional para mir...,[todo]{todas}<ga:na:fm:ps:det:inan>,2.0,ga:na:fm:ps:det:inan,...,4,ga:na:fm:ps,"[41, 109]","['Llevaria', 'mi', 'familia', 'al', 'Museo', '...","['Llevaria', 'mi', 'familia', 'al', 'Museo', '...",0,0,[],,A
5619,47080,44,1492,f6f91450d7fd28e3df5266da689a9ee900195ee3d9a435...,SPA 3,Llevaria mi familia al Museo Nacional para mir...,Llevaria mi familia al Museo Nacional para mir...,[los]{las}<ga:fm:det:inan>,3.0,ga:fm:det:inan,...,4,ga:fm,"[42, 110]","['Llevaria', 'mi', 'familia', 'al', 'Museo', '...","['Llevaria', 'mi', 'familia', 'al', 'Museo', '...",0,0,[],,A
5620,47081,47,1623,d136aa5a6cdd9ba0c56a12b3e4b24b37f0f8293d013291...,SPA 2,"Entonces será certificado como “perfecto"".",Entonces será [certificado]{certificadas}<ga:f...,[certificado]{certificadas}<ga:fm:adj:inan>,0.0,ga:fm:adj:inan,...,2,ga:fm,"[2, 11]","['Entonces', 'será', 'certificado', 'como', '""...","['Entonces', 'será', 'certificadas', 'como', '...",0,0,[],,A


In [17]:
df_errors.to_csv(args.PROCESSED_DATA_FOLDER + "/final_with_errors.csv", index=False)

In [18]:
df_errors.index[(df_errors.essay_new == 'Ella está feliz de ayudar a otras personas.')]

Int64Index([3], dtype='int64')

In [19]:
df_errors[df_errors.notnull()].count()

orig_index                 5622
Unnamed: 0                 5622
paragraph_id               5622
sentence_id                5622
course                     5622
essay_new                  5622
annotated_sentence         5622
full_error_tag             5622
order_tag                  5622
error_tag                  5622
correction                 5573
error                      5208
anno_error_cnt             5622
error_type                 5622
correction_index           5622
source                     5622
target                     5622
check_target               5622
check_source               5622
evidence_index             5622
predicted_parsing_order    5622
origin                     5622
dtype: int64

In [20]:
df_no_errors[df_no_errors.notnull()].count()

orig_index                 41461
Unnamed: 0                 41461
paragraph_id               41461
sentence_id                41461
course                     41461
essay_new                  41461
annotated_sentence         41461
full_error_tag                 0
order_tag                      0
error_tag                      0
correction                     0
error                          0
anno_error_cnt             41461
error_type                 41461
correction_index           41461
source                     41461
target                     41461
check_target               41461
check_source               41461
evidence_index             41461
predicted_parsing_order    41461
origin                     41461
dtype: int64

In [21]:
df_errors.full_error_tag.unique()

array(['[a]{a}<aa:do:an>', '[]{a}<az:do:an>', '[a]{}<za:do:inan>', ...,
       '[fresco]{frescos}<na:ps:adj:inan>',
       '[encurtido]{encurtidos}<na:ps:adj:inan>',
       '[certificado]{certificadas}<ga:fm:adj:inan>'], dtype=object)

## Sanity Check Lines with Errors

In [22]:
def compute_status(row):
    num_len_mismatch = 0
    num_extra_corrections = 0
    source = eval(row.source)
    target = eval(row.target)
    correction_index = set(eval(row.correction_index))
    row_status = 'GOOD'
    if len(source) != len(target):
        num_len_mismatch = num_len_mismatch + 1
        row_status = 'LEN_MISMATCH'
    else:
        mismatches = map(lambda x: x[0] == x[1], zip(source, target))
        for (i, flag) in enumerate(mismatches):
            if not flag and i not in correction_index:
                # print('Extra corrections')
                num_extra_corrections = num_extra_corrections + 1
                row_status = 'EXTRA_CORRECTIONS'
                break
    # row['status'] = row_status
    return row_status
    # print(f'Found {num_len_mismatch} entries and {num_extra_corrections} entries with extra corrections')

def check_errors():
    df_tt = df_errors[['orig_index', 'sentence_id', 'essay_new', 'source', 'target', 'correction_index']].copy(deep=True)
    df_tt['status'] = df_tt.apply(compute_status, axis=1)
    df_tt['has_annotation'] = df_tt.apply(lambda row: '<' in row.source or '<' in row.target, axis=1)
    return df_tt
    
df_error_status = check_errors()

In [24]:
print('Should have just GOOD rows')
print(df_error_status.status.value_counts())

print()
print('Should not have annotations')
print(df_error_status.has_annotation.value_counts())

with pd.option_context('display.max_colwidth', None):
    display(df_error_status[df_error_status['status'] == 'EXTRA_CORRECTIONS'].sample(2))
    print("----------------------")
    display(df_error_status[df_error_status.has_annotation == True].sample(2))

Should have just GOOD rows
GOOD                 3364
EXTRA_CORRECTIONS    2258
Name: status, dtype: int64

Should not have annotations
False    5188
True      434
Name: has_annotation, dtype: int64


Unnamed: 0,orig_index,sentence_id,essay_new,source,target,correction_index,status,has_annotation
4571,46032,8eadccd43198fb2567027d22de3448f40de9a7a20f75d141e68f62c072831588,Mi vacacion perfecto es un viaje al Costa Rica.,"['Mi', 'vacacion', 'perfecto', 'es', 'un', 'viaje', 'al', 'Costa', 'Rica', '.']","['Mi', 'vacaciones', 'perfecta', 'es', 'un', 'viaje', 'al', 'Costa', 'Rica', '.']","[1, 12]",EXTRA_CORRECTIONS,False
4592,46053,6291916380a7faeca808d4cf8350fd69ea27ceebec686991dcf31e738aba7caa,"Yo vería los pájaros, las plantas, los flores, los animales, las personas de Hawaii, la playa, el mar, las pescados y casas muy interesante.","['Yo', 'vería', 'los', 'pájaros', ',', 'las', 'plantas', ',', 'los', 'flores', ',', 'los', 'animales', ',', 'las', 'personas', 'de', 'Hawaii', ',', 'la', 'playa', ',', 'el', 'mar', ',', 'los', 'pescados', 'y', 'casas', 'muy', 'interesante', '.']","['Yo', 'vería', 'los', 'pájaros', ',', 'las', 'plantas', ',', 'los', 'flores', ',', 'los', 'animales', ',', 'las', 'personas', 'de', 'Hawaii', ',', 'la', 'playa', ',', 'el', 'mar', ',', 'las', 'pescados', 'y', 'casas', 'muy', 'interesantes', '.']","[30, 63]",EXTRA_CORRECTIONS,False


----------------------


Unnamed: 0,orig_index,sentence_id,essay_new,source,target,correction_index,status,has_annotation
5391,46852,4089a1722b4d7e5c89c7fc303519c44de7b2177ec5f2fca7209cf9b62625467c,Me gustaría mucho pasear el tiempo en Europa con mi esposo y relajarme en las paises lindas!,"['Me', 'gustaría', 'mucho', 'pasear', 'el', 'tiempo', 'en', 'Europa', 'con', 'mi', 'esposo', 'y', 'relajarme', 'en', 'las', 'países', '<', 'gat', ':', 'noun', ':', 'inan', '>', 'lindas', '!']","['Me', 'gustaría', 'mucho', 'pasear', 'el', 'tiempo', 'en', 'Europa', 'con', 'mi', 'esposo', 'y', 'relajarme', 'en', 'los', 'países', '<', 'gat', ':', 'noun', ':', 'inan', '>', 'lindos', '!']","[14, 40]",EXTRA_CORRECTIONS,True
3524,44985,2fd3744a9b6ebe37adb4ab3a66e033cc899d03b34278e0f46094f49d253c40d6,El historia de aquí jugador es muchos motivador.,"['El', 'historia', '<', 'gat', ':', 'noun', ':', 'inan', '>', 'de', 'aquí', 'jugador', 'es', 'muchos', 'motivador', '.']","['la', 'historia', '<', 'gat', ':', 'noun', ':', 'inan', '>', 'de', 'aquí', 'jugador', 'es', 'muchos', 'motivadora', '.']","[0, 17]",EXTRA_CORRECTIONS,True


# Explore EXPECT dataset

In [25]:
def load_expect_data(num_lines=None):
    expect_train_file = f"{args.EXPLAINABLE_GEC_DATA_FOLDER}/train.json"
    with open(expect_train_file, "r") as file1:
        count = 0
        rows = file1.readlines()
        df = pd.DataFrame([eval(row) for row in rows])
        df['source_org'] = df.source.map(lambda v: ' '.join(v).replace(" .", "."))        
        df['target_org'] = df.target.map(lambda v: ' '.join(v).replace(" .", "."))
        return df

df_expect = load_expect_data()


In [26]:
with pd.option_context('display.max_colwidth', None):
    display(df_expect.sample(5))

Unnamed: 0,target,source,evidence_index,correction_index,error_type,predicted_parsing_order,origin,source_org,target_org
8524,"[First, of, all, ,, transport, is, the, most, significant, carbon, dioxide, [NONE], emitter, .]","[First, of, all, ,, transport, is, the, most, significant, carbon, dioxide, 's, emitter, .]","[9, 10, 12, 24, 25, 27]","[11, 26]",POS Confusion,"{'11': 1, '26': 1}",B,"First of all , transport is the most significant carbon dioxide 's emitter.","First of all , transport is the most significant carbon dioxide [NONE] emitter."
5919,"[To, put, it, simply, ,, the, new, generation, has, decided, to, take, their, driving, licence, and, I, have, seen, this, around, me, ,, with, my, colleagues, ,, my, friends, ,, my, family, .]","[To, put, it, simply, ,, the, new, generation, has, decided, to, take, their, driving, licence, and, I, have, seen, this, around, me, ,, [NONE], my, colleagues, ,, my, friends, ,, my, family, .]","[24, 25, 58, 59]","[23, 57]",Preposition,"{'18': 3, '22': 2, '23': 1, '25': 3, '26': 2, '28': 3, '29': 3, '52': 3, '56': 2, '57': 1, '59': 3, '60': 2, '62': 3, '63': 3}",A,"To put it simply , the new generation has decided to take their driving licence and I have seen this around me , [NONE] my colleagues , my friends , my family.","To put it simply , the new generation has decided to take their driving licence and I have seen this around me , with my colleagues , my friends , my family."
5849,"[Basketball, is, recognized, by, people, all, over, the, world, .]","[Basketball, is, recognize, by, people, all, over, the, world, .]","[0, 1, 3, 4, 11, 12, 14, 15]","[2, 13]",Participle,"{'0': 2, '1': 2, '2': 1, '3': 2, '4': 3, '9': 2, '11': 2, '12': 2, '13': 1, '14': 2, '15': 3, '20': 2}",A,Basketball is recognize by people all over the world.,Basketball is recognized by people all over the world.
9125,"[Anyway, ,, I, still, love, him, and, still, have, the, hope, that, this, is, just, a, temporary, period, in, our, life, and, he, will, be, back, to, the, man, I, loved, once, ,, especially, since, we, have, the, most, amazing, reason, to, live, for, ;, it, 's, our, beautiful, son, ,, our, angel, and, the, amazing, love, of, our, lives, .]","[Anyway, ,, I, still, love, him, and, still, have, the, hope, that, this, is, just, a, temporary, period, in, our, life, and, he, will, be, back, to, the, man, I, loved, once, ,, especially, that, we, have, the, most, amazing, reason, to, live, for, ;, it, 's, our, beautiful, son, ,, our, angel, and, the, amazing, love, of, our, lives, .]",[],"[34, 96]",Others,"{'24': 3, '33': 2, '34': 1, '86': 3, '95': 2, '96': 1}",B,"Anyway , I still love him and still have the hope that this is just a temporary period in our life and he will be back to the man I loved once , especially that we have the most amazing reason to live for ; it 's our beautiful son , our angel and the amazing love of our lives.","Anyway , I still love him and still have the hope that this is just a temporary period in our life and he will be back to the man I loved once , especially since we have the most amazing reason to live for ; it 's our beautiful son , our angel and the amazing love of our lives."
7122,"[If, you, want, to, play, this, sport, ,, you, have, to, make, a, really, big, effort, and, try, to, practise, as, much, as, possible, .]","[If, you, want, to, play, this, sport, ,, I, have, to, make, a, really, big, effort, and, try, to, practise, as, much, as, possible, .]",[],"[8, 34]",Possessive,"{'2': 2, '8': 1, '9': 3, '28': 2, '34': 1, '35': 3}",B,"If you want to play this sport , I have to make a really big effort and try to practise as much as possible.","If you want to play this sport , you have to make a really big effort and try to practise as much as possible."


In [27]:
df_expect.isna().sum()

target                     0
source                     0
evidence_index             0
correction_index           0
error_type                 0
predicted_parsing_order    0
origin                     0
source_org                 0
target_org                 0
dtype: int64

## Dependency parsing tree on EXPECT

In [28]:
with pd.option_context('display.max_colwidth', None):
    display(df_expect.iloc[11872])

target                                 [The, periodic, movement, of, the, minute, hand, compared, to, the, smooth, path, of, the, second, hand, .]
source                                     [The, periodic, move, of, the, minute, hand, compared, to, the, smooth, path, of, the, second, hand, .]
evidence_index                                                                                                                      [0, 1, 18, 19]
correction_index                                                                                                                           [2, 20]
error_type                                                                                                                           POS Confusion
predicted_parsing_order    {'0': 2, '1': 2, '2': 1, '3': 2, '5': 3, '6': 3, '7': 2, '18': 2, '19': 2, '20': 1, '21': 2, '23': 3, '24': 3, '25': 2}
origin                                                                                                                

In [29]:
def show_one_EXPECT(nlp, row_idx):
    row = df_expect.iloc[row_idx]
    parsing_order = dep_parsing_tree(nlp, row.target_org, row.correction_index, verbosity=1)
    print()
    print(f'Orig: {row.source_org}')
    print(f'Corr: {row.target_org}')
    concacted = [*row.source, 'SEP', *row.target]
    print(f'Concatenated: {concacted}')
    print(f'Corr: {row.target_org}')
    print()
    print(f'Correction index: {row.correction_index}')
    print()
    print(f'Corrected offset: {len(row.target) + 1}')
    print(f'Original pred_parse_order: {row.predicted_parsing_order}')
    print()
    print(f'computed pred_parse_order: {parsing_order}')

# show_one_EXPECT(11872)
show_one_EXPECT(NLP_EN, 1314)
    

id -> head: {0: 1, 1: 3, 2: 3, 3: 3, 4: 5, 5: 3}
head -> ids: {1: [0], 3: [1, 2, 3, 5], 5: [4]}



Orig: Castle also has a courtyard.
Corr: The castle also has a courtyard.
Concatenated: ['Castle', 'also', 'has', 'a', 'courtyard', '.', 'SEP', 'The', 'castle', 'also', 'has', 'a', 'courtyard', '.']
Corr: The castle also has a courtyard.

Correction index: [0, 1, 8]

Corrected offset: 8
Original pred_parse_order: {'0': 1, '1': 1, '3': 2, '8': 1, '10': 2}

computed pred_parse_order: {'0': 1, '1': 3, '3': 3, '2': 3, '5': 3}


In [30]:
print("Evidence words are important for GEC")
print(f'computed pred_parse_order: {dep_parsing_tree(spacy.load("en_core_web_sm"), "Evidence words are important for GEC", [2])}')


Evidence words are important for GEC
computed pred_parse_order: {'2': 2, '1': 2, '3': 2, '4': 2, '0': 1, '5': 4}


In [31]:
show_one_EXPECT(NLP_EN, 1314)

id -> head: {0: 1, 1: 3, 2: 3, 3: 3, 4: 5, 5: 3}
head -> ids: {1: [0], 3: [1, 2, 3, 5], 5: [4]}



Orig: Castle also has a courtyard.
Corr: The castle also has a courtyard.
Concatenated: ['Castle', 'also', 'has', 'a', 'courtyard', '.', 'SEP', 'The', 'castle', 'also', 'has', 'a', 'courtyard', '.']
Corr: The castle also has a courtyard.

Correction index: [0, 1, 8]

Corrected offset: 8
Original pred_parse_order: {'0': 1, '1': 1, '3': 2, '8': 1, '10': 2}

computed pred_parse_order: {'0': 1, '1': 3, '3': 3, '2': 3, '5': 3}


In [32]:
show_one_EXPECT(NLP_EN, 11872)

id -> head: {0: 2, 1: 2, 2: 7, 3: 2, 4: 6, 5: 6, 6: 3, 7: 7, 8: 7, 9: 11, 10: 11, 11: 8, 12: 11, 13: 15, 14: 15, 15: 12}
head -> ids: {2: [0, 1, 3], 7: [2, 7, 8], 6: [4, 5], 3: [6], 11: [9, 10, 12], 8: [11], 15: [13, 14], 12: [15]}



Orig: The periodic move of the minute hand compared to the smooth path of the second hand.
Corr: The periodic movement of the minute hand compared to the smooth path of the second hand.
Concatenated: ['The', 'periodic', 'move', 'of', 'the', 'minute', 'hand', 'compared', 'to', 'the', 'smooth', 'path', 'of', 'the', 'second', 'hand', '.', 'SEP', 'The', 'periodic', 'movement', 'of', 'the', 'minute', 'hand', 'compared', 'to', 'the', 'smooth', 'path', 'of', 'the', 'second', 'hand', '.']
Corr: The periodic movement of the minute hand compared to the smooth path of the second hand.

Correction index: [2, 20]

Corrected offset: 18
Original pred_parse_order: {'0': 2, '1': 2, '2': 1, '3': 2, '5': 3, '6': 3, '7': 2, '18': 2, '19': 2, '20': 1, '21': 2, '23': 3, '24': 3, '25': 2}

computed pred_parse_order: {'2': 7, '0': 2, '1': 2, '3': 2, '7': 7, '8': 7, '6': 3}


In [33]:
show_one_EXPECT(NLP_EN, 1289)

id -> head: {0: 1, 1: 2, 2: 2, 3: 2, 4: 2, 5: 6, 6: 4}
head -> ids: {1: [0], 2: [1, 2, 3, 4], 6: [5], 4: [6]}



Orig: The waiter come and took the order.
Corr: The waiter came and took the order.
Concatenated: ['The', 'waiter', 'come', 'and', 'took', 'the', 'order', '.', 'SEP', 'The', 'waiter', 'came', 'and', 'took', 'the', 'order', '.']
Corr: The waiter came and took the order.

Correction index: [2, 11]

Corrected offset: 9
Original pred_parse_order: {'0': 3, '1': 2, '2': 1, '3': 2, '4': 2, '6': 3, '7': 2, '9': 3, '10': 2, '11': 1, '12': 2, '13': 2, '15': 3, '16': 2}

computed pred_parse_order: {'2': 2, '1': 2, '3': 2, '4': 2, '0': 1, '6': 4}


# Process Data

## Add Dependency Parse Tree (Spacy)

In [34]:
def show_one_COWSL2H(nlp, df, row_idx):
    row = df.iloc[row_idx]
    parsing_order = dep_parsing_tree(nlp, row.essay_new, eval(row.correction_index), verbosity=2)
    print()
    print(f'Orig: {" ".join(eval(row.source))}')
    print(f'Corr: {row.essay_new}')
    print(f'row.source: {row.source}')
    print(f'row.target: {row.target}')
    concacted = [*eval(row.source), 'SEP', *eval(row.target)]
    print(f'Concatenated: {concacted}')
    print()
    print(f'Correction index: {row.correction_index}')
    print()
    print(f'Corrected offset: {len(eval(row.target)) + 1}')
    print(f'Original pred_parse_order: {row.predicted_parsing_order}')
    print()
    print(f'computed pred_parse_order: {parsing_order}')

In [64]:
%%time
def add_dependencies(df):
    def add_parsing_order(row):
        return dep_parsing_tree(NLP_ES, row.essay_new, eval(row.correction_index), verbosity=0)        
        
    df['predicted_parsing_order'] = df.apply(add_parsing_order, axis=1)
    df['parsing_order_len'] = df_errors.apply(lambda row: len(row.predicted_parsing_order), axis=1)

add_dependencies(df_errors)

CPU times: user 21.7 s, sys: 30.5 ms, total: 21.7 s
Wall time: 21.7 s


In [68]:
df_errors.parsing_order_len.value_counts()

3     1459
4      981
2      841
5      612
6      481
7      342
8      247
9      166
0      149
10     126
11      76
12      48
13      32
14      26
16       8
15       7
1        7
17       6
18       5
19       2
22       1
Name: parsing_order_len, dtype: int64

In [71]:
with pd.option_context('display.max_colwidth', None):
    display(df_errors[df_errors.parsing_order_len == 0].sample(10))

Unnamed: 0.1,orig_index,Unnamed: 0,paragraph_id,sentence_id,course,essay_new,annotated_sentence,full_error_tag,order_tag,error_tag,...,error_type,correction_index,source,target,check_target,check_source,evidence_index,predicted_parsing_order,origin,parsing_order_len
4759,46220,662,1128,08d5192771bff67b27c42d81af9dd72086875f88fd624f8f6f63717493b9ef84,SPA 3,Su col picante llamado kimchi es sabroso.,Su col<gat:noun:inan> picante [llamado]{llamada}<ga:fm:adj:inan> kimchi es [sabroso]{sabrosa}<ga:fm:adj:inan>.,[sabroso]{sabrosa}<ga:fm:adj:inan>,1.0,ga:fm:adj:inan,...,ga:fm,"[13, 29]","['Su', 'col', '<', 'gat', ':', 'noun', ':', 'inan', '>', 'picante', 'llamada', 'kimchi', 'es', 'sabroso', '.']","['Su', 'col', '<', 'gat', ':', 'noun', ':', 'inan', '>', 'picante', 'llamado', 'kimchi', 'es', 'sabrosa', '.']",0,0,[],{},A,0
4033,45494,309,461,98fc7a6fa7a0e81afa69cf05b66ee7a50326215b27de53954c21f4286da0974b,SPA 3,"Streep es una actriz muy popular en todo el mundo porque todos los películas de Streep son muy bien, emocional y interesante.","Streep es una actriz muy popular en todo el mundo porque [todos]{todas}<ga:fm:det:inan> [los]{las}<ga:fm:det:inan> películas<gat:noun:inan> de Streep son muy bien, [emocional]{emocionales}<na:ps:adj:inan> y [interesante]{interesantes}<na:ps:adj:inan>.",[interesante]{interesantes}<na:ps:adj:inan>,3.0,na:ps:adj:inan,...,na:ps,"[29, 61]","['Streep', 'es', 'una', 'actriz', 'muy', 'popular', 'en', 'todo', 'el', 'mundo', 'porque', 'todas', 'las', 'películas', '<', 'gat', ':', 'noun', ':', 'inan', '>', 'de', 'Streep', 'son', 'muy', 'bien', ',', 'emocionales', 'y', 'interesante', '.']","['Streep', 'es', 'una', 'actriz', 'muy', 'popular', 'en', 'todo', 'el', 'mundo', 'porque', 'todos', 'los', 'películas', '<', 'gat', ':', 'noun', ':', 'inan', '>', 'de', 'Streep', 'son', 'muy', 'bien', ',', 'emocional', 'y', 'interesantes', '.']",0,0,[],{},A,0
3570,45031,67,110,bb84cd896bc9fbf0af8b811dad07223cd6076b76a1c4534f8460e45285e3ce1d,SPA 2,Bush promovió los ideas conservados en el tiempo de presidente.,Bush promovió [los]{las}<ga:fm:det:inan> ideas<gat:noun:inan> [conservados]{conservadas}<ga:fm:adj:inan> en el tiempo de presidente.,[conservados]{conservadas}<ga:fm:adj:inan>,1.0,ga:fm:adj:inan,...,ga:fm,"[11, 30]","['Bush', 'promovió', 'las', 'ideas', '<', 'gat', ':', 'noun', ':', 'inan', '>', 'conservados', 'en', 'el', 'tiempo', 'de', 'presidente', '.']","['Bush', 'promovió', 'los', 'ideas', '<', 'gat', ':', 'noun', ':', 'inan', '>', 'conservadas', 'en', 'el', 'tiempo', 'de', 'presidente', '.']",0,0,[],{},A,0
3812,45273,205,302,ffa70021aa2af28a1c1d0133ff2f7a767da0fb669b48b132bc1a51d15ff240e4,SPA 2,Porque es popular en los Estados Unidos por el público invaden el vida privado de Kanye.,Porque es popular en los Estados Unidos por el público invaden [el]{la}<ga:fm:det:inan> vida<gat:noun:inan> [privado]{privada}<ga:fm:adj:inan> de Kanye.,[privado]{privada}<ga:fm:adj:inan>,1.0,ga:fm:adj:inan,...,ga:fm,"[20, 45]","['Porque', 'es', 'popular', 'en', 'los', 'Estados', 'Unidos', 'por', 'el', 'público', 'invaden', 'la', 'vida', '<', 'gat', ':', 'noun', ':', 'inan', '>', 'privado', 'de', 'Kanye', '.']","['Porque', 'es', 'popular', 'en', 'los', 'Estados', 'Unidos', 'por', 'el', 'público', 'invaden', 'el', 'vida', '<', 'gat', ':', 'noun', ':', 'inan', '>', 'privada', 'de', 'Kanye', '.']",0,0,[],{},A,0
4942,46403,743,1224,85aa15e953b0e1419745efeb66ba222ba5747de3f8c561d0226822c6f6ecffd6,SPA 3,"En general, me gustaría que mis vacaciones para ser divertido y emocionante pero tranquilo.","En general, me gustaría que mis vacaciones<gat:noun:inan> para ser [divertido]{divertidas}<ga:na:fm:ps:adj:inan> y [emocionante]{emociantes}<na:ps:adj:inan> pero [tranquilo]{tranquilas}<ga:na:fm:ps:adj:inan>.",[emocionante]{emociantes}<na:ps:adj:inan>,1.0,na:ps:adj:inan,...,na:ps,"[19, 43]","['En', 'general', ',', 'me', 'gustaría', 'que', 'mis', 'vacaciones', '<', 'gat', ':', 'noun', ':', 'inan', '>', 'para', 'ser', 'divertidas', 'y', 'emocionante', 'pero', 'tranquilo', '.']","['En', 'general', ',', 'me', 'gustaría', 'que', 'mis', 'vacaciones', '<', 'gat', ':', 'noun', ':', 'inan', '>', 'para', 'ser', 'divertido', 'y', 'emociantes', 'pero', 'tranquilas', '.']",0,0,[],{},A,0
4089,45550,353,555,41f0899c758def448ca8fc40d89165b9de4492f4b112ae434d40b613f175a61b,SPA 3,"Me gusta Chris porque en cada película, Chris Pratt tiene un personalidad maravilloso y divertido, y es muy encantador.","Me gusta Chris porque en cada película, Chris Pratt tiene [un]{una}<ga:fm:det:inan> personalidad<gat:noun:inan> [maravilloso]{maravillosa}<ga:fm:adj:inan> y [divertido]{divertida}<ga:fm:adj:inan>, y es muy encantador.",[divertido]{divertida}<ga:fm:adj:inan>,2.0,ga:fm:adj:inan,...,ga:fm,"[22, 52]","['Me', 'gusta', 'Chris', 'porque', 'en', 'cada', 'película', ',', 'Chris', 'Pratt', 'tiene', 'una', 'personalidad', '<', 'gat', ':', 'noun', ':', 'inan', '>', 'maravillosa', 'y', 'divertido', ',', 'y', 'es', 'muy', 'encantador', '.']","['Me', 'gusta', 'Chris', 'porque', 'en', 'cada', 'película', ',', 'Chris', 'Pratt', 'tiene', 'un', 'personalidad', '<', 'gat', ':', 'noun', ':', 'inan', '>', 'maravilloso', 'y', 'divertida', ',', 'y', 'es', 'muy', 'encantador', '.']",0,0,[],{},A,0
3321,44782,240,350,69ee9c42605d18dd2342d1528adb040030fdd41125451085bf22b91751c0036e,SPA 2,"Cuando era un bebe, Señor Voldemort asesinó los padres de Harry y intentó asesinar Harry.","Cuando era un bebe, Señor Voldemort asesinó []{a}<az:do:an> los padres de Harry y intentó asesinar []{a}<az:do:an> Harry.",[]{a}<az:do:an>,1.0,az:do:an,...,az:do,"[16, 36]","['Cuando', 'era', 'un', 'bebe', ',', 'Señor', 'Voldemort', 'asesinó', 'a', 'los', 'padres', 'de', 'Harry', 'y', 'intentó', 'asesinar', '[NONE]', 'Harry', '.']","['Cuando', 'era', 'un', 'bebe', ',', 'Señor', 'Voldemort', 'asesinó', '[NONE]', 'los', 'padres', 'de', 'Harry', 'y', 'intentó', 'asesinar', 'a', 'Harry', '.']",0,0,[],{},A,0
4833,46294,693,1166,15c6d9ec7a41a5c262c873d45d22bbc1346aa709dd3a807abbe4c7332fd0738a,SPA 2,El Alpes Suzios es un cordillera magnifico.,[El]{los}<na:ps:det:inan> Alpes Suzios es [un]{una}<ga:fm:det:inan> cordillera<gat:noun:inan> [magnifico]{magnífica}<ga:fm:adj:inan>.,[magnifico]{magnífica}<ga:fm:adj:inan>,2.0,ga:fm:adj:inan,...,ga:fm,"[13, 29]","['los', 'Alpes', 'Suzios', 'es', 'una', 'cordillera', '<', 'gat', ':', 'noun', ':', 'inan', '>', 'magnifico', '.']","['El', 'Alpes', 'Suzios', 'es', 'un', 'cordillera', '<', 'gat', ':', 'noun', ':', 'inan', '>', 'magnífica', '.']",0,0,[],{},A,0
3398,44859,719,1198,94acf628361ec20601a3028d3ad9b4777f8e20540e66f96f61aaa858eee03c52,SPA 1,"Yo traigo mi novio, los padres, la hermana mayor, y perro pequeno.","Yo traigo []{a}<az:do:an> mi novio, []{a}<az:do:an> los padres, []{a}<az:do:an> la hermana mayor, y []{a}<az:do:an> perro pequeno.",[]{a}<az:do:an>,3.0,az:do:an,...,az:do,"[16, 37]","['Yo', 'traigo', 'a', 'mi', 'novio', ',', 'a', 'los', 'padres', ',', 'a', 'la', 'hermana', 'mayor', ',', 'y', '[NONE]', 'perro', 'pequeno', '.']","['Yo', 'traigo', '[NONE]', 'mi', 'novio', ',', '[NONE]', 'los', 'padres', ',', '[NONE]', 'la', 'hermana', 'mayor', ',', 'y', 'a', 'perro', 'pequeno', '.']",0,0,[],{},A,0
800,42261,40,65,139d6a79730caee9d68ad994e3fb047e5c0b6b31d42450bcf838d3943b5e33dc,SPA 3,Empezó con el grupo se llama Destiny's Child antes de realizar solo.,Empezó con el grupo se llama Destiny's Child antes de realizar [solo]{sola}<ga:fm:adj:an>.,[solo]{sola}<ga:fm:adj:an>,0.0,ga:fm:adj:an,...,ga:fm,"[13, 29]","['Empezó', 'con', 'el', 'grupo', 'se', 'llama', 'Destiny', ""'"", 's', 'Child', 'antes', 'de', 'realizar', 'solo', '.']","['Empezó', 'con', 'el', 'grupo', 'se', 'llama', 'Destiny', ""'"", 's', 'Child', 'antes', 'de', 'realizar', 'sola', '.']",0,0,[],{},A,0


In [74]:
df_errors[df_errors.essay_new == 'El Alpes Suzios es un cordillera magnifico.']

Unnamed: 0.1,orig_index,Unnamed: 0,paragraph_id,sentence_id,course,essay_new,annotated_sentence,full_error_tag,order_tag,error_tag,...,error_type,correction_index,source,target,check_target,check_source,evidence_index,predicted_parsing_order,origin,parsing_order_len
4831,46292,693,1166,15c6d9ec7a41a5c262c873d45d22bbc1346aa709dd3a80...,SPA 2,El Alpes Suzios es un cordillera magnifico.,[El]{los}<na:ps:det:inan> Alpes Suzios es [un]...,[El]{los}<na:ps:det:inan>,0.0,na:ps:det:inan,...,na:ps,"[0, 16]","['El', 'Alpes', 'Suzios', 'es', 'un', 'cordill...","['los', 'Alpes', 'Suzios', 'es', 'una', 'cordi...",0,0,[],"{'0': 1, '1': 5, '2': 1}",A,3
4832,46293,693,1166,15c6d9ec7a41a5c262c873d45d22bbc1346aa709dd3a80...,SPA 2,El Alpes Suzios es un cordillera magnifico.,[El]{los}<na:ps:det:inan> Alpes Suzios es [un]...,[un]{una}<ga:fm:det:inan>,1.0,ga:fm:det:inan,...,ga:fm,"[4, 20]","['los', 'Alpes', 'Suzios', 'es', 'un', 'cordil...","['El', 'Alpes', 'Suzios', 'es', 'una', 'cordil...",0,0,[],"{'4': 5, '5': 5, '1': 5, '3': 5, '6': 5}",A,5
4833,46294,693,1166,15c6d9ec7a41a5c262c873d45d22bbc1346aa709dd3a80...,SPA 2,El Alpes Suzios es un cordillera magnifico.,[El]{los}<na:ps:det:inan> Alpes Suzios es [un]...,[magnifico]{magnífica}<ga:fm:adj:inan>,2.0,ga:fm:adj:inan,...,ga:fm,"[13, 29]","['los', 'Alpes', 'Suzios', 'es', 'una', 'cordi...","['El', 'Alpes', 'Suzios', 'es', 'un', 'cordill...",0,0,[],{},A,0


In [75]:
ids_to_check = [4833, 5621]
# print(df_errors.iloc[id_to_check])
# print(type(df_errors.iloc[id_to_check].source))
# print(type(df_errors.iloc[id_to_check].source))
# print(type(df_errors.iloc[id_to_check].essay_new))
for idx in ids_to_check:
    show_one_COWSL2H(NLP_ES, df_errors, idx)
# parsing_order = dep_parsing_tree(NLP_ES, df_errors.iloc[id_to_check].essay_new, eval(df_errors.iloc[id_to_check].correction_index), verbosity=2)


id -> head: {0: 1, 1: 5, 2: 1, 3: 5, 4: 5, 5: 5, 6: 5}
head -> ids: {1: [0, 2], 5: [1, 3, 4, 5, 6]}
idx:13, as_list: [], as_map: {}
idx:29, as_list: [], as_map: {}



Orig: los Alpes Suzios es una cordillera < gat : noun : inan > magnifico .
Corr: El Alpes Suzios es un cordillera magnifico.
row.source: ['los', 'Alpes', 'Suzios', 'es', 'una', 'cordillera', '<', 'gat', ':', 'noun', ':', 'inan', '>', 'magnifico', '.']
row.target: ['El', 'Alpes', 'Suzios', 'es', 'un', 'cordillera', '<', 'gat', ':', 'noun', ':', 'inan', '>', 'magnífica', '.']
Concatenated: ['los', 'Alpes', 'Suzios', 'es', 'una', 'cordillera', '<', 'gat', ':', 'noun', ':', 'inan', '>', 'magnifico', '.', 'SEP', 'El', 'Alpes', 'Suzios', 'es', 'un', 'cordillera', '<', 'gat', ':', 'noun', ':', 'inan', '>', 'magnífica', '.']

Correction index: [13, 29]

Corrected offset: 16
Original pred_parse_order: {}

computed pred_parse_order: {}
id -> head: {0: 2, 1: 2, 2: 2, 3: 4, 4: 2, 5: 4}
head -> ids: {2: [0, 1, 2, 4], 4: [3, 5]}
idx:5, as_list: [4], as_map: {'5': 4}
idx:4, as_list: [2, 3, 5], as_map: {'4': 2, '3': 4, '5': 4}
idx:14, as_list: [], as_map: {}



Orig: Entonces será certificadas como " perfecto " .
Corr: Entonces será certificado como “perfecto".
row.source: ['Entonces', 'será', 'certificadas', 'como', '"', 'perfecto', '"', '.']
row.target: ['Entonces', 'será', 'certificado', 'como', '"', 'perfectas', '"', '.']
Concatenated: ['Entonces', 'será', 'certificadas', 'como', '"', 'perfecto', '"', '.', 'SEP', 'Entonces', 'será', 'certificado', 'como', '"', 'perfectas', '"', '.']

Correction index: [5, 14]

Corrected offset: 9
Original pred_parse_order: {'5': 4, '4': 2, '3': 4}

computed pred_parse_order: {'5': 4, '4': 2, '3': 4}


In [76]:
df_errors.sample(2)

Unnamed: 0.1,orig_index,Unnamed: 0,paragraph_id,sentence_id,course,essay_new,annotated_sentence,full_error_tag,order_tag,error_tag,...,error_type,correction_index,source,target,check_target,check_source,evidence_index,predicted_parsing_order,origin,parsing_order_len
882,42343,71,116,5c8071881e570ad308b7390e6470c20db583ff77099482...,SPA 2,Cher tiene muchas conciertos en los Estados Un...,Cher tiene [muchas]{muchos}<ga:mf:det:inan> co...,[muchas]{muchos}<ga:mf:det:inan>,0.0,ga:mf:det:inan,...,ga:mf,"[2, 12]","['Cher', 'tiene', 'muchas', 'conciertos', 'en'...","['Cher', 'tiene', 'muchos', 'conciertos', 'en'...",0,0,[],"{'2': 3, '3': 1, '8': 3}",A,3
4148,45609,383,623,cc547a94cb94ef5741e4c7950e628a73d796b935ad88a6...,SPA 3,Me gusta las programas de television con ella ...,Me gusta [las]{los}<ga:mf:det:aty:inan> progra...,[mucha]{mucho}<ga:mf:pron:inan>,1.0,ga:mf:pron:inan,...,ga:mf,"[15, 39]","['Me', 'gusta', 'los', 'programas', 'de', 'tel...","['Me', 'gusta', 'las', 'programas', 'de', 'tel...",0,0,[],"{'15': 14, '17': 15, '14': 12, '13': 14, '20':...",A,6


## To JSON representation of each row

In [78]:
def to_json_df(in_df):
    df_proj = in_df[['target', 'source', 'evidence_index', 'correction_index', 'error_type', 'predicted_parsing_order', 'origin']]
    df_proj = df_proj.dropna()
    df_json = df_proj.copy(deep=True)
    df_json['json'] = df_proj.apply(lambda x: x.to_json(), axis=1)
    return df_json[['json']]


In [80]:
def export_as_json(df):
    json_df = to_json_df(df)
    # tt.to_csv(args.PROCESSED_DATA_FOLDER + "/rows_as_json.csv", index=False, header=False, quoting=csv.QUOTE_NONE, escapechar=" ")
    with open(args.PROCESSED_DATA_FOLDER + "/rows_as_json.csv", "w+") as file1:
        # Writing data to a file

        # json.dumps([row.dropna().to_dict() for index,row in df.iterrows()])

        for i, row in json_df.iterrows():
            # file1.write(f'{json.dumps(row.dropna().to_dict())}\n')
            dumdum = eval(row.json)
            dumdum['target'] = eval(dumdum['target'])
            dumdum['source'] = eval(dumdum['source'])
            dumdum['correction_index'] = eval(dumdum['correction_index'])
            file1.write(f'{json.dumps(dumdum)}\n')

# Export after remove lines with empty predicted parsing order until the underlying data is fixed.
export_as_json(df_errors[df_errors.parsing_order_len > 0])