## Import Libraries

In [42]:
import pandas as pd
from tqdm import tqdm
import numpy as np

## Master Function & convert to dataframe

In [43]:
#https://www.cl.cam.ac.uk/research/nl/bea2019st/data/corr_from_m2.py
def m2_to_df(m2_file_path,id=0):
    '''This function takes m2 file path as input and converts it to pandas dataframe'''

    m2 = open(m2_file_path, encoding='utf-8').read().strip().split("\n\n")
    # Do not apply edits with these error types
    skip = {"noop", "UNK", "Um"}

    correct_sent_array = []
    incorrect_sent_array = []

    for sent in tqdm(m2):
        sent = sent.split("\n")
        incor_sent = sent[0].split()[1:] # Ignore "S "
        incorrect_sent_array.append(str(' '.join(incor_sent))) 
        cor_sent = incor_sent.copy()

        edits = sent[1:]
        offset = 0
        for edit in edits:
            edit = edit.split("|||")
            if edit[1] in skip: continue # Ignore certain edits
            coder = int(edit[-1])
            if coder != id: continue # Ignore other coders
            span = edit[0].split()[1:] # Ignore "A "
            start = int(span[0])
            end = int(span[1])
            cor = edit[2].split()
            cor_sent[start+offset:end+offset] = cor
            offset = offset-(end-start)+len(cor)
        correct_sent_array.append(str(' '.join(cor_sent)))

    df = pd.DataFrame()
    df["correct"] = correct_sent_array
    df["incorrect"] = incorrect_sent_array
    return df

In [44]:
m2_file_path = 'data\ABC.train.gold.bea19.m2'
final_df = m2_to_df(m2_file_path)

100%|████████████████████████████████████████████████████████████████████████| 34308/34308 [00:00<00:00, 253168.48it/s]


## Check data

In [45]:
final_df.sample(5)

Unnamed: 0,correct,incorrect
2762,Hongdae is most appropriate to feel a free and...,Hongdae is most appropriate to feel free and v...
11494,Write to me as soon as possible .,Write to me as soon as possible .
7613,The Great Wall is a great military constructio...,Great Wall is a great military construction in...
13039,"At that moment , my dog was playing with a bal...","At that moment , my dog was kidding with a bal..."
24087,I am a qualified first aider and I am able to ...,I am a qualified first aider and I am able to ...


In [46]:
def show_random_datapoints(n_samples,df):
    for i in range(n_samples):
        id = int(np.random.uniform(0,len(df)))

        if len(df['correct'].iloc[id].split())>5 and df['correct'].iloc[id] != df['incorrect'].iloc[id]:
            print(f"CORRE: {df['correct'].iloc[id]}")
            print(f"INCOR: {df['incorrect'].iloc[id]}")
            print('*'*100)

In [47]:
show_random_datapoints(10,final_df)

CORRE: I pack my backpack , pick up my garbage bag and look around my room for one last time to make sure I have n't forgotten anything . I go downstairs and put the garbage bag into the trash bin .
INCOR: I pack my backpack , take my garbage bag and look around my room for one last time to make sure : I do n't forget anything . I go downstairs and put the garbage bag into the trash bin .
****************************************************************************************************
CORRE: Inside the " Monumento a La Bandera " you can see historical objects about the history of Argentina , and in the underground there is the hall of honor of American Flags .
INCOR: Inside the " Monumento a La Bandera " you can see historical objects about the history of Argentina , in the underground it 's the hall of honor of American Flags .
****************************************************************************************************
CORRE: And , of course , to add an extra activity to my

In [48]:
final_df.shape

(34308, 2)

In [49]:
final_df.to_csv('data\\final_df.csv',index=False)