## Import Libraries

In [13]:
import pandas as pd
from tqdm import tqdm
import numpy as np

## Master Function & convert to dataframe

In [14]:
def m2_to_df(m2_file_path,id=0):
    '''This function takes m2 file path as input and converts it to pandas dataframe'''

    m2 = open(m2_file_path).read().strip().split("\n\n")
    # Do not apply edits with these error types
    skip = {"noop", "UNK", "Um"}

    correct_sent_array = []
    incorrect_sent_array = []

    for sent in tqdm(m2):
        sent = sent.split("\n")
        incor_sent = sent[0].split()[1:] # Ignore "S "
        incorrect_sent_array.append(str(' '.join(incor_sent))) 
        cor_sent = incor_sent.copy()

        edits = sent[1:]
        offset = 0
        for edit in edits:
            edit = edit.split("|||")
            if edit[1] in skip: continue # Ignore certain edits
            coder = int(edit[-1])
            if coder != id: continue # Ignore other coders
            span = edit[0].split()[1:] # Ignore "A "
            start = int(span[0])
            end = int(span[1])
            cor = edit[2].split()
            cor_sent[start+offset:end+offset] = cor
            offset = offset-(end-start)+len(cor)
        correct_sent_array.append(str(' '.join(cor_sent)))

    df = pd.DataFrame()
    df["correct"] = correct_sent_array
    df["incorrect"] = incorrect_sent_array
    return df

In [15]:
m2_file_path = 'lang8.bea19/lang8.train.auto.bea19.m2'
final_df = m2_to_df(m2_file_path)

100%|██████████| 1037561/1037561 [00:03<00:00, 305356.51it/s]


## Check data

In [16]:
final_df.sample(5)

Unnamed: 0,correct,incorrect
549510,"March 15 , 2010","March 15 , 2010"
122932,I want to say ' I 'm looking forward to it ! '...,I want to say ' I 'm looking forward to it ! '...
432641,My favorite players are Beckham and Keisuke Ho...,My favorite players are Beckham and Keisuke Ho...
733161,Thank you for reading my writing .,thanks you for reading my writing .
972647,great memories of my homestay,great memories of my homestay


In [17]:
def show_random_datapoints(n_samples,df):
    for i in range(n_samples):
        id = int(np.random.uniform(0,len(df)))

        if len(df['correct'].iloc[id].split())>5 and df['correct'].iloc[id] != df['incorrect'].iloc[id]:
            print(f"CORRE: {df['correct'].iloc[id]}")
            print(f"INCOR: {df['incorrect'].iloc[id]}")
            print('*'*100)

In [18]:
show_random_datapoints(10,final_df)

CORRE: I 've cooked rice with a clay pot for these past 4 months , for my rice cooker was broken by the electric leakage that 3 . 11 quake had caused .
INCOR: I 've cooked rice with a clay pot for these 4 months , for my rice cooker was broken by the electric leakage that 3 . 11 quake caused .
****************************************************************************************************
CORRE: However , I have a short - term goal .
INCOR: But , If a short term goal , yes , I have .
****************************************************************************************************
CORRE: Attending my English school , I get to know many people who work for different companies .
INCOR: Attending my English school , I get to know many people who work different companies .
****************************************************************************************************


In [19]:
final_df.shape

(1037561, 2)

In [20]:
final_df.to_csv('data/final_df_20211027.csv',index=False)