## Import libraries & data

In [2]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import re
from tqdm.auto import tqdm
tqdm.pandas()

In [3]:
final_df = pd.read_csv('data/final_df_20211027.csv')

In [4]:
final_df.sample(5)

Unnamed: 0,correct,incorrect
1073,because I have to go the ANZ bank to get a sav...,"m , because I have to go the ANZ bank to get t..."
1000358,I 'm getting ' a ' cold .,I 'm getting ' a ' cold .
283244,I 'm writing this diary using 100 yen netbook ...,I 'm writing this diary by its netbook in the ...
589876,( 1 ) sentence,( 1 ) sentence
212870,Study abroad through the Internet,Study abroad through Internet


In [5]:
final_df.shape

(1037561, 2)

### Adding length features

In [6]:
final_df['correct_char_count'] = final_df['correct'].astype('str').apply(lambda x:len(x))
final_df['incorrect_char_count'] = final_df['incorrect'].astype('str').apply(lambda x:len(x))

In [7]:
final_df['correct_word_count'] = final_df['correct'].astype('str').apply(lambda x:len(x.split()))
final_df['incorrect_word_count'] = final_df['incorrect'].astype('str').apply(lambda x:len(x.split()))

In [8]:
final_df.sample(5)

Unnamed: 0,correct,incorrect,correct_char_count,incorrect_char_count,correct_word_count,incorrect_word_count
727438,"In my opinion , as a teacher especially , a co...","To my opinion , as a teacher especially a coll...",108,104,22,20
412249,I 'm 19year 's old .,I 'm 19year 's old .,20,20,6,6
974149,"On the other hand , a best friend will try to ...","On the other hand , a best friend will try to ...",111,111,23,23
736324,To be continued later . . .,To be continued later . . .,27,27,7,7
419353,"However , upon arriving there , one of my frie...","however , in arriving here , one of my friend ...",134,108,25,22


## Preprocessing

### Removing Missing/NA 

In [9]:
pd.DataFrame(final_df.isna().sum(),columns=['missing_count'])

Unnamed: 0,missing_count
correct,1
incorrect,1
correct_char_count,0
incorrect_char_count,0
correct_word_count,0
incorrect_word_count,0


In [10]:
final_df[final_df.isna().any(axis=1)]

Unnamed: 0,correct,incorrect,correct_char_count,incorrect_char_count,correct_word_count,incorrect_word_count
222211,,,3,3,1,1


In [11]:
final_df = final_df.dropna().reset_index(drop=True)

In [12]:
final_df.shape

(1037560, 6)

In [13]:
final_df.sample(5)

Unnamed: 0,correct,incorrect,correct_char_count,incorrect_char_count,correct_word_count,incorrect_word_count
238009,1 .,1 .,3,3,2,2
950592,I visited MARUNOUCHI .,I visited MARUNOUCHI .,22,22,4,4
114436,I 'm working in Tokyo in Japan .,I 'm working in Tokyo in Japan .,32,32,8,8
574040,2 ) Does your brother ever wait us ?,2 ) Does your brother ever wait us ?,36,36,9,9
14025,Do you like snow scenes ? ?,Do you like snow scenes ? ?,27,27,7,7


### Keep unique sentence pairs

In [14]:
print(f"total number of duplicate pairs: {len(final_df[final_df['correct']==final_df['incorrect']])}")

total number of duplicate pairs: 539201


In [15]:
print(f"total number of duplicate pairs: {len(final_df[final_df['correct']==final_df['incorrect']])}")

total number of duplicate pairs: 539201


In [16]:
final_df[final_df['correct']==final_df['incorrect']].sample(10)

Unnamed: 0,correct,incorrect,correct_char_count,incorrect_char_count,correct_word_count,incorrect_word_count
378092,It attracted me .,It attracted me .,17,17,4,4
425881,Smoking !,Smoking !,9,9,2,2
401691,Ukrainians - Cossacks .,Ukrainians - Cossacks .,23,23,4,4
467939,"After my husband went to work , I decided to l...","After my husband went to work , I decided to l...",103,103,24,24
1029810,I like this song very much .,I like this song very much .,28,28,7,7
579522,"If anyone knows adout good shops in paris , pl...","If anyone knows adout good shops in paris , pl...",84,84,17,17
823888,But they are expensive .,But they are expensive .,24,24,5,5
868363,I will do my best .,I will do my best .,19,19,6,6
763725,I 'll definitely not miss any opportunities co...,I 'll definitely not miss any opportunities co...,74,74,14,14
997042,"Above , a pic of Tokyo Execution chamber .","Above , a pic of Tokyo Execution chamber .",42,42,9,9


In [17]:
final_df = final_df[final_df['correct']!=final_df['incorrect']]

In [18]:
final_df.shape

(498359, 6)

In [19]:
final_df.sample(5)

Unnamed: 0,correct,incorrect,correct_char_count,incorrect_char_count,correct_word_count,incorrect_word_count
232463,Although the wedding party was held at a churc...,"Although the wedding party held at church , it...",84,78,17,15
213546,I bought a used pair of Levis for my daughter .,I bought an used pair of levis for my daughter .,47,48,11,11
863293,It was only after a second that it started to ...,It was only after a second that it started to ...,60,59,13,13
304109,After I went to the library to get my English ...,After I went to the library to get my English ...,117,111,24,23
488637,"At first , we thought everything was okay and ...","At first , we thought everything was ok and th...",87,80,17,16


### Remove Duplicates

In [20]:
print(f'total number of duplicates: {final_df.duplicated().sum()}')

total number of duplicates: 2021


In [21]:
final_df[final_df.duplicated(keep=False)].sort_values('correct')

Unnamed: 0,correct,incorrect,correct_char_count,incorrect_char_count,correct_word_count,incorrect_word_count
717379,"( I seriously want to escape , all the way , t...","( I seriously want to escape , all the way , t...",88,85,19,19
1027462,"( I seriously want to escape , all the way , t...","( I seriously want to escape , all the way , t...",88,85,19,19
802135,: - ),: - (,5,5,3,3
800388,: - ),: - (,5,5,3,3
161743,A : How much did it cost ?,A : How much does is cost ?,26,27,8,8
...,...,...,...,...,...,...
350827,to be continued . . .,to be continue . . .,21,20,6,6
17343,to be continued . . .,to be continue . . .,21,20,6,6
633235,to be continued . . .,to be continue . . .,21,20,6,6
767284,today was a bad day .,today is a bad day .,21,20,6,6


In [22]:
final_df = final_df.drop_duplicates().reset_index(drop=True)

In [23]:
final_df.shape

(496338, 6)

In [24]:
final_df.sample(5)

Unnamed: 0,correct,incorrect,correct_char_count,incorrect_char_count,correct_word_count,incorrect_word_count
79969,I made a small group for studying Economics to...,I made a small group for studying from Economi...,62,67,11,12
407628,"These include Mexican food ( burrito , fajita ...","These include mexican food ( burrito , fajita ...",409,410,81,81
181486,I 'm trying to love English and that is why I ...,I try to love English that is why I 'm here .,55,45,14,12
487216,"I went to Sendai again , because my older sist...","I went to Sendai again , because my older sist...",62,66,13,14
319807,It was that I and some of adult compared our c...,It was I and some of adult compared our childr...,77,69,17,15


### Remove Small sentences



In [25]:
final_df[final_df['incorrect_char_count']<2].shape

(5, 6)

In [26]:
final_df = final_df[final_df['incorrect_char_count']>2].reset_index(drop=True)

In [27]:
final_df.shape

(496326, 6)

In [28]:
final_df[final_df['correct_char_count']<2].shape

(27, 6)

In [29]:
final_df[final_df['correct_char_count']<2].sample(10)

Unnamed: 0,correct,incorrect,correct_char_count,incorrect_char_count,correct_word_count,incorrect_word_count
164087,?,' ? ?,1,5,1,3
241609,.,is japanese holiday .,1,21,1,4
91009,!,Do ours best !,1,14,1,4
318100,.,noddle .,1,8,1,2
461049,.,had them .,1,10,1,3
80604,.,to near park .,1,14,1,4
301056,.,I am hungry .,1,13,1,4
193002,.,I am young .,1,12,1,4
439410,.,on face .,1,9,1,3
153477,.,For Tech support .,1,18,1,4


In [30]:
final_df = final_df[final_df['correct_char_count']>2].reset_index(drop=True)

In [31]:
final_df.shape

(496287, 6)

### Clean text

In [32]:

contractions_dict = { "ain't": "are not","'s":" is","aren't": "are not",
                     "can't": "cannot","can't've": "cannot have",
                     "'cause": "because","could've": "could have","couldn't": "could not",
                     "couldn't've": "could not have", "didn't": "did not","doesn't": "does not",
                     "don't": "do not","hadn't": "had not","hadn't've": "had not have",
                     "hasn't": "has not","haven't": "have not","he'd": "he would",
                     "he'd've": "he would have","he'll": "he will", "he'll've": "he will have",
                     "how'd": "how did","how'd'y": "how do you","how'll": "how will",
                     "I'd": "I would", "I'd've": "I would have","I'll": "I will",
                     "I'll've": "I will have","I'm": "I am","I've": "I have", "isn't": "is not",
                     "it'd": "it would","it'd've": "it would have","it'll": "it will",
                     "it'll've": "it will have", "let's": "let us","ma'am": "madam",
                     "mayn't": "may not","might've": "might have","mightn't": "might not", 
                     "mightn't've": "might not have","must've": "must have","mustn't": "must not",
                     "mustn't've": "must not have", "needn't": "need not",
                     "needn't've": "need not have","o'clock": "of the clock","oughtn't": "ought not",
                     "oughtn't've": "ought not have","shan't": "shall not","sha'n't": "shall not",
                     "shan't've": "shall not have","she'd": "she would","she'd've": "she would have",
                     "she'll": "she will", "she'll've": "she will have","should've": "should have",
                     "shouldn't": "should not", "shouldn't've": "should not have","so've": "so have",
                     "that'd": "that would","that'd've": "that would have", "there'd": "there would",
                     "there'd've": "there would have", "they'd": "they would",
                     "they'd've": "they would have","they'll": "they will",
                     "they'll've": "they will have", "they're": "they are","they've": "they have",
                     "to've": "to have","wasn't": "was not","we'd": "we would",
                     "we'd've": "we would have","we'll": "we will","we'll've": "we will have",
                     "we're": "we are","we've": "we have", "weren't": "were not","what'll": "what will",
                     "what'll've": "what will have","what're": "what are", "what've": "what have",
                     "when've": "when have","where'd": "where did", "where've": "where have",
                     "who'll": "who will","who'll've": "who will have","who've": "who have",
                     "why've": "why have","will've": "will have","won't": "will not",
                     "won't've": "will not have", "would've": "would have","wouldn't": "would not",
                     "wouldn't've": "would not have","y'all": "you all", "y'all'd": "you all would",
                     "y'all'd've": "you all would have","y'all're": "you all are",
                     "y'all've": "you all have", "you'd": "you would","you'd've": "you would have",
                     "you'll": "you will","you'll've": "you will have", "you're": "you are",
                     "you've": "you have","n\'t":" not","\'re":" are","\'s": " is","\'d":" would",
                     "\'ll": " will","\'t":" not","\'ve": " have","\'m":" am"}


# Regular expression for finding contractions
contractions_re=re.compile('(%s)' % '|'.join(contractions_dict.keys()))

# Function for expanding contractions
def expand_contractions(text,contractions_dict=contractions_dict):
    def replace(match):
        return contractions_dict[match.group(0)]
    return contractions_re.sub(replace, text)

In [33]:

def clean(text):
    text = re.sub('\s*\<.*?\>\s', '', text)
    text = re.sub('\s*\(.*?\)\s', '', text)
    text = re.sub('\s*\[.*?\]\s', '', text)
    text = re.sub('\s*\{.*?\}\s', '', text)
    text = re.sub("[-+@#^/|*(){}$~<>=_%:;]","",text)
    text = text.replace("\\","")
    text = re.sub("\[","",text)
    text = re.sub("\]","",text)
    text = re.sub("\<","",text)
    text = re.sub("\>","",text)
    text = re.sub("\(","",text)
    text = re.sub("\)","",text)
    text = re.sub("[0-9]","",text)
    text = ' '.join(text.split())
    return text

In [34]:
final_df['correct'] = final_df['correct'].progress_apply(clean)
final_df['correct'] = final_df['correct'].progress_apply(expand_contractions)

100%|██████████| 496287/496287 [00:06<00:00, 82079.57it/s]
100%|██████████| 496287/496287 [00:05<00:00, 91005.82it/s]


In [35]:
final_df['incorrect'] = final_df['incorrect'].progress_apply(clean)
final_df['incorrect'] = final_df['incorrect'].progress_apply(expand_contractions)

100%|██████████| 496287/496287 [00:06<00:00, 79081.27it/s]
100%|██████████| 496287/496287 [00:05<00:00, 95226.15it/s]


In [36]:
final_df.sample(5)

Unnamed: 0,correct,incorrect,correct_char_count,incorrect_char_count,correct_word_count,incorrect_word_count
422710,"Also say , I think accidents are really unexpe...",I think we ca not expect an accident .,52,38,10,9
81533,I know my grammar is very very bad.,I know my grammar very very bad.,52,49,16,15
429328,I have only seen `` Colors `` and the beginnin...,I have seen only `` Colors `` and the beginnin...,95,97,20,20
29366,I am a bad student then .,"I am a bad student , then .",25,27,7,8
54983,"Firstly , family in most cases consists of the...","Firstly , family in most cases consist of near...",68,63,13,12


In [37]:
final_df.isna().sum()

correct                 0
incorrect               0
correct_char_count      0
incorrect_char_count    0
correct_word_count      0
incorrect_word_count    0
dtype: int64

In [41]:
final_df = final_df.drop(['correct_char_count','incorrect_char_count','correct_word_count','incorrect_word_count'], axis=1)

In [42]:
final_df.to_csv('data/final_df_preprocessed_20211028.csv')