## Import libraries & data

In [149]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import re
from tqdm.auto import tqdm
tqdm.pandas()

In [150]:
final_df = pd.read_csv('data\\final_df.csv')

In [151]:
final_df.sample(5)

Unnamed: 0,correct,incorrect
33991,She looked at the phone surprised and tried ca...,She looked at the phone surprised and tried ca...
12579,I really like the school and it is going well ...,I really like the school and it is going good ...
14465,In my spare time I like reading English books ...,In my spare time I like reading English books ...
15869,They were not a good representation of their i...,They are not a good representation of their id...
32191,This story tells us that sometimes we can not ...,This story tells us that sometimes we can not ...


In [152]:
final_df.shape

(34308, 2)

### Adding length features

In [153]:
final_df['correct_char_count'] = final_df['correct'].astype('str').apply(lambda x:len(x))
final_df['incorrect_char_count'] = final_df['incorrect'].astype('str').apply(lambda x:len(x))

In [154]:
final_df['correct_word_count'] = final_df['correct'].astype('str').apply(lambda x:len(x.split()))
final_df['incorrect_word_count'] = final_df['incorrect'].astype('str').apply(lambda x:len(x.split()))

In [155]:
final_df.sample(5)

Unnamed: 0,correct,incorrect,correct_char_count,incorrect_char_count,correct_word_count,incorrect_word_count
5393,There were so many trendy clothes at attractiv...,There was so many trendy clothes in attractive...,56,55,10,10
8626,"Nevertheless , I would recommend you to wear a...","Nevertheless , I would recommend you to wear a...",98,105,18,18
27483,Taking a year off after finishing school can b...,Taking a year off after finishing school can b...,111,111,22,22
20462,Maybe by doing this we will be able to find ot...,Maybe by doing this we would be able to find o...,101,102,18,18
1886,"If we use public transport services ,",If we use our public transport services .,37,41,7,8


## Preprocessing

### Removing Missing/NA 

In [156]:
pd.DataFrame(final_df.isna().sum(),columns=['missing_count'])

Unnamed: 0,missing_count
correct,4
incorrect,0
correct_char_count,0
incorrect_char_count,0
correct_word_count,0
incorrect_word_count,0


In [157]:
final_df[final_df.isna().any(axis=1)]

Unnamed: 0,correct,incorrect,correct_char_count,incorrect_char_count,correct_word_count,incorrect_word_count
5517,,yes becouse yes,3,15,1,3
5518,,fsdjgdofg,3,9,1,1
13824,,Ou,3,2,1,1
24908,,.,3,1,1,1


In [158]:
final_df = final_df.dropna().reset_index(drop=True)

In [159]:
final_df.shape

(34304, 6)

In [160]:
final_df.sample(5)

Unnamed: 0,correct,incorrect,correct_char_count,incorrect_char_count,correct_word_count,incorrect_word_count
33894,"After all , a successful career is best measur...",After all a successful career is best measured...,192,190,36,35
1095,It is difficult to go to an unknown place with...,It is difficult to go to a place unknown with ...,104,98,22,20
22258,"Of course , you can look up to celebrities but...",Of course you can look up to the celebrities b...,89,91,19,19
6931,"This sport is very famous all over the world ,...","This sport is very famous all over the world ,...",136,141,29,30
15279,"Concerning your work plans , I have an uncle w...","Concernig your working plans , I have an uncle...",112,114,26,26


### Keep unique sentence pairs

In [161]:
print(f"total number of duplicate pairs: {len(final_df[final_df['correct']==final_df['incorrect']])}")

total number of duplicate pairs: 11571


In [162]:
print(f"total number of duplicate pairs: {len(final_df[final_df['correct']==final_df['incorrect']])}")

total number of duplicate pairs: 11571


In [163]:
final_df[final_df['correct']==final_df['incorrect']].sample(10)

Unnamed: 0,correct,incorrect,correct_char_count,incorrect_char_count,correct_word_count,incorrect_word_count
15001,You should .,You should .,12,12,3,3
9959,"Hi Jan ,","Hi Jan ,",8,8,3,3
32190,I think this period when we start to write is ...,I think this period when we start to write is ...,113,113,22,22
4053,I think .,I think .,9,9,3,3
32033,Anna knew that it was going to be a very speci...,Anna knew that it was going to be a very speci...,62,62,15,15
31242,The next one is boat .,The next one is boat .,22,22,6,6
7672,The wizard attacked him and he went away from ...,The wizard attacked him and he went away from ...,52,52,11,11
12242,It was really cool .,It was really cool .,20,20,5,5
25848,We 're expecting you !,We 're expecting you !,22,22,5,5
16230,It was really hard not to burst out laughing i...,It was really hard not to burst out laughing i...,58,58,13,13


In [164]:
final_df = final_df[final_df['correct']!=final_df['incorrect']]

In [165]:
final_df.shape

(22733, 6)

In [166]:
final_df.sample(5)

Unnamed: 0,correct,incorrect,correct_char_count,incorrect_char_count,correct_word_count,incorrect_word_count
1810,Some people do n't even care and just do n't t...,Some people do n't even care and just the do n...,112,109,23,23
30697,The municipal board of Clayton does an excelle...,The municipal board of Clayton does an excelle...,121,121,22,22
24608,I witnessed and learned from his ability to wo...,I witnessed and learn form his ability to work...,172,165,32,30
33712,"At the end of the video , the screen shows a t...","At the last of the video , the screen shows a ...",81,82,18,18
4208,I ran to help them and we also called an ambul...,I ran to help them and also we called the ambu...,52,53,12,12


### Remove Duplicates

In [167]:
print(f'total number of duplicates: {final_df.duplicated().sum()}')

total number of duplicates: 27


In [168]:
final_df[final_df.duplicated(keep=False)].sort_values('correct')

Unnamed: 0,correct,incorrect,correct_char_count,incorrect_char_count,correct_word_count,incorrect_word_count
10053,"All in all , using renewable energy and improv...","All in all , using renewable energy and improv...",125,122,24,23
10062,"All in all , using renewable energy and improv...","All in all , using renewable energy and improv...",125,122,24,23
31709,"Dear Mrs Ashby ,","Dear Mrs Kate Ashby ,",16,21,4,5
17444,"Dear Mrs Ashby ,","Dear Mrs Kate Ashby ,",16,21,4,5
9235,"Dear Mrs Ashby ,","Dear Mrs Kate Ashby ,",16,21,4,5
31802,"Dear Sir ,",Dear Sir :,10,10,3,3
13264,"Dear Sir ,",Dear Sir :,10,10,3,3
18331,"Dear Sir / Madam ,","Dear sir / madam ,",18,18,5,5
28481,"Dear Sir / Madam ,","Dear sir / madam ,",18,18,5,5
24205,"Faithfully ,","Sincerely ,",12,11,2,2


In [169]:
final_df = final_df.drop_duplicates().reset_index(drop=True)

In [170]:
final_df.shape

(22706, 6)

In [171]:
final_df.sample(5)

Unnamed: 0,correct,incorrect,correct_char_count,incorrect_char_count,correct_word_count,incorrect_word_count
7234,The only thing they found was a pair of goggle...,The only thing they found was a pair of google...,79,84,18,18
1445,"Sometimes , many people want to travel far awa...",sometimes many people want to travel faraway a...,91,82,18,14
1304,After spending two weeks with a fairy in Franc...,After spending two weeks with a fairy in Franc...,101,95,23,22
3513,"What is more , you will not be satisfied with ...","What is more , you will not be satisfied with ...",135,136,29,29
16930,"However , I recommend travelling to Extremadur...","However , I recommend travelling to Extremadur...",101,101,19,19


### Remove Small sentences



In [172]:
final_df[final_df['incorrect_char_count']<2].shape

(0, 6)

In [173]:
final_df = final_df[final_df['incorrect_char_count']>2].reset_index(drop=True)

In [174]:
final_df.shape

(22706, 6)

In [175]:
final_df[final_df['correct_char_count']<2].shape

(1, 6)

In [176]:
final_df[final_df['correct_char_count']<2].sample(10)

ValueError: Cannot take a larger sample than population when 'replace=False'

In [177]:
final_df = final_df[final_df['correct_char_count']>2].reset_index(drop=True)

In [178]:
final_df.shape

(22704, 6)

### Clean text

In [179]:
#https://www.analyticsvidhya.com/blog/2020/04/beginners-guide-exploratory-data-analysis-text-data/
contractions_dict = { "ain't": "are not","'s":" is","aren't": "are not",
                     "can't": "cannot","can't've": "cannot have",
                     "'cause": "because","could've": "could have","couldn't": "could not",
                     "couldn't've": "could not have", "didn't": "did not","doesn't": "does not",
                     "don't": "do not","hadn't": "had not","hadn't've": "had not have",
                     "hasn't": "has not","haven't": "have not","he'd": "he would",
                     "he'd've": "he would have","he'll": "he will", "he'll've": "he will have",
                     "how'd": "how did","how'd'y": "how do you","how'll": "how will",
                     "I'd": "I would", "I'd've": "I would have","I'll": "I will",
                     "I'll've": "I will have","I'm": "I am","I've": "I have", "isn't": "is not",
                     "it'd": "it would","it'd've": "it would have","it'll": "it will",
                     "it'll've": "it will have", "let's": "let us","ma'am": "madam",
                     "mayn't": "may not","might've": "might have","mightn't": "might not", 
                     "mightn't've": "might not have","must've": "must have","mustn't": "must not",
                     "mustn't've": "must not have", "needn't": "need not",
                     "needn't've": "need not have","o'clock": "of the clock","oughtn't": "ought not",
                     "oughtn't've": "ought not have","shan't": "shall not","sha'n't": "shall not",
                     "shan't've": "shall not have","she'd": "she would","she'd've": "she would have",
                     "she'll": "she will", "she'll've": "she will have","should've": "should have",
                     "shouldn't": "should not", "shouldn't've": "should not have","so've": "so have",
                     "that'd": "that would","that'd've": "that would have", "there'd": "there would",
                     "there'd've": "there would have", "they'd": "they would",
                     "they'd've": "they would have","they'll": "they will",
                     "they'll've": "they will have", "they're": "they are","they've": "they have",
                     "to've": "to have","wasn't": "was not","we'd": "we would",
                     "we'd've": "we would have","we'll": "we will","we'll've": "we will have",
                     "we're": "we are","we've": "we have", "weren't": "were not","what'll": "what will",
                     "what'll've": "what will have","what're": "what are", "what've": "what have",
                     "when've": "when have","where'd": "where did", "where've": "where have",
                     "who'll": "who will","who'll've": "who will have","who've": "who have",
                     "why've": "why have","will've": "will have","won't": "will not",
                     "won't've": "will not have", "would've": "would have","wouldn't": "would not",
                     "wouldn't've": "would not have","y'all": "you all", "y'all'd": "you all would",
                     "y'all'd've": "you all would have","y'all're": "you all are",
                     "y'all've": "you all have", "you'd": "you would","you'd've": "you would have",
                     "you'll": "you will","you'll've": "you will have", "you're": "you are",
                     "you've": "you have","n\'t":" not","\'re":" are","\'s": " is","\'d":" would",
                     "\'ll": " will","\'t":" not","\'ve": " have","\'m":" am"}


# Regular expression for finding contractions
contractions_re=re.compile('(%s)' % '|'.join(contractions_dict.keys()))

# Function for expanding contractions
def expand_contractions(text,contractions_dict=contractions_dict):
    def replace(match):
        return contractions_dict[match.group(0)]
    return contractions_re.sub(replace, text)

In [180]:
# https://stackoverflow.com/a/47091490/4084039
def clean(text):
    text = re.sub('\s*\<.*?\>\s', '', text)
    text = re.sub('\s*\(.*?\)\s', '', text)
    text = re.sub('\s*\[.*?\]\s', '', text)
    text = re.sub('\s*\{.*?\}\s', '', text)
    text = re.sub("[-+@#^/|*(){}$~<>=_%:;]","",text)
    text = text.replace("\\","")
    text = re.sub("\[","",text)
    text = re.sub("\]","",text)
    text = re.sub("\<","",text)
    text = re.sub("\>","",text)
    text = re.sub("\(","",text)
    text = re.sub("\)","",text)
    text = re.sub("[0-9]","",text)
    text = ' '.join(text.split())
    return text

In [181]:
final_df['correct'] = final_df['correct'].progress_apply(clean)
final_df['correct'] = final_df['correct'].progress_apply(expand_contractions)

  0%|          | 0/22704 [00:00<?, ?it/s]

  0%|          | 0/22704 [00:00<?, ?it/s]

In [182]:
final_df['incorrect'] = final_df['incorrect'].progress_apply(clean)
final_df['incorrect'] = final_df['incorrect'].progress_apply(expand_contractions)

  0%|          | 0/22704 [00:00<?, ?it/s]

  0%|          | 0/22704 [00:00<?, ?it/s]

In [183]:
final_df.sample(5)

Unnamed: 0,correct,incorrect,correct_char_count,incorrect_char_count,correct_word_count,incorrect_word_count
998,"In the middle of the main street , which is ne...","In the middle of the principal street , that i...",115,119,27,27
8062,"Using your own car , you can take the decision...",Using your own car you can take the decision o...,121,115,25,24
4548,"I got home from school , only to find a gift o...","I came home from school , only to found a gift...",58,60,15,15
10199,The Sforzesco Castle is another suggestive his...,The Sforzesco Castle is another suggestive his...,186,204,35,37
7987,"a summer job in a travel agency , because they...","a summer work in a travel agency , because the...",62,63,13,13


In [184]:
final_df.isna().sum()

correct                 0
incorrect               0
correct_char_count      0
incorrect_char_count    0
correct_word_count      0
incorrect_word_count    0
dtype: int64

In [185]:
final_df.to_csv('data\\final_df_preprocessed.csv',index=False)