In [2]:
!pip install --upgrade gspread
# Install Java
!apt-get install -y openjdk-8-jdk-headless -qq > /dev/null
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"

!pip install language_tool_python




## read in the data on google sheet

In [3]:
 from google.colab import drive
 drive.mount('/content/drive')
 %cd /content/drive/MyDrive/

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive


In [4]:
from google.colab import auth
import gspread
from google.auth import default
# autenticating to google
auth.authenticate_user()
creds, _ = default()
gc = gspread.authorize(creds)

In [42]:
import pandas as pd

worksheet = gc.open('text_auto_correction').sheet1
#get_all_values gives a list of rows
rows = worksheet.get_all_values()
#Convert to a DataFrame
df = pd.DataFrame(rows)

df.columns = df.iloc[0]
df = df.iloc[1:]

In [6]:
org_text = df[['text']]
org_text

Unnamed: 0,text
1,mole animal small.it lives at underground.mole...
2,dolphins and whales communicate using pings of...
3,A weather report of mountains may tell you to ...
4,An earthquake occurs when the ground suddenly ...
5,you have seen tall mountains in real life or i...
...,...
4686,The results from many studies indicate that vi...
4687,People are being advised to spend less time in...
4688,We are thus in a situation where people are re...
4689,"There are two types of cancers, melanoma and b..."


pass the `is_first_round=True` for the first time running the correction function on the original text. It will add a space after each period if there isn't one. For instance, modifying "...animal small.it lives..." to "...animal small. it lives...".

In [30]:
import language_tool_python
import re
tool = language_tool_python.LanguageTool('en-US')

def add_space_after_period(text):
    corrected_text = re.sub(r'(?<=\w)\.(?=\w)', '. ', text)
    return corrected_text

def correct_text(text, is_first_round = True):
    if is_first_round:
      corrected_text = tool.correct(add_space_after_period(text))
    else:
      corrected_text = tool.correct(text)
    return corrected_text

### Process the text


#### the first 1000 rows

In [28]:
org_text_part = org_text.head(1000)
org_text_part['corrected_text'] = org_text_part['text'].apply(correct_text)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  org_text_part['corrected_text'] = org_text_part['text'].apply(correct_text)


* There are some incorrect correction


For example, "11 Dolphins and whales communicate using **prings** of sound " prings should be **pings**, but it was changed to **brings**

In [29]:
org_text_part

Unnamed: 0,text,corrected_text
1,mole animal small.it lives at underground.mole...,Mole animal small. It lives at underground. Mo...
2,dolphins and whales communicate using pings of...,Dolphins and whales communicate using pings of...
3,A weather report of mountains may tell you to ...,A weather report of mountains may tell you to ...
4,An earthquake occurs when the ground suddenly ...,An earthquake occurs when the ground suddenly ...
5,you have seen tall mountains in real life or i...,You have seen tall mountains in real life or i...
...,...,...
996,so many of job is for government. as well as p...,So many of job is for government. As well as p...
997,DISEASES CAN CAUSE PROBLEMS. ALZHEIMER'S MAKES...,DISEASES CAN CAUSE PROBLEMS. ALZHEIMER'S MAKES...
998,Diseases can cause many problems. Alzheimer's ...,Diseases can cause many problems. Alzheimer's ...
999,Civil service offers jobs to thousands of men ...,Civil service offers jobs to thousands of men ...


##### Things might be better if we run the function twice

In [31]:
org_text_part['R2_corrected_text'] = org_text_part['corrected_text'].apply(lambda x: correct_text(x, is_first_round=False))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  org_text_part['R2_corrected_text'] = org_text_part['corrected_text'].apply(lambda x: correct_text(x, is_first_round=False))


In [32]:
org_text_part

Unnamed: 0,text,corrected_text,R2_corrected_text
1,mole animal small.it lives at underground.mole...,Mole animal small. It lives at underground. Mo...,Mole animal small. It lives at underground. Mo...
2,dolphins and whales communicate using pings of...,Dolphins and whales communicate using pings of...,Dolphins and whales communicate using pings of...
3,A weather report of mountains may tell you to ...,A weather report of mountains may tell you to ...,A weather report of mountains may tell you to ...
4,An earthquake occurs when the ground suddenly ...,An earthquake occurs when the ground suddenly ...,An earthquake occurs when the ground suddenly ...
5,you have seen tall mountains in real life or i...,You have seen tall mountains in real life or i...,You have seen tall mountains in real life or i...
...,...,...,...
996,so many of job is for government. as well as p...,So many of job is for government. As well as p...,So many of job is for government. As well as p...
997,DISEASES CAN CAUSE PROBLEMS. ALZHEIMER'S MAKES...,DISEASES CAN CAUSE PROBLEMS. ALZHEIMER'S MAKES...,DISEASES CAN CAUSE PROBLEMS. ALZHEIMER'S MAKES...
998,Diseases can cause many problems. Alzheimer's ...,Diseases can cause many problems. Alzheimer's ...,Diseases can cause many problems. Alzheimer's ...
999,Civil service offers jobs to thousands of men ...,Civil service offers jobs to thousands of men ...,Civil service offers jobs to thousands of men ...


#### process the rest data

In [35]:
the_rest = org_text.iloc[1000:, :]
the_rest['corrected_text'] = the_rest['text'].apply(correct_text)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  the_rest['corrected_text'] = the_rest['text'].apply(correct_text)


In [37]:
the_rest['R2_corrected_text'] = the_rest['corrected_text'].apply(lambda x: correct_text(x, is_first_round=False))
the_rest

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  the_rest['R2_corrected_text'] = the_rest['corrected_text'].apply(lambda x: correct_text(x, is_first_round=False))


Unnamed: 0,text,corrected_text,R2_corrected_text
1001,Civil service jobs are jobs that work with dif...,Civil service jobs are jobs that work with dif...,Civil service jobs are jobs that work with dif...
1002,They're civil jobs for women and men.. These j...,They're civil jobs for women and men. These jo...,They're civil jobs for women and men. These jo...
1003,Learn How a Hidden Vitamin D Deficiency Could ...,Learn How a Hidden vitamin D Deficiency Could ...,Learn How a Hidden vitamin D Deficiency Could ...
1004,Non-living parts of the environment are also p...,Non-living parts of the environment are also p...,Non-living parts of the environment are also p...
1005,Diseases Is a big problem in our life's. Alzhe...,Diseases Is a big problem in our life's. Alzhe...,Diseases Is a big problem in our life's. Alzhe...
...,...,...,...
4686,The results from many studies indicate that vi...,The results from many studies indicate that vi...,The results from many studies indicate that vi...
4687,People are being advised to spend less time in...,People are being advised to spend less time in...,People are being advised to spend less time in...
4688,We are thus in a situation where people are re...,We are thus in a situation where people are re...,We are thus in a situation where people are re...
4689,"There are two types of cancers, melanoma and b...","There are two types of cancers, melanoma and b...","There are two types of cancers, melanoma and b..."


#### concat the 2 dataframes together

In [41]:
res_df = pd.concat([org_text_part, the_rest])
res_df

Unnamed: 0,text,corrected_text,R2_corrected_text
1,mole animal small.it lives at underground.mole...,Mole animal small. It lives at underground. Mo...,Mole animal small. It lives at underground. Mo...
2,dolphins and whales communicate using pings of...,Dolphins and whales communicate using pings of...,Dolphins and whales communicate using pings of...
3,A weather report of mountains may tell you to ...,A weather report of mountains may tell you to ...,A weather report of mountains may tell you to ...
4,An earthquake occurs when the ground suddenly ...,An earthquake occurs when the ground suddenly ...,An earthquake occurs when the ground suddenly ...
5,you have seen tall mountains in real life or i...,You have seen tall mountains in real life or i...,You have seen tall mountains in real life or i...
...,...,...,...
4686,The results from many studies indicate that vi...,The results from many studies indicate that vi...,The results from many studies indicate that vi...
4687,People are being advised to spend less time in...,People are being advised to spend less time in...,People are being advised to spend less time in...
4688,We are thus in a situation where people are re...,We are thus in a situation where people are re...,We are thus in a situation where people are re...
4689,"There are two types of cancers, melanoma and b...","There are two types of cancers, melanoma and b...","There are two types of cancers, melanoma and b..."


merge the corrected text dataframe to the original dataframe

In [45]:
merged_df = pd.merge(df, res_df, on='text')
merged_df.to_csv('output.csv', index=False)

### some tests

In [22]:
test = 'finelly Dolphins and whales communicate using prings of sound at high frewuncies.'
tool.correct(test)

'Finelly Dolphins and whales communicate using brings of sound at high frequencies.'

In [26]:
tool.check(test)

[Match({'ruleId': 'UPPERCASE_SENTENCE_START', 'message': 'This sentence does not start with an uppercase letter.', 'replacements': ['Finelly'], 'offsetInContext': 0, 'context': 'finelly Dolphins and whales communicate using p...', 'offset': 0, 'errorLength': 7, 'category': 'CASING', 'ruleIssueType': 'typographical', 'sentence': 'finelly Dolphins and whales communicate using prings of sound at high frewuncies.'}),
 Match({'ruleId': 'MORFOLOGIK_RULE_EN_US', 'message': 'Possible spelling mistake found.', 'replacements': ['brings', 'prints', 'rings', 'springs', 'prongs', 'pings', 'parings', 'wrings', 'prigs', 'prangs'], 'offsetInContext': 43, 'context': '...y Dolphins and whales communicate using prings of sound at high frewuncies.', 'offset': 46, 'errorLength': 6, 'category': 'TYPOS', 'ruleIssueType': 'misspelling', 'sentence': 'finelly Dolphins and whales communicate using prings of sound at high frewuncies.'}),
 Match({'ruleId': 'MORFOLOGIK_RULE_EN_US', 'message': 'Possible spelling mis

In [23]:
tool.correct(tool.correct(test))

'Finally Dolphins and whales communicate using brings of sound at high frequencies.'

In [24]:
tool.correct(tool.correct(tool.correct(test)))

'Finally, Dolphins and whales communicate using brings of sound at high frequencies.'

In [25]:
tool.correct(tool.correct(tool.correct(tool.correct(test))))

'Finally, Dolphins and whales communicate using brings of sound at high frequencies.'