## Imports

In [1]:
import pandas as pd
import numpy as np
import spacy

from string import punctuation
from spacy.lang.en.stop_words import STOP_WORDS

In [2]:
!spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m0m eta [36m0:00:01[0m0:01[0m:01[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


## Load data

In [3]:
df = pd.read_csv("./Dataset/processed_dataset_1.csv")

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,essay_id,essay_set,essay,word_count,final_score
0,0,1,1,"Dear local newspaper, I think effects computer...",338,6
1,1,2,1,"Dear @CAPS1 @CAPS2, I believe that using compu...",419,7
2,2,3,1,"Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...",279,5
3,3,4,1,"Dear Local Newspaper, @CAPS1 I have found that...",524,8
4,4,5,1,"Dear @LOCATION1, I know having computers has a...",465,6


In [5]:
df.drop("Unnamed: 0",inplace=True,axis=1)

In [6]:
df.head()

Unnamed: 0,essay_id,essay_set,essay,word_count,final_score
0,1,1,"Dear local newspaper, I think effects computer...",338,6
1,2,1,"Dear @CAPS1 @CAPS2, I believe that using compu...",419,7
2,3,1,"Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...",279,5
3,4,1,"Dear Local Newspaper, @CAPS1 I have found that...",524,8
4,5,1,"Dear @LOCATION1, I know having computers has a...",465,6


## Essay Processing
It is divided into 5 steps:
1. Language Correction
2. Sentence tokenization, Sentence count and length, Word tokenization
3. Word token classification (punctuation, stop words and anonymized entities, pos, ent)

### 1. Language Correction

Student's essays posses lots of grammar and spelling errors. Parts-of-speech (POS) and named-entity-recognition (NER) is hampered in part by the lack of consistent spelling and punctuation. Therefore, the essays will be corrected using languagetool and the nlp parsing will be performed with Spacy on the corrected essays.

In [7]:
!pip install language-tool-python



In [8]:
from  datetime import datetime
import language_tool_python

In [28]:
tool = language_tool_python.LanguageTool('en-US')

t0 = datetime.now()
df['matches'] = df['essay'].apply(lambda txt: tool.check(txt))
df['corrections'] = df.apply(lambda l: len(l['matches']), axis=1)
df['corrected'] = df.apply(lambda l: language_tool_python.utils.correct(l['essay'], l['matches']), axis=1)


t1 = datetime.now()
print('Processing time: {}'.format(t1 - t0))

Downloading LanguageTool 5.7: 100%|██████████████████████████████████████████████████████████████████████████| 225M/225M [00:29<00:00, 7.73MB/s]
Unzipping /tmp/tmpat77pdnd.zip to /home/surajkarki/.cache/language_tool_python.
Downloaded https://www.languagetool.org/download/LanguageTool-5.7.zip to /home/surajkarki/.cache/language_tool_python.


Processing time: 0:24:41.084574


Let's see a very special example of poor writing skills.

In [29]:
print('Original:')
print(df.essay[18])
print('Corrected with languagetool')
print(df.corrected[18])

Original:
I aegre waf the evansmant ov tnachnolage. The evansmant ov tnachnolige is being to halp fined a kohar froi alnsas. Tnanchnolage waf ont ot we wod not go to the moon. Tnachnologe evans as we maech at. The people are in tnacholege to the frchr fror the good ov live. Famas invanyor ues tnacholage leki lena orde dvanse and his fling mashine. Tnachologe is the grat
Corrected with languagetool
I Segre weigh the evanescent of tnachnolage. The evanescent of tnachnolige is being to half fined a Zohar from Kansas. Tnanchnolage weigh on tot we won not go to the moon. Technology Evans as we match at. The people are in tnacholege to the arch for the good of live. FAMAS inventor UES anchorage Levi Lena order dance and his fling machine. Tnachologe is the great


### 2. Sentence tokenization, Sentence count and length, Word tokenization

In [10]:
sents = []
tokens = []
lemma = []
pos = []
ner = []

stop_words = set(STOP_WORDS)
stop_words.update(punctuation)

nlp = spacy.load('en_core_web_sm')

t0 = datetime.now()

# suppress numpy warnings
np.warnings.filterwarnings('ignore')

for essay in nlp.pipe(df['corrected'], batch_size=100):
    if essay.is_parsed:
        tokens.append([e.text for e in essay])
        sents.append([sent.text.strip() for sent in essay.sents])
        pos.append([e.pos_ for e in essay])
        ner.append([e.text for e in essay.ents])
        lemma.append([n.lemma_ for n in essay])
    else:
        tokens.append(None)
        lemma.append(None)
        pos.append(None)
        sents.append(None)
        ner.append(None)

df['tokens'] = tokens
df['lemma'] = lemma
df['pos'] = pos
df['sents'] = sents
df['ner'] = ner

t1 = datetime.now()
print('Processing time: {}'.format(t1 - t0))

Processing time: 0:05:34.070084


In [11]:
df.head()

Unnamed: 0.1,Unnamed: 0,essay_id,essay_set,essay,word_count,final_score,matches,corrections,corrected,tokens,lemma,pos,sents,ner
0,0,1,1,"Dear local newspaper, I think effects computer...",338,6,"[Match({'ruleId': 'MORFOLOGIK_RULE_EN_US', 'me...",16,"Dear local newspaper, I think effects computer...","[Dear, local, newspaper, ,, I, think, effects,...","[dear, local, newspaper, ,, I, think, effect, ...","[ADJ, ADJ, NOUN, PUNCT, PRON, VERB, NOUN, NOUN...","[Dear local newspaper, I think effects compute...","[Facebook, MySpace]"
1,1,2,1,"Dear @CAPS1 @CAPS2, I believe that using compu...",419,7,"[Match({'ruleId': 'MORFOLOGIK_RULE_EN_US', 'me...",25,"Dear @CAPS1 @CAPS2, I believe that using compu...","[Dear, @CAPS1, @CAPS2, ,, I, believe, that, us...","[Dear, @CAPS1, @CAPS2, ,, I, believe, that, us...","[PROPN, PROPN, PROPN, PUNCT, PRON, VERB, SCONJ...","[Dear @CAPS1 @CAPS2, I believe that using comp...","[Facebook and MySpace, millions, one, MySpace,..."
2,2,3,1,"Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...",279,5,"[Match({'ruleId': 'MORFOLOGIK_RULE_EN_US', 'me...",17,"Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...","[Dear, ,, @CAPS1, @CAPS2, @CAPS3, More, and, m...","[dear, ,, @CAPS1, @CAPS2, @CAPS3, More, and, m...","[ADJ, PUNCT, PROPN, PROPN, PROPN, ADJ, CCONJ, ...","[Dear, @CAPS1 @CAPS2 @CAPS3 More and more peop...","[today, one, a thousand]"
3,3,4,1,"Dear Local Newspaper, @CAPS1 I have found that...",524,8,"[Match({'ruleId': 'MORFOLOGIK_RULE_EN_US', 'me...",29,"Dear Local Newspaper, @CAPS1 I have found that...","[Dear, Local, Newspaper, ,, @CAPS1, I, have, f...","[Dear, Local, Newspaper, ,, @CAPS1, I, have, f...","[PROPN, PROPN, PROPN, PUNCT, PROPN, PRON, AUX,...","[Dear Local Newspaper, @CAPS1 I have found tha...","[Dear Local Newspaper, @PERSON1, A+, @CAPS7, N..."
4,4,5,1,"Dear @LOCATION1, I know having computers has a...",465,6,"[Match({'ruleId': 'MORFOLOGIK_RULE_EN_US', 'me...",17,"Dear @LOCATION1, I know having computers has a...","[Dear, @LOCATION1, ,, I, know, having, compute...","[dear, @LOCATION1, ,, I, know, have, computer,...","[ADJ, PROPN, PUNCT, PRON, VERB, VERB, NOUN, VE...","[Dear @LOCATION1, I know having computers has ...","[First, one, Secondly, one, only one]"


## Count various features

In [12]:
t0 = datetime.now()

df['token_count'] = df.apply(lambda x: len(x['tokens']), axis=1)
df['unique_token_count'] = df.apply(lambda x: len(set(x['tokens'])), axis=1)
df['nostop_count'] = df.apply(lambda x: len([token for token in x['tokens'] if token not in stop_words]), axis=1)
df['sent_count'] = df.apply(lambda x: len(x['sents']), axis=1)
df['ner_count'] = df.apply(lambda x: len(x['ner']), axis=1)
df['comma'] = df.apply(lambda x: x['corrected'].count(','), axis=1)
df['question'] = df.apply(lambda x: x['corrected'].count('?'), axis=1)
df['exclamation'] = df.apply(lambda x: x['corrected'].count('!'), axis=1)
df['quotation'] = df.apply(lambda x: x['corrected'].count('"') + x['corrected'].count("'"), axis=1)
df['organization'] = df.apply(lambda x: x['corrected'].count(r'@ORGANIZATION'), axis=1)
df['caps'] = df.apply(lambda x: x['corrected'].count(r'@CAPS'), axis=1)
df['person'] = df.apply(lambda x: x['corrected'].count(r'@PERSON'), axis=1)
df['location'] = df.apply(lambda x: x['corrected'].count(r'@LOCATION'), axis=1)
df['money'] = df.apply(lambda x: x['corrected'].count(r'@MONEY'), axis=1)
df['time'] = df.apply(lambda x: x['corrected'].count(r'@TIME'), axis=1)
df['date'] = df.apply(lambda x: x['corrected'].count(r'@DATE'), axis=1)
df['percent'] = df.apply(lambda x: x['corrected'].count(r'@PERCENT'), axis=1)
df['noun'] = df.apply(lambda x: x['pos'].count('NOUN'), axis=1)
df['adj'] = df.apply(lambda x: x['pos'].count('ADJ'), axis=1)
df['pron'] = df.apply(lambda x: x['pos'].count('PRON'), axis=1)
df['verb'] = df.apply(lambda x: x['pos'].count('VERB'), axis=1)
df['noun'] = df.apply(lambda x: x['pos'].count('NOUN'), axis=1)
df['cconj'] = df.apply(lambda x: x['pos'].count('CCONJ'), axis=1)
df['adv'] = df.apply(lambda x: x['pos'].count('ADV'), axis=1)
df['det'] = df.apply(lambda x: x['pos'].count('DET'), axis=1)
df['propn'] = df.apply(lambda x: x['pos'].count('PROPN'), axis=1)
df['num'] = df.apply(lambda x: x['pos'].count('NUM'), axis=1)
df['part'] = df.apply(lambda x: x['pos'].count('PART'), axis=1)
df['intj'] = df.apply(lambda x: x['pos'].count('INTJ'), axis=1)

t1 = datetime.now()

print('Processing time: {}'.format(t1 - t0))

Processing time: 0:00:03.164669


In [13]:
df.head()

Unnamed: 0.1,Unnamed: 0,essay_id,essay_set,essay,word_count,final_score,matches,corrections,corrected,tokens,...,adj,pron,verb,cconj,adv,det,propn,num,part,intj
0,0,1,1,"Dear local newspaper, I think effects computer...",338,6,"[Match({'ruleId': 'MORFOLOGIK_RULE_EN_US', 'me...",16,"Dear local newspaper, I think effects computer...","[Dear, local, newspaper, ,, I, think, effects,...",...,17,48,51,14,15,21,6,0,16,2
1,1,2,1,"Dear @CAPS1 @CAPS2, I believe that using compu...",419,7,"[Match({'ruleId': 'MORFOLOGIK_RULE_EN_US', 'me...",25,"Dear @CAPS1 @CAPS2, I believe that using compu...","[Dear, @CAPS1, @CAPS2, ,, I, believe, that, us...",...,20,51,70,18,18,30,12,5,10,0
2,2,3,1,"Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...",279,5,"[Match({'ruleId': 'MORFOLOGIK_RULE_EN_US', 'me...",17,"Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...","[Dear, ,, @CAPS1, @CAPS2, @CAPS3, More, and, m...",...,19,27,40,16,11,25,6,3,10,0
3,3,4,1,"Dear Local Newspaper, @CAPS1 I have found that...",524,8,"[Match({'ruleId': 'MORFOLOGIK_RULE_EN_US', 'me...",29,"Dear Local Newspaper, @CAPS1 I have found that...","[Dear, Local, Newspaper, ,, @CAPS1, I, have, f...",...,42,33,73,17,20,42,37,0,23,0
4,4,5,1,"Dear @LOCATION1, I know having computers has a...",465,6,"[Match({'ruleId': 'MORFOLOGIK_RULE_EN_US', 'me...",17,"Dear @LOCATION1, I know having computers has a...","[Dear, @LOCATION1, ,, I, know, having, compute...",...,28,41,61,16,33,49,3,4,20,0


In [14]:
df.to_csv("./Dataset/processed_dataset_2.csv")