In [1]:
import pandas as pd
import matplotlib.pyplot 
%matplotlib inline
import numpy as np
import seaborn as sns
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from collections import Counter
import pickle

from datetime import datetime

In [2]:
df=pd.read_pickle('new_dataset.pickle')
df.head()

Unnamed: 0,index,essay_id,essay_set,essay,domain1_score,mistake_count,sentence_count
0,0,1,1,"Dear local newspaper, I think effects computer...",8.0,15,11
1,1,2,1,"Dear @CAPS1 @CAPS2, I believe that using compu...",9.0,23,19
2,2,3,1,"Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...",7.0,17,15
3,3,4,1,"Dear Local Newspaper, @CAPS1 I have found that...",10.0,27,25
4,4,5,1,"Dear @LOCATION1, I know having computers has a...",8.0,17,31


## Counting the number of nouns, verbs, adjectives, etc. 

In [3]:
nlp = spacy.load('en_core_web_sm')

### POS (Parts of Speech) tagging 

In [None]:
# list to store all the pos tags, NER and tokens
pos = [] 
ner = []
tokens = []

# there are predefined stop words, so we are just using them here 
stop_words = set(STOP_WORDS)

t0 = datetime.now()

# iterating over each essay 
for essay in nlp.pipe(df['essay']):
    # idk about this, i just looked it up 
    if essay.has_annotation("DEP"):
        # add the pos tag for each word in the esssay to the list 
        pos.append([e.pos_ for e in essay])
        # add the NER present in the essay tothe list
        ner.append([e.text for e in essay.ents])
        # add the tokens present in the essay to the list
        tokens.append([e.text for e in essay])
        
t1 = datetime.now()
print('Processing time: {}'.format(t1 - t0)) 

# so basically, each essay now has a list of the pos tag for every word in it and also the 
# lists of NERs and tokens.

In [None]:
type(df['essay'])

In [None]:
# i was getting some warning saying i can't directly add a new col to the existing df like i did below.
# this is to suppress it.
pd.options.mode.chained_assignment = None  # default='warn'

In [6]:
# creating a new col for the pos 
df['pos'] = pos
df['ner'] = ner 
df['token'] = tokens

In [7]:
# counting the number of a specific pos tag in each essay (in the 'pos' col) and adding a new col for it in the df 
df['noun'] = df.apply(lambda x: x['pos'].count('NOUN'), axis=1)
df['verb'] = df.apply(lambda x: x['pos'].count('VERB'), axis=1)
df['adj'] = df.apply(lambda x: x['pos'].count('ADJ'), axis=1)
df['punct'] = df.apply(lambda x: x['pos'].count('PUNCT'), axis=1)
df['adv'] = df.apply(lambda x: x['pos'].count('ADV'), axis=1)
df['pron'] = df.apply(lambda x: x['pos'].count('PRON'), axis=1) 
# length of the NER list will itself be the no. of NERs
df['ner_count'] = df.apply(lambda x: len(x['ner']), axis=1)
# if a word from the list of token is present in the stop words list of an essay then add it to the temp list 
# and just take the len of that list as it will be the count of stop words in that particular essay 
df['stop_words_count'] = df.apply(lambda x: len([word for word in x['token'] if word in stop_words]), axis=1)

In [8]:
df.head()

Unnamed: 0,index,essay_id,essay_set,essay,domain1_score,mistake_count,sentence_count,pos,ner,token,noun,verb,adj,punct,adv,pron,ner_count,stop_words_count
0,0,1,1,"Dear local newspaper, I think effects computer...",8.0,15,11,"[ADJ, ADJ, NOUN, PUNCT, PRON, VERB, NOUN, NOUN...","[@ORGANIZATION2, @CAPS1, @CAPS2]","[Dear, local, newspaper, ,, I, think, effects,...",82,50,19,38,16,47,3,189
1,1,2,1,"Dear @CAPS1 @CAPS2, I believe that using compu...",9.0,23,19,"[ADJ, NOUN, ADV, PUNCT, PRON, VERB, SCONJ, VER...","[@CAPS1, millions, one, one, millions, @LOCATI...","[Dear, @CAPS1, @CAPS2, ,, I, believe, that, us...",98,71,24,32,16,49,11,218
2,2,3,1,"Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...",7.0,17,15,"[ADJ, PUNCT, PROPN, ADV, PROPN, ADJ, CCONJ, AD...","[today, @CAPS4, one, a thousand]","[Dear, ,, @CAPS1, @CAPS2, @CAPS3, More, and, m...",72,38,19,23,13,25,4,151
3,3,4,1,"Dear Local Newspaper, @CAPS1 I have found that...",10.0,27,25,"[PROPN, PROPN, PROPN, PUNCT, PROPN, PRON, AUX,...","[@CAPS1, @PERSON1, @PERSON2, @CAPS3, @PERCENT2...","[Dear, Local, Newspaper, ,, @CAPS1, I, have, f...",136,72,39,43,23,31,15,246
4,4,5,1,"Dear @LOCATION1, I know having computers has a...",8.0,17,31,"[ADJ, PROPN, PUNCT, PRON, VERB, VERB, NOUN, VE...","[First, one, Secondly, one, only one, @CAPS1]","[Dear, @LOCATION1, ,, I, know, having, compute...",107,61,28,43,35,41,6,266


In [9]:
df.columns

Index(['index', 'essay_id', 'essay_set', 'essay', 'domain1_score',
       'mistake_count', 'sentence_count', 'pos', 'ner', 'token', 'noun',
       'verb', 'adj', 'punct', 'adv', 'pron', 'ner_count', 'stop_words_count'],
      dtype='object')

In [10]:
#df.to_pickle('final.pickle')

In [11]:
df2 = df.drop(columns=['token', 'pos', 'ner'])
df2.head(2)

Unnamed: 0,index,essay_id,essay_set,essay,domain1_score,mistake_count,sentence_count,noun,verb,adj,punct,adv,pron,ner_count,stop_words_count
0,0,1,1,"Dear local newspaper, I think effects computer...",8.0,15,11,82,50,19,38,16,47,3,189
1,1,2,1,"Dear @CAPS1 @CAPS2, I believe that using compu...",9.0,23,19,98,71,24,32,16,49,11,218


In [12]:
df2.to_pickle('final.pickle')