In [18]:
# Configure the language model, data source, and options

model = 'en_core_web_sm'
manifest_dir = 'data'
manifest_file = '2006_08_humanities_student_major_0_reddit_com.json'
options = {'merge_noun_chunks': False, 'merge_subtokens': False, 'collect_readability_scores': True}

In [50]:
import json
import os
import nltk
import pandas as pd
import spacy
from collections import Counter
from nltk.stem.porter import *  
from nltk.stem.snowball import SnowballStemmer

# Load the language model
nlp = spacy.load(model)

# Test for the spacy-readability module
try:
    from spacy_readability import Readability
except:
    msg = 'The spacy-readability module is not installed on your system, so readability scores will be unavailable unless you `pip install spacy-_readability`.'
    print(msg)

# The Document class
class Document():
    """Model a document's features.

    Parameters:
    - manifest_dir: the path to the manifest directory
    - manifest_file: the name of the manifest file.
    - content_property: the name of the property from which to extract the content

    Returns a JSON object with the format `{'response': 'success|fail', 'errors': []}`.

    """
    
    def __init__(self, manifest_dir, manifest_file, content_property, **kwargs):
        """Initialize the object."""
        self.manifest_filepath = os.path.join(manifest_dir, manifest_file)
        self.manifest_dict = self._read_manifest()
        self.manifest_json = json.dumps(self.manifest_dict, indent=2)
        self.doc_string = self._get_docstring(content_property)
        self.content = nlp(self.doc_string)
        self.options = kwargs['kwargs']
        self.feature_list = self.get_feature_list()
        
    def _read_manifest(self):
        """Read a JSON file and return a Python dict."""
        with open(self.manifest_filepath, 'r', encoding='utf-8') as f:
            return json.loads(f.read())
        
    def _get_docstring(self, content_property):
        """Extract a document string from a manifest property."""
        return self.manifest_dict[content_property]
    
    def get_feature_list(self):
        """Process the document with the spaCy pipeline.
        
        If `collect_readability_scores` is set, Flesch-Kincaid Readability,
        Flesch-Kincaid Reading Ease and Dale-Chall formula scores are collected
        in a tuple in that order. Other formulas are available (see 
        https://github.com/mholtzscher/spacy_readability).

        Returns a list containing the document's feature set.
        """
        # Handle optional pipes
        if 'merge_noun_chunks' in self.options and self.options['merge_noun_chunks'] == True:
            merge_nps = nlp.create_pipe('merge_noun_chunks')
            nlp.add_pipe(merge_nps)
        if 'merge_subtokens' in self.options and self.options['merge_subtokens'] == True:
            merge_subtok = nlp.create_pipe('merge_subtokens')
            nlp.add_pipe(merge_subtok)
        if 'collect_readability_scores' in self.options and self.options['collect_readability_scores'] == True:
            try:
                nlp.add_pipe(Readability())
            except:
                pass
            readability = (self.content._.flesch_kincaid_grade_level, self.content._.flesch_kincaid_reading_ease, self.content._.dale_chall)
        # Build the feature list
        feature_list = []
        for token in self.content:
            # Get named entity info (I=Inside, O=Outside, B=Begin)
            ner = (token.ent_iob_, token.ent_type_)
            t = [token.text, token.lemma_, token.pos_, token.tag_, ner]
            if readability:
                t.append(readability)
            feature_list.append(tuple(t))
        return feature_list
    
    def get_df(self):
        """Convert the list of features to a Pandas dataframe."""
        columns = ['Text', 'Lemma', 'POS', 'Tag', 'Entities']
        if 'collect_readability_scores' in self.options and self.options['collect_readability_scores'] == True:
            columns.append('Readability')
        return pd.DataFrame(self.feature_list, columns=columns)
    
    def token_count(self, remove=[], pos=None):
        """Calculate number of tokens in the spaCy document.
        
        Parameters:
        - remove: a list containing features to be removed ('punctuation', 'stopwords', or both)
        - pos: a valid grammatical category to filter by (e.g. 'NOUN').
        - as_counter: returns the result as a counter object

        Returns a dict unless as_counter is set to `True`. This enables the following:
        """
        is_punct = 'False'
        is_stop = 'False'
        if 'punctuation' in remove:
            is_punct = True
        if 'stopwords' in remove:
            is_stop = True
        if pos is not None:
            tokens = [token.text for token in self.content if token.is_punct != is_punct and token.is_stop != is_stop and token.pos_ == pos]
        else:
            tokens = [token.text for token in self.content if token.is_punct != is_punct and token.is_stop != is_stop]
        return len(tokens)

    def get_lemmas(self, as_df=False):
        """Get a list of lemmas from the document.
        
        Parameters:
        - as_df: return the list as a dataframe.
        """
        lemmas = [token.lemma_ for token in self.content]
        if as_df == True:
            lemmas = pd.DataFrame(lemmas, columns=['Lemmas'])
        return lemmas

    def get_pos(self, as_df=False):
        """Get a list of parts of speech from the document.
        
        Parameters:
        - as_df: return the list as a dataframe.
        """
        pos = [token.pos_ for token in self.content]
        if as_df == True:
            pos = pd.DataFrame(pos, columns=['POS'])
        return pos

    def get_tags(self, as_df=False):
        """Get a list of grammatical tags from the document.
        
        Parameters:
        - as_df: return the list as a dataframe.
        """
        tags = [token.tag_ for token in self.content]
        if as_df == True:
            tags = pd.DataFrame(tags, columns=['Tags'])
        return tags

    def get_entities(self, options=['text', 'label'], as_df=False):
        """Get a list of entities from the document.
        
        Parameters:
        - options: a list of attributes ('text', 'start', 'end', 'label')
        - as_df: return the list as a dataframe.
        """
        ents = []
        for ent in self.content.ents:
            e = []
            if 'text' in options:
                e.append(ent.text)
            if 'start' in options:
                e.append(ent.start)
            if 'end' in options:
                e.append(ent.end)
            if 'label' in options:
                e.append(ent.label_)
            ents.append(tuple(e))
        if as_df == True:
            ents = pd.DataFrame(ents, columns=[option.title() for option in options])
        return ents

    def get_readability_scores(self, columns=['Flesch-Kincaid Readability',
        'Flesch-Kincaid Reading Ease', 'Dale-Chall'], as_df=False):
        """Get a list of readability scores from the document.
        
        Parameters:
        - columns: a list of labels for the score types
        - as_df: return the list as a dataframe.
        """
        tuples = self.get_df()['Readability'].values
        scores = []
        for score in tuples:
            scores.append(list(score))
        if as_df == True:
            scores = pd.DataFrame(scores, columns=columns)
        return scores

    def bagify(self, remove=[], pos=None, as_counter=False):
        """Convert a spaCy document into a dict containing feature frequencies.

        Parameters:
        - remove: a list containing features to be removed ('punctuation', 'stopwords', or both)
        - pos: a valid grammatical category to filter by (e.g. 'NOUN').
        - as_counter: returns the result as a counter object

        Returns a dict unless as_counter is set to `True`. This enables the following:
        
        `bag = features.bagify(as_counter=True).most_common(10)`
        """
        # Boolean False is not accepted for some reason
        is_punct = 'False'
        is_stop = 'False'
        if 'punctuation' in remove:
            is_punct = True
        if 'stopwords' in remove:
            is_stop = True
        if pos is not None:
            bag = [token.text for token in self.content if token.is_punct != is_punct and token.is_stop != is_stop and token.pos_ == pos]
        else:
            bag = [token.text for token in self.content if token.is_punct != is_punct and token.is_stop != is_stop]
        if as_counter == True:
            return Counter(bag)
        else:
            return dict(Counter(bag))

    def get_stems(self, stemmer='porter', as_df=True):
        """Convert the tokens in a spaCy document to stems.

        Parameters:
        - stemmer: the stemming algorithm ('porter' or 'snowball').
        - as_df: return the list as a dataframe.

        Returns a list of stems or a dataframe.
        """
        if stemmer == 'snowball':
            stemmer = SnowballStemmer(language='english')
        else:
            stemmer = PorterStemmer()
        stems = [stemmer.stem(token.text) for token in self.content]
        if as_df == True:
            stems = pd.DataFrame(stems, columns=['Stems'])
        return stems


In [51]:
# Initialise the feature table
doc = Document(manifest_dir, manifest_file, 'content_scrubbed', kwargs=options)

In [55]:
## Uncomment lines below to try various class methods.
## See the method docstrings for options not shown in the examples below.

# Get all the doc features in a dataframe
doc.get_df()

# Same thing, except return a list of tuples
# # doc.get_feature_list()

# Get lemmas
# doc.get_lemmas(as_df=True)

# Get entities
# doc.get_entities(as_df=True)

# Get Stems
# doc.get_stems(stemmer='porter', as_df=True)

# Get readability scores
# doc.get_readability_scores(as_df=True)

# Bagify
# doc.bagify()

# Get most common nouns
# doc.bagify(pos='NOUN', as_counter=True).most_common(5)

# Get a count of all tokens
# doc.token_count(remove=['punctuation', 'stopwords'])

# Get the original document text
# doc.doc_string


Unnamed: 0,Text,Lemma,POS,Tag,Entities,Readability
0,Ideas,idea,NOUN,NNS,"(O, )","(7.039373695386168, 70.15675434571368, 7.96144..."
1,for,for,ADP,IN,"(O, )","(7.039373695386168, 70.15675434571368, 7.96144..."
2,replacing,replace,VERB,VBG,"(O, )","(7.039373695386168, 70.15675434571368, 7.96144..."
3,it,-PRON-,PRON,PRP,"(O, )","(7.039373695386168, 70.15675434571368, 7.96144..."
4,?,?,PUNCT,.,"(O, )","(7.039373695386168, 70.15675434571368, 7.96144..."
5,Yeah,yeah,INTJ,UH,"(O, )","(7.039373695386168, 70.15675434571368, 7.96144..."
6,",",",",PUNCT,",","(O, )","(7.039373695386168, 70.15675434571368, 7.96144..."
7,I,-PRON-,PRON,PRP,"(O, )","(7.039373695386168, 70.15675434571368, 7.96144..."
8,'ll,will,VERB,MD,"(O, )","(7.039373695386168, 70.15675434571368, 7.96144..."
9,bite,bite,VERB,VB,"(O, )","(7.039373695386168, 70.15675434571368, 7.96144..."


### To do:

- Method to add a column (e.g. stems) to the feature list.
- Methods to update the document manifest.