In [87]:
# Configure the language model, data source, and options

model = 'en_core_web_sm'
manifest_dir = 'data'
manifest_file = '2008_10_humanities_student_major_22_economics.json'
options = {'merge_noun_chunks': False, 'merge_subtokens': False, 'collect_readability_scores': True}
add_stopwords = ["'nt'"]
remove_stopwords = []
lemmatization_cases = {
    "humanities": [{ORTH: u'humanities', LEMMA: u'humanities', POS: u'NOUN', TAG: u'NNS'}],
    "aren't": [{ORTH: "are"}, {ORTH: "n't", LEMMA: "not"}],
    "isn't": [{ORTH: "is"}, {ORTH: "n't", LEMMA: "not"}]
}

In [96]:
import json
import os
import nltk
import pandas as pd
import re
import spacy
import time
import unicodedata
from collections import Counter
from ftfy import fix_text
from nltk.stem.porter import *  
from nltk.stem.snowball import SnowballStemmer
from nltk.util import ngrams
from spacy.tokenizer import Tokenizer

# Constants for the preprocessing functions
LINEBREAK_REGEX = re.compile(r'((\r\n)|[\n\v])+')
NONBREAKING_SPACE_REGEX = re.compile(r'(?!\n)\s+')
PREFIX_RE = re.compile(r'''^[\[\]\("'\.,;:-]''')
SUFFIX_RE = re.compile(r'''[\[\]\)"'\.,;:-]$''')
INFIX_RE = re.compile(r'''[~]''')
SIMPLE_URL_RE = re.compile(r'''^https?://''')

# Load the language model
nlp = spacy.load(model, disable=['sentencizer'])

# Add Custom Tokenizer
def custom_tokenizer(nlp):
    return Tokenizer(nlp.vocab, prefix_search=PREFIX_RE.search,
                                suffix_search=SUFFIX_RE.search,
                                infix_finditer=INFIX_RE.finditer,
                                token_match=SIMPLE_URL_RE.match)
nlp.tokenizer = custom_tokenizer(nlp)

# Handle lemmatisation exceptions
from spacy.symbols import ORTH, LEMMA, POS, TAG
for k, v in lemmatization_cases.items():
    nlp.tokenizer.add_special_case(k, v)

# Add and remove custom stop words
for word in add_stopwords:
    nlp.vocab[word].is_stop = True
for word in remove_stopwords:
    nlp.vocab[word].is_stop = False

# Test for the spacy-readability module
try:
    from spacy_readability import Readability
    nlp.add_pipe(Readability())
except:
    msg = 'The spacy-readability module is not installed on your system, so readability scores will be unavailable unless you `pip install spacy-_readability`.'
    print(msg)

# The Document class
class Document():
    """Model a document's features.

    Parameters:
    - manifest_dir: the path to the manifest directory
    - manifest_file: the name of the manifest file.
    - content_property: the name of the property from which to extract the content

    Returns a dataframe.

    """
    
    def __init__(self, manifest_dir, manifest_file, content_property, **kwargs):
        """Initialize the object."""
        self.manifest_filepath = os.path.join(manifest_dir, manifest_file)
        self.manifest_dict = self._read_manifest()
        self.manifest_json = json.dumps(self.manifest_dict, indent=2)
        self.doc_string = self.scrub(self._get_docstring(content_property))
        self.content = nlp(self.doc_string)
        self.options = kwargs['kwargs']
        # Re-do this to deserialise a list of lists.
        if 'features' in self.manifest_dict:
            self.features = pd.DataFrame(
                self.manifest_dict['features'],
                columns=['Text', 'Lemma', 'POS', 'Tag', 'Entities']
            )
        else:
            self.features = self.get_features()

    def _remove_accents(self, text, method='unicode'):
        """Remove accents from any accented unicode characters in a string.

        Either transforms them into ascii equivalents or removes them entirely.

        Parameters:
        - text (str): raw text
        - method ({'unicode', 'ascii'}): if 'unicode', remove accented
            char for any unicode symbol with a direct ASCII equivalent; if 'ascii',
            remove accented char for any unicode symbol.
            NB: the 'ascii' method is notably faster but less effective than 'unicode'.
        Returns:
            str
        Raises:
            ValueError: if ``method`` is not in {'unicode', 'ascii'}
        """
        if method == 'unicode':
            return ''.join(
                c
                for c in unicodedata.normalize('NFKD', text)
                if not unicodedata.combining(c)
            )
        elif method == 'ascii':
            return (
                unicodedata.normalize('NFKD', text)
                .encode('ascii', errors='ignore')
                .decode('ascii')
            )
        else:
            msg = '`method` must be either "unicode" and "ascii", not {}'.format(method)
            raise ValueError(msg)

    def scrub(self, text, unicode_normalization='NFC', accent_removal_method='unicode'):
        """Normalize whitespace and and bad unicode, and remove accents.

        Parameters:
        - unicode_normalization: The ftfy.fix_text() `normalization` parameter.
        - accent_removal_method: The Doc.remove_accents() `method` parameter.    
        Returns str
        """
        # Change multiple spaces to one and multiple line breaks to one.
        # Also strip leading/trailing whitespace.
        text = NONBREAKING_SPACE_REGEX.sub(' ', LINEBREAK_REGEX.sub(r'\n', text)).strip()
        # Combine characters and diacritics written using separate code points
        text = fix_text(text, normalization=unicode_normalization)
        text = self._remove_accents(text, method=accent_removal_method)
        return text

    def _read_manifest(self):
        """Read a JSON file and return a Python dict."""
        with open(self.manifest_filepath, 'r', encoding='utf-8') as f:
            return json.loads(f.read())
        
    def _get_docstring(self, content_property):
        """Extract a document string from a manifest property."""
        return self.manifest_dict[content_property]

    def get_features(self):
        """Process the document with the spaCy pipeline into a pandas dataframe.
        
        If `collect_readability_scores` is set, Flesch-Kincaid Readability,
        Flesch-Kincaid Reading Ease and Dale-Chall formula scores are collected
        in a tuple in that order. Other formulas are available (see 
        https://github.com/mholtzscher/spacy_readability).
        
        Parameters:
        - as_list: Return the features as a list instead of a dataframe
        """
        # Handle optional pipes
        if 'merge_noun_chunks' in self.options and self.options['merge_noun_chunks'] == True:
            merge_nps = nlp.create_pipe('merge_noun_chunks')
            nlp.add_pipe(merge_nps)
        if 'merge_subtokens' in self.options and self.options['merge_subtokens'] == True:
            merge_subtok = nlp.create_pipe('merge_subtokens')
            nlp.add_pipe(merge_subtok)
        # Build the feature list
        feature_list = []
        columns = ['TOKEN', 'LEMMA', 'POS', 'TAG', 'STOPWORD', 'ENTITIES']
        for token in self.content:
            # Get named entity info (I=Inside, O=Outside, B=Begin)
            ner = (token.ent_iob_, token.ent_type_)
            t = [token.text, token.lemma_, token.pos_, token.tag_, str(token.is_stop), ner]
            feature_list.append(tuple(t))
        return pd.DataFrame(feature_list, columns=columns)

    def filter(self, pattern=None, column='TOKEN', skip_punct=False, skip_stopwords=False, skip_linebreaks=False, case=True, flags=0, na=False, regex=True):
        """Return a new dataframe with filtered rows.

        Parameters:
        - pattern: The string or regex pattern on which to filter.
        - column: The column where the string is to be searched.
        - skip_punct: Do not include punctuation marks.
        - skip_stopwords: Do not include stopwords.
        - skip_linebreaks: Do not include linebreaks.
        - case: Perform a case-sensitive match.
        - flags: Regex flags.
        - na: Filler for empty cells.
        - regex: Set to True; otherwise absolute values will be matched.

        The last four parameters are from `pandas.Series.str.contains`.
        """
        # Filter based on column content
        new_df = self.features
        if pattern is not None:
            new_df = new_df[new_df[column].str.contains(pattern, case=case, flags=flags, na=na, regex=regex)]
        # Filter based on token type
        if skip_punct == True:
            new_df = new_df[~new_df['POS'].str.contains('PUNCT', case=True, flags=0, na=False, regex=True)]
        if skip_stopwords == True:
            new_df = new_df[~new_df['STOPWORD'].str.contains('TRUE', case=False, flags=0, na=False, regex=True)]
        if skip_linebreaks == True:
            new_df = new_df[~new_df['POS'].str.contains('SPACE', case=True, flags=0, na=False, regex=True)]
        return new_df

    def lemmas(self, as_list=False):
        """Return a dataframe containing just the lemmas."""
        if as_list == True:
            return [token.lemma_ for token in self.content]
        else:
            return pd.DataFrame([token.lemma_ for token in self.content], columns=['LEMMA'])

    def punctuation(self, as_list=False):
        """Return a dataframe containing just the punctuation marks."""
        if as_list == True:
            return [token.text for token in self.content if token.is_punct]
        else:
            return pd.DataFrame([token.text for token in self.content if token.is_punct], columns=['PUNCTUATION'])

    def pos(self, as_list=False):
        """Return a dataframe containing just the parts of speech."""
        if as_list == True:
            return [token.pos_ for token in self.content]
        else:
            return pd.DataFrame([token.pos_ for token in self.content], columns=['POS'])

    def tags(self, as_list=False):
        """Return a dataframe containing just the tags."""
        if as_list == True:
            return [token.tag_ for token in self.content]
        else:
            return pd.DataFrame([token.tag_ for token in self.content], columns=['TAG'])

    def entities(self, options=['text', 'label'], as_list=False):
        """Return a dataframe containing just the entities from the document.
        
        Parameters:
        - options: a list of attributes ('text', 'start', 'end', 'label')
        - as_list: return the entities as a list of tuples.
        """
        ents = []
        for ent in self.content.ents:
            e = []
            if 'text' in options:
                e.append(ent.text)
            if 'start' in options:
                e.append(ent.start)
            if 'end' in options:
                e.append(ent.end)
            if 'label' in options:
                e.append(ent.label_)
            ents.append(tuple(e))
        if as_list == True:
            return ents
        else:
            return pd.DataFrame(ents, columns=[option.title() for option in options])

    def readability_scores(self, columns=['Flesch-Kincaid Readability',
        'Flesch-Kincaid Reading Ease', 'Dale-Chall'], as_list=False):
        """Get a list of readability scores from the document.
        
        Parameters:
        - columns: a list of labels for the score types
        - as_df: return the list as a dataframe.
        """
        fkr = self.content._.flesch_kincaid_reading_ease
        fkg = self.content._.flesch_kincaid_grade_level
        dc = self.content._.dale_chall
        scores = [(fkr, fkg, dc)]
        if as_list == True:
            return scores
        else:
            return pd.DataFrame(scores, columns=columns)

    def stems(self, stemmer='porter', as_list=False):
        """Convert the tokens in a spaCy document to stems.

        Parameters:
        - stemmer: the stemming algorithm ('porter' or 'snowball').
        - as_list: return the dataframe as a list.
        """
        if stemmer == 'snowball':
            stemmer = SnowballStemmer(language='english')
        else:
            stemmer = PorterStemmer()
        stems = [stemmer.stem(token.text) for token in self.content]
        if as_list == True:
            return stems
        else:
            return pd.DataFrame(stems, columns=['Stems'])

    def ngrams(self, n=2, as_list=False):
        """Convert the tokens in a spaCy document to ngrams.

        Parameters:
        - n: The number of tokens in an ngram.
        - as_list: return the dataframe as a list.
        """
        ngram_tokens = list(ngrams([token.text for token in self.content], n))
        if as_list == True:
            return ngram_tokens
        else:
            prefix = str(n) + '-'
            if n == 2:
                prefix = 'Bi'
            if n == 3:
                prefix = 'Tri'
            label = prefix + 'grams'
            return pd.DataFrame({label: pd.Series(ngram_tokens)})

    def remove_property(self, property):
        """Remove a property from the manifest.

        Parameters:
        - property: The property or a list of properties to be removed from the manifest.
        """
        if isinstance(property, str):
            property = list(property)
        for prop in property:
            del self.manifest_dict[prop]
        # Write the json to the manifest file
        # IMPORTANT: May not work if the manifest file has binary content
        with open(self.manifest_filepath, 'w', encoding='utf-8') as f:
            f.write(json.dumps(self.manifest_dict))
            
    def serialize(self, df, indent=None):
        """Serialize a dataframe as a list of lists with the column headers as the first element.

        Parameters:
        - indent: An integer indicating the number of spaces to indent the json string. Default is None.
        """
        j = json.loads(pd.DataFrame.to_json(df, orient='values'))
        j.insert(0, columns)
        return json.dumps(j, indent=indent)

    def deserialize(self, j):
        """Serialize a list of lists to a dataframe using the first element as the headers."""
        df = pd.read_json(j, orient='values')
        headers = df.iloc[0]
        return pd.DataFrame(df.values[1:], columns=headers)


    def save(self, property=None, series=None):
        """Convert a series of values and save them to the manifest file.
        
        Over-writes the original manifest file, so not to be used lightly.

        Parameters:
        - property: A string naming the JSON property to save to.
        - series: The list or dataframe to save.
        """
        if isinstance(series, dict) or isinstance(series, list):
            self.manifest_dict[property] = series
        else:
            self.manifest_dict[property] = json.loads(pd.DataFrame.to_json(series, orient='columns'))
        # Write the json to the manifest file
        # IMPORTANT: May not work if the manifest file has binary content
        with open(self.manifest_filepath, 'w', encoding='utf-8') as f:
            f.write(json.dumps(self.manifest_dict))        

# Not part of the Document class for ease of access.
# Create bags as separate dicts and then save them to the manifest.
def bagify(series, as_counter=False):
    """Convert a list of values to a dict of value frequencies.
    
    Parameters:
    - as_counter: If True, returns a Python Counter object enabling its most_common() method.
    """
    # Make sure we are working with a list of values
    if isinstance(series, pd.DataFrame):
        print('Please select only one columns from the dataframe.')
    if isinstance(series, pd.Series):
        series = list(series.values)
    if as_counter == True:
        return Counter(series)
    else:
        return dict(Counter(series))


In [97]:
# Initialise the feature table
start = time.time()
doc = Document(manifest_dir, manifest_file, 'content_scrubbed', kwargs=options)
end = time.time()
t = end - start
print('Executed in ' + str(t) + ' seconds.')

Executed in 0.8055734634399414seconds.


In [101]:
start = time.time()
## Uncomment lines below to try various class methods.
## See the method docstrings for options not shown in the examples below.
# doc.doc_string
# Get all the doc features in a dataframe
# doc.get_features()

# Count the number of feature unfiltered tokens
# len(doc.get_features())

# Filter the tokens
# filtered = doc.filter(pattern='NOUN', column='POS', skip_punct=True, skip_stopwords=True, skip_linebreaks=True)
# pd.DataFrame(filtered['TOKEN'])
# len(filtered)

# Get Various Features
# doc.lemmas()
# doc.punctuation(as_list=True)
# doc.pos()
# doc.tags()
# doc.entities()
# doc.readability_scores()
# doc.stems(stemmer='porter')
# doc.ngrams(n=3)

# Insert a column using pandas
# stems = doc.stems()
# features = doc.get_features()
# features.insert(3, 'STEM', stems)
# print(features)

# Sort the features using pandas
features.sort_values(by=['TOKEN'], inplace=True)
print(features)

# Save a column to the manifest file. Overwrites the original file, so not to be used lightly.
# stems = doc.stems()
# doc.save('stems', stems)

# Bagify the tokens, skipping punctuation and stop words
# filtered = doc.filter(column='TOKEN', skip_punct=True, skip_stopwords=True, skip_linebreaks=True)
# bag = bagify(filtered['TOKEN'])
# bag
# Save the results to the manifest

# Get 10 most common nouns
# nouns = doc.filter(pattern='NOUN', column='POS', skip_punct=True, skip_stopwords=True, skip_linebreaks=True)
# most_common = bagify(nouns['TOKEN'], as_counter=True).most_common(10)
# doc.save('bag', dict(most_common))
# dict(most_common)
end = time.time()
t = end - start
print('Executed in ' + str(t) + ' seconds.')

        TOKEN   LEMMA    POS      STEM   TAG STOPWORD  ENTITIES
87         \n      \n  SPACE        \n          False  (B, GPE)
33         \n      \n  SPACE        \n          False     (O, )
489        \n      \n  SPACE        \n          False     (O, )
399        \n      \n  SPACE        \n          False     (O, )
350        \n      \n  SPACE        \n          False     (O, )
495         !       !  PUNCT         !     .    False     (O, )
411         !       !  PUNCT         !     .    False     (O, )
196         "       "  PUNCT         "    ''    False     (O, )
194         "       "  PUNCT         "    ``    False     (O, )
287         "       "  PUNCT         "    ''    False     (O, )
284         "       "  PUNCT         "    ``    False     (O, )
282         "       "  PUNCT         "    ''    False     (O, )
280         "       "  PUNCT         "    ``    False     (O, )
276         "       "  PUNCT         "    ``    False     (O, )
278         "       "  PUNCT         "  

### To do:

- Test bagification and saving more extensively.
- Fine tune tokenisation and lemmatisation rules.
- Call the actual methods we want for preprocessing (including saving token counts). Don't filter stop words.


### Algorithm

1. Get the feature table and sort it.
2. Serialise the feature table and update self.manifest_dict in it.
3. Count the feature table rows and update self.manifest_dict with the token count.
4. Bagify the non-punctuation tokens and update self.manifest_dict.
5. Get the readability scores and add them to self.manifest_dict.
6. Save the manifest.