In [357]:
# Configure the language model, data source, and options

model = 'en_core_web_sm'
manifest_dir = 'data'
manifest_file = '2007_10_humanities_student_major_9_reddit_com.json'
options = {'merge_noun_chunks': False, 'merge_subtokens': False, 'collect_readability_scores': True}
add_stopwords = ['schmuck', 'ridiculous']
remove_stopwords = ['of']

In [424]:
import json
import os
import nltk
import pandas as pd
import re
import spacy
import unicodedata
from collections import Counter
from ftfy import fix_text
from nltk.stem.porter import *  
from nltk.stem.snowball import SnowballStemmer
from nltk.util import ngrams

# Constants for the preprocessing functions
LINEBREAK_REGEX = re.compile(r'((\r\n)|[\n\v])+')
NONBREAKING_SPACE_REGEX = re.compile(r'(?!\n)\s+')

# Load the language model
nlp = spacy.load(model)

# Add and remove custom stop words
for word in add_stopwords:
    nlp.vocab[word].is_stop = True
for word in remove_stopwords:
    nlp.vocab[word].is_stop = False

# Test for the spacy-readability module
try:
    from spacy_readability import Readability
except:
    msg = 'The spacy-readability module is not installed on your system, so readability scores will be unavailable unless you `pip install spacy-_readability`.'
    print(msg)

# The Document class
class Document():
    """Model a document's features.

    Parameters:
    - manifest_dir: the path to the manifest directory
    - manifest_file: the name of the manifest file.
    - content_property: the name of the property from which to extract the content

    Returns a JSON object with the format `{'response': 'success|fail', 'errors': []}`.

    """
    
    def __init__(self, manifest_dir, manifest_file, content_property, **kwargs):
        """Initialize the object."""
        self.manifest_filepath = os.path.join(manifest_dir, manifest_file)
        self.manifest_dict = self._read_manifest()
        self.manifest_json = json.dumps(self.manifest_dict, indent=2)
        self.doc_string = self.scrub(self._get_docstring(content_property))
        self.content = nlp(self.doc_string)
        self.options = kwargs['kwargs']
        if 'features' in self.manifest_dict:
            self.features = pd.DataFrame(
                self.manifest_dict['features'],
                columns=['Text', 'Lemma', 'POS', 'Tag', 'Entities']
            )
        else:
            self.features = self.get_features()

    def _remove_accents(self, text, method='unicode'):
        """Remove accents from any accented unicode characters in a string.

        Either transforms them into ascii equivalents or removes them entirely.

        Parameters:
        - text (str): raw text
        - method ({'unicode', 'ascii'}): if 'unicode', remove accented
            char for any unicode symbol with a direct ASCII equivalent; if 'ascii',
            remove accented char for any unicode symbol.
            NB: the 'ascii' method is notably faster but less effective than 'unicode'.
        Returns:
            str
        Raises:
            ValueError: if ``method`` is not in {'unicode', 'ascii'}
        """
        if method == 'unicode':
            return ''.join(
                c
                for c in unicodedata.normalize('NFKD', text)
                if not unicodedata.combining(c)
            )
        elif method == 'ascii':
            return (
                unicodedata.normalize('NFKD', text)
                .encode('ascii', errors='ignore')
                .decode('ascii')
            )
        else:
            msg = '`method` must be either "unicode" and "ascii", not {}'.format(method)
            raise ValueError(msg)

    def scrub(self, text, unicode_normalization='NFC', accent_removal_method='unicode'):
        """Normalize whitespace and and bad unicode, and remove accents.

        Parameters:
        - unicode_normalization: The ftfy.fix_text() `normalization` parameter.
        - accent_removal_method: The Doc.remove_accents() `method` parameter.    
        Returns str
        """
        # Change multiple spaces to one and multiple line breaks to one.
        # Also strip leading/trailing whitespace.
        text = NONBREAKING_SPACE_REGEX.sub(' ', LINEBREAK_REGEX.sub(r'\n', text)).strip()
        # Combine characters and diacritics written using separate code points
        text = fix_text(text, normalization=unicode_normalization)
        text = self._remove_accents(text, method=accent_removal_method)
        return text

    def _read_manifest(self):
        """Read a JSON file and return a Python dict."""
        with open(self.manifest_filepath, 'r', encoding='utf-8') as f:
            return json.loads(f.read())
        
    def _get_docstring(self, content_property):
        """Extract a document string from a manifest property."""
        return self.manifest_dict[content_property]

    def get_features(self):
        """Process the document with the spaCy pipeline into a pandas dataframe.
        
        If `collect_readability_scores` is set, Flesch-Kincaid Readability,
        Flesch-Kincaid Reading Ease and Dale-Chall formula scores are collected
        in a tuple in that order. Other formulas are available (see 
        https://github.com/mholtzscher/spacy_readability).
        
        Parameters:
        - as_list: Return the features as a list instead of a dataframe
        """
        # Handle optional pipes
        if 'merge_noun_chunks' in self.options and self.options['merge_noun_chunks'] == True:
            merge_nps = nlp.create_pipe('merge_noun_chunks')
            nlp.add_pipe(merge_nps)
        if 'merge_subtokens' in self.options and self.options['merge_subtokens'] == True:
            merge_subtok = nlp.create_pipe('merge_subtokens')
            nlp.add_pipe(merge_subtok)
        if 'collect_readability_scores' in self.options and self.options['collect_readability_scores'] == True:
            try:
                nlp.add_pipe(Readability())
            except:
                pass
            readability = (self.content._.flesch_kincaid_grade_level, self.content._.flesch_kincaid_reading_ease, self.content._.dale_chall)
        # Build the feature list
        feature_list = []
        columns = ['TOKEN', 'LEMMA', 'POS', 'TAG', 'STOPWORD', 'ENTITIES']
#         if readability:
#             columns.append('READABILITY')
        for token in self.content:
            # Get named entity info (I=Inside, O=Outside, B=Begin)
            ner = (token.ent_iob_, token.ent_type_)
            t = [token.text, token.lemma_, token.pos_, token.tag_, str(token.is_stop), ner]
#             if readability:
#                 t.append(readability)
            feature_list.append(tuple(t))
        return pd.DataFrame(feature_list, columns=columns)

    def filter(self, pattern=None, column='TOKEN', skip_punct=False, skip_stopwords=False, skip_linebreaks=False, case=True, flags=0, na=False, regex=True):
        """Return a new dataframe with filtered rows.

        Parameters:
        - pattern: The string or regex pattern on which to filter.
        - column: The column where the string is to be searched.
        - skip_punct: Do not include punctuation marks.
        - skip_stopwords: Do not include stopwords.
        - skip_linebreaks: Do not include linebreaks.
        - case: Perform a case-sensitive match.
        - flags: Regex flags.
        - na: Filler for empty cells.
        - regex: Set to True; otherwise absolute values will be matched.

        The last four parameters are from `pandas.Series.str.contains`.
        """
        # Filter based on column content
        new_df = self.features
        if pattern is not None:
            new_df = new_df[new_df[column].str.contains(pattern, case=case, flags=flags, na=na, regex=regex)]
        # Filter based on token type
        if skip_punct == True:
            new_df = new_df[~new_df['POS'].str.contains('PUNCT', case=True, flags=0, na=False, regex=True)]
        if skip_stopwords == True:
            new_df = new_df[~new_df['STOPWORD'].str.contains('TRUE', case=False, flags=0, na=False, regex=True)]
        if skip_linebreaks == True:
            new_df = new_df[~new_df['POS'].str.contains('SPACE', case=True, flags=0, na=False, regex=True)]
        return new_df

    def lemmas(self, as_list=False):
        """Return a dataframe containing just the lemmas."""
        if as_list == True:
            return [token.lemma_ for token in self.content]
        else:
            return pd.DataFrame([token.lemma_ for token in self.content], columns=['LEMMA'])

    def punctuation(self, as_list=False):
        """Return a dataframe containing just the punctuation marks."""
        if as_list == True:
            return [token.text for token in self.content if token.is_punct]
        else:
            return pd.DataFrame([token.text for token in self.content if token.is_punct], columns=['PUNCTUATION'])

    def pos(self, as_list=False):
        """Return a dataframe containing just the parts of speech."""
        if as_list == True:
            return [token.pos_ for token in self.content]
        else:
            return pd.DataFrame([token.pos_ for token in self.content], columns=['POS'])

    def tags(self, as_list=False):
        """Return a dataframe containing just the tags."""
        if as_list == True:
            return [token.tag_ for token in self.content]
        else:
            return pd.DataFrame([token.tag_ for token in self.content], columns=['TAG'])

    def entities(self, options=['text', 'label'], as_list=False):
        """Return a dataframe containing just the entities from the document.
        
        Parameters:
        - options: a list of attributes ('text', 'start', 'end', 'label')
        - as_list: return the entities as a list of tuples.
        """
        ents = []
        for ent in self.content.ents:
            e = []
            if 'text' in options:
                e.append(ent.text)
            if 'start' in options:
                e.append(ent.start)
            if 'end' in options:
                e.append(ent.end)
            if 'label' in options:
                e.append(ent.label_)
            ents.append(tuple(e))
        if as_list == True:
            return ents
        else:
            return pd.DataFrame(ents, columns=[option.title() for option in options])

    def readability_scores(self, columns=['Flesch-Kincaid Readability',
        'Flesch-Kincaid Reading Ease', 'Dale-Chall'], as_list=False):
        """Get a list of readability scores from the document.
        
        Parameters:
        - columns: a list of labels for the score types
        - as_df: return the list as a dataframe.
        """
        fkr = self.content._.flesch_kincaid_reading_ease
        fkg = self.content._.flesch_kincaid_grade_level
        dc = self.content._.dale_chall
        scores = [(fkr, fkg, dc)]
        if as_list == True:
            return scores
        else:
            return pd.DataFrame(scores, columns=columns)

    def stems(self, stemmer='porter', as_list=False):
        """Convert the tokens in a spaCy document to stems.

        Parameters:
        - stemmer: the stemming algorithm ('porter' or 'snowball').
        - as_list: return the dataframe as a list.
        """
        if stemmer == 'snowball':
            stemmer = SnowballStemmer(language='english')
        else:
            stemmer = PorterStemmer()
        stems = [stemmer.stem(token.text) for token in self.content]
        if as_list == True:
            return stems
        else:
            return pd.DataFrame(stems, columns=['Stems'])

    def ngrams(self, n=2, as_list=False):
        """Convert the tokens in a spaCy document to ngrams.

        Parameters:
        - n: The number of tokens in an ngram.
        - as_list: return the dataframe as a list.
        """
        ngram_tokens = list(ngrams([token.text for token in self.content], n))
        if as_list == True:
            return ngram_tokens
        else:
            prefix = str(n) + '-'
            if n == 2:
                prefix = 'Bi'
            if n == 3:
                prefix = 'Tri'
            label = prefix + 'grams'
            return pd.DataFrame({label: pd.Series(ngram_tokens)})

def bagify(df, skip_punct=False, skip_stopwords=False, pos=None, as_counter=False):
    """Convert a spaCy document into a dict containing feature frequencies.

    Parameters:
    - remove: a list containing features to be removed ('punctuation', 'stopwords', or both)
    - pos: a valid grammatical category to filter by (e.g. 'NOUN').
    - as_counter: returns the result as a counter object

    Returns a dict unless as_counter is set to `True`. This enables the following:

    `bag = features.bagify(as_counter=True).most_common(10)`
    """
    print('moo')
#     # Boolean False is not accepted for some reason
#     if pos is not None:
#         bag = [token.text for token in content if token.is_punct != skip_punct and token.is_stop != skip_stop and token.pos_ == pos]
#     else:
#         bag = [token.text for token in self.content if token.is_punct != skip_punct and token.is_stop != skip_stop]
#     if as_counter == True:
#         return Counter(bag)
#     else:
#         return dict(Counter(bag))

In [425]:
# Initialise the feature table
doc = Document(manifest_dir, manifest_file, 'content_scrubbed', kwargs=options)

In [427]:
## Uncomment lines below to try various class methods.
## See the method docstrings for options not shown in the examples below.
doc.doc_string
# Get all the doc features in a dataframe
# doc.get_features()

# Count the number of feature unfiltered tokens
# len(doc.get_features())

# Filter the tokens
# filtered = doc.filter(pattern='NOUN', column='POS', skip_punct=True, skip_stopwords=True, skip_linebreaks=True)
# pd.DataFrame(filtered['TOKEN'])
# len(filtered)

# Get Various Features
# doc.lemmas()
# doc.punctuation(as_list=True)
# doc.pos()
# doc.tags()
# doc.entities()
# doc.readability_scores()
# doc.stems(stemmer='porter')
# doc.ngrams(n=2)

# Insert a column - does not currently update the Doc.features
# stems = doc.stems()
# features = doc.get_features()
# features.insert(3, 'STEM', stems)
# features[0:10]

# Bagify
# doc.bagify()

# Get most common nouns
# doc.bagify(pos='NOUN', as_counter=True).most_common(5)

# Get a count of all tokens
# doc.token_count(remove=['punctuation', 'stopwords'])

# Get the original document text
# doc.doc_string


'1. Don\'t listen to any of these schmucks about how you should/should not have ridiculous amounts of sex. That[.] your personal decision and yours alone.\n2. Visit the schools you apply to. Only so much can be conveyed about a school through a pamphlet/website/phone recruiter - and you\'d better be sure as hell that perspective is a meticulously crafted one. Go and see it for yourself before you commit.\n3. Don\'t worry too much about going to a competitive school with a world-renowned reputation... even if you are worried about getting into other schools after you graduate. I went to a very small liberal_arts college for my undergrad and now I am a PhD candidate at an Ivy League university in a highly competitive biology program. Truth be told, your grades that you earn during your four years, as well as the recommendations you get from people for your graduate school applications are what truly count. 4. Don\'t get scared about picking the "right" major, and don\'t let anyone muscle

### To do:

- Add custom tokenisation rules.
- `Doc.bagify()` needs work.
- Method to add a column (e.g. stems) to the feature list.
- Methods to update the document manifest.